diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
| -rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 1926 |
1 files changed, 1003 insertions, 923 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 8e054d24ab..09cb43769f 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -5,30 +5,30 @@ * * * IDENTIFICATION - * $Id: nbtsort.c,v 1.19 1997/08/19 21:29:46 momjian Exp $ + * $Id: nbtsort.c,v 1.20 1997/09/07 04:39:02 momjian Exp $ * * NOTES * * what we do is: * - generate a set of initial one-block runs, distributed round-robin - * between the output tapes. + * between the output tapes. * - for each pass, - * - swap input and output tape sets, rewinding both and truncating - * the output tapes. - * - merge the current run in each input tape to the current output - * tape. - * - when each input run has been exhausted, switch to another output - * tape and start processing another run. + * - swap input and output tape sets, rewinding both and truncating + * the output tapes. + * - merge the current run in each input tape to the current output + * tape. + * - when each input run has been exhausted, switch to another output + * tape and start processing another run. * - when we have fewer runs than tapes, we know we are ready to start - * merging into the btree leaf pages. (i.e., we do not have to wait - * until we have exactly one tape.) + * merging into the btree leaf pages. (i.e., we do not have to wait + * until we have exactly one tape.) * - as we extract tuples from the final runs, we build the pages for - * each level. when we have only one page on a level, it must be the - * root -- it can be attached to the btree metapage and we are done. + * each level. when we have only one page on a level, it must be the + * root -- it can be attached to the btree metapage and we are done. * * conventions: * - external interface routines take in and return "void *" for their - * opaque handles. this is for modularity reasons. + * opaque handles. this is for modularity reasons. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on @@ -58,20 +58,21 @@ #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif #ifdef BTREE_BUILD_STATS #include <tcop/tcopprot.h> -extern int ShowExecutorStats; +extern int ShowExecutorStats; + #endif -static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags); -static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); -static void *_bt_pagestate(Relation index, int flags, int level, bool doupper); -static void _bt_uppershutdown(Relation index, BTPageState *state); +static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags); +static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); +static void *_bt_pagestate(Relation index, int flags, int level, bool doupper); +static void _bt_uppershutdown(Relation index, BTPageState * state); /* * turn on debugging output. @@ -83,18 +84,18 @@ static void _bt_uppershutdown(Relation index, BTPageState *state); #define FASTBUILD_SPOOL #define FASTBUILD_MERGE -#define MAXTAPES (7) -#define TAPEBLCKSZ (MAXBLCKSZ << 2) -#define TAPETEMP "pg_btsortXXXXXX" +#define MAXTAPES (7) +#define TAPEBLCKSZ (MAXBLCKSZ << 2) +#define TAPETEMP "pg_btsortXXXXXX" -extern int NDirectFileRead; -extern int NDirectFileWrite; -extern char *mktemp(char *template); +extern int NDirectFileRead; +extern int NDirectFileWrite; +extern char *mktemp(char *template); /* - * this is what we use to shovel BTItems in and out of memory. it's + * this is what we use to shovel BTItems in and out of memory. it's * bigger than a standard block because we are doing a lot of strictly - * sequential i/o. this is obviously something of a tradeoff since we + * sequential i/o. this is obviously something of a tradeoff since we * are potentially reading a bunch of zeroes off of disk in many * cases. * @@ -104,14 +105,15 @@ extern char *mktemp(char *template); * the only thing like that so i'm not going to worry about wasting a * few bytes. */ -typedef struct { - int bttb_magic; /* magic number */ - int bttb_fd; /* file descriptor */ - int bttb_top; /* top of free space within bttb_data */ - short bttb_ntup; /* number of tuples in this block */ - short bttb_eor; /* End-Of-Run marker */ - char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; -} BTTapeBlock; +typedef struct +{ + int bttb_magic; /* magic number */ + int bttb_fd; /* file descriptor */ + int bttb_top; /* top of free space within bttb_data */ + short bttb_ntup; /* number of tuples in this block */ + short bttb_eor; /* End-Of-Run marker */ + char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; +} BTTapeBlock; /* * this structure holds the bookkeeping for a simple balanced multiway @@ -120,13 +122,14 @@ typedef struct { * right now. though if psort was in a condition that i could hack it * to do this, you bet i would.) */ -typedef struct { - int bts_ntapes; - int bts_tape; - BTTapeBlock **bts_itape; /* input tape blocks */ - BTTapeBlock **bts_otape; /* output tape blocks */ - bool isunique; -} BTSpool; +typedef struct +{ + int bts_ntapes; + int bts_tape; + BTTapeBlock **bts_itape; /* input tape blocks */ + BTTapeBlock **bts_otape; /* output tape blocks */ + bool isunique; +} BTSpool; /*------------------------------------------------------------------------- * sorting comparison routine - returns {-1,0,1} depending on whether @@ -146,101 +149,102 @@ typedef struct { * what the heck. * *------------------------------------------------------------------------- */ -typedef struct { - Datum *btsk_datum; - char *btsk_nulls; - BTItem btsk_item; -} BTSortKey; +typedef struct +{ + Datum *btsk_datum; + char *btsk_nulls; + BTItem btsk_item; +} BTSortKey; static Relation _bt_sortrel; -static int _bt_nattr; -static BTSpool * _bt_inspool; +static int _bt_nattr; +static BTSpool *_bt_inspool; static void -_bt_isortcmpinit(Relation index, BTSpool *spool) +_bt_isortcmpinit(Relation index, BTSpool * spool) { - _bt_sortrel = index; - _bt_inspool = spool; - _bt_nattr = index->rd_att->natts; + _bt_sortrel = index; + _bt_inspool = spool; + _bt_nattr = index->rd_att->natts; } static int -_bt_isortcmp(BTSortKey *k1, BTSortKey *k2) +_bt_isortcmp(BTSortKey * k1, BTSortKey * k2) { - Datum *k1_datum = k1->btsk_datum; - Datum *k2_datum = k2->btsk_datum; - char *k1_nulls = k1->btsk_nulls; - char *k2_nulls = k2->btsk_nulls; - bool equal_isnull = false; - int i; - - if (k1->btsk_item == (BTItem) NULL) - { - if (k2->btsk_item == (BTItem) NULL) - return(0); /* 1 = 2 */ - return(1); /* 1 > 2 */ - } - else if (k2->btsk_item == (BTItem) NULL) - return(-1); /* 1 < 2 */ - - for (i = 0; i < _bt_nattr; i++) - { - if ( k1_nulls[i] != ' ' ) /* k1 attr is NULL */ + Datum *k1_datum = k1->btsk_datum; + Datum *k2_datum = k2->btsk_datum; + char *k1_nulls = k1->btsk_nulls; + char *k2_nulls = k2->btsk_nulls; + bool equal_isnull = false; + int i; + + if (k1->btsk_item == (BTItem) NULL) { - if ( k2_nulls[i] != ' ' ) /* the same for k2 */ - { - equal_isnull = true; - continue; - } - return (1); /* NULL ">" NOT_NULL */ + if (k2->btsk_item == (BTItem) NULL) + return (0); /* 1 = 2 */ + return (1); /* 1 > 2 */ } - else if ( k2_nulls[i] != ' ' ) /* k2 attr is NULL */ - return (-1); /* NOT_NULL "<" NULL */ - - if (_bt_invokestrat(_bt_sortrel, i+1, BTGreaterStrategyNumber, - k1_datum[i], k2_datum[i])) - return(1); /* 1 > 2 */ - else if (_bt_invokestrat(_bt_sortrel, i+1, BTGreaterStrategyNumber, - k2_datum[i], k1_datum[i])) - return(-1); /* 1 < 2 */ - } - - if ( _bt_inspool->isunique && !equal_isnull ) - { - _bt_spooldestroy ((void*)_bt_inspool); - elog (WARN, "Cannot create unique index. Table contains non-unique values"); - } - return(0); /* 1 = 2 */ + else if (k2->btsk_item == (BTItem) NULL) + return (-1); /* 1 < 2 */ + + for (i = 0; i < _bt_nattr; i++) + { + if (k1_nulls[i] != ' ') /* k1 attr is NULL */ + { + if (k2_nulls[i] != ' ') /* the same for k2 */ + { + equal_isnull = true; + continue; + } + return (1); /* NULL ">" NOT_NULL */ + } + else if (k2_nulls[i] != ' ') /* k2 attr is NULL */ + return (-1); /* NOT_NULL "<" NULL */ + + if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, + k1_datum[i], k2_datum[i])) + return (1); /* 1 > 2 */ + else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, + k2_datum[i], k1_datum[i])) + return (-1); /* 1 < 2 */ + } + + if (_bt_inspool->isunique && !equal_isnull) + { + _bt_spooldestroy((void *) _bt_inspool); + elog(WARN, "Cannot create unique index. Table contains non-unique values"); + } + return (0); /* 1 = 2 */ } static void -_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk) +_bt_setsortkey(Relation index, BTItem bti, BTSortKey * sk) { - sk->btsk_item = (BTItem) NULL; - sk->btsk_datum = (Datum*) NULL; - sk->btsk_nulls = (char*) NULL; - - if (bti != (BTItem) NULL) - { - IndexTuple it = &(bti->bti_itup); - TupleDesc itdesc = index->rd_att; - Datum *dp = (Datum*) palloc (_bt_nattr * sizeof (Datum)); - char *np = (char*) palloc (_bt_nattr * sizeof (char)); - bool isnull; - int i; - - for (i = 0; i < _bt_nattr; i++) - { - dp[i] = index_getattr(it, i+1, itdesc, &isnull); - if ( isnull ) - np[i] = 'n'; - else - np[i] = ' '; + sk->btsk_item = (BTItem) NULL; + sk->btsk_datum = (Datum *) NULL; + sk->btsk_nulls = (char *) NULL; + + if (bti != (BTItem) NULL) + { + IndexTuple it = &(bti->bti_itup); + TupleDesc itdesc = index->rd_att; + Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum)); + char *np = (char *) palloc(_bt_nattr * sizeof(char)); + bool isnull; + int i; + + for (i = 0; i < _bt_nattr; i++) + { + dp[i] = index_getattr(it, i + 1, itdesc, &isnull); + if (isnull) + np[i] = 'n'; + else + np[i] = ' '; + } + sk->btsk_item = bti; + sk->btsk_datum = dp; + sk->btsk_nulls = np; } - sk->btsk_item = bti; - sk->btsk_datum = dp; - sk->btsk_nulls = np; - } } /*------------------------------------------------------------------------- @@ -254,84 +258,100 @@ _bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk) * XXX these probably ought to be generic library functions. *------------------------------------------------------------------------- */ -typedef struct { - int btpqe_tape; /* tape identifier */ - BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */ -} BTPriQueueElem; - -#define MAXELEM MAXTAPES -typedef struct { - int btpq_nelem; - BTPriQueueElem btpq_queue[MAXELEM]; - Relation btpq_rel; -} BTPriQueue; +typedef struct +{ + int btpqe_tape; /* tape identifier */ + BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */ +} BTPriQueueElem; + +#define MAXELEM MAXTAPES +typedef struct +{ + int btpq_nelem; + BTPriQueueElem btpq_queue[MAXELEM]; + Relation btpq_rel; +} BTPriQueue; /* be sure to call _bt_isortcmpinit first */ #define GREATER(a, b) \ - (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) + (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) static void -_bt_pqsift(BTPriQueue *q, int parent) +_bt_pqsift(BTPriQueue * q, int parent) { - int child; - BTPriQueueElem e; - - for (child = parent * 2 + 1; - child < q->btpq_nelem; - child = parent * 2 + 1) { - if (child < q->btpq_nelem - 1) { - if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) { - ++child; - } - } - if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { - e = q->btpq_queue[child]; /* struct = */ - q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ - q->btpq_queue[parent] = e; /* struct = */ - parent = child; - } else { - parent = child + 1; + int child; + BTPriQueueElem e; + + for (child = parent * 2 + 1; + child < q->btpq_nelem; + child = parent * 2 + 1) + { + if (child < q->btpq_nelem - 1) + { + if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1]))) + { + ++child; + } + } + if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) + { + e = q->btpq_queue[child]; /* struct = */ + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + q->btpq_queue[parent] = e; /* struct = */ + parent = child; + } + else + { + parent = child + 1; + } } - } } static int -_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) +_bt_pqnext(BTPriQueue * q, BTPriQueueElem * e) { - if (q->btpq_nelem < 1) { /* already empty */ - return(-1); - } - *e = q->btpq_queue[0]; /* struct = */ - - if (--q->btpq_nelem < 1) { /* now empty, don't sift */ - return(0); - } - q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ - _bt_pqsift(q, 0); - return(0); + if (q->btpq_nelem < 1) + { /* already empty */ + return (-1); + } + *e = q->btpq_queue[0]; /* struct = */ + + if (--q->btpq_nelem < 1) + { /* now empty, don't sift */ + return (0); + } + q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ + _bt_pqsift(q, 0); + return (0); } static void -_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) +_bt_pqadd(BTPriQueue * q, BTPriQueueElem * e) { - int child, parent; - - if (q->btpq_nelem >= MAXELEM) { - elog(WARN, "_bt_pqadd: queue overflow"); - } - - child = q->btpq_nelem++; - while (child > 0) { - parent = child / 2; - if (GREATER(e, &(q->btpq_queue[parent]))) { - break; - } else { - q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ - child = parent; + int child, + parent; + + if (q->btpq_nelem >= MAXELEM) + { + elog(WARN, "_bt_pqadd: queue overflow"); + } + + child = q->btpq_nelem++; + while (child > 0) + { + parent = child / 2; + if (GREATER(e, &(q->btpq_queue[parent]))) + { + break; + } + else + { + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + child = parent; + } } - } - q->btpq_queue[child] = *e; /* struct = */ + q->btpq_queue[child] = *e; /* struct = */ } /*------------------------------------------------------------------------- @@ -339,37 +359,37 @@ _bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) *------------------------------------------------------------------------- */ -#define BTITEMSZ(btitem) \ - ((btitem) ? \ - (IndexTupleDSize((btitem)->bti_itup) + \ - (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ - 0) -#define SPCLEFT(tape) \ - (sizeof((tape)->bttb_data) - (tape)->bttb_top) -#define EMPTYTAPE(tape) \ - ((tape)->bttb_ntup <= 0) -#define BTTAPEMAGIC 0x19660226 +#define BTITEMSZ(btitem) \ + ((btitem) ? \ + (IndexTupleDSize((btitem)->bti_itup) + \ + (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ + 0) +#define SPCLEFT(tape) \ + (sizeof((tape)->bttb_data) - (tape)->bttb_top) +#define EMPTYTAPE(tape) \ + ((tape)->bttb_ntup <= 0) +#define BTTAPEMAGIC 0x19660226 /* * reset the tape header for its next use without doing anything to - * the physical tape file. (setting bttb_top to 0 makes the block + * the physical tape file. (setting bttb_top to 0 makes the block * empty.) */ static void -_bt_tapereset(BTTapeBlock *tape) +_bt_tapereset(BTTapeBlock * tape) { - tape->bttb_eor = 0; - tape->bttb_top = 0; - tape->bttb_ntup = 0; + tape->bttb_eor = 0; + tape->bttb_top = 0; + tape->bttb_ntup = 0; } /* * rewind the physical tape file. */ static void -_bt_taperewind(BTTapeBlock *tape) +_bt_taperewind(BTTapeBlock * tape) { - FileSeek(tape->bttb_fd, 0, SEEK_SET); + FileSeek(tape->bttb_fd, 0, SEEK_SET); } /* @@ -382,17 +402,17 @@ _bt_taperewind(BTTapeBlock *tape) * least you don't have to delete and reinsert the directory entries. */ static void -_bt_tapeclear(BTTapeBlock *tape) +_bt_tapeclear(BTTapeBlock * tape) { - /* blow away the contents of the old file */ - _bt_taperewind(tape); + /* blow away the contents of the old file */ + _bt_taperewind(tape); #if 0 - FileSync(tape->bttb_fd); + FileSync(tape->bttb_fd); #endif - FileTruncate(tape->bttb_fd, 0); + FileTruncate(tape->bttb_fd, 0); - /* reset the buffer */ - _bt_tapereset(tape); + /* reset the buffer */ + _bt_tapereset(tape); } /* @@ -402,43 +422,44 @@ _bt_tapeclear(BTTapeBlock *tape) static BTTapeBlock * _bt_tapecreate(char *fname) { - BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); + BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); - if (tape == (BTTapeBlock *) NULL) { - elog(WARN, "_bt_tapecreate: out of memory"); - } + if (tape == (BTTapeBlock *) NULL) + { + elog(WARN, "_bt_tapecreate: out of memory"); + } - tape->bttb_magic = BTTAPEMAGIC; + tape->bttb_magic = BTTAPEMAGIC; - tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); - Assert(tape->bttb_fd >= 0); + tape->bttb_fd = FileNameOpenFile(fname, O_RDWR | O_CREAT | O_TRUNC, 0600); + Assert(tape->bttb_fd >= 0); - /* initialize the buffer */ - _bt_tapereset(tape); + /* initialize the buffer */ + _bt_tapereset(tape); - return(tape); + return (tape); } /* * destroy the BTTapeBlock structure and its physical tape file. */ static void -_bt_tapedestroy(BTTapeBlock *tape) +_bt_tapedestroy(BTTapeBlock * tape) { - FileUnlink(tape->bttb_fd); - pfree((void *) tape); + FileUnlink(tape->bttb_fd); + pfree((void *) tape); } /* * flush the tape block to the file, marking End-Of-Run if requested. */ static void -_bt_tapewrite(BTTapeBlock *tape, int eor) +_bt_tapewrite(BTTapeBlock * tape, int eor) { - tape->bttb_eor = eor; - FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); - NDirectFileWrite += TAPEBLCKSZ/MAXBLCKSZ; - _bt_tapereset(tape); + tape->bttb_eor = eor; + FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); + NDirectFileWrite += TAPEBLCKSZ / MAXBLCKSZ; + _bt_tapereset(tape); } /* @@ -447,34 +468,36 @@ _bt_tapewrite(BTTapeBlock *tape, int eor) * * returns: * - 0 if there are no more blocks in the tape or in this run (call - * _bt_tapereset to clear the End-Of-Run marker) + * _bt_tapereset to clear the End-Of-Run marker) * - 1 if a valid block was read */ static int -_bt_taperead(BTTapeBlock *tape) +_bt_taperead(BTTapeBlock * tape) { - int fd; - int nread; - - if (tape->bttb_eor) { - return(0); /* we are already at End-Of-Run */ - } - - /* - * we're clobbering the old tape block, but we do need to save the - * VFD (the one in the block we're reading is bogus). - */ - fd = tape->bttb_fd; - nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); - tape->bttb_fd = fd; - - if (nread != TAPEBLCKSZ) { - Assert(nread == 0); /* we are at EOF */ - return(0); - } - Assert(tape->bttb_magic == BTTAPEMAGIC); - NDirectFileRead += TAPEBLCKSZ/MAXBLCKSZ; - return(1); + int fd; + int nread; + + if (tape->bttb_eor) + { + return (0); /* we are already at End-Of-Run */ + } + + /* + * we're clobbering the old tape block, but we do need to save the VFD + * (the one in the block we're reading is bogus). + */ + fd = tape->bttb_fd; + nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); + tape->bttb_fd = fd; + + if (nread != TAPEBLCKSZ) + { + Assert(nread == 0); /* we are at EOF */ + return (0); + } + Assert(tape->bttb_magic == BTTAPEMAGIC); + NDirectFileRead += TAPEBLCKSZ / MAXBLCKSZ; + return (1); } /* @@ -487,19 +510,20 @@ _bt_taperead(BTTapeBlock *tape) * side effects: * - sets 'pos' to the current position within the block. */ -static BTItem -_bt_tapenext(BTTapeBlock *tape, char **pos) +static BTItem +_bt_tapenext(BTTapeBlock * tape, char **pos) { - Size itemsz; - BTItem bti; - - if (*pos >= tape->bttb_data + tape->bttb_top) { - return((BTItem) NULL); - } - bti = (BTItem) *pos; - itemsz = BTITEMSZ(bti); - *pos += DOUBLEALIGN(itemsz); - return(bti); + Size itemsz; + BTItem bti; + + if (*pos >= tape->bttb_data + tape->bttb_top) + { + return ((BTItem) NULL); + } + bti = (BTItem) * pos; + itemsz = BTITEMSZ(bti); + *pos += DOUBLEALIGN(itemsz); + return (bti); } /* @@ -514,11 +538,11 @@ _bt_tapenext(BTTapeBlock *tape, char **pos) * the beginning of free space. */ static void -_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) +_bt_tapeadd(BTTapeBlock * tape, BTItem item, int itemsz) { - memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); - ++tape->bttb_ntup; - tape->bttb_top += DOUBLEALIGN(itemsz); + memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); + ++tape->bttb_ntup; + tape->bttb_top += DOUBLEALIGN(itemsz); } /*------------------------------------------------------------------------- @@ -530,41 +554,44 @@ _bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) * create and initialize a spool structure, including the underlying * files. */ -void * +void * _bt_spoolinit(Relation index, int ntapes, bool isunique) { - BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); - int i; - char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); - - if (btspool == (BTSpool *) NULL || fname == (char *) NULL) { - elog(WARN, "_bt_spoolinit: out of memory"); - } - memset((char *) btspool, 0, sizeof(BTSpool)); - btspool->bts_ntapes = ntapes; - btspool->bts_tape = 0; - btspool->isunique = isunique; - - btspool->bts_itape = - (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); - btspool->bts_otape = - (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); - if (btspool->bts_itape == (BTTapeBlock **) NULL || - btspool->bts_otape == (BTTapeBlock **) NULL) { - elog(WARN, "_bt_spoolinit: out of memory"); - } - - for (i = 0; i < ntapes; ++i) { - btspool->bts_itape[i] = - _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); - btspool->bts_otape[i] = - _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); - } - pfree((void *) fname); - - _bt_isortcmpinit(index, btspool); - - return((void *) btspool); + BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); + int i; + char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); + + if (btspool == (BTSpool *) NULL || fname == (char *) NULL) + { + elog(WARN, "_bt_spoolinit: out of memory"); + } + memset((char *) btspool, 0, sizeof(BTSpool)); + btspool->bts_ntapes = ntapes; + btspool->bts_tape = 0; + btspool->isunique = isunique; + + btspool->bts_itape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + btspool->bts_otape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + if (btspool->bts_itape == (BTTapeBlock **) NULL || + btspool->bts_otape == (BTTapeBlock **) NULL) + { + elog(WARN, "_bt_spoolinit: out of memory"); + } + + for (i = 0; i < ntapes; ++i) + { + btspool->bts_itape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + btspool->bts_otape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + } + pfree((void *) fname); + + _bt_isortcmpinit(index, btspool); + + return ((void *) btspool); } /* @@ -573,29 +600,32 @@ _bt_spoolinit(Relation index, int ntapes, bool isunique) void _bt_spooldestroy(void *spool) { - BTSpool *btspool = (BTSpool *) spool; - int i; - - for (i = 0; i < btspool->bts_ntapes; ++i) { - _bt_tapedestroy(btspool->bts_otape[i]); - _bt_tapedestroy(btspool->bts_itape[i]); - } - pfree((void *) btspool); + BTSpool *btspool = (BTSpool *) spool; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) + { + _bt_tapedestroy(btspool->bts_otape[i]); + _bt_tapedestroy(btspool->bts_itape[i]); + } + pfree((void *) btspool); } /* * flush out any dirty output tape blocks */ static void -_bt_spoolflush(BTSpool *btspool) +_bt_spoolflush(BTSpool * btspool) { - int i; + int i; - for (i = 0; i < btspool->bts_ntapes; ++i) { - if (!EMPTYTAPE(btspool->bts_otape[i])) { - _bt_tapewrite(btspool->bts_otape[i], 1); + for (i = 0; i < btspool->bts_ntapes; ++i) + { + if (!EMPTYTAPE(btspool->bts_otape[i])) + { + _bt_tapewrite(btspool->bts_otape[i], 1); + } } - } } /* @@ -605,36 +635,37 @@ _bt_spoolflush(BTSpool *btspool) * output tapes. */ static void -_bt_spoolswap(BTSpool *btspool) +_bt_spoolswap(BTSpool * btspool) { - File tmpfd; - BTTapeBlock *itape; - BTTapeBlock *otape; - int i; + File tmpfd; + BTTapeBlock *itape; + BTTapeBlock *otape; + int i; - for (i = 0; i < btspool->bts_ntapes; ++i) { - itape = btspool->bts_itape[i]; - otape = btspool->bts_otape[i]; + for (i = 0; i < btspool->bts_ntapes; ++i) + { + itape = btspool->bts_itape[i]; + otape = btspool->bts_otape[i]; - /* - * swap the input and output VFDs. - */ - tmpfd = itape->bttb_fd; - itape->bttb_fd = otape->bttb_fd; - otape->bttb_fd = tmpfd; + /* + * swap the input and output VFDs. + */ + tmpfd = itape->bttb_fd; + itape->bttb_fd = otape->bttb_fd; + otape->bttb_fd = tmpfd; - /* - * rewind the new input tape. - */ - _bt_taperewind(itape); - _bt_tapereset(itape); + /* + * rewind the new input tape. + */ + _bt_taperewind(itape); + _bt_tapereset(itape); - /* - * clear the new output tape -- it's ok to throw away the old - * inputs. - */ - _bt_tapeclear(otape); - } + /* + * clear the new output tape -- it's ok to throw away the old + * inputs. + */ + _bt_tapeclear(otape); + } } /*------------------------------------------------------------------------- @@ -643,7 +674,7 @@ _bt_spoolswap(BTSpool *btspool) */ /* - * spool 'btitem' into an initial run. as tape blocks are filled, the + * spool 'btitem' into an initial run. as tape blocks are filled, the * block BTItems are qsorted and written into some output tape (it * doesn't matter which; we go round-robin for simplicity). the * initial runs are therefore always just one block. @@ -651,134 +682,137 @@ _bt_spoolswap(BTSpool *btspool) void _bt_spool(Relation index, BTItem btitem, void *spool) { - BTSpool *btspool = (BTSpool *) spool; - BTTapeBlock *itape; - Size itemsz; - - _bt_isortcmpinit (index, btspool); - - itape = btspool->bts_itape[btspool->bts_tape]; - itemsz = BTITEMSZ(btitem); - itemsz = DOUBLEALIGN(itemsz); - - /* - * if this buffer is too full for this BTItemData, or if we have - * run out of BTItems, we need to sort the buffer and write it - * out. in this case, the BTItemData will go into the next tape's - * buffer. - */ - if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { - BTSortKey *parray = (BTSortKey *) NULL; - BTTapeBlock *otape; - BTItem bti; - char *pos; - int btisz; - int it_ntup = itape->bttb_ntup; - int i; + BTSpool *btspool = (BTSpool *) spool; + BTTapeBlock *itape; + Size itemsz; - /* - * build an array of pointers to the BTItemDatas on the input - * block. - */ - if (it_ntup > 0) { - parray = - (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey)); - pos = itape->bttb_data; - for (i = 0; i < it_ntup; ++i) { - _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i])); - } - - /* - * qsort the pointer array. - */ - qsort((void *) parray, it_ntup, sizeof(BTSortKey), - (int (*)(const void *,const void *))_bt_isortcmp); - } + _bt_isortcmpinit(index, btspool); + + itape = btspool->bts_itape[btspool->bts_tape]; + itemsz = BTITEMSZ(btitem); + itemsz = DOUBLEALIGN(itemsz); /* - * write the spooled run into the output tape. we copy the - * BTItemDatas in the order dictated by the sorted array of - * BTItems, not the original order. - * - * (since everything was DOUBLEALIGN'd and is all on a single - * tape block, everything had *better* still fit on one tape - * block..) + * if this buffer is too full for this BTItemData, or if we have run + * out of BTItems, we need to sort the buffer and write it out. in + * this case, the BTItemData will go into the next tape's buffer. */ - otape = btspool->bts_otape[btspool->bts_tape]; - for (i = 0; i < it_ntup; ++i) { - bti = parray[i].btsk_item; - btisz = BTITEMSZ(bti); - btisz = DOUBLEALIGN(btisz); - _bt_tapeadd(otape, bti, btisz); + if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) + { + BTSortKey *parray = (BTSortKey *) NULL; + BTTapeBlock *otape; + BTItem bti; + char *pos; + int btisz; + int it_ntup = itape->bttb_ntup; + int i; + + /* + * build an array of pointers to the BTItemDatas on the input + * block. + */ + if (it_ntup > 0) + { + parray = + (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey)); + pos = itape->bttb_data; + for (i = 0; i < it_ntup; ++i) + { + _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i])); + } + + /* + * qsort the pointer array. + */ + qsort((void *) parray, it_ntup, sizeof(BTSortKey), + (int (*) (const void *, const void *)) _bt_isortcmp); + } + + /* + * write the spooled run into the output tape. we copy the + * BTItemDatas in the order dictated by the sorted array of + * BTItems, not the original order. + * + * (since everything was DOUBLEALIGN'd and is all on a single tape + * block, everything had *better* still fit on one tape block..) + */ + otape = btspool->bts_otape[btspool->bts_tape]; + for (i = 0; i < it_ntup; ++i) + { + bti = parray[i].btsk_item; + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + _bt_tapeadd(otape, bti, btisz); #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, - &isnull); - printf("_bt_spool: inserted <%x> into output tape %d\n", - d, btspool->bts_tape); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */ - } + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, + &isnull); - /* - * the initial runs are always single tape blocks. flush the - * output block, marking End-Of-Run. - */ - _bt_tapewrite(otape, 1); + printf("_bt_spool: inserted <%x> into output tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */ + } - /* - * reset the input buffer for the next run. we don't have to - * write it out or anything -- we only use it to hold the - * unsorted BTItemDatas, the output tape contains all the - * sorted stuff. - * - * changing bts_tape changes the output tape and input tape; - * we change itape for the code below. - */ - _bt_tapereset(itape); - btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; - itape = btspool->bts_itape[btspool->bts_tape]; + /* + * the initial runs are always single tape blocks. flush the + * output block, marking End-Of-Run. + */ + _bt_tapewrite(otape, 1); - /* - * destroy the pointer array. - */ - if (parray != (BTSortKey *) NULL) - { - for (i = 0; i < it_ntup; i++) - { - if ( parray[i].btsk_datum != (Datum*) NULL ) - pfree ((void*)(parray[i].btsk_datum)); - if ( parray[i].btsk_nulls != (char*) NULL ) - pfree ((void*)(parray[i].btsk_nulls)); - } - pfree((void *) parray); + /* + * reset the input buffer for the next run. we don't have to + * write it out or anything -- we only use it to hold the unsorted + * BTItemDatas, the output tape contains all the sorted stuff. + * + * changing bts_tape changes the output tape and input tape; we + * change itape for the code below. + */ + _bt_tapereset(itape); + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + itape = btspool->bts_itape[btspool->bts_tape]; + + /* + * destroy the pointer array. + */ + if (parray != (BTSortKey *) NULL) + { + for (i = 0; i < it_ntup; i++) + { + if (parray[i].btsk_datum != (Datum *) NULL) + pfree((void *) (parray[i].btsk_datum)); + if (parray[i].btsk_nulls != (char *) NULL) + pfree((void *) (parray[i].btsk_nulls)); + } + pfree((void *) parray); + } } - } - /* insert this item into the current buffer */ - if (btitem != (BTItem) NULL) { - _bt_tapeadd(itape, btitem, itemsz); - } + /* insert this item into the current buffer */ + if (btitem != (BTItem) NULL) + { + _bt_tapeadd(itape, btitem, itemsz); + } } /* * allocate a new, clean btree page, not linked to any siblings. */ static void -_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) +_bt_blnewpage(Relation index, Buffer * buf, Page * page, int flags) { - BTPageOpaque opaque; + BTPageOpaque opaque; - *buf = _bt_getbuf(index, P_NEW, BT_WRITE); + *buf = _bt_getbuf(index, P_NEW, BT_WRITE); #if 0 - printf("\tblk=%d\n", BufferGetBlockNumber(*buf)); + printf("\tblk=%d\n", BufferGetBlockNumber(*buf)); #endif - *page = BufferGetPage(*buf); - _bt_pageinit(*page, BufferGetPageSize(*buf)); - opaque = (BTPageOpaque) PageGetSpecialPointer(*page); - opaque->btpo_prev = opaque->btpo_next = P_NONE; - opaque->btpo_flags = flags; + *page = BufferGetPage(*buf); + _bt_pageinit(*page, BufferGetPageSize(*buf)); + opaque = (BTPageOpaque) PageGetSpecialPointer(*page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_flags = flags; } /* @@ -790,42 +824,44 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) static void _bt_slideleft(Relation index, Buffer buf, Page page) { - OffsetNumber off; - OffsetNumber maxoff; - ItemId previi; - ItemId thisii; - - if (!PageIsEmpty(page)) { - maxoff = PageGetMaxOffsetNumber(page); - previi = PageGetItemId(page, P_HIKEY); - for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { - thisii = PageGetItemId(page, off); - *previi = *thisii; - previi = thisii; + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + ItemId thisii; + + if (!PageIsEmpty(page)) + { + maxoff = PageGetMaxOffsetNumber(page); + previi = PageGetItemId(page, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) + { + thisii = PageGetItemId(page, off); + *previi = *thisii; + previi = thisii; + } + ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); } - ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); - } } /* * allocate and initialize a new BTPageState. the returned structure * is suitable for immediate use by _bt_buildadd. */ -static void * +static void * _bt_pagestate(Relation index, int flags, int level, bool doupper) { - BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); - - memset((char *) state, 0, sizeof(BTPageState)); - _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); - state->btps_firstoff = InvalidOffsetNumber; - state->btps_lastoff = P_HIKEY; - state->btps_lastbti = (BTItem) NULL; - state->btps_next = (BTPageState *) NULL; - state->btps_level = level; - state->btps_doupper = doupper; - - return((void *) state); + BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); + + memset((char *) state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); + state->btps_firstoff = InvalidOffsetNumber; + state->btps_lastoff = P_HIKEY; + state->btps_lastbti = (BTItem) NULL; + state->btps_next = (BTPageState *) NULL; + state->btps_level = level; + state->btps_doupper = doupper; + + return ((void *) state); } /* @@ -834,19 +870,19 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper) * the page to which the item used to point, e.g., a heap page if * 'opage' is a leaf page). */ -static BTItem +static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend) { - OffsetNumber off; - BTItem obti; - BTItem nbti; + OffsetNumber off; + BTItem obti; + BTItem nbti; - off = atend ? P_HIKEY : P_FIRSTKEY; - obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); - nbti = _bt_formitem(&(obti->bti_itup)); - ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); + off = atend ? P_HIKEY : P_FIRSTKEY; + obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); + nbti = _bt_formitem(&(obti->bti_itup)); + ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); - return(nbti); + return (nbti); } /* @@ -855,26 +891,26 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * we must be careful to observe the following restrictions, placed * upon us by the conventions in nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at - * P_FIRSTKEY. + * P_FIRSTKEY. * - duplicates cannot be split among pages unless the chain of - * duplicates starts at the first data item. + * duplicates starts at the first data item. * * a leaf page being built looks like: * * +----------------+---------------------------------+ - * | PageHeaderData | linp0 linp1 linp2 ... | + * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ - * | ... linpN | ^ first | + * | ... linpN | ^ first | * +-----------+--------------------------------------+ - * | ^ last | - * | | - * | v last | + * | ^ last | + * | | + * | v last | * +-------------+------------------------------------+ - * | | itemN ... | + * | | itemN ... | * +-------------+------------------+-----------------+ - * | ... item3 item2 item1 | "special space" | + * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ - * ^ first + * ^ first * * contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. this is because we reserve linp0 as a @@ -888,216 +924,230 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * * if all keys are unique, 'first' will always be the same as 'last'. */ -static BTItem +static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) { - BTPageState *state = (BTPageState *) pstate; - Buffer nbuf; - Page npage; - BTItem last_bti; - OffsetNumber first_off; - OffsetNumber last_off; - OffsetNumber off; - Size pgspc; - Size btisz; - - nbuf = state->btps_buf; - npage = state->btps_page; - first_off = state->btps_firstoff; - last_off = state->btps_lastoff; - last_bti = state->btps_lastbti; - - pgspc = PageGetFreeSpace(npage); - btisz = BTITEMSZ(bti); - btisz = DOUBLEALIGN(btisz); - if (pgspc < btisz) { - Buffer obuf = nbuf; - Page opage = npage; - OffsetNumber o, n; - ItemId ii; - ItemId hii; - - _bt_blnewpage(index, &nbuf, &npage, flags); + BTPageState *state = (BTPageState *) pstate; + Buffer nbuf; + Page npage; + BTItem last_bti; + OffsetNumber first_off; + OffsetNumber last_off; + OffsetNumber off; + Size pgspc; + Size btisz; + + nbuf = state->btps_buf; + npage = state->btps_page; + first_off = state->btps_firstoff; + last_off = state->btps_lastoff; + last_bti = state->btps_lastbti; + + pgspc = PageGetFreeSpace(npage); + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (pgspc < btisz) + { + Buffer obuf = nbuf; + Page opage = npage; + OffsetNumber o, + n; + ItemId ii; + ItemId hii; - /* - * if 'last' is part of a chain of duplicates that does not - * start at the beginning of the old page, the entire chain is - * copied to the new page; we delete all of the duplicates - * from the old page except the first, which becomes the high - * key item of the old page. - * - * if the chain starts at the beginning of the page or there - * is no chain ('first' == 'last'), we need only copy 'last' - * to the new page. again, 'first' (== 'last') becomes the - * high key of the old page. - * - * note that in either case, we copy at least one item to the - * new page, so 'last_bti' will always be valid. 'bti' will - * never be the first data item on the new page. - */ - if (first_off == P_FIRSTKEY) { - Assert(last_off != P_FIRSTKEY); - first_off = last_off; - } - for (o = first_off, n = P_FIRSTKEY; - o <= last_off; - o = OffsetNumberNext(o), n = OffsetNumberNext(n)) { - ii = PageGetItemId(opage, o); - if ( PageAddItem(npage, PageGetItem(opage, ii), - ii->lp_len, n, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the page in _bt_sort (1)"); + _bt_blnewpage(index, &nbuf, &npage, flags); + + /* + * if 'last' is part of a chain of duplicates that does not start + * at the beginning of the old page, the entire chain is copied to + * the new page; we delete all of the duplicates from the old page + * except the first, which becomes the high key item of the old + * page. + * + * if the chain starts at the beginning of the page or there is no + * chain ('first' == 'last'), we need only copy 'last' to the new + * page. again, 'first' (== 'last') becomes the high key of the + * old page. + * + * note that in either case, we copy at least one item to the new + * page, so 'last_bti' will always be valid. 'bti' will never be + * the first data item on the new page. + */ + if (first_off == P_FIRSTKEY) + { + Assert(last_off != P_FIRSTKEY); + first_off = last_off; + } + for (o = first_off, n = P_FIRSTKEY; + o <= last_off; + o = OffsetNumberNext(o), n = OffsetNumberNext(n)) + { + ii = PageGetItemId(opage, o); + if (PageAddItem(npage, PageGetItem(opage, ii), + ii->lp_len, n, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)"); #if 0 #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - BTItem tmpbti = - (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); - Datum d = index_getattr(&(tmpbti->bti_itup), 1, - index->rd_att, &isnull); - printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", - d, n, state->btps_level); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + { + bool isnull; + BTItem tmpbti = + (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); + Datum d = index_getattr(&(tmpbti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", + d, n, state->btps_level); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ #endif - } - /* - * this loop is backward because PageIndexTupleDelete shuffles - * the tuples to fill holes in the page -- by starting at the - * end and working back, we won't create holes (and thereby - * avoid shuffling). - */ - for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { - PageIndexTupleDelete(opage, o); - } - hii = PageGetItemId(opage, P_HIKEY); - ii = PageGetItemId(opage, first_off); - *hii = *ii; - ii->lp_flags &= ~LP_USED; - ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + } - first_off = P_FIRSTKEY; - last_off = PageGetMaxOffsetNumber(npage); - last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); + /* + * this loop is backward because PageIndexTupleDelete shuffles the + * tuples to fill holes in the page -- by starting at the end and + * working back, we won't create holes (and thereby avoid + * shuffling). + */ + for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) + { + PageIndexTupleDelete(opage, o); + } + hii = PageGetItemId(opage, P_HIKEY); + ii = PageGetItemId(opage, first_off); + *hii = *ii; + ii->lp_flags &= ~LP_USED; + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); - /* - * set the page (side link) pointers. - */ - { - BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); - BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); - - oopaque->btpo_next = BufferGetBlockNumber(nbuf); - nopaque->btpo_prev = BufferGetBlockNumber(obuf); - nopaque->btpo_next = P_NONE; - - if ( _bt_itemcmp(index, _bt_nattr, - (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)), - (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)), - BTEqualStrategyNumber) ) - oopaque->btpo_flags |= BTP_CHAIN; - } + first_off = P_FIRSTKEY; + last_off = PageGetMaxOffsetNumber(npage); + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); - /* - * copy the old buffer's minimum key to its parent. if we - * don't have a parent, we have to create one; this adds a new - * btree level. - */ - if (state->btps_doupper) { - BTItem nbti; - - if (state->btps_next == (BTPageState *) NULL) { - state->btps_next = - _bt_pagestate(index, 0, state->btps_level + 1, true); - } - nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); - _bt_buildadd(index, state->btps_next, nbti, 0); - pfree((void *) nbti); + /* + * set the page (side link) pointers. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = BufferGetBlockNumber(nbuf); + nopaque->btpo_prev = BufferGetBlockNumber(obuf); + nopaque->btpo_next = P_NONE; + + if (_bt_itemcmp(index, _bt_nattr, + (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)), + (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)), + BTEqualStrategyNumber)) + oopaque->btpo_flags |= BTP_CHAIN; + } + + /* + * copy the old buffer's minimum key to its parent. if we don't + * have a parent, we have to create one; this adds a new btree + * level. + */ + if (state->btps_doupper) + { + BTItem nbti; + + if (state->btps_next == (BTPageState *) NULL) + { + state->btps_next = + _bt_pagestate(index, 0, state->btps_level + 1, true); + } + nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); + _bt_buildadd(index, state->btps_next, nbti, 0); + pfree((void *) nbti); + } + + /* + * write out the old stuff. we never want to see it again, so we + * can give up our lock (if we had one; BuildingBtree is set, so + * we aren't locking). + */ + _bt_wrtbuf(index, obuf); } /* - * write out the old stuff. we never want to see it again, so - * we can give up our lock (if we had one; BuildingBtree is - * set, so we aren't locking). + * if this item is different from the last item added, we start a new + * chain of duplicates. */ - _bt_wrtbuf(index, obuf); - } - - /* - * if this item is different from the last item added, we start a - * new chain of duplicates. - */ - off = OffsetNumberNext(last_off); - if ( PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the page in _bt_sort (2)"); + off = OffsetNumberNext(last_off); + if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)"); #if 0 #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); - printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", - d, off, state->btps_level); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); + + printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", + d, off, state->btps_level); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ #endif - if (last_bti == (BTItem) NULL) - { - first_off = P_FIRSTKEY; - } - else if ( !_bt_itemcmp(index, _bt_nattr, - bti, last_bti, BTEqualStrategyNumber) ) - { - first_off = off; - } - last_off = off; - last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); - - state->btps_buf = nbuf; - state->btps_page = npage; - state->btps_lastbti = last_bti; - state->btps_lastoff = last_off; - state->btps_firstoff = first_off; - - return(last_bti); + if (last_bti == (BTItem) NULL) + { + first_off = P_FIRSTKEY; + } + else if (!_bt_itemcmp(index, _bt_nattr, + bti, last_bti, BTEqualStrategyNumber)) + { + first_off = off; + } + last_off = off; + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); + + state->btps_buf = nbuf; + state->btps_page = npage; + state->btps_lastbti = last_bti; + state->btps_lastoff = last_off; + state->btps_firstoff = first_off; + + return (last_bti); } static void -_bt_uppershutdown(Relation index, BTPageState *state) +_bt_uppershutdown(Relation index, BTPageState * state) { - BTPageState *s; - BlockNumber blkno; - BTPageOpaque opaque; - BTItem bti; + BTPageState *s; + BlockNumber blkno; + BTPageOpaque opaque; + BTItem bti; - for (s = state; s != (BTPageState *) NULL; s = s->btps_next) { - blkno = BufferGetBlockNumber(s->btps_buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); + for (s = state; s != (BTPageState *) NULL; s = s->btps_next) + { + blkno = BufferGetBlockNumber(s->btps_buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); - /* - * if this is the root, attach it to the metapage. otherwise, - * stick the minimum key of the last page on this level (which - * has not been split, or else it wouldn't be the last page) - * into its parent. this may cause the last page of upper - * levels to split, but that's not a problem -- we haven't - * gotten to them yet. - */ - if (s->btps_doupper) { - if (s->btps_next == (BTPageState *) NULL) { - opaque->btpo_flags |= BTP_ROOT; - _bt_metaproot(index, blkno, s->btps_level + 1); - } else { - bti = _bt_minitem(s->btps_page, blkno, 0); - _bt_buildadd(index, s->btps_next, bti, 0); - pfree((void *) bti); - } - } + /* + * if this is the root, attach it to the metapage. otherwise, + * stick the minimum key of the last page on this level (which has + * not been split, or else it wouldn't be the last page) into its + * parent. this may cause the last page of upper levels to split, + * but that's not a problem -- we haven't gotten to them yet. + */ + if (s->btps_doupper) + { + if (s->btps_next == (BTPageState *) NULL) + { + opaque->btpo_flags |= BTP_ROOT; + _bt_metaproot(index, blkno, s->btps_level + 1); + } + else + { + bti = _bt_minitem(s->btps_page, blkno, 0); + _bt_buildadd(index, s->btps_next, bti, 0); + pfree((void *) bti); + } + } - /* - * this is the rightmost page, so the ItemId array needs to be - * slid back one slot. - */ - _bt_slideleft(index, s->btps_buf, s->btps_page); - _bt_wrtbuf(index, s->btps_buf); - } + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, s->btps_buf, s->btps_page); + _bt_wrtbuf(index, s->btps_buf); + } } /* @@ -1105,203 +1155,230 @@ _bt_uppershutdown(Relation index, BTPageState *state) * merging passes until at most one run is left in each tape. at that * point, merge the final tape runs into a set of btree leaves. * - * XXX three nested loops? gross. cut me up into smaller routines. + * XXX three nested loops? gross. cut me up into smaller routines. */ static void -_bt_merge(Relation index, BTSpool *btspool) +_bt_merge(Relation index, BTSpool * btspool) { - BTPageState *state; - BTPriQueue q; - BTPriQueueElem e; - BTSortKey btsk; - BTItem bti; - BTTapeBlock *itape; - BTTapeBlock *otape; - char *tapepos[MAXTAPES]; - int tapedone[MAXTAPES]; - int t; - int goodtapes; - int npass; - int nruns; - Size btisz; - bool doleaf = false; - - /* - * initialize state needed for the merge into the btree leaf pages. - */ - state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true); - - npass = 0; - do { /* pass */ + BTPageState *state; + BTPriQueue q; + BTPriQueueElem e; + BTSortKey btsk; + BTItem bti; + BTTapeBlock *itape; + BTTapeBlock *otape; + char *tapepos[MAXTAPES]; + int tapedone[MAXTAPES]; + int t; + int goodtapes; + int npass; + int nruns; + Size btisz; + bool doleaf = false; + /* - * each pass starts by flushing the previous outputs and - * swapping inputs and outputs. flushing sets End-of-Run for - * any dirty output tapes. swapping clears the new output - * tapes and rewinds the new input tapes. + * initialize state needed for the merge into the btree leaf pages. */ - btspool->bts_tape = btspool->bts_ntapes - 1; - _bt_spoolflush(btspool); - _bt_spoolswap(btspool); - - ++npass; - nruns = 0; - - for (;;) { /* run */ - /* - * each run starts by selecting a new output tape. the - * merged results of a given run are always sent to this - * one tape. - */ - btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; - otape = btspool->bts_otape[btspool->bts_tape]; - - /* - * initialize the priority queue by loading it with the - * first element of the given run in each tape. since we - * are starting a new run, we reset the tape (clearing the - * End-Of-Run marker) before reading it. this means that - * _bt_taperead will return 0 only if the tape is actually - * at EOF. - */ - memset((char *) &q, 0, sizeof(BTPriQueue)); - goodtapes = 0; - for (t = 0; t < btspool->bts_ntapes; ++t) { - itape = btspool->bts_itape[t]; - tapepos[t] = itape->bttb_data; - tapedone[t] = 0; - _bt_tapereset(itape); - do { - if (_bt_taperead(itape) == 0) { - tapedone[t] = 1; - } - } while (!tapedone[t] && EMPTYTAPE(itape)); - if (!tapedone[t]) { - ++goodtapes; - e.btpqe_tape = t; - _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]), - &(e.btpqe_item)); - if (e.btpqe_item.btsk_item != (BTItem) NULL) { - _bt_pqadd(&q, &e); - } - } - } - /* - * if we don't have any tapes with any input (i.e., they - * are all at EOF), there is no work to do in this run -- - * we must be done with this pass. - */ - if (goodtapes == 0) { - break; /* for */ - } - ++nruns; - - /* - * output the smallest element from the queue until there - * are no more. - */ - while (_bt_pqnext(&q, &e) >= 0) { /* item */ + state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true); + + npass = 0; + do + { /* pass */ + /* - * replace the element taken from priority queue, - * fetching a new block if needed. a tape can run out - * if it hits either End-Of-Run or EOF. + * each pass starts by flushing the previous outputs and swapping + * inputs and outputs. flushing sets End-of-Run for any dirty + * output tapes. swapping clears the new output tapes and rewinds + * the new input tapes. */ - t = e.btpqe_tape; - btsk = e.btpqe_item; - bti = btsk.btsk_item; - if (bti != (BTItem) NULL) { - btisz = BTITEMSZ(bti); - btisz = DOUBLEALIGN(btisz); - if (doleaf) { - _bt_buildadd(index, state, bti, BTP_LEAF); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + btspool->bts_tape = btspool->bts_ntapes - 1; + _bt_spoolflush(btspool); + _bt_spoolswap(btspool); + + ++npass; + nruns = 0; + + for (;;) + { /* run */ + + /* + * each run starts by selecting a new output tape. the merged + * results of a given run are always sent to this one tape. + */ + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + otape = btspool->bts_otape[btspool->bts_tape]; + + /* + * initialize the priority queue by loading it with the first + * element of the given run in each tape. since we are + * starting a new run, we reset the tape (clearing the + * End-Of-Run marker) before reading it. this means that + * _bt_taperead will return 0 only if the tape is actually at + * EOF. + */ + memset((char *) &q, 0, sizeof(BTPriQueue)); + goodtapes = 0; + for (t = 0; t < btspool->bts_ntapes; ++t) { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - index->rd_att, &isnull); - printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n", - npass, nruns, d, t, - BufferGetBlockNumber(state->btps_buf)); + itape = btspool->bts_itape[t]; + tapepos[t] = itape->bttb_data; + tapedone[t] = 0; + _bt_tapereset(itape); + do + { + if (_bt_taperead(itape) == 0) + { + tapedone[t] = 1; + } + } while (!tapedone[t] && EMPTYTAPE(itape)); + if (!tapedone[t]) + { + ++goodtapes; + e.btpqe_tape = t; + _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]), + &(e.btpqe_item)); + if (e.btpqe_item.btsk_item != (BTItem) NULL) + { + _bt_pqadd(&q, &e); + } + } } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - } else { - if (SPCLEFT(otape) < btisz) { - /* - * if it's full, write it out and add the - * item to the next block. (since we will - * be adding another tuple immediately - * after this, we can be sure that there - * will be at least one more block in this - * run and so we know we do *not* want to - * set End-Of-Run here.) - */ - _bt_tapewrite(otape, 0); - } - _bt_tapeadd(otape, bti, btisz); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + + /* + * if we don't have any tapes with any input (i.e., they are + * all at EOF), there is no work to do in this run -- we must + * be done with this pass. + */ + if (goodtapes == 0) { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - index->rd_att, &isnull); - printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n", - npass, nruns, d, t, - btspool->bts_tape); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - } - - if ( btsk.btsk_datum != (Datum*) NULL ) - pfree ((void*)(btsk.btsk_datum)); - if ( btsk.btsk_nulls != (char*) NULL ) - pfree ((void*)(btsk.btsk_nulls)); - - } - itape = btspool->bts_itape[t]; - if (!tapedone[t]) { - BTItem newbti = _bt_tapenext(itape, &tapepos[t]); - - if (newbti == (BTItem) NULL) { - do { - if (_bt_taperead(itape) == 0) { - tapedone[t] = 1; - } - } while (!tapedone[t] && EMPTYTAPE(itape)); - if (!tapedone[t]) { - tapepos[t] = itape->bttb_data; - newbti = _bt_tapenext(itape, &tapepos[t]); + break; /* for */ } - } - if (newbti != (BTItem) NULL) { - BTPriQueueElem nexte; - - nexte.btpqe_tape = t; - _bt_setsortkey(index, newbti, &(nexte.btpqe_item)); - _bt_pqadd(&q, &nexte); - } + ++nruns; + + /* + * output the smallest element from the queue until there are + * no more. + */ + while (_bt_pqnext(&q, &e) >= 0) + { /* item */ + + /* + * replace the element taken from priority queue, fetching + * a new block if needed. a tape can run out if it hits + * either End-Of-Run or EOF. + */ + t = e.btpqe_tape; + btsk = e.btpqe_item; + bti = btsk.btsk_item; + if (bti != (BTItem) NULL) + { + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (doleaf) + { + _bt_buildadd(index, state, bti, BTP_LEAF); +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n", + npass, nruns, d, t, + BufferGetBlockNumber(state->btps_buf)); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + } + else + { + if (SPCLEFT(otape) < btisz) + { + + /* + * if it's full, write it out and add the item + * to the next block. (since we will be + * adding another tuple immediately after + * this, we can be sure that there will be at + * least one more block in this run and so we + * know we do *not* want to set End-Of-Run + * here.) + */ + _bt_tapewrite(otape, 0); + } + _bt_tapeadd(otape, bti, btisz); +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n", + npass, nruns, d, t, + btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + } + + if (btsk.btsk_datum != (Datum *) NULL) + pfree((void *) (btsk.btsk_datum)); + if (btsk.btsk_nulls != (char *) NULL) + pfree((void *) (btsk.btsk_nulls)); + + } + itape = btspool->bts_itape[t]; + if (!tapedone[t]) + { + BTItem newbti = _bt_tapenext(itape, &tapepos[t]); + + if (newbti == (BTItem) NULL) + { + do + { + if (_bt_taperead(itape) == 0) + { + tapedone[t] = 1; + } + } while (!tapedone[t] && EMPTYTAPE(itape)); + if (!tapedone[t]) + { + tapepos[t] = itape->bttb_data; + newbti = _bt_tapenext(itape, &tapepos[t]); + } + } + if (newbti != (BTItem) NULL) + { + BTPriQueueElem nexte; + + nexte.btpqe_tape = t; + _bt_setsortkey(index, newbti, &(nexte.btpqe_item)); + _bt_pqadd(&q, &nexte); + } + } + } /* item */ + + /* + * that's it for this run. flush the output tape, marking + * End-of-Run. + */ + _bt_tapewrite(otape, 1); + } /* run */ + + /* + * we are here because we ran out of input on all of the input + * tapes. + * + * if this pass did not generate more actual output runs than we have + * tapes, we know we have at most one run in each tape. this + * means that we are ready to merge into the final btree leaf + * pages instead of merging into a tape file. + */ + if (nruns <= btspool->bts_ntapes) + { + doleaf = true; } - } /* item */ - - /* - * that's it for this run. flush the output tape, marking - * End-of-Run. - */ - _bt_tapewrite(otape, 1); - } /* run */ - - /* - * we are here because we ran out of input on all of the input - * tapes. - * - * if this pass did not generate more actual output runs than - * we have tapes, we know we have at most one run in each - * tape. this means that we are ready to merge into the final - * btree leaf pages instead of merging into a tape file. - */ - if (nruns <= btspool->bts_ntapes) { - doleaf = true; - } - } while (nruns > 0); /* pass */ + } while (nruns > 0); /* pass */ - _bt_uppershutdown(index, state); + _bt_uppershutdown(index, state); } @@ -1320,62 +1397,65 @@ _bt_merge(Relation index, BTSpool *btspool) void _bt_upperbuild(Relation index) { - Buffer rbuf; - BlockNumber blk; - Page rpage; - BTPageOpaque ropaque; - BTPageState *state; - BTItem nbti; - - /* - * find the first leaf block. while we're at it, clear the - * BTP_ROOT flag that we set while building it (so we could find - * it later). - */ - rbuf = _bt_getroot(index, BT_WRITE); - blk = BufferGetBlockNumber(rbuf); - rpage = BufferGetPage(rbuf); - ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); - ropaque->btpo_flags &= ~BTP_ROOT; - _bt_wrtbuf(index, rbuf); - - state = (BTPageState *) _bt_pagestate(index, 0, 0, true); - - /* for each page... */ - do { -#if 0 - printf("\t\tblk=%d\n", blk); -#endif - rbuf = _bt_getbuf(index, blk, BT_READ); + Buffer rbuf; + BlockNumber blk; + Page rpage; + BTPageOpaque ropaque; + BTPageState *state; + BTItem nbti; + + /* + * find the first leaf block. while we're at it, clear the BTP_ROOT + * flag that we set while building it (so we could find it later). + */ + rbuf = _bt_getroot(index, BT_WRITE); + blk = BufferGetBlockNumber(rbuf); rpage = BufferGetPage(rbuf); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); - - /* for each item... */ - if (!PageIsEmpty(rpage)) { - /* - * form a new index tuple corresponding to the minimum key - * of the lower page and insert it into a page at this - * level. - */ - nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque)); + ropaque->btpo_flags &= ~BTP_ROOT; + _bt_wrtbuf(index, rbuf); + + state = (BTPageState *) _bt_pagestate(index, 0, 0, true); + + /* for each page... */ + do + { +#if 0 + printf("\t\tblk=%d\n", blk); +#endif + rbuf = _bt_getbuf(index, blk, BT_READ); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* for each item... */ + if (!PageIsEmpty(rpage)) + { + + /* + * form a new index tuple corresponding to the minimum key of + * the lower page and insert it into a page at this level. + */ + nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque)); #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att, - &isnull); - printf("_bt_upperbuild: inserting <%x> at %d\n", - d, state->btps_level); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - _bt_buildadd(index, state, nbti, 0); - pfree((void *) nbti); - } - blk = ropaque->btpo_next; - _bt_relbuf(index, rbuf, BT_READ); - } while (blk != P_NONE); - - _bt_uppershutdown(index, state); + { + bool isnull; + Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att, + &isnull); + + printf("_bt_upperbuild: inserting <%x> at %d\n", + d, state->btps_level); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + _bt_buildadd(index, state, nbti, 0); + pfree((void *) nbti); + } + blk = ropaque->btpo_next; + _bt_relbuf(index, rbuf, BT_READ); + } while (blk != P_NONE); + + _bt_uppershutdown(index, state); } + #endif /* @@ -1385,17 +1465,17 @@ _bt_upperbuild(Relation index) void _bt_leafbuild(Relation index, void *spool) { - _bt_isortcmpinit (index, (BTSpool *) spool); + _bt_isortcmpinit(index, (BTSpool *) spool); #ifdef BTREE_BUILD_STATS - if ( ShowExecutorStats ) - { - fprintf(stderr, "! BtreeBuild (Spool) Stats:\n"); - ShowUsage (); - ResetUsage (); - } + if (ShowExecutorStats) + { + fprintf(stderr, "! BtreeBuild (Spool) Stats:\n"); + ShowUsage(); + ResetUsage(); + } #endif - _bt_merge(index, (BTSpool *) spool); + _bt_merge(index, (BTSpool *) spool); } |
