/*------------------------------------------------------------------------- * btsort.c-- * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Id: nbtsort.c,v 1.4 1996/10/20 10:53:13 scrappy Exp $ * * NOTES * * what we do is: * - generate a set of initial one-block runs, distributed round-robin * between the output tapes. * - for each pass, * - swap input and output tape sets, rewinding both and truncating * the output tapes. * - merge the current run in each input tape to the current output * tape. * - when each input run has been exhausted, switch to another output * tape and start processing another run. * - when we have fewer runs than tapes, we know we are ready to start * merging into the btree leaf pages. * - every time we complete a level of the btree, we can construct the * next level up. when we have only one page on a level, it can be * attached to the btree metapage and we are done. * * conventions: * - external interface routines take in and return "void *" for their * opaque handles. this is for modularity reasons (i prefer not to * export these structures without good reason). * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random * probes into the btree thrash the buffer pool. * * this code currently packs the pages to 100% of capacity. this is * not wise, since *any* insertion will cause splitting. filling to * something like the standard 70% steady-state load factor for btrees * would probably be better. * * somebody desperately needs to figure out how to do a better job of * balancing the merge passes -- the fan-in on the final merges can be * pretty poor, which is bad for performance. *------------------------------------------------------------------------- */ #include "postgres.h" #include "catalog/pg_attribute.h" #include "access/attnum.h" #include "nodes/pg_list.h" #include "access/tupdesc.h" #include "storage/fd.h" #include "catalog/pg_am.h" #include "catalog/pg_class.h" #include "nodes/nodes.h" #include "rewrite/prs2lock.h" #include "access/skey.h" #include "access/strat.h" #include "utils/rel.h" #include "storage/block.h" #include "storage/off.h" #include "storage/itemptr.h" #include "access/itup.h" #include "access/funcindex.h" #include "storage/itemid.h" #include "storage/item.h" #include "storage/buf.h" #include "storage/bufpage.h" #include #include "utils/nabstime.h" #include "access/htup.h" #include "utils/tqual.h" #include "access/relscan.h" #include "access/sdir.h" #include "access/nbtree.h" #include #include #include "storage/ipc.h" #include "storage/bufmgr.h" #ifdef FASTBUILD #define MAXTAPES (7) #define TAPEBLCKSZ (BLCKSZ << 2) #define TAPETEMP "pg_btsortXXXXXX" /*------------------------------------------------------------------------- * sorting comparison routine - returns {-1,0,1} depending on whether * the key in the left BTItem is {<,=,>} the key in the right BTItem. * * we want to use _bt_isortcmp as a comparison function for qsort(3), * but it needs extra arguments, so we "pass them in" as global * variables. ick. fortunately, they are the same throughout the * build, so we need do this only once. this is why you must call * _bt_isortcmpinit before the call to qsort(3). * * a NULL BTItem is always assumed to be greater than any actual * value; our heap routines (see below) assume that the smallest * element in the heap is returned. that way, NULL values from the * exhausted tapes can sift down to the bottom of the heap. in point * of fact we just don't replace the elements of exhausted tapes, but * what the heck. * *------------------------------------------------------------------------- */ static Relation _bt_sortrel; static void _bt_isortcmpinit(Relation index) { _bt_sortrel = index; } static int _bt_isortcmp(const void *bti1p,const void *bti2p) { BTItem bti1 = *(BTItem *)bti1p; BTItem bti2 = *(BTItem *)bti2p; if (bti1 == (BTItem) NULL) { if (bti2 == (BTItem) NULL) { return(0); /* 1 = 2 */ } return(1); /* 1 > 2 */ } else if (bti2 == (BTItem) NULL) { return(-1); /* 1 < 2 */ } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2, BTGreaterStrategyNumber)) { return(1); /* 1 > 2 */ } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1, BTGreaterStrategyNumber)) { return(-1); /* 1 < 2 */ } return(0); /* 1 = 2 */ } /*------------------------------------------------------------------------- * priority queue methods * * these were more-or-less lifted from the heap section of the 1984 * edition of gonnet's book on algorithms and data structures. they * are coded so that the smallest element in the heap is returned (we * use them for merging sorted runs). * * XXX these probably ought to be generic library functions. *------------------------------------------------------------------------- */ typedef struct { int btpqe_tape; /* tape identifier */ BTItem btpqe_item; /* pointer to BTItem in tape buffer */ } BTPriQueueElem; #define MAXELEM MAXTAPES typedef struct { int btpq_nelem; BTPriQueueElem btpq_queue[MAXELEM]; Relation btpq_rel; } BTPriQueue; /* be sure to call _bt_isortcmpinit first */ #define GREATER(a, b) \ (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) static void _bt_pqsift(BTPriQueue *q, int parent) { int child; BTPriQueueElem e; for (child = parent * 2 + 1; child < q->btpq_nelem; child = parent * 2 + 1) { if (child < q->btpq_nelem - 1) { if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) { ++child; } } if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { e = q->btpq_queue[child]; /* struct = */ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ q->btpq_queue[parent] = e; /* struct = */ parent = child; } else { parent = child + 1; } } } static int _bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) { if (q->btpq_nelem < 1) { /* already empty */ return(-1); } *e = q->btpq_queue[0]; /* struct = */ if (--q->btpq_nelem < 1) { /* now empty, don't sift */ return(0); } q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ _bt_pqsift(q, 0); return(0); } static void _bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) { int child, parent; if (q->btpq_nelem >= MAXELEM) { elog(WARN, "_bt_pqadd: queue overflow"); } child = q->btpq_nelem++; while (child > 0) { parent = child / 2; if (GREATER(e, &(q->btpq_queue[parent]))) { break; } else { q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ child = parent; } } q->btpq_queue[child] = *e; /* struct = */ } /*------------------------------------------------------------------------- * tape methods *------------------------------------------------------------------------- */ #define BTITEMSZ(btitem) \ ((btitem) ? \ (IndexTupleDSize((btitem)->bti_itup) + \ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ 0) #define SPCLEFT(tape) \ (sizeof((tape)->bttb_data) - (tape)->bttb_top) #define EMPTYTAPE(tape) \ ((tape)->bttb_ntup <= 0) #define BTTAPEMAGIC 0x19660226 /* * this is what we use to shovel BTItems in and out of memory. it's * bigger than a standard block because we are doing a lot of strictly * sequential i/o. this is obviously something of a tradeoff since we * are potentially reading a bunch of zeroes off of disk in many * cases. * * BTItems are packed in and DOUBLEALIGN'd. * * the fd should not be going out to disk, strictly speaking, but it's * the only thing like that so i'm not going to worry about wasting a * few bytes. */ typedef struct { int bttb_magic; /* magic number */ int bttb_fd; /* file descriptor */ int bttb_top; /* top of free space within bttb_data */ short bttb_ntup; /* number of tuples in this block */ short bttb_eor; /* End-Of-Run marker */ char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; } BTTapeBlock; /* * reset the tape header for its next use without doing anything to * the physical tape file. (setting bttb_top to 0 makes the block * empty.) */ static void _bt_tapereset(BTTapeBlock *tape) { tape->bttb_eor = 0; tape->bttb_top = 0; tape->bttb_ntup = 0; } /* * rewind the physical tape file. */ static void _bt_taperewind(BTTapeBlock *tape) { (void) FileSeek(tape->bttb_fd, 0, SEEK_SET); } /* * destroy the contents of the physical tape file without destroying * the tape data structure or removing the physical tape file. * * we use the VFD version of ftruncate(2) to do this rather than * unlinking and recreating the file. you still have to wait while * the OS frees up all of the file system blocks and stuff, but at * least you don't have to delete and reinsert the directory entries. */ static void _bt_tapeclear(BTTapeBlock *tape) { /* blow away the contents of the old file */ _bt_taperewind(tape); #if 0 FileSync(tape->bttb_fd); #endif FileTruncate(tape->bttb_fd, 0); /* reset the buffer */ _bt_tapereset(tape); } /* * create a new BTTapeBlock, allocating memory for the data structure * as well as opening a physical tape file. */ static BTTapeBlock * _bt_tapecreate(char *fname) { BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); if (tape == (BTTapeBlock *) NULL) { elog(WARN, "_bt_tapecreate: out of memory"); } tape->bttb_magic = BTTAPEMAGIC; tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); Assert(tape->bttb_fd >= 0); /* initialize the buffer */ _bt_tapereset(tape); return(tape); } /* * destroy the BTTapeBlock structure and its physical tape file. */ static void _bt_tapedestroy(BTTapeBlock *tape) { FileUnlink(tape->bttb_fd); pfree((void *) tape); } /* * flush the tape block to the file, marking End-Of-Run if requested. */ static void _bt_tapewrite(BTTapeBlock *tape, int eor) { tape->bttb_eor = eor; FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ); _bt_tapereset(tape); } /* * read a tape block from the file, overwriting the current contents * of the buffer. * * returns: * - 0 if there are no more blocks in the tape or in this run (call * _bt_tapereset to clear the End-Of-Run marker) * - 1 if a valid block was read */ static int _bt_taperead(BTTapeBlock *tape) { int fd; int nread; if (tape->bttb_eor) { return(0); /* we are at End-Of-Run */ } /* * we're clobbering the old tape block, but we do need to save the * VFD (the one in the block we're reading is bogus). */ fd = tape->bttb_fd; nread = FileRead(fd, (char*) tape, TAPEBLCKSZ); tape->bttb_fd = fd; if (nread != TAPEBLCKSZ) { Assert(nread == 0); /* we are at EOF */ return(0); } Assert(tape->bttb_magic == BTTAPEMAGIC); return(1); } /* * get the next BTItem from a tape block. * * returns: * - NULL if we have run out of BTItems * - a pointer to the BTItemData in the block otherwise * * side effects: * - sets 'pos' to the current position within the block. */ static BTItem _bt_tapenext(BTTapeBlock *tape, char **pos) { Size itemsz; BTItem bti; if (*pos >= tape->bttb_data + tape->bttb_top) { return((BTItem) NULL); } bti = (BTItem) *pos; itemsz = BTITEMSZ(bti); *pos += DOUBLEALIGN(itemsz); return(bti); } /* * copy a BTItem into a tape block. * * assumes that we have already checked to see if the block has enough * space for the item. * * side effects: * * - advances the 'top' pointer in the tape block header to point to * the beginning of free space. */ static void _bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) { (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); ++tape->bttb_ntup; tape->bttb_top += DOUBLEALIGN(itemsz); } /*------------------------------------------------------------------------- * spool methods *------------------------------------------------------------------------- */ /* * this structure holds the bookkeeping for a simple balanced multiway * merge. (polyphase merging is hairier than i want to get into right * now, and i don't see why i have to care how many "tapes" i use * right now. though if psort was in a condition that i could hack it * to do this, you bet i would.) */ typedef struct { int bts_ntapes; int bts_tape; BTTapeBlock **bts_itape; /* input tape blocks */ BTTapeBlock **bts_otape; /* output tape blocks */ } BTSpool; /* * create and initialize a spool structure, including the underlying * files. */ void * _bt_spoolinit(Relation index, int ntapes) { char *mktemp(); BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); int i; char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); if (btspool == (BTSpool *) NULL || fname == (char *) NULL) { elog(WARN, "_bt_spoolinit: out of memory"); } (void) memset((char *) btspool, 0, sizeof(BTSpool)); btspool->bts_ntapes = ntapes; btspool->bts_tape = 0; btspool->bts_itape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); btspool->bts_otape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); if (btspool->bts_itape == (BTTapeBlock **) NULL || btspool->bts_otape == (BTTapeBlock **) NULL) { elog(WARN, "_bt_spoolinit: out of memory"); } for (i = 0; i < ntapes; ++i) { btspool->bts_itape[i] = _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); btspool->bts_otape[i] = _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); } pfree((void *) fname); _bt_isortcmpinit(index); return((void *) btspool); } /* * clean up a spool structure and its substructures. */ void _bt_spooldestroy(void *spool) { BTSpool *btspool = (BTSpool *) spool; int i; for (i = 0; i < btspool->bts_ntapes; ++i) { _bt_tapedestroy(btspool->bts_otape[i]); _bt_tapedestroy(btspool->bts_itape[i]); } pfree((void *) btspool); } /* * flush out any dirty output tape blocks */ static void _bt_spoolflush(BTSpool *btspool) { int i; for (i = 0; i < btspool->bts_ntapes; ++i) { if (!EMPTYTAPE(btspool->bts_otape[i])) { _bt_tapewrite(btspool->bts_otape[i], 1); } } } /* * swap input tapes and output tapes by swapping their file * descriptors. additional preparation for the next merge pass * includes rewinding the new input tapes and clearing out the new * output tapes. */ static void _bt_spoolswap(BTSpool *btspool) { File tmpfd; BTTapeBlock *itape; BTTapeBlock *otape; int i; for (i = 0; i < btspool->bts_ntapes; ++i) { itape = btspool->bts_itape[i]; otape = btspool->bts_otape[i]; /* * swap the input and output VFDs. */ tmpfd = itape->bttb_fd; itape->bttb_fd = otape->bttb_fd; otape->bttb_fd = tmpfd; /* * rewind the new input tape. */ _bt_taperewind(itape); _bt_tapereset(itape); /* * clear the new output tape -- it's ok to throw away the old * inputs. */ _bt_tapeclear(otape); } } /*------------------------------------------------------------------------- * sorting routines *------------------------------------------------------------------------- */ /* * spool 'btitem' into an initial run. as tape blocks are filled, the * block BTItems are qsorted and written into some output tape (it * doesn't matter which; we go round-robin for simplicity). the * initial runs are therefore always just one block. */ void _bt_spool(Relation index, BTItem btitem, void *spool) { BTSpool *btspool = (BTSpool *) spool; BTTapeBlock *itape; Size itemsz; itape = btspool->bts_itape[btspool->bts_tape]; itemsz = BTITEMSZ(btitem); itemsz = DOUBLEALIGN(itemsz); /* * if this buffer is too full for this BTItemData, or if we have * run out of BTItems, we need to sort the buffer and write it * out. in this case, the BTItemData will go into the next tape's * buffer. */ if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { BTItem *parray; BTTapeBlock *otape; BTItem bti; char *pos; int btisz; int i; /* * build an array of pointers to the BTItemDatas on the input * block. */ parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem)); if (parray == (BTItem *) NULL) { elog(WARN, "_bt_spool: out of memory"); } pos = itape->bttb_data; for (i = 0; i < itape->bttb_ntup; ++i) { parray[i] = _bt_tapenext(itape, &pos); } /* * qsort the pointer array. */ _bt_isortcmpinit(index); qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp); /* * write the spooled run into the output tape. we copy the * BTItemDatas in the order dictated by the sorted array of * BTItems, not the original order. * * (since everything was DOUBLEALIGN'd and is all on a single * page, everything had *better* still fit on one page..) */ otape = btspool->bts_otape[btspool->bts_tape]; for (i = 0; i < itape->bttb_ntup; ++i) { bti = parray[i]; btisz = BTITEMSZ(bti); btisz = DOUBLEALIGN(btisz); _bt_tapeadd(otape, bti, btisz); #ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_spool: inserted <%x> into output tape %d\n", d, btspool->bts_tape); } #endif /* FASTBUILD_DEBUG */ } /* * the initial runs are always single tape blocks. flush the * output block, marking End-Of-Run. */ _bt_tapewrite(otape, 1); /* * reset the input buffer for the next run. we don't have to * write it out or anything -- we only use it to hold the * unsorted BTItemDatas, the output tape contains all the * sorted stuff. * * changing bts_tape changes the output tape and input tape; * we change itape for the code below. */ _bt_tapereset(itape); btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; itape = btspool->bts_itape[btspool->bts_tape]; /* * destroy the pointer array. */ pfree((void *) parray); } /* insert this item into the current buffer */ if (btitem != (BTItem) NULL) { _bt_tapeadd(itape, btitem, itemsz); } } /* * allocate a new, clean btree page, not linked to any siblings. */ static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) { BTPageOpaque opaque; *buf = _bt_getbuf(index, P_NEW, BT_WRITE); *page = BufferGetPage(*buf); _bt_pageinit(*page, BufferGetPageSize(*buf)); opaque = (BTPageOpaque) PageGetSpecialPointer(*page); opaque->btpo_prev = opaque->btpo_next = P_NONE; opaque->btpo_flags = flags; } /* * slide an array of ItemIds back one slot (from P_FIRSTKEY to * P_HIKEY). we need to do this when we discover that we have built * an ItemId array in what has turned out to be a P_RIGHTMOST page. */ static void _bt_slideleft(Relation index, Buffer buf, Page page) { OffsetNumber off; OffsetNumber maxoff; ItemId previi; ItemId thisii; maxoff = PageGetMaxOffsetNumber(page); previi = PageGetItemId(page, P_HIKEY); for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { thisii = PageGetItemId(page, off); *previi = *thisii; previi = thisii; } ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); } typedef struct { Buffer btps_buf; Page btps_page; BTItem btps_lastbti; OffsetNumber btps_lastoff; OffsetNumber btps_firstoff; } BTPageState; /* * add an item to a disk page from a merge tape block. * * we must be careful to observe the following restrictions, placed * upon us by the conventions in nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at * P_FIRSTKEY. * - duplicates cannot be split among pages unless the chain of * duplicates starts at the first data item. * * a leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | ^ first | * +-----------+--------------------------------------+ * | ^ last | * | | * | v last | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * ^ first * * contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. this is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. * * 'last' pointers indicate the last offset/item added to the page. * 'first' pointers indicate the first offset/item that is part of a * chain of duplicates extending from 'first' to 'last'. * * if all keys are unique, 'first' will always be the same as 'last'. */ static void _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) { Buffer nbuf; Page npage; BTItem last_bti; OffsetNumber first_off; OffsetNumber last_off; OffsetNumber off; Size pgspc; Size btisz; nbuf = state->btps_buf; npage = state->btps_page; first_off = state->btps_firstoff; last_off = state->btps_lastoff; last_bti = state->btps_lastbti; pgspc = PageGetFreeSpace(npage); btisz = BTITEMSZ(bti); btisz = DOUBLEALIGN(btisz); if (pgspc < btisz) { Buffer obuf = nbuf; Page opage = npage; OffsetNumber o, n; ItemId ii; ItemId hii; _bt_blnewpage(index, &nbuf, &npage, flags); /* * if 'last' is part of a chain of duplicates that does not * start at the beginning of the old page, the entire chain is * copied to the new page; we delete all of the duplicates * from the old page except the first, which becomes the high * key item of the old page. * * if the chain starts at the beginning of the page or there * is no chain ('first' == 'last'), we need only copy 'last' * to the new page. again, 'first' (== 'last') becomes the * high key of the old page. * * note that in either case, we copy at least one item to the * new page, so 'last_bti' will always be valid. 'bti' will * never be the first data item on the new page. */ if (first_off == P_FIRSTKEY) { Assert(last_off != P_FIRSTKEY); first_off = last_off; } for (o = first_off, n = P_FIRSTKEY; o <= last_off; o = OffsetNumberNext(o), n = OffsetNumberNext(n)) { ii = PageGetItemId(opage, o); (void) PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len, n, LP_USED); #ifdef FASTBUILD_DEBUG { bool isnull; BTItem tmpbti = (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); Datum d = index_getattr(&(tmpbti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_buildadd: moved <%x> to offset %d\n", d, n); } #endif /* FASTBUILD_DEBUG */ } for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { PageIndexTupleDelete(opage, o); } hii = PageGetItemId(opage, P_HIKEY); ii = PageGetItemId(opage, first_off); *hii = *ii; ii->lp_flags &= ~LP_USED; ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); first_off = P_FIRSTKEY; last_off = PageGetMaxOffsetNumber(npage); last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); /* * set the page (side link) pointers. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = BufferGetBlockNumber(nbuf); nopaque->btpo_prev = BufferGetBlockNumber(obuf); nopaque->btpo_next = P_NONE; } /* * write out the old stuff. we never want to see it again, so * we can give up our lock (if we had one; BuildingBtree is * set, so we aren't locking). */ _bt_wrtbuf(index, obuf); } /* * if this item is different from the last item added, we start a * new chain of duplicates. */ off = OffsetNumberNext(last_off); (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED); #ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_buildadd: inserted <%x> at offset %d\n", d, off); } #endif /* FASTBUILD_DEBUG */ if (last_bti == (BTItem) NULL) { first_off = P_FIRSTKEY; } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) { first_off = off; } last_off = off; last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); state->btps_buf = nbuf; state->btps_page = npage; state->btps_lastbti = last_bti; state->btps_lastoff = last_off; state->btps_firstoff = first_off; } /* * take the input tapes stored by 'btspool' and perform successive * merging passes until at most one run is left in each tape. at that * point, merge the final tape runs into a set of btree leaves. * * XXX three nested loops? gross. cut me up into smaller routines. */ static BlockNumber _bt_merge(Relation index, BTSpool *btspool) { BTPageState state; BlockNumber firstblk; BTPriQueue q; BTPriQueueElem e; BTItem bti; BTTapeBlock *itape; BTTapeBlock *otape; char *tapepos[MAXTAPES]; int tapedone[MAXTAPES]; int t; int goodtapes; int nruns; Size btisz; bool doleaf = false; /* * initialize state needed for the merge into the btree leaf pages. */ (void) memset((char *) &state, 0, sizeof(BTPageState)); _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF); state.btps_lastoff = P_HIKEY; state.btps_lastbti = (BTItem) NULL; firstblk = BufferGetBlockNumber(state.btps_buf); do { /* pass */ /* * each pass starts by flushing the previous outputs and * swapping inputs and outputs. this process also clears the * new output tapes and rewinds the new input tapes. */ btspool->bts_tape = btspool->bts_ntapes - 1; _bt_spoolflush(btspool); _bt_spoolswap(btspool); nruns = 0; for (;;) { /* run */ /* * each run starts by selecting a new output tape. the * merged results of a given run are always sent to this * one tape. */ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; otape = btspool->bts_otape[btspool->bts_tape]; /* * initialize the priority queue by loading it with the * first element of the given run in each tape. since we * are starting a new run, we reset the tape (clearing the * End-Of-Run marker) before reading it. this means that * _bt_taperead will return 0 only if the tape is actually * at EOF. */ (void) memset((char *) &q, 0, sizeof(BTPriQueue)); goodtapes = 0; for (t = 0; t < btspool->bts_ntapes; ++t) { itape = btspool->bts_itape[t]; tapepos[t] = itape->bttb_data; _bt_tapereset(itape); if (_bt_taperead(itape) == 0) { tapedone[t] = 1; } else { ++goodtapes; tapedone[t] = 0; e.btpqe_tape = t; e.btpqe_item = _bt_tapenext(itape, &tapepos[t]); if (e.btpqe_item != (BTItem) NULL) { _bt_pqadd(&q, &e); } } } /* * if we don't have any tapes with any input (i.e., they * are all at EOF), we must be done with this pass. */ if (goodtapes == 0) { break; /* for */ } ++nruns; /* * output the smallest element from the queue until there are no * more. */ while (_bt_pqnext(&q, &e) >= 0) { /* item */ /* * replace the element taken from priority queue, * fetching a new block if needed. a tape can run out * if it hits either End-Of-Run or EOF. */ t = e.btpqe_tape; bti = e.btpqe_item; if (bti != (BTItem) NULL) { btisz = BTITEMSZ(bti); btisz = DOUBLEALIGN(btisz); if (doleaf) { _bt_buildadd(index, &state, bti, BTP_LEAF); #ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_merge: inserted <%x> into block %d\n", d, BufferGetBlockNumber(state.btps_buf)); } #endif /* FASTBUILD_DEBUG */ } else { if (SPCLEFT(otape) < btisz) { /* * if it's full, write it out and add the * item to the next block. (since we know * there will be at least one more block, * we know we do *not* want to set * End-Of-Run here!) */ _bt_tapewrite(otape, 0); } _bt_tapeadd(otape, bti, btisz); #ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_merge: inserted <%x> into tape %d\n", d, btspool->bts_tape); } #endif /* FASTBUILD_DEBUG */ } } #ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_merge: got <%x> from tape %d\n", d, t); } #endif /* FASTBUILD_DEBUG */ itape = btspool->bts_itape[t]; if (!tapedone[t]) { BTItem newbti = _bt_tapenext(itape, &tapepos[t]); if (newbti == (BTItem) NULL) { if (_bt_taperead(itape) == 0) { tapedone[t] = 1; } else { tapepos[t] = itape->bttb_data; newbti = _bt_tapenext(itape, &tapepos[t]); } } if (newbti != (BTItem) NULL) { BTPriQueueElem nexte; nexte.btpqe_tape = t; nexte.btpqe_item = newbti; _bt_pqadd(&q, &nexte); } } } /* item */ } /* run */ /* * we are here because we ran out of input on all of the input * tapes. * * if this pass did not generate more actual output runs than * we have tapes, we know we have at most one run in each * tape. this means that we are ready to merge into the final * btree leaf pages instead of merging into a tape file. */ if (nruns <= btspool->bts_ntapes) { doleaf = true; } } while (nruns > 0); /* pass */ /* * this is the rightmost page, so the ItemId array needs to be * slid back one slot. */ _bt_slideleft(index, state.btps_buf, state.btps_page); _bt_wrtbuf(index, state.btps_buf); return(firstblk); } /* * given the block number 'blk' of the first page of a set of linked * siblings (i.e., the start of an entire level of the btree), * construct the corresponding next level of the btree. we do this by * placing minimum keys from each page into this page. the format of * the internal pages is otherwise the same as for leaf pages. */ void _bt_upperbuild(Relation index, BlockNumber blk, int level) { Buffer rbuf; Page rpage; BTPageOpaque ropaque; BTPageState state; BlockNumber firstblk; BTItem bti; BTItem nbti; OffsetNumber off; rbuf = _bt_getbuf(index, blk, BT_WRITE); rpage = BufferGetPage(rbuf); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); /* * if we only have one page on a level, we can just make it the * root. */ if (P_RIGHTMOST(ropaque)) { ropaque->btpo_flags |= BTP_ROOT; _bt_wrtbuf(index, rbuf); _bt_metaproot(index, blk); return; } _bt_relbuf(index, rbuf, BT_WRITE); (void) memset((char *) &state, 0, sizeof(BTPageState)); _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0); state.btps_lastoff = P_HIKEY; state.btps_lastbti = (BTItem) NULL; firstblk = BufferGetBlockNumber(state.btps_buf); /* for each page... */ do { rbuf = _bt_getbuf(index, blk, BT_READ); rpage = BufferGetPage(rbuf); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); /* for each item... */ if (!PageIsEmpty(rpage)) { /* * form a new index tuple corresponding to the minimum key * of the lower page and insert it into a page at this * level. */ off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY; bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off)); nbti = _bt_formitem(&(bti->bti_itup)); ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY); #ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(nbti->bti_itup), 1, RelationGetTupleDescriptor(index), &isnull); printf("_bt_upperbuild: inserting <%x> at %d\n", d, level); } #endif /* FASTBUILD_DEBUG */ _bt_buildadd(index, &state, nbti, 0); pfree((void *) nbti); } blk = ropaque->btpo_next; _bt_relbuf(index, rbuf, BT_READ); } while (blk != P_NONE); /* * this is the rightmost page, so the ItemId array needs to be * slid back one slot. */ _bt_slideleft(index, state.btps_buf, state.btps_page); _bt_wrtbuf(index, state.btps_buf); _bt_upperbuild(index, firstblk, level + 1); } /* * given a spool loading by successive calls to _bt_spool, create an * entire btree. */ void _bt_leafbuild(Relation index, void *spool) { BTSpool *btspool = (BTSpool *) spool; BlockNumber firstblk; /* * merge the runs into btree leaf pages. */ firstblk = _bt_merge(index, btspool); /* * build the upper levels of the btree. */ _bt_upperbuild(index, firstblk, 0); } #else /* !FASTBUILD */ void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); } void _bt_spooldestroy(void *spool) { } void _bt_spool(Relation index, BTItem btitem, void *spool) { } void _bt_upperbuild(Relation index, BlockNumber blk, int level) { } void _bt_leafbuild(Relation index, void *spool) { } #endif /* !FASTBUILD */