/*------------------------------------------------------------------------- * * gistbuildbuffers.c * node buffer management functions for GiST buffering build algorithm. * * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/access/gist/gistbuildbuffers.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/genam.h" #include "access/gist_private.h" #include "catalog/index.h" #include "miscadmin.h" #include "storage/buffile.h" #include "storage/bufmgr.h" #include "utils/memutils.h" #include "utils/rel.h" static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb); static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer); static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer); static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer); static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup); static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup); static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb); static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum); static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr); static void WriteTempFileBlock(BufFile *file, long blknum, const void *ptr); /* * Initialize GiST build buffers. */ GISTBuildBuffers * gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) { GISTBuildBuffers *gfbb; HASHCTL hashCtl; gfbb = palloc(sizeof(GISTBuildBuffers)); gfbb->pagesPerBuffer = pagesPerBuffer; gfbb->levelStep = levelStep; /* * Create a temporary file to hold buffer pages that are swapped out of * memory. */ gfbb->pfile = BufFileCreateTemp(false); gfbb->nFileBlocks = 0; /* Initialize free page management. */ gfbb->nFreeBlocks = 0; gfbb->freeBlocksLen = 32; gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); /* * Current memory context will be used for all in-memory data structures * of buffers which are persistent during buffering build. */ gfbb->context = CurrentMemoryContext; /* * nodeBuffersTab hash is association between index blocks and it's * buffers. */ hashCtl.keysize = sizeof(BlockNumber); hashCtl.entrysize = sizeof(GISTNodeBuffer); hashCtl.hcxt = CurrentMemoryContext; gfbb->nodeBuffersTab = hash_create("gistbuildbuffers", 1024, &hashCtl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); gfbb->bufferEmptyingQueue = NIL; /* * Per-level node buffers lists for final buffers emptying process. Node * buffers are inserted here when they are created. */ gfbb->buffersOnLevelsLen = 1; gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * gfbb->buffersOnLevelsLen); gfbb->buffersOnLevels[0] = NIL; /* * Block numbers of node buffers which last pages are currently loaded * into main memory. */ gfbb->loadedBuffersLen = 32; gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); gfbb->loadedBuffersCount = 0; gfbb->rootlevel = maxLevel; return gfbb; } /* * Returns a node buffer for given block. The buffer is created if it * doesn't exist yet. */ GISTNodeBuffer * gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, BlockNumber nodeBlocknum, int level) { GISTNodeBuffer *nodeBuffer; bool found; /* Find node buffer in hash table */ nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab, &nodeBlocknum, HASH_ENTER, &found); if (!found) { /* * Node buffer wasn't found. Initialize the new buffer as empty. */ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ nodeBuffer->blocksCount = 0; nodeBuffer->pageBlocknum = InvalidBlockNumber; nodeBuffer->pageBuffer = NULL; nodeBuffer->queuedForEmptying = false; nodeBuffer->isTemp = false; nodeBuffer->level = level; /* * Add this buffer to the list of buffers on this level. Enlarge * buffersOnLevels array if needed. */ if (level >= gfbb->buffersOnLevelsLen) { int i; gfbb->buffersOnLevels = (List **) repalloc(gfbb->buffersOnLevels, (level + 1) * sizeof(List *)); /* initialize the enlarged portion */ for (i = gfbb->buffersOnLevelsLen; i <= level; i++) gfbb->buffersOnLevels[i] = NIL; gfbb->buffersOnLevelsLen = level + 1; } /* * Prepend the new buffer to the list of buffers on this level. It's * not arbitrary that the new buffer is put to the beginning of the * list: in the final emptying phase we loop through all buffers at * each level, and flush them. If a page is split during the emptying, * it's more efficient to flush the new splitted pages first, before * moving on to pre-existing pages on the level. The buffers just * created during the page split are likely still in cache, so * flushing them immediately is more efficient than putting them to * the end of the queue. */ gfbb->buffersOnLevels[level] = lcons(nodeBuffer, gfbb->buffersOnLevels[level]); MemoryContextSwitchTo(oldcxt); } return nodeBuffer; } /* * Allocate memory for a buffer page. */ static GISTNodeBufferPage * gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) { GISTNodeBufferPage *pageBuffer; pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, BLCKSZ); pageBuffer->prev = InvalidBlockNumber; /* Set page free space */ PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; return pageBuffer; } /* * Add specified buffer into loadedBuffers array. */ static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) { /* Never add a temporary buffer to the array */ if (nodeBuffer->isTemp) return; /* Enlarge the array if needed */ if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) { gfbb->loadedBuffersLen *= 2; gfbb->loadedBuffers = (GISTNodeBuffer **) repalloc(gfbb->loadedBuffers, gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); } gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer; gfbb->loadedBuffersCount++; } /* * Load last page of node buffer into main memory. */ static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) { /* Check if we really should load something */ if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) { /* Allocate memory for page */ nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); /* Read block from temporary file */ ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum, nodeBuffer->pageBuffer); /* Mark file block as free */ gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum); /* Mark node buffer as loaded */ gistAddLoadedBuffer(gfbb, nodeBuffer); nodeBuffer->pageBlocknum = InvalidBlockNumber; } } /* * Write last page of node buffer to the disk. */ static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) { /* Check if we have something to write */ if (nodeBuffer->pageBuffer) { BlockNumber blkno; /* Get free file block */ blkno = gistBuffersGetFreeBlock(gfbb); /* Write block to the temporary file */ WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); /* Free memory of that page */ pfree(nodeBuffer->pageBuffer); nodeBuffer->pageBuffer = NULL; /* Save block number */ nodeBuffer->pageBlocknum = blkno; } } /* * Write last pages of all node buffers to the disk. */ void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb) { int i; /* Unload all the buffers that have a page loaded in memory. */ for (i = 0; i < gfbb->loadedBuffersCount; i++) gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]); /* Now there are no node buffers with loaded last page */ gfbb->loadedBuffersCount = 0; } /* * Add index tuple to buffer page. */ static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup) { Size itupsz = IndexTupleSize(itup); char *ptr; /* There should be enough of space. */ Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz)); /* Reduce free space value of page to reserve a spot for the tuple. */ PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz); /* Get pointer to the spot we reserved (ie. end of free space). */ ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET + PAGE_FREE_SPACE(pageBuffer); /* Copy the index tuple there. */ memcpy(ptr, itup, itupsz); } /* * Get last item from buffer page and remove it from page. */ static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup) { IndexTuple ptr; Size itupsz; Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */ /* Get pointer to last index tuple */ ptr = (IndexTuple) ((char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET + PAGE_FREE_SPACE(pageBuffer)); itupsz = IndexTupleSize(ptr); /* Make a copy of the tuple */ *itup = (IndexTuple) palloc(itupsz); memcpy(*itup, ptr, itupsz); /* Mark the space used by the tuple as free */ PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz); } /* * Push an index tuple to node buffer. */ void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple itup) { /* * Most part of memory operations will be in buffering build persistent * context. So, let's switch to it. */ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); /* * If the buffer is currently empty, create the first page. */ if (nodeBuffer->blocksCount == 0) { nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); nodeBuffer->blocksCount = 1; gistAddLoadedBuffer(gfbb, nodeBuffer); } /* Load last page of node buffer if it wasn't in memory already */ if (!nodeBuffer->pageBuffer) gistLoadNodeBuffer(gfbb, nodeBuffer); /* * Check if there is enough space on the last page for the tuple. */ if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) { /* * Nope. Swap previous block to disk and allocate a new one. */ BlockNumber blkno; /* Write filled page to the disk */ blkno = gistBuffersGetFreeBlock(gfbb); WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); /* * Reset the in-memory page as empty, and link the previous block to * the new page by storing its block number in the prev-link. */ PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); nodeBuffer->pageBuffer->prev = blkno; /* We've just added one more page */ nodeBuffer->blocksCount++; } gistPlaceItupToPage(nodeBuffer->pageBuffer, itup); /* * If the buffer just overflowed, add it to the emptying queue. */ if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying) { gfbb->bufferEmptyingQueue = lcons(nodeBuffer, gfbb->bufferEmptyingQueue); nodeBuffer->queuedForEmptying = true; } /* Restore memory context */ MemoryContextSwitchTo(oldcxt); } /* * Removes one index tuple from node buffer. Returns true if success and false * if node buffer is empty. */ bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple *itup) { /* * If node buffer is empty then return false. */ if (nodeBuffer->blocksCount <= 0) return false; /* Load last page of node buffer if needed */ if (!nodeBuffer->pageBuffer) gistLoadNodeBuffer(gfbb, nodeBuffer); /* * Get index tuple from last non-empty page. */ gistGetItupFromPage(nodeBuffer->pageBuffer, itup); /* * If we just removed the last tuple from the page, fetch previous page on * this node buffer (if any). */ if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) { BlockNumber prevblkno; /* * blocksCount includes the page in pageBuffer, so decrease it now. */ nodeBuffer->blocksCount--; /* * If there's more pages, fetch previous one. */ prevblkno = nodeBuffer->pageBuffer->prev; if (prevblkno != InvalidBlockNumber) { /* There is a previous page. Fetch it. */ Assert(nodeBuffer->blocksCount > 0); ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer); /* * Now that we've read the block in memory, we can release its * on-disk block for reuse. */ gistBuffersReleaseBlock(gfbb, prevblkno); } else { /* No more pages. Free memory. */ Assert(nodeBuffer->blocksCount == 0); pfree(nodeBuffer->pageBuffer); nodeBuffer->pageBuffer = NULL; } } return true; } /* * Select a currently unused block for writing to. */ static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb) { /* * If there are multiple free blocks, we select the one appearing last in * freeBlocks[]. If there are none, assign the next block at the end of * the file (causing the file to be extended). */ if (gfbb->nFreeBlocks > 0) return gfbb->freeBlocks[--gfbb->nFreeBlocks]; else return gfbb->nFileBlocks++; } /* * Return a block# to the freelist. */ static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum) { int ndx; /* Enlarge freeBlocks array if full. */ if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) { gfbb->freeBlocksLen *= 2; gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, gfbb->freeBlocksLen * sizeof(long)); } /* Add blocknum to array */ ndx = gfbb->nFreeBlocks++; gfbb->freeBlocks[ndx] = blocknum; } /* * Free buffering build data structure. */ void gistFreeBuildBuffers(GISTBuildBuffers *gfbb) { /* Close buffers file. */ BufFileClose(gfbb->pfile); /* All other things will be freed on memory context release */ } /* * Data structure representing information about node buffer for index tuples * relocation from splitted node buffer. */ typedef struct { GISTENTRY entry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; GISTPageSplitInfo *splitinfo; GISTNodeBuffer *nodeBuffer; } RelocationBufferInfo; /* * At page split, distribute tuples from the buffer of the split page to * new buffers for the created page halves. This also adjusts the downlinks * in 'splitinfo' to include the tuples in the buffers. */ void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, Relation r, int level, Buffer buffer, List *splitinfo) { RelocationBufferInfo *relocationBuffersInfos; bool found; GISTNodeBuffer *nodeBuffer; BlockNumber blocknum; IndexTuple itup; int splitPagesCount = 0; GISTENTRY entry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; GISTNodeBuffer oldBuf; ListCell *lc; /* If the splitted page doesn't have buffers, we have nothing to do. */ if (!LEVEL_HAS_BUFFERS(level, gfbb)) return; /* * Get the node buffer of the splitted page. */ blocknum = BufferGetBlockNumber(buffer); nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, HASH_FIND, &found); if (!found) { /* The page has no buffer, so we have nothing to do. */ return; } /* * Make a copy of the old buffer, as we're going reuse it as the buffer * for the new left page, which is on the same block as the old page. * That's not true for the root page, but that's fine because we never * have a buffer on the root page anyway. The original algorithm as * described by Arge et al did, but it's of no use, as you might as well * read the tuples straight from the heap instead of the root buffer. */ Assert(blocknum != GIST_ROOT_BLKNO); memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer)); oldBuf.isTemp = true; /* Reset the old buffer, used for the new left page from now on */ nodeBuffer->blocksCount = 0; nodeBuffer->pageBuffer = NULL; nodeBuffer->pageBlocknum = InvalidBlockNumber; /* * Allocate memory for information about relocation buffers. */ splitPagesCount = list_length(splitinfo); relocationBuffersInfos = (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * splitPagesCount); /* * Fill relocation buffers information for node buffers of pages produced * by split. */ foreach(lc, splitinfo) { GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc); GISTNodeBuffer *newNodeBuffer; int i = foreach_current_index(lc); /* Decompress parent index tuple of node buffer page. */ gistDeCompressAtt(giststate, r, si->downlink, NULL, (OffsetNumber) 0, relocationBuffersInfos[i].entry, relocationBuffersInfos[i].isnull); /* * Create a node buffer for the page. The leftmost half is on the same * block as the old page before split, so for the leftmost half this * will return the original buffer. The tuples on the original buffer * were relinked to the temporary buffer, so the original one is now * empty. */ newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level); relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; relocationBuffersInfos[i].splitinfo = si; } /* * Loop through all index tuples in the buffer of the page being split, * moving them to buffers for the new pages. We try to move each tuple to * the page that will result in the lowest penalty for the leading column * or, in the case of a tie, the lowest penalty for the earliest column * that is not tied. * * The page searching logic is very similar to gistchoose(). */ while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) { float best_penalty[INDEX_MAX_KEYS]; int i, which; IndexTuple newtup; RelocationBufferInfo *targetBufferInfo; gistDeCompressAtt(giststate, r, itup, NULL, (OffsetNumber) 0, entry, isnull); /* default to using first page (shouldn't matter) */ which = 0; /* * best_penalty[j] is the best penalty we have seen so far for column * j, or -1 when we haven't yet examined column j. Array entries to * the right of the first -1 are undefined. */ best_penalty[0] = -1; /* * Loop over possible target pages, looking for one to move this tuple * to. */ for (i = 0; i < splitPagesCount; i++) { RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; bool zero_penalty; int j; zero_penalty = true; /* Loop over index attributes. */ for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) { float usize; /* Compute penalty for this column. */ usize = gistpenalty(giststate, j, &splitPageInfo->entry[j], splitPageInfo->isnull[j], &entry[j], isnull[j]); if (usize > 0) zero_penalty = false; if (best_penalty[j] < 0 || usize < best_penalty[j]) { /* * New best penalty for column. Tentatively select this * page as the target, and record the best penalty. Then * reset the next column's penalty to "unknown" (and * indirectly, the same for all the ones to its right). * This will force us to adopt this page's penalty values * as the best for all the remaining columns during * subsequent loop iterations. */ which = i; best_penalty[j] = usize; if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) best_penalty[j + 1] = -1; } else if (best_penalty[j] == usize) { /* * The current page is exactly as good for this column as * the best page seen so far. The next iteration of this * loop will compare the next column. */ } else { /* * The current page is worse for this column than the best * page seen so far. Skip the remaining columns and move * on to the next page, if any. */ zero_penalty = false; /* so outer loop won't exit */ break; } } /* * If we find a page with zero penalty for all columns, there's no * need to examine remaining pages; just break out of the loop and * return it. */ if (zero_penalty) break; } /* OK, "which" is the page index to push the tuple to */ targetBufferInfo = &relocationBuffersInfos[which]; /* Push item to selected node buffer */ gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup); /* Adjust the downlink for this page, if needed. */ newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink, itup, giststate); if (newtup) { gistDeCompressAtt(giststate, r, newtup, NULL, (OffsetNumber) 0, targetBufferInfo->entry, targetBufferInfo->isnull); targetBufferInfo->splitinfo->downlink = newtup; } } pfree(relocationBuffersInfos); } /* * Wrappers around BufFile operations. The main difference is that these * wrappers report errors with ereport(), so that the callers don't need * to check the return code. */ static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr) { if (BufFileSeekBlock(file, blknum) != 0) elog(ERROR, "could not seek to block %ld in temporary file", blknum); BufFileReadExact(file, ptr, BLCKSZ); } static void WriteTempFileBlock(BufFile *file, long blknum, const void *ptr) { if (BufFileSeekBlock(file, blknum) != 0) elog(ERROR, "could not seek to block %ld in temporary file", blknum); BufFileWrite(file, ptr, BLCKSZ); }