diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml index f9226e7a35..192338be88 100644 --- a/doc/src/sgml/gist.sgml +++ b/doc/src/sgml/gist.sgml @@ -259,6 +259,8 @@ CREATE INDEX ON my_table USING GIST (my_inet_column inet_ops); compress method is omitted. The optional tenth method options is needed if the operator class provides the user-specified parameters. + The sortsupport method is also optional and is used to + speed up building a GiST index. @@ -1065,6 +1067,74 @@ my_compress(PG_FUNCTION_ARGS) + + + sortsupport + + + Returns a comparator function to sort data in a way that preserves + locality. It is used by CREATE INDEX and + REINDEX commands. The quality of the created index + depends on how well the sort order determined by the comparator function + preserves locality of the inputs. + + + The sortsupport method is optional. If it is not + provided, CREATE INDEX builds the index by inserting + each tuple to the tree using the penalty and + picksplit functions, which is much slower. + + + + The SQL declaration of the function must look like + this: + + +CREATE OR REPLACE FUNCTION my_sortsupport(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + + + The argument is a pointer to a SortSupport + struct. At a minimum, the function must fill in its comparator field. + The comparator takes three arguments: two Datums to compare, and + a pointer to the SortSupport struct. The + Datums are the two indexed values in the format that they are stored + in the index; that is, in the format returned by the + compress method. The full API is defined in + src/include/utils/sortsupport.h. + + + + The matching code in the C module could then follow this skeleton: + + +PG_FUNCTION_INFO_V1(my_sortsupport); + +static int +my_fastcmp(Datum x, Datum y, SortSupport ssup) +{ + /* establish order between x and y by computing some sorting value z */ + + int z1 = ComputeSpatialCode(x); + int z2 = ComputeSpatialCode(y); + + return z1 == z2 ? 0 : z1 > z2 ? 1 : -1; +} + +Datum +my_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = my_fastcmp; + PG_RETURN_VOID(); +} + + + + diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 671b5e9186..230625cf1e 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -3,6 +3,24 @@ * gistbuild.c * build algorithm for GiST indexes implementation. * + * There are two different strategies: + * + * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted + * order, and create downlinks and internal pages as we go. This builds + * the index from the bottom up, similar to how B-tree index build + * works. + * + * 2. Start with an empty index, and insert all tuples one by one. + * + * The sorted method is used if the operator classes for all columns have + * a 'sortsupport' defined. Otherwise, we resort to the second strategy. + * + * The second strategy can optionally use buffers at different levels of + * the tree to reduce I/O, see "Buffering build algorithm" in the README + * for a more detailed explanation. It initially calls insert over and + * over, but switches to the buffered algorithm after a certain number of + * tuples (unless buffering mode is disabled). + * * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -28,6 +46,7 @@ #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/tuplesort.h" /* Step of index tuples for check whether to switch to buffering build mode */ #define BUFFERING_MODE_SWITCH_CHECK_STEP 256 @@ -40,8 +59,14 @@ */ #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096 +/* + * Strategy used to build the index. It can change between the + * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used, + * that needs to be decided up-front and cannot be changed afterwards. + */ typedef enum { + GIST_SORTED_BUILD, /* bottom-up build by sorting */ GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to * switch */ GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to @@ -51,7 +76,7 @@ typedef enum * before switching to the buffering build * mode */ GIST_BUFFERING_ACTIVE /* in buffering build mode */ -} GistBufferingMode; +} GistBuildMode; /* Working state for gistbuild and its callback */ typedef struct @@ -60,23 +85,58 @@ typedef struct Relation heaprel; GISTSTATE *giststate; - int64 indtuples; /* number of tuples indexed */ - int64 indtuplesSize; /* total size of all indexed tuples */ - Size freespace; /* amount of free space to leave on pages */ + GistBuildMode buildMode; + + int64 indtuples; /* number of tuples indexed */ + /* * Extra data structures used during a buffering build. 'gfbb' contains * information related to managing the build buffers. 'parentMap' is a * lookup table of the parent of each internal page. */ + int64 indtuplesSize; /* total size of all indexed tuples */ GISTBuildBuffers *gfbb; HTAB *parentMap; - GistBufferingMode bufferingMode; + /* + * Extra data structures used during a sorting build. + */ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + + BlockNumber pages_allocated; + BlockNumber pages_written; + + int ready_num_pages; + BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; + Page ready_pages[XLR_MAX_BLOCK_ID]; } GISTBuildState; +/* + * In sorted build, we use a stack of these structs, one for each level, + * to hold an in-memory buffer of the righmost page at the level. When the + * page fills up, it is written out and a new page is allocated. + */ +typedef struct GistSortedBuildPageState +{ + Page page; + struct GistSortedBuildPageState *parent; /* Upper level, if any */ +} GistSortedBuildPageState; + /* prototypes for private functions */ + +static void gistSortedBuildCallback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *state); +static void gist_indexsortbuild(GISTBuildState *state); +static void gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup); +static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate); +static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); + static void gistInitBuffering(GISTBuildState *buildstate); static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); static void gistBuildCallback(Relation index, @@ -107,10 +167,9 @@ static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent); static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); + /* - * Main entry point to GiST index build. Initially calls insert over and over, - * but switches to more efficient buffering build algorithm after a certain - * number of tuples (unless buffering mode is disabled). + * Main entry point to GiST index build. */ IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) @@ -118,39 +177,12 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) IndexBuildResult *result; double reltuples; GISTBuildState buildstate; - Buffer buffer; - Page page; MemoryContext oldcxt = CurrentMemoryContext; int fillfactor; - - buildstate.indexrel = index; - buildstate.heaprel = heap; - - if (index->rd_options) - { - /* Get buffering mode from the options string */ - GiSTOptions *options = (GiSTOptions *) index->rd_options; - - if (options->buffering_mode == GIST_OPTION_BUFFERING_ON) - buildstate.bufferingMode = GIST_BUFFERING_STATS; - else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF) - buildstate.bufferingMode = GIST_BUFFERING_DISABLED; - else - buildstate.bufferingMode = GIST_BUFFERING_AUTO; - - fillfactor = options->fillfactor; - } - else - { - /* - * By default, switch to buffering mode when the index grows too large - * to fit in cache. - */ - buildstate.bufferingMode = GIST_BUFFERING_AUTO; - fillfactor = GIST_DEFAULT_FILLFACTOR; - } - /* Calculate target amount of free space to leave on pages */ - buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; + Oid SortSupportFnOids[INDEX_MAX_KEYS]; + bool hasallsortsupports; + int keyscount = IndexRelationGetNumberOfKeyAttributes(index); + GiSTOptions *options = NULL; /* * We expect to be called exactly once for any index relation. If that's @@ -160,7 +192,12 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); - /* no locking is needed */ + if (index->rd_options) + options = (GiSTOptions *) index->rd_options; + + buildstate.indexrel = index; + buildstate.heaprel = heap; + buildstate.sortstate = NULL; buildstate.giststate = initGISTstate(index); /* @@ -170,42 +207,129 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ buildstate.giststate->tempCxt = createTempGistContext(); - /* initialize the root page */ - buffer = gistNewBuffer(index); - Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); - page = BufferGetPage(buffer); + /* + * Choose build strategy. If all keys support sorting, do that. Otherwise + * the default strategy is switch to buffering mode when the index grows + * too large to fit in cache. + */ + hasallsortsupports = true; + for (int i = 0; i < keyscount; i++) + { + SortSupportFnOids[i] = index_getprocid(index, i + 1, + GIST_SORTSUPPORT_PROC); + if (!OidIsValid(SortSupportFnOids[i])) + { + hasallsortsupports = false; + break; + } + } - START_CRIT_SECTION(); + if (hasallsortsupports) + { + buildstate.buildMode = GIST_SORTED_BUILD; + } + else if (options) + { + if (options->buffering_mode == GIST_OPTION_BUFFERING_ON) + buildstate.buildMode = GIST_BUFFERING_STATS; + else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF) + buildstate.buildMode = GIST_BUFFERING_DISABLED; + else + buildstate.buildMode = GIST_BUFFERING_AUTO; + } + else + { + buildstate.buildMode = GIST_BUFFERING_AUTO; + } - GISTInitBuffer(buffer, F_LEAF); + /* + * Calculate target amount of free space to leave on pages. + */ + fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR; + buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; - MarkBufferDirty(buffer); - PageSetLSN(page, GistBuildLSN); - - UnlockReleaseBuffer(buffer); - - END_CRIT_SECTION(); - - /* build the index */ + /* + * Build the index using the chosen strategy. + */ buildstate.indtuples = 0; buildstate.indtuplesSize = 0; - /* - * Do the heap scan. - */ - reltuples = table_index_build_scan(heap, index, indexInfo, true, true, - gistBuildCallback, - (void *) &buildstate, NULL); - - /* - * If buffering was used, flush out all the tuples that are still in the - * buffers. - */ - if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE) + if (buildstate.buildMode == GIST_SORTED_BUILD) { - elog(DEBUG1, "all tuples processed, emptying buffers"); - gistEmptyAllBuffers(&buildstate); - gistFreeBuildBuffers(buildstate.gfbb); + /* + * Sort all data, build the index from bottom up. + */ + buildstate.sortstate = tuplesort_begin_index_gist(heap, + index, + maintenance_work_mem, + NULL, + false); + + /* Scan the table, adding all tuples to the tuplesort */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistSortedBuildCallback, + (void *) &buildstate, NULL); + + /* + * Perform the sort and build index pages. + */ + tuplesort_performsort(buildstate.sortstate); + + gist_indexsortbuild(&buildstate); + + tuplesort_end(buildstate.sortstate); + } + else + { + /* + * Initialize an empty index and insert all tuples, possibly using + * buffers on intermediate levels. + */ + Buffer buffer; + Page page; + + /* initialize the root page */ + buffer = gistNewBuffer(index); + Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); + page = BufferGetPage(buffer); + + START_CRIT_SECTION(); + + GISTInitBuffer(buffer, F_LEAF); + + MarkBufferDirty(buffer); + PageSetLSN(page, GistBuildLSN); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* Scan the table, inserting all the tuples to the index. */ + reltuples = table_index_build_scan(heap, index, indexInfo, true, true, + gistBuildCallback, + (void *) &buildstate, NULL); + + /* + * If buffering was used, flush out all the tuples that are still in + * the buffers. + */ + if (buildstate.buildMode == GIST_BUFFERING_ACTIVE) + { + elog(DEBUG1, "all tuples processed, emptying buffers"); + gistEmptyAllBuffers(&buildstate); + gistFreeBuildBuffers(buildstate.gfbb); + } + + /* + * We didn't write WAL records as we built the index, so if + * WAL-logging is required, write all pages to the WAL now. + */ + if (RelationNeedsWAL(index)) + { + log_newpage_range(index, MAIN_FORKNUM, + 0, RelationGetNumberOfBlocks(index), + true); + } } /* okay, all heap tuples are indexed */ @@ -214,17 +338,6 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) freeGISTstate(buildstate.giststate); - /* - * We didn't write WAL records as we built the index, so if WAL-logging is - * required, write all pages to the WAL now. - */ - if (RelationNeedsWAL(index)) - { - log_newpage_range(index, MAIN_FORKNUM, - 0, RelationGetNumberOfBlocks(index), - true); - } - /* * Return statistics */ @@ -236,6 +349,235 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) return result; } +/*------------------------------------------------------------------------- + * Routines for sorted build + *------------------------------------------------------------------------- + */ + +/* + * Per-tuple callback for table_index_build_scan. + */ +static void +gistSortedBuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + MemoryContext oldCtx; + Datum compressed_values[INDEX_MAX_KEYS]; + + oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); + + /* Form an index tuple and point it at the heap tuple */ + gistCompressValues(buildstate->giststate, index, + values, isnull, + true, compressed_values); + + tuplesort_putindextuplevalues(buildstate->sortstate, + buildstate->indexrel, + tid, + compressed_values, isnull); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->giststate->tempCxt); + + /* Update tuple count. */ + buildstate->indtuples += 1; +} + +/* + * Build GiST index from bottom up from pre-sorted tuples. + */ +static void +gist_indexsortbuild(GISTBuildState *state) +{ + IndexTuple itup; + GistSortedBuildPageState *leafstate; + GistSortedBuildPageState *pagestate; + Page page; + + state->pages_allocated = 0; + state->pages_written = 0; + state->ready_num_pages = 0; + + /* + * Write an empty page as a placeholder for the root page. It will be + * replaced with the real root page at the end. + */ + page = palloc0(BLCKSZ); + smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + page, true); + state->pages_allocated++; + state->pages_written++; + + /* Allocate a temporary buffer for the first leaf page. */ + leafstate = palloc(sizeof(GistSortedBuildPageState)); + leafstate->page = page; + leafstate->parent = NULL; + gistinitpage(page, F_LEAF); + + /* + * Fill index pages with tuples in the sorted order. + */ + while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL) + { + gist_indexsortbuild_pagestate_add(state, leafstate, itup); + MemoryContextReset(state->giststate->tempCxt); + } + + /* + * Write out the partially full non-root pages. + * + * Keep in mind that flush can build a new root. + */ + pagestate = leafstate; + while (pagestate->parent != NULL) + { + GistSortedBuildPageState *parent; + + gist_indexsortbuild_pagestate_flush(state, pagestate); + parent = pagestate->parent; + pfree(pagestate->page); + pfree(pagestate); + pagestate = parent; + } + + gist_indexsortbuild_flush_ready_pages(state); + + /* Write out the root */ + PageSetLSN(pagestate->page, GistBuildLSN); + smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + if (RelationNeedsWAL(state->indexrel)) + log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, + pagestate->page, true); + + pfree(pagestate->page); + pfree(pagestate); +} + +/* + * Add tuple to a page. If the pages is full, write it out and re-initialize + * a new page first. + */ +static void +gist_indexsortbuild_pagestate_add(GISTBuildState *state, + GistSortedBuildPageState *pagestate, + IndexTuple itup) +{ + Size sizeNeeded; + + /* Does the tuple fit? If not, flush */ + sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace; + if (PageGetFreeSpace(pagestate->page) < sizeNeeded) + gist_indexsortbuild_pagestate_flush(state, pagestate); + + gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber); +} + +static void +gist_indexsortbuild_pagestate_flush(GISTBuildState *state, + GistSortedBuildPageState *pagestate) +{ + GistSortedBuildPageState *parent; + IndexTuple *itvec; + IndexTuple union_tuple; + int vect_len; + bool isleaf; + BlockNumber blkno; + MemoryContext oldCtx; + + /* check once per page */ + CHECK_FOR_INTERRUPTS(); + + if (state->ready_num_pages == XLR_MAX_BLOCK_ID) + gist_indexsortbuild_flush_ready_pages(state); + + /* + * The page is now complete. Assign a block number to it, and add it to + * the list of finished pages. (We don't write it out immediately, because + * we want to WAL-log the pages in batches.) + */ + blkno = state->pages_allocated++; + state->ready_blknos[state->ready_num_pages] = blkno; + state->ready_pages[state->ready_num_pages] = pagestate->page; + state->ready_num_pages++; + + isleaf = GistPageIsLeaf(pagestate->page); + + /* + * Form a downlink tuple to represent all the tuples on the page. + */ + oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt); + itvec = gistextractpage(pagestate->page, &vect_len); + union_tuple = gistunion(state->indexrel, itvec, vect_len, + state->giststate); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); + MemoryContextSwitchTo(oldCtx); + + /* + * Insert the downlink to the parent page. If this was the root, create a + * new page as the parent, which becomes the new root. + */ + parent = pagestate->parent; + if (parent == NULL) + { + parent = palloc(sizeof(GistSortedBuildPageState)); + parent->page = (Page) palloc(BLCKSZ); + parent->parent = NULL; + gistinitpage(parent->page, 0); + + pagestate->parent = parent; + } + gist_indexsortbuild_pagestate_add(state, parent, union_tuple); + + /* Re-initialize the page buffer for next page on this level. */ + pagestate->page = palloc(BLCKSZ); + gistinitpage(pagestate->page, isleaf ? F_LEAF : 0); +} + +static void +gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) +{ + if (state->ready_num_pages == 0) + return; + + for (int i = 0; i < state->ready_num_pages; i++) + { + Page page = state->ready_pages[i]; + + /* Currently, the blocks must be buffered in order. */ + if (state->ready_blknos[i] != state->pages_written) + elog(ERROR, "unexpected block number to flush GiST sorting build"); + + PageSetLSN(page, GistBuildLSN); + + smgrextend(state->indexrel->rd_smgr, + MAIN_FORKNUM, + state->pages_written++, + page, + true); + } + + if (RelationNeedsWAL(state->indexrel)) + log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages, + state->ready_blknos, state->ready_pages, true); + + for (int i = 0; i < state->ready_num_pages; i++) + pfree(state->ready_pages[i]); + + state->ready_num_pages = 0; +} + + +/*------------------------------------------------------------------------- + * Routines for non-sorted build + *------------------------------------------------------------------------- + */ + /* * Attempt to switch to buffering mode. * @@ -375,7 +717,7 @@ gistInitBuffering(GISTBuildState *buildstate) if (levelStep <= 0) { elog(DEBUG1, "failed to switch to buffered GiST build"); - buildstate->bufferingMode = GIST_BUFFERING_DISABLED; + buildstate->buildMode = GIST_BUFFERING_DISABLED; return; } @@ -392,7 +734,7 @@ gistInitBuffering(GISTBuildState *buildstate) gistInitParentMap(buildstate); - buildstate->bufferingMode = GIST_BUFFERING_ACTIVE; + buildstate->buildMode = GIST_BUFFERING_ACTIVE; elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d", levelStep, pagesPerBuffer); @@ -453,10 +795,12 @@ gistBuildCallback(Relation index, oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); /* form an index tuple and point it at the heap tuple */ - itup = gistFormTuple(buildstate->giststate, index, values, isnull, true); + itup = gistFormTuple(buildstate->giststate, index, + values, isnull, + true); itup->t_tid = *tid; - if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE) + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE) { /* We have buffers, so use them. */ gistBufferingBuildInsert(buildstate, itup); @@ -478,7 +822,7 @@ gistBuildCallback(Relation index, MemoryContextSwitchTo(oldCtx); MemoryContextReset(buildstate->giststate->tempCxt); - if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE && + if (buildstate->buildMode == GIST_BUFFERING_ACTIVE && buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0) { /* Adjust the target buffer size now */ @@ -493,10 +837,10 @@ gistBuildCallback(Relation index, * To avoid excessive calls to smgrnblocks(), only check this every * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples */ - if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO && + if ((buildstate->buildMode == GIST_BUFFERING_AUTO && buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) || - (buildstate->bufferingMode == GIST_BUFFERING_STATS && + (buildstate->buildMode == GIST_BUFFERING_STATS && buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) { /* diff --git a/src/backend/access/gist/gistproc.c b/src/backend/access/gist/gistproc.c index 9ace64c3c4..27d9c0f77c 100644 --- a/src/backend/access/gist/gistproc.c +++ b/src/backend/access/gist/gistproc.c @@ -24,6 +24,7 @@ #include "utils/builtins.h" #include "utils/float.h" #include "utils/geo_decls.h" +#include "utils/sortsupport.h" static bool gist_box_leaf_consistent(BOX *key, BOX *query, @@ -31,6 +32,15 @@ static bool gist_box_leaf_consistent(BOX *key, BOX *query, static bool rtree_internal_consistent(BOX *key, BOX *query, StrategyNumber strategy); +static uint64 point_zorder_internal(float4 x, float4 y); +static uint64 part_bits32_by2(uint32 x); +static uint32 ieee_float32_to_uint32(float f); +static int gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup); +static Datum gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup); +static int gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup); +static bool gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup); + + /* Minimum accepted ratio of split */ #define LIMIT_RATIO 0.3 @@ -1540,3 +1550,222 @@ gist_poly_distance(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(distance); } + +/* + * Z-order routines for fast index build + */ + +/* + * Compute Z-value of a point + * + * Z-order (also known as Morton Code) maps a two-dimensional point to a + * single integer, in a way that preserves locality. Points that are close in + * the two-dimensional space are mapped to integer that are not far from each + * other. We do that by interleaving the bits in the X and Y components. + * + * Morton Code is normally defined only for integers, but the X and Y values + * of a point are floating point. We expect floats to be in IEEE format. + */ +static uint64 +point_zorder_internal(float4 x, float4 y) +{ + uint32 ix = ieee_float32_to_uint32(x); + uint32 iy = ieee_float32_to_uint32(y); + + /* Interleave the bits */ + return part_bits32_by2(ix) | (part_bits32_by2(iy) << 1); +} + +/* Interleave 32 bits with zeroes */ +static uint64 +part_bits32_by2(uint32 x) +{ + uint64 n = x; + + n = (n | (n << 16)) & UINT64CONST(0x0000FFFF0000FFFF); + n = (n | (n << 8)) & UINT64CONST(0x00FF00FF00FF00FF); + n = (n | (n << 4)) & UINT64CONST(0x0F0F0F0F0F0F0F0F); + n = (n | (n << 2)) & UINT64CONST(0x3333333333333333); + n = (n | (n << 1)) & UINT64CONST(0x5555555555555555); + + return n; +} + +/* + * Convert a 32-bit IEEE float to uint32 in a way that preserves the ordering + */ +static uint32 +ieee_float32_to_uint32(float f) +{ + /*---- + * + * IEEE 754 floating point format + * ------------------------------ + * + * IEEE 754 floating point numbers have this format: + * + * exponent (8 bits) + * | + * s eeeeeeee mmmmmmmmmmmmmmmmmmmmmmm + * | | + * sign mantissa (23 bits) + * + * Infinity has all bits in the exponent set and the mantissa is all + * zeros. Negative infinity is the same but with the sign bit set. + * + * NaNs are represented with all bits in the exponent set, and the least + * significant bit in the mantissa also set. The rest of the mantissa bits + * can be used to distinguish different kinds of NaNs. + * + * The IEEE format has the nice property that when you take the bit + * representation and interpret it as an integer, the order is preserved, + * except for the sign. That holds for the +-Infinity values too. + * + * Mapping to uint32 + * ----------------- + * + * In order to have a smooth transition from negative to positive numbers, + * we map floats to unsigned integers like this: + * + * x < 0 to range 0-7FFFFFFF + * x = 0 to value 8000000 (both positive and negative zero) + * x > 0 to range 8000001-FFFFFFFF + * + * We don't care to distinguish different kind of NaNs, so they are all + * mapped to the same arbitrary value, FFFFFFFF. Because of the IEEE bit + * representation of NaNs, there aren't any non-NaN values that would be + * mapped to FFFFFFFF. In fact, there is a range of unused values on both + * ends of the uint32 space. + */ + if (isnan(f)) + return 0xFFFFFFFF; + else + { + union + { + float f; + uint32 i; + } u; + + u.f = f; + + /* Check the sign bit */ + if ((u.i & 0x80000000) != 0) + { + /* + * Map the negative value to range 0-7FFFFFFF. This flips the sign + * bit to 0 in the same instruction. + */ + Assert(f <= 0); /* can be -0 */ + u.i ^= 0xFFFFFFFF; + } + else + { + /* Map the positive value (or 0) to range 80000000-FFFFFFFF */ + u.i |= 0x80000000; + } + + return u.i; + } +} + +/* + * Compare the Z-order of points + */ +static int +gist_bbox_zorder_cmp(Datum a, Datum b, SortSupport ssup) +{ + Point *p1 = &(DatumGetBoxP(a)->low); + Point *p2 = &(DatumGetBoxP(b)->low); + uint64 z1; + uint64 z2; + + /* + * Do a quick check for equality first. It's not clear if this is worth it + * in general, but certainly is when used as tie-breaker with abbreviated + * keys, + */ + if (p1->x == p2->x && p1->y == p2->y) + return 0; + + z1 = point_zorder_internal(p1->x, p1->y); + z2 = point_zorder_internal(p2->x, p2->y); + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * Abbreviated version of Z-order comparison + * + * The abbreviated format is a Z-order value computed from the two 32-bit + * floats. If SIZEOF_DATUM == 8, the 64-bit Z-order value fits fully in the + * abbreviated Datum, otherwise use its most significant bits. + */ +static Datum +gist_bbox_zorder_abbrev_convert(Datum original, SortSupport ssup) +{ + Point *p = &(DatumGetBoxP(original)->low); + uint64 z; + + z = point_zorder_internal(p->x, p->y); + +#if SIZEOF_DATUM == 8 + return (Datum) z; +#else + return (Datum) (z >> 32); +#endif +} + +static int +gist_bbox_zorder_cmp_abbrev(Datum z1, Datum z2, SortSupport ssup) +{ + /* + * Compare the pre-computed Z-orders as unsigned integers. Datum is a + * typedef for 'uintptr_t', so no casting is required. + */ + if (z1 > z2) + return 1; + else if (z1 < z2) + return -1; + else + return 0; +} + +/* + * We never consider aborting the abbreviation. + * + * On 64-bit systems, the abbreviation is not lossy so it is always + * worthwhile. (Perhaps it's not on 32-bit systems, but we don't bother + * with logic to decide.) + */ +static bool +gist_bbox_zorder_abbrev_abort(int memtupcount, SortSupport ssup) +{ + return false; +} + +/* + * Sort support routine for fast GiST index build by sorting. + */ +Datum +gist_point_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + if (ssup->abbreviate) + { + ssup->comparator = gist_bbox_zorder_cmp_abbrev; + ssup->abbrev_converter = gist_bbox_zorder_abbrev_convert; + ssup->abbrev_abort = gist_bbox_zorder_abbrev_abort; + ssup->abbrev_full_comparator = gist_bbox_zorder_cmp; + } + else + { + ssup->comparator = gist_bbox_zorder_cmp; + } + PG_RETURN_VOID(); +} diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 0516059e3d..615b5ade23 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -572,12 +572,31 @@ gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, - Datum attdata[], bool isnull[], bool isleaf) + Datum *attdata, bool *isnull, bool isleaf) { Datum compatt[INDEX_MAX_KEYS]; - int i; IndexTuple res; + gistCompressValues(giststate, r, attdata, isnull, isleaf, compatt); + + res = index_form_tuple(isleaf ? giststate->leafTupdesc : + giststate->nonLeafTupdesc, + compatt, isnull); + + /* + * The offset number on tuples on internal pages is unused. For historical + * reasons, it is set to 0xffff. + */ + ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); + return res; +} + +void +gistCompressValues(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf, Datum *compatt) +{ + int i; + /* * Call the compress method on each attribute. */ @@ -617,17 +636,6 @@ gistFormTuple(GISTSTATE *giststate, Relation r, compatt[i] = attdata[i]; } } - - res = index_form_tuple(isleaf ? giststate->leafTupdesc : - giststate->nonLeafTupdesc, - compatt, isnull); - - /* - * The offset number on tuples on internal pages is unused. For historical - * reasons, it is set to 0xffff. - */ - ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff); - return res; } /* @@ -745,14 +753,11 @@ gistpenalty(GISTSTATE *giststate, int attno, * Initialize a new index page */ void -GISTInitBuffer(Buffer b, uint32 f) +gistinitpage(Page page, uint32 f) { GISTPageOpaque opaque; - Page page; - Size pageSize; + Size pageSize = BLCKSZ; - pageSize = BufferGetPageSize(b); - page = BufferGetPage(b); PageInit(page, pageSize, sizeof(GISTPageOpaqueData)); opaque = GistPageGetOpaque(page); @@ -763,6 +768,18 @@ GISTInitBuffer(Buffer b, uint32 f) opaque->gist_page_id = GIST_PAGE_ID; } +/* + * Initialize a new index buffer + */ +void +GISTInitBuffer(Buffer b, uint32 f) +{ + Page page; + + page = BufferGetPage(b); + gistinitpage(page, f); +} + /* * Verify that a freshly-read page looks sane. */ diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index 2b9ab693be..8a14620fab 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -143,6 +143,10 @@ gistvalidate(Oid opclassoid) case GIST_OPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case GIST_SORTSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -263,7 +267,7 @@ gistvalidate(Oid opclassoid) continue; /* got it */ if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || - i == GIST_OPTIONS_PROC) + i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC) continue; /* optional methods */ ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index c526bb1928..1f0e4e01e6 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1019,6 +1019,63 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, return recptr; } +/* + * Like log_newpage(), but allows logging multiple pages in one operation. + * It is more efficient than calling log_newpage() for each page separately, + * because we can write multiple pages in a single WAL record. + */ +void +log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages, + BlockNumber *blknos, Page *pages, bool page_std) +{ + int flags; + XLogRecPtr recptr; + int i; + int j; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + /* + * Iterate over all the pages. They are collected into batches of + * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each + * batch. + */ + XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0); + + i = 0; + while (i < num_pages) + { + int batch_start = i; + int nbatch; + + XLogBeginInsert(); + + nbatch = 0; + while (nbatch < XLR_MAX_BLOCK_ID && i < num_pages) + { + XLogRegisterBlock(nbatch, rnode, forkNum, blknos[i], pages[i], flags); + i++; + nbatch++; + } + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (j = batch_start; j < i; j++) + { + /* + * The page may be uninitialized. If so, we can't set the LSN because that + * would corrupt the page. + */ + if (!PageIsNew(pages[j])) + { + PageSetLSN(pages[j], recptr); + } + } + } +} + /* * Write a WAL record containing a full image of a page. * diff --git a/src/backend/utils/sort/sortsupport.c b/src/backend/utils/sort/sortsupport.c index fcfe6e831a..c436fbb4ce 100644 --- a/src/backend/utils/sort/sortsupport.c +++ b/src/backend/utils/sort/sortsupport.c @@ -15,6 +15,7 @@ #include "postgres.h" +#include "access/gist.h" #include "access/nbtree.h" #include "catalog/pg_am.h" #include "fmgr.h" @@ -175,3 +176,36 @@ PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, FinishSortSupportFunction(opfamily, opcintype, ssup); } + +/* + * Fill in SortSupport given a GiST index relation + * + * Caller must previously have zeroed the SortSupportData structure and then + * filled in ssup_cxt, ssup_attno, ssup_collation, and ssup_nulls_first. This + * will fill in ssup_reverse (always false for GiST index build), as well as + * the comparator function pointer. + */ +void +PrepareSortSupportFromGistIndexRel(Relation indexRel, SortSupport ssup) +{ + Oid opfamily = indexRel->rd_opfamily[ssup->ssup_attno - 1]; + Oid opcintype = indexRel->rd_opcintype[ssup->ssup_attno - 1]; + Oid sortSupportFunction; + + Assert(ssup->comparator == NULL); + + if (indexRel->rd_rel->relam != GIST_AM_OID) + elog(ERROR, "unexpected non-gist AM: %u", indexRel->rd_rel->relam); + ssup->ssup_reverse = false; + + /* + * Look up the sort support function. This is simpler than for B-tree + * indexes because we don't support the old-style btree comparators. + */ + sortSupportFunction = get_opfamily_proc(opfamily, opcintype, opcintype, + GIST_SORTSUPPORT_PROC); + if (!OidIsValid(sortSupportFunction)) + elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", + GIST_SORTSUPPORT_PROC, opcintype, opcintype, opfamily); + OidFunctionCall1(sortSupportFunction, PointerGetDatum(ssup)); +} diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index cbda911f46..d0cc04a878 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1167,6 +1167,63 @@ tuplesort_begin_index_hash(Relation heapRel, return state; } +Tuplesortstate * +tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = indexRel->rd_indcollation[i]; + sortKey->ssup_nulls_first = false; + sortKey->ssup_attno = i + 1; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + /* Look for a sort support function */ + PrepareSortSupportFromGistIndexRel(indexRel, sortKey); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 4994351697..4f6dae9a76 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -37,7 +37,8 @@ #define GIST_DISTANCE_PROC 8 #define GIST_FETCH_PROC 9 #define GIST_OPTIONS_PROC 10 -#define GISTNProcs 10 +#define GIST_SORTSUPPORT_PROC 11 +#define GISTNProcs 11 /* * Page opaque data in a GiST index page. diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 02e985549f..b68c01a5f2 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -501,12 +501,15 @@ extern IndexTuple gistgetadjusted(Relation r, GISTSTATE *giststate); extern IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf); +extern void gistCompressValues(GISTSTATE *giststate, Relation r, + Datum *attdata, bool *isnull, bool isleaf, Datum *compatt); extern OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate); extern void GISTInitBuffer(Buffer b, uint32 f); +extern void gistinitpage(Page page, uint32 f); extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, Datum k, Relation r, Page pg, OffsetNumber o, bool l, bool isNull); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 63df25ae90..4ba2c56be6 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -54,6 +54,8 @@ extern bool XLogCheckBufferNeedsBackup(Buffer buffer); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, char *page, bool page_std); +extern void log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages, + BlockNumber *blknos, char **pages, bool page_std); extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); extern void log_newpage_range(Relation rel, ForkNumber forkNum, BlockNumber startblk, BlockNumber endblk, bool page_std); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 0bbe0a122a..06ddb1f16b 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,7 @@ */ /* yyyymmddN */ +/* FIXME: bump this before pushing! */ #define CATALOG_VERSION_NO 202009031 #endif diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 37b580883f..a8e0c4ff8a 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -480,6 +480,8 @@ amproc => 'gist_point_distance' }, { amprocfamily => 'gist/point_ops', amproclefttype => 'point', amprocrighttype => 'point', amprocnum => '9', amproc => 'gist_point_fetch' }, +{ amprocfamily => 'gist/point_ops', amproclefttype => 'point', + amprocrighttype => 'point', amprocnum => '11', amproc => 'gist_point_sortsupport' }, { amprocfamily => 'gist/box_ops', amproclefttype => 'box', amprocrighttype => 'box', amprocnum => '1', amproc => 'gist_box_consistent' }, { amprocfamily => 'gist/box_ops', amproclefttype => 'box', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 687509ba92..96d7efd427 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8062,6 +8062,9 @@ proname => 'gist_poly_distance', prorettype => 'float8', proargtypes => 'internal polygon int2 oid internal', prosrc => 'gist_poly_distance' }, +{ oid => '3435', descr => 'sort support', + proname => 'gist_point_sortsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'gist_point_sortsupport' }, # GIN array support { oid => '2743', descr => 'GIN array support', diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h index 264aec820b..fb262c6e8d 100644 --- a/src/include/utils/sortsupport.h +++ b/src/include/utils/sortsupport.h @@ -272,5 +272,6 @@ extern void PrepareSortSupportComparisonShim(Oid cmpFunc, SortSupport ssup); extern void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup); extern void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup); +extern void PrepareSortSupportFromGistIndexRel(Relation indexRel, SortSupport ssup); #endif /* SORTSUPPORT_H */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 9e76666fe9..c69b36e209 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -217,6 +217,10 @@ extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, uint32 max_buckets, int workMem, SortCoordinate coordinate, bool randomAccess); +extern Tuplesortstate *tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + int workMem, SortCoordinate coordinate, + bool randomAccess); extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 64c0c66859..6ace7662ee 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -523,8 +523,8 @@ SELECT * FROM point_tbl ORDER BY f1 <-> '0,1'; SELECT * FROM point_tbl ORDER BY f1 <-> '0,1'; f1 ------------------- - (0,0) (1e-300,-1e-300) + (0,0) (-3,4) (-10,0) (10,10) @@ -561,8 +561,8 @@ SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1'; SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1'; f1 ------------------- - (0,0) (1e-300,-1e-300) + (0,0) (-3,4) (-10,0) (10,10) @@ -584,8 +584,8 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; f1 ------------------ - (0,0) (1e-300,-1e-300) + (0,0) (-3,4) (-10,0) (10,10)