postgresql/src/backend/access/spgist/spginsert.c

245 lines
6.9 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* spginsert.c
* Externally visible index creation/insertion routines
*
* All the actual insertion logic is in spgdoinsert.c.
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/access/spgist/spginsert.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/spgist_private.h"
#include "access/spgxlog.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "catalog/index.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/smgr.h"
#include "utils/memutils.h"
#include "utils/rel.h"
typedef struct
{
SpGistState spgstate; /* SPGiST's working state */
MemoryContext tmpCtx; /* per-tuple temporary context */
} SpGistBuildState;
/* Callback to process one heap tuple during IndexBuildHeapScan */
static void
spgistBuildCallback(Relation index, HeapTuple htup, Datum *values,
bool *isnull, bool tupleIsAlive, void *state)
{
SpGistBuildState *buildstate = (SpGistBuildState *) state;
MemoryContext oldCtx;
/* Work in temp context, and reset it after each tuple */
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
/*
* Even though no concurrent insertions can be happening, we still might
* get a buffer-locking failure due to bgwriter or checkpointer taking a
* lock on some buffer. So we need to be willing to retry. We can flush
* any temp data when retrying.
*/
while (!spgdoinsert(index, &buildstate->spgstate, &htup->t_self,
*values, *isnull))
{
MemoryContextReset(buildstate->tmpCtx);
}
MemoryContextSwitchTo(oldCtx);
MemoryContextReset(buildstate->tmpCtx);
}
/*
* Build an SP-GiST index.
*/
IndexBuildResult *
spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
{
IndexBuildResult *result;
double reltuples;
SpGistBuildState buildstate;
Buffer metabuffer,
rootbuffer,
nullbuffer;
if (RelationGetNumberOfBlocks(index) != 0)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
/*
* Initialize the meta page and root pages
*/
metabuffer = SpGistNewBuffer(index);
rootbuffer = SpGistNewBuffer(index);
nullbuffer = SpGistNewBuffer(index);
Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO);
Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO);
START_CRIT_SECTION();
SpGistInitMetapage(BufferGetPage(metabuffer));
MarkBufferDirty(metabuffer);
SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
MarkBufferDirty(rootbuffer);
SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
MarkBufferDirty(nullbuffer);
if (RelationNeedsWAL(index))
{
XLogRecPtr recptr;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/*
* Replay will re-initialize the pages, so don't take full pages
* images. No other data to log.
*/
XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBuffer(1, rootbuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBuffer(2, nullbuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX);
PageSetLSN(BufferGetPage(metabuffer), recptr);
PageSetLSN(BufferGetPage(rootbuffer), recptr);
PageSetLSN(BufferGetPage(nullbuffer), recptr);
}
END_CRIT_SECTION();
UnlockReleaseBuffer(metabuffer);
UnlockReleaseBuffer(rootbuffer);
UnlockReleaseBuffer(nullbuffer);
/*
* Now insert all the heap data into the index
*/
initSpGistState(&buildstate.spgstate, index);
buildstate.spgstate.isBuild = true;
buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
"SP-GiST build temporary context",
Add macros to make AllocSetContextCreate() calls simpler and safer. I found that half a dozen (nearly 5%) of our AllocSetContextCreate calls had typos in the context-sizing parameters. While none of these led to especially significant problems, they did create minor inefficiencies, and it's now clear that expecting people to copy-and-paste those calls accurately is not a great idea. Let's reduce the risk of future errors by introducing single macros that encapsulate the common use-cases. Three such macros are enough to cover all but two special-purpose contexts; those two calls can be left as-is, I think. While this patch doesn't in itself improve matters for third-party extensions, it doesn't break anything for them either, and they can gradually adopt the simplified notation over time. In passing, change TopMemoryContext to use the default allocation parameters. Formerly it could only be extended 8K at a time. That was probably reasonable when this code was written; but nowadays we create many more contexts than we did then, so that it's not unusual to have a couple hundred K in TopMemoryContext, even without considering various dubious code that sticks other things there. There seems no good reason not to let it use growing blocks like most other contexts. Back-patch to 9.6, mostly because that's still close enough to HEAD that it's easy to do so, and keeping the branches in sync can be expected to avoid some future back-patching pain. The bugs fixed by these changes don't seem to be significant enough to justify fixing them further back. Discussion: <21072.1472321324@sss.pgh.pa.us>
2016-08-27 23:50:38 +02:00
ALLOCSET_DEFAULT_SIZES);
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
spgistBuildCallback, (void *) &buildstate,
NULL);
MemoryContextDelete(buildstate.tmpCtx);
SpGistUpdateMetaPage(index);
result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
result->heap_tuples = result->index_tuples = reltuples;
return result;
}
/*
* Build an empty SPGiST index in the initialization fork
*/
void
spgbuildempty(Relation index)
{
Page page;
/* Construct metapage. */
page = (Page) palloc(BLCKSZ);
SpGistInitMetapage(page);
/*
* Write the page and log it unconditionally. This is important
* particularly for indexes created on tablespaces and databases whose
* creation happened after the last redo pointer as recovery removes any
* of their existing content when the corresponding create records are
* replayed.
*/
PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
(char *) page, true);
log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
SPGIST_METAPAGE_BLKNO, page, true);
/* Likewise for the root page. */
SpGistInitPage(page, SPGIST_LEAF);
PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO,
(char *) page, true);
log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
SPGIST_ROOT_BLKNO, page, true);
/* Likewise for the null-tuples root page. */
SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO,
(char *) page, true);
log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
SPGIST_NULL_BLKNO, page, true);
/*
* An immediate sync is required even if we xlog'd the pages, because the
* writes did not go through shared buffers and therefore a concurrent
* checkpoint may have moved the redo pointer past our xlog record.
*/
smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
}
/*
* Insert one new tuple into an SPGiST index.
*/
bool
spginsert(Relation index, Datum *values, bool *isnull,
ItemPointer ht_ctid, Relation heapRel,
IndexUniqueCheck checkUnique,
IndexInfo *indexInfo)
{
SpGistState spgstate;
MemoryContext oldCtx;
MemoryContext insertCtx;
insertCtx = AllocSetContextCreate(CurrentMemoryContext,
"SP-GiST insert temporary context",
Add macros to make AllocSetContextCreate() calls simpler and safer. I found that half a dozen (nearly 5%) of our AllocSetContextCreate calls had typos in the context-sizing parameters. While none of these led to especially significant problems, they did create minor inefficiencies, and it's now clear that expecting people to copy-and-paste those calls accurately is not a great idea. Let's reduce the risk of future errors by introducing single macros that encapsulate the common use-cases. Three such macros are enough to cover all but two special-purpose contexts; those two calls can be left as-is, I think. While this patch doesn't in itself improve matters for third-party extensions, it doesn't break anything for them either, and they can gradually adopt the simplified notation over time. In passing, change TopMemoryContext to use the default allocation parameters. Formerly it could only be extended 8K at a time. That was probably reasonable when this code was written; but nowadays we create many more contexts than we did then, so that it's not unusual to have a couple hundred K in TopMemoryContext, even without considering various dubious code that sticks other things there. There seems no good reason not to let it use growing blocks like most other contexts. Back-patch to 9.6, mostly because that's still close enough to HEAD that it's easy to do so, and keeping the branches in sync can be expected to avoid some future back-patching pain. The bugs fixed by these changes don't seem to be significant enough to justify fixing them further back. Discussion: <21072.1472321324@sss.pgh.pa.us>
2016-08-27 23:50:38 +02:00
ALLOCSET_DEFAULT_SIZES);
oldCtx = MemoryContextSwitchTo(insertCtx);
initSpGistState(&spgstate, index);
/*
* We might have to repeat spgdoinsert() multiple times, if conflicts
* occur with concurrent insertions. If so, reset the insertCtx each time
* to avoid cumulative memory consumption. That means we also have to
* redo initSpGistState(), but it's cheap enough not to matter.
*/
while (!spgdoinsert(index, &spgstate, ht_ctid, *values, *isnull))
{
MemoryContextReset(insertCtx);
initSpGistState(&spgstate, index);
}
SpGistUpdateMetaPage(index);
MemoryContextSwitchTo(oldCtx);
MemoryContextDelete(insertCtx);
/* return false since we've not done any unique check */
return false;
}