postgresql/src/backend/access/gist/gist.c

1145 lines
34 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* gist.c
* interface routines for the postgres GiST index access method.
*
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.122 2005/06/27 12:45:21 teodor Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/gist_private.h"
#include "access/gistscan.h"
#include "access/heapam.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "utils/memutils.h"
const XLogRecPtr XLogRecPtrForTemp = { 1, 1 };
/* Working state for gistbuild and its callback */
typedef struct
{
GISTSTATE giststate;
int numindexattrs;
double indtuples;
MemoryContext tmpCtx;
} GISTBuildState;
/* non-export function prototypes */
static void gistbuildCallback(Relation index,
HeapTuple htup,
Datum *values,
bool *isnull,
bool tupleIsAlive,
void *state);
2001-03-22 05:01:46 +01:00
static void gistdoinsert(Relation r,
IndexTuple itup,
GISTSTATE *GISTstate);
static void gistfindleaf(GISTInsertState *state,
2001-03-22 05:01:46 +01:00
GISTSTATE *giststate);
#define ROTATEDIST(d) do { \
SplitedPageLayout *tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \
memset(tmp,0,sizeof(SplitedPageLayout)); \
tmp->next = (d); \
(d)=tmp; \
} while(0)
/*
* Create and return a temporary memory context for use by GiST. We
* _always_ invoke user-provided methods in a temporary memory
* context, so that memory leaks in those functions cannot cause
* problems. Also, we use some additional temporary contexts in the
* GiST code itself, to avoid the need to do some awkward manual
* memory management.
*/
MemoryContext
createTempGistContext(void)
{
return AllocSetContextCreate(CurrentMemoryContext,
"GiST temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
}
/*
* Routine to build an index. Basically calls insert over and over.
*
* XXX: it would be nice to implement some sort of bulk-loading
* algorithm, but it is not clear how to do that.
*/
Datum
gistbuild(PG_FUNCTION_ARGS)
{
2001-03-22 05:01:46 +01:00
Relation heap = (Relation) PG_GETARG_POINTER(0);
Relation index = (Relation) PG_GETARG_POINTER(1);
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
double reltuples;
GISTBuildState buildstate;
Buffer buffer;
/*
* We expect to be called exactly once for any index relation. If
* that's not the case, big trouble's what we have.
*/
if (RelationGetNumberOfBlocks(index) != 0)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
/* no locking is needed */
initGISTstate(&buildstate.giststate, index);
/* initialize the root page */
buffer = gistNewBuffer(index);
GISTInitBuffer(buffer, F_LEAF);
if ( !index->rd_istemp ) {
XLogRecPtr recptr;
XLogRecData rdata;
Page page;
rdata.buffer = InvalidBuffer;
rdata.data = (char*)&(index->rd_node);
rdata.len = sizeof(RelFileNode);
rdata.next = NULL;
page = BufferGetPage(buffer);
START_CRIT_SECTION();
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
END_CRIT_SECTION();
} else
PageSetLSN(BufferGetPage(buffer), XLogRecPtrForTemp);
LockBuffer(buffer, GIST_UNLOCK);
WriteBuffer(buffer);
/* build the index */
buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
buildstate.indtuples = 0;
/*
* create a temporary memory context that is reset once for each
* tuple inserted into the index
*/
buildstate.tmpCtx = createTempGistContext();
/* do the heap scan */
reltuples = IndexBuildHeapScan(heap, index, indexInfo,
gistbuildCallback, (void *) &buildstate);
/* okay, all heap tuples are indexed */
MemoryContextDelete(buildstate.tmpCtx);
/* since we just counted the # of tuples, may as well update stats */
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);
freeGISTstate(&buildstate.giststate);
PG_RETURN_VOID();
}
/*
* Per-tuple callback from IndexBuildHeapScan
*/
static void
gistbuildCallback(Relation index,
HeapTuple htup,
Datum *values,
bool *isnull,
bool tupleIsAlive,
void *state)
{
GISTBuildState *buildstate = (GISTBuildState *) state;
IndexTuple itup;
GISTENTRY tmpcentry;
int i;
MemoryContext oldCtx;
/* GiST cannot index tuples with leading NULLs */
if (isnull[0])
return;
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
/* immediately compress keys to normalize */
for (i = 0; i < buildstate->numindexattrs; i++)
{
if (isnull[i])
values[i] = (Datum) 0;
else
{
gistcentryinit(&buildstate->giststate, i, &tmpcentry, values[i],
NULL, NULL, (OffsetNumber) 0,
-1 /* size is currently bogus */, TRUE, FALSE);
values[i] = tmpcentry.key;
}
}
/* form an index tuple and point it at the heap tuple */
itup = index_form_tuple(buildstate->giststate.tupdesc, values, isnull);
itup->t_tid = htup->t_self;
/*
* Since we already have the index relation locked, we call
* gistdoinsert directly. Normal access method calls dispatch through
* gistinsert, which locks the relation for write. This is the right
* thing to do if you're inserting single tups, but not when you're
* initializing the whole index at once.
*/
gistdoinsert(index, itup, &buildstate->giststate);
buildstate->indtuples += 1;
MemoryContextSwitchTo(oldCtx);
MemoryContextReset(buildstate->tmpCtx);
}
/*
* gistinsert -- wrapper for GiST tuple insertion.
*
* This is the public interface routine for tuple insertion in GiSTs.
* It doesn't do any work; just locks the relation and passes the buck.
*/
Datum
gistinsert(PG_FUNCTION_ARGS)
{
2001-03-22 05:01:46 +01:00
Relation r = (Relation) PG_GETARG_POINTER(0);
Datum *values = (Datum *) PG_GETARG_POINTER(1);
bool *isnull = (bool *) PG_GETARG_POINTER(2);
2001-03-22 05:01:46 +01:00
ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
#ifdef NOT_USED
2001-03-22 05:01:46 +01:00
Relation heapRel = (Relation) PG_GETARG_POINTER(4);
bool checkUnique = PG_GETARG_BOOL(5);
#endif
IndexTuple itup;
GISTSTATE giststate;
GISTENTRY tmpentry;
int i;
MemoryContext oldCtx;
MemoryContext insertCtx;
/* GiST cannot index tuples with leading NULLs */
if (isnull[0])
PG_RETURN_BOOL(false);
insertCtx = createTempGistContext();
oldCtx = MemoryContextSwitchTo(insertCtx);
initGISTstate(&giststate, r);
/* immediately compress keys to normalize */
for (i = 0; i < r->rd_att->natts; i++)
{
if (isnull[i])
values[i] = (Datum) 0;
else
{
gistcentryinit(&giststate, i, &tmpentry, values[i],
NULL, NULL, (OffsetNumber) 0,
-1 /* size is currently bogus */, TRUE, FALSE);
values[i] = tmpentry.key;
}
}
itup = index_form_tuple(giststate.tupdesc, values, isnull);
itup->t_tid = *ht_ctid;
gistdoinsert(r, itup, &giststate);
/* cleanup */
freeGISTstate(&giststate);
MemoryContextSwitchTo(oldCtx);
MemoryContextDelete(insertCtx);
PG_RETURN_BOOL(true);
}
/*
* Workhouse routine for doing insertion into a GiST index. Note that
* this routine assumes it is invoked in a short-lived memory context,
* so it does not bother releasing palloc'd allocations.
*/
2001-03-22 05:01:46 +01:00
static void
gistdoinsert(Relation r, IndexTuple itup, GISTSTATE *giststate)
2001-03-22 05:01:46 +01:00
{
GISTInsertState state;
2001-03-22 05:01:46 +01:00
memset(&state, 0, sizeof(GISTInsertState));
state.itup = (IndexTuple *) palloc(sizeof(IndexTuple));
state.itup[0] = (IndexTuple) palloc(IndexTupleSize(itup));
memcpy(state.itup[0], itup, IndexTupleSize(itup));
state.ituplen=1;
state.r = r;
state.key = itup->t_tid;
state.needInsertComplete = true;
state.stack = (GISTInsertStack*)palloc0(sizeof(GISTInsertStack));
state.stack->blkno=GIST_ROOT_BLKNO;
gistfindleaf(&state, giststate);
gistmakedeal(&state, giststate);
}
static bool
gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) {
bool is_splitted = false;
bool is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false;
if ( !is_leaf )
/*
* This node's key has been modified, either because a child
* split occurred or because we needed to adjust our key for
* an insert in a child node. Therefore, remove the old
* version of this node's key.
*/
PageIndexTupleDelete(state->stack->page, state->stack->childoffnum);
if (gistnospace(state->stack->page, state->itup, state->ituplen))
2001-03-22 05:01:46 +01:00
{
/* no space for insertion */
IndexTuple *itvec,
*newitup;
int tlen,olen;
SplitedPageLayout *dist=NULL, *ptr;
is_splitted = true;
itvec = gistextractbuffer(state->stack->buffer, &tlen);
olen=tlen;
itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen);
newitup = gistSplit(state->r, state->stack->buffer, itvec, &tlen, &dist, giststate);
if ( !state->r->rd_istemp ) {
XLogRecPtr recptr;
XLogRecData *rdata;
rdata = formSplitRdata(state->r->rd_node, state->stack->blkno,
&(state->key), dist);
START_CRIT_SECTION();
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata);
ptr = dist;
while(ptr) {
PageSetLSN(BufferGetPage(ptr->buffer), recptr);
PageSetTLI(BufferGetPage(ptr->buffer), ThisTimeLineID);
ptr=ptr->next;
}
END_CRIT_SECTION();
} else {
ptr = dist;
while(ptr) {
PageSetLSN(BufferGetPage(ptr->buffer), XLogRecPtrForTemp);
ptr=ptr->next;
}
}
state->itup = newitup;
state->ituplen = tlen; /* now tlen >= 2 */
if ( state->stack->blkno == GIST_ROOT_BLKNO ) {
gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key));
state->needInsertComplete=false;
ptr = dist;
while(ptr) {
Page page = (Page)BufferGetPage(ptr->buffer);
GistPageGetOpaque(page)->rightlink = ( ptr->next ) ?
ptr->next->block.blkno : InvalidBlockNumber;
LockBuffer( ptr->buffer, GIST_UNLOCK );
WriteBuffer(ptr->buffer);
ptr=ptr->next;
}
} else {
Page page;
BlockNumber rightrightlink = InvalidBlockNumber;
SplitedPageLayout *ourpage=NULL;
GistNSN oldnsn;
GISTPageOpaque opaque;
/* move origpage to first in chain */
if ( dist->block.blkno != state->stack->blkno ) {
ptr = dist;
while(ptr->next) {
if ( ptr->next->block.blkno == state->stack->blkno ) {
ourpage = ptr->next;
ptr->next = ptr->next->next;
ourpage->next = dist;
dist = ourpage;
break;
}
ptr=ptr->next;
}
Assert( ourpage != NULL );
} else
ourpage = dist;
/* now gets all needed data, and sets nsn's */
page = (Page)BufferGetPage(ourpage->buffer);
opaque = GistPageGetOpaque(page);
rightrightlink = opaque->rightlink;
oldnsn = opaque->nsn;
opaque->nsn = PageGetLSN(page);
opaque->rightlink = ourpage->next->block.blkno;
/* fills and write all new pages.
They isn't linked into tree yet */
ptr = ourpage->next;
while(ptr) {
page = (Page)BufferGetPage(ptr->buffer);
GistPageGetOpaque(page)->rightlink = ( ptr->next ) ?
ptr->next->block.blkno : rightrightlink;
/* only for last set oldnsn */
GistPageGetOpaque(page)->nsn = ( ptr->next ) ?
opaque->nsn : oldnsn;
LockBuffer(ptr->buffer, GIST_UNLOCK);
WriteBuffer(ptr->buffer);
ptr=ptr->next;
}
}
WriteNoReleaseBuffer( state->stack->buffer );
2001-03-22 05:01:46 +01:00
}
else
{
/* enough space */
OffsetNumber l, off;
XLogRecPtr oldlsn;
off = ( PageIsEmpty(state->stack->page) ) ?
FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(state->stack->page));
l = gistfillbuffer(state->r, state->stack->page, state->itup, state->ituplen, off);
oldlsn = PageGetLSN(state->stack->page);
if ( !state->r->rd_istemp ) {
OffsetNumber noffs=0, offs[ MAXALIGN( sizeof(OffsetNumber) ) / sizeof(OffsetNumber) ];
XLogRecPtr recptr;
XLogRecData *rdata;
if ( !is_leaf ) {
/*only on inner page we should delete previous version */
offs[0] = state->stack->childoffnum;
noffs=1;
}
rdata = formUpdateRdata(state->r->rd_node, state->stack->blkno,
offs, noffs, false, state->itup, state->ituplen,
&(state->key));
START_CRIT_SECTION();
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_ENTRY_UPDATE, rdata);
PageSetLSN(state->stack->page, recptr);
PageSetTLI(state->stack->page, ThisTimeLineID);
END_CRIT_SECTION();
} else
PageSetLSN(state->stack->page, XLogRecPtrForTemp);
if ( state->stack->blkno == GIST_ROOT_BLKNO )
state->needInsertComplete=false;
WriteNoReleaseBuffer(state->stack->buffer);
if (!is_leaf) /* small optimization: inform scan ablout deleting... */
gistadjscans(state->r, GISTOP_DEL, state->stack->blkno,
state->stack->childoffnum, PageGetLSN(state->stack->page), oldlsn );
if (state->ituplen > 1)
{ /* previous is_splitted==true */
2001-03-22 05:01:46 +01:00
/*
* child was splited, so we must form union for insertion in
* parent
*/
IndexTuple newtup = gistunion(state->r, state->itup, state->ituplen, giststate);
ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno);
state->itup[0] = newtup;
state->ituplen = 1;
} else if (is_leaf) {
/* itup[0] store key to adjust parent, we set it to valid
to correct check by GistTupleIsInvalid macro in gistgetadjusted() */
ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno);
GistTupleSetValid( state->itup[0] );
}
}
return is_splitted;
}
/*
* returns stack of pages, all pages in stack are pinned, and
* leaf is X-locked
*/
static void
gistfindleaf(GISTInsertState *state, GISTSTATE *giststate)
{
ItemId iid;
IndexTuple idxtuple;
GISTPageOpaque opaque;
/* walk down, We don't lock page for a long time, but so
we should be ready to recheck path in a bad case...
We remember, that page->lsn should never be invalid. */
while( true ) {
if ( XLogRecPtrIsInvalid( state->stack->lsn ) )
state->stack->buffer = ReadBuffer(state->r, state->stack->blkno);
LockBuffer( state->stack->buffer, GIST_SHARE );
state->stack->page = (Page) BufferGetPage(state->stack->buffer);
opaque = GistPageGetOpaque(state->stack->page);
state->stack->lsn = PageGetLSN(state->stack->page);
Assert( state->r->rd_istemp || !XLogRecPtrIsInvalid( state->stack->lsn ) );
if ( state->stack->blkno != GIST_ROOT_BLKNO &&
XLByteLT( state->stack->parent->lsn, opaque->nsn) ) {
/* caused split non-root page is detected, go up to parent to choose best child */
LockBuffer( state->stack->buffer, GIST_UNLOCK );
ReleaseBuffer( state->stack->buffer );
state->stack = state->stack->parent;
continue;
}
if (!GistPageIsLeaf(state->stack->page))
{
/*
* This is an internal page, so continue to walk down the
* tree. We find the child node that has the minimum insertion
* penalty and recursively invoke ourselves to modify that
* node. Once the recursive call returns, we may need to
* adjust the parent node for two reasons: the child node
* split, or the key in this node needs to be adjusted for the
* newly inserted key below us.
*/
GISTInsertStack *item=(GISTInsertStack*)palloc0(sizeof(GISTInsertStack));
state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate);
iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid);
item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
LockBuffer( state->stack->buffer, GIST_UNLOCK );
item->parent = state->stack;
item->child = NULL;
if ( state->stack )
state->stack->child = item;
state->stack = item;
} else {
/* be carefull, during unlock/lock page may be changed... */
LockBuffer( state->stack->buffer, GIST_UNLOCK );
LockBuffer( state->stack->buffer, GIST_EXCLUSIVE );
state->stack->page = (Page) BufferGetPage(state->stack->buffer);
opaque = GistPageGetOpaque(state->stack->page);
if ( state->stack->blkno == GIST_ROOT_BLKNO ) {
/* the only page can become inner instead of leaf is a root page,
so for root we should recheck it */
if ( !GistPageIsLeaf(state->stack->page) ) {
/* very rarely situation: during unlock/lock index
with number of pages = 1 was increased */
LockBuffer( state->stack->buffer, GIST_UNLOCK );
continue;
}
/* we don't need to check root split, because checking
leaf/inner is enough to recognize split for root */
} else if ( XLByteLT( state->stack->parent->lsn, opaque->nsn) ) {
/* detecting split during unlock/lock, so we should
find better child on parent*/
/* forget buffer */
LockBuffer( state->stack->buffer, GIST_UNLOCK );
ReleaseBuffer( state->stack->buffer );
state->stack = state->stack->parent;
continue;
}
state->stack->lsn = PageGetLSN( state->stack->page );
/* ok we found a leaf page and it X-locked */
break;
}
}
/* now state->stack->(page, buffer and blkno) points to leaf page */
}
/*
* Should have the same interface as XLogReadBuffer
*/
static Buffer
gistReadAndLockBuffer( bool unused, Relation r, BlockNumber blkno ) {
Buffer buffer = ReadBuffer( r, blkno );
LockBuffer( buffer, GIST_SHARE );
return buffer;
}
/*
* Traverse the tree to find path from root page,
* to prevent deadlocks, it should lock only one page simultaneously.
* Function uses in recovery and usial mode, so should work with different
* read functions (gistReadAndLockBuffer and XLogReadBuffer)
* returns from the begining of closest parent;
*/
GISTInsertStack*
gistFindPath( Relation r, BlockNumber child, Buffer (*myReadBuffer)(bool, Relation, BlockNumber) ) {
Page page;
Buffer buffer;
OffsetNumber i, maxoff;
ItemId iid;
IndexTuple idxtuple;
GISTInsertStack *top, *tail, *ptr;
BlockNumber blkno;
top = tail = (GISTInsertStack*)palloc0( sizeof(GISTInsertStack) );
top->blkno = GIST_ROOT_BLKNO;
while( top && top->blkno != child ) {
buffer = myReadBuffer(false, r, top->blkno); /* buffer locked */
page = (Page)BufferGetPage( buffer );
Assert( !GistPageIsLeaf(page) );
top->lsn = PageGetLSN(page);
if ( top->parent && XLByteLT( top->parent->lsn, GistPageGetOpaque(page)->nsn) &&
GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */) {
/* page splited while we thinking of... */
ptr = (GISTInsertStack*)palloc0( sizeof(GISTInsertStack) );
ptr->blkno = GistPageGetOpaque(page)->rightlink;
ptr->childoffnum = InvalidOffsetNumber;
ptr->parent = top;
ptr->next = NULL;
tail->next = ptr;
tail = ptr;
}
maxoff = PageGetMaxOffsetNumber(page);
for(i = FirstOffsetNumber; i<= maxoff; i = OffsetNumberNext(i)) {
iid = PageGetItemId(page, i);
idxtuple = (IndexTuple) PageGetItem(page, iid);
blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
if ( blkno == child ) {
OffsetNumber poff = InvalidOffsetNumber;
/* make childs links */
ptr = top;
while( ptr->parent ) {
/* set child link */
ptr->parent->child = ptr;
/* move childoffnum.. */
if ( ptr == top ) {
/*first iteration*/
poff = ptr->parent->childoffnum;
ptr->parent->childoffnum = ptr->childoffnum;
} else {
OffsetNumber tmp = ptr->parent->childoffnum;
ptr->parent->childoffnum = poff;
poff = tmp;
}
ptr = ptr->parent;
}
top->childoffnum = i;
LockBuffer( buffer, GIST_UNLOCK );
ReleaseBuffer( buffer );
return top;
} else if ( GistPageGetOpaque(page)->level> 0 ) {
/* Install next inner page to the end of stack */
ptr = (GISTInsertStack*)palloc0( sizeof(GISTInsertStack) );
ptr->blkno = blkno;
ptr->childoffnum = i; /* set offsetnumber of child to child !!! */
ptr->parent = top;
ptr->next = NULL;
tail->next = ptr;
tail = ptr;
}
}
LockBuffer( buffer, GIST_UNLOCK );
ReleaseBuffer( buffer );
top = top->next;
}
return NULL;
}
/*
* Returns X-locked parent of stack page
*/
static void
gistFindCorrectParent( Relation r, GISTInsertStack *child ) {
GISTInsertStack *parent = child->parent;
LockBuffer( parent->buffer, GIST_EXCLUSIVE );
parent->page = (Page)BufferGetPage( parent->buffer );
/* here we don't need to distinguish between split and page update */
if ( parent->childoffnum == InvalidOffsetNumber || !XLByteEQ( parent->lsn, PageGetLSN(parent->page) ) ) {
/* parent is changed, look child in right links until found */
OffsetNumber i, maxoff;
ItemId iid;
IndexTuple idxtuple;
GISTInsertStack *ptr;
while(true) {
maxoff = PageGetMaxOffsetNumber(parent->page);
for(i = FirstOffsetNumber; i<= maxoff; i = OffsetNumberNext(i)) {
iid = PageGetItemId(parent->page, i);
idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
if ( ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno ) {
/* yes!!, found */
parent->childoffnum = i;
return;
}
}
parent->blkno = GistPageGetOpaque( parent->page )->rightlink;
LockBuffer( parent->buffer, GIST_UNLOCK );
ReleaseBuffer( parent->buffer );
if ( parent->blkno == InvalidBlockNumber )
/* end of chain and still didn't found parent,
It's very-very rare situation when root splited */
break;
parent->buffer = ReadBuffer( r, parent->blkno );
LockBuffer( parent->buffer, GIST_EXCLUSIVE );
parent->page = (Page)BufferGetPage( parent->buffer );
}
/* awful!!, we need search tree to find parent ... ,
but before we should release all old parent */
ptr = child->parent->parent; /* child->parent already released above */
while(ptr) {
ReleaseBuffer( ptr->buffer );
ptr = ptr->parent;
}
/* ok, find new path */
ptr = parent = gistFindPath(r, child->blkno, gistReadAndLockBuffer);
Assert( ptr!=NULL );
/* read all buffers as supposed in caller */
while( ptr ) {
ptr->buffer = ReadBuffer( r, ptr->blkno );
ptr->page = (Page)BufferGetPage( ptr->buffer );
ptr = ptr->parent;
}
/* install new chain of parents to stack */
child->parent = parent;
parent->child = child;
/* make recursive call to normal processing */
gistFindCorrectParent( r, child );
}
return;
}
void
gistmakedeal(GISTInsertState *state, GISTSTATE *giststate) {
int is_splitted;
ItemId iid;
IndexTuple oldtup, newtup;
/* walk up */
while( true ) {
/*
* After this call: 1. if child page was splited, then itup
* contains keys for each page 2. if child page wasn't splited,
* then itup contains additional for adjustment of current key
*/
if ( state->stack->parent ) {
/* X-lock parent page before proceed child,
gistFindCorrectParent should find and lock it */
gistFindCorrectParent( state->r, state->stack );
}
is_splitted = gistplacetopage(state, giststate);
/* parent locked above, so release child buffer */
LockBuffer(state->stack->buffer, GIST_UNLOCK );
ReleaseBuffer( state->stack->buffer );
/* pop parent page from stack */
state->stack = state->stack->parent;
/* stack is void */
if ( ! state->stack )
break;
2001-03-22 05:01:46 +01:00
/* child did not split, so we can check is it needed to update parent tuple */
if (!is_splitted)
{
/* parent's tuple */
iid = PageGetItemId(state->stack->page, state->stack->childoffnum);
oldtup = (IndexTuple) PageGetItem(state->stack->page, iid);
newtup = gistgetadjusted(state->r, oldtup, state->itup[0], giststate);
if (!newtup) { /* not need to update key */
LockBuffer( state->stack->buffer, GIST_UNLOCK );
break;
}
state->itup[0] = newtup;
}
} /* while */
/* release all parent buffers */
while( state->stack ) {
ReleaseBuffer(state->stack->buffer);
state->stack = state->stack->parent;
}
/* say to xlog that insert is completed */
if ( state->needInsertComplete && !state->r->rd_istemp )
gistxlogInsertCompletion(state->r->rd_node, &(state->key), 1);
}
static void
gistToRealOffset(OffsetNumber *arr, int len, OffsetNumber *reasloffset) {
int i;
for(i=0;i<len;i++)
arr[i] = reasloffset[ arr[i] ];
}
/*
* gistSplit -- split a page in the tree.
*/
IndexTuple *
gistSplit(Relation r,
Buffer buffer,
IndexTuple *itup, /* contains compressed entry */
int *len,
SplitedPageLayout **dist,
GISTSTATE *giststate)
{
Page p;
2001-03-22 05:01:46 +01:00
Buffer leftbuf,
rightbuf;
Page left,
right;
IndexTuple *lvectup,
*rvectup,
*newtup;
BlockNumber lbknum,
rbknum;
GISTPageOpaque opaque;
GIST_SPLITVEC v;
2004-08-29 07:07:03 +02:00
GistEntryVector *entryvec;
int i, fakeoffset,
2001-03-22 05:01:46 +01:00
nlen;
OffsetNumber *realoffset;
IndexTuple *cleaneditup = itup;
int lencleaneditup = *len;
int level;
p = (Page) BufferGetPage(buffer);
opaque = GistPageGetOpaque(p);
level = opaque->level;
/*
* The root of the tree is the first block in the relation. If we're
* about to split the root, we need to do some hocus-pocus to enforce
* this guarantee.
*/
if (BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
{
leftbuf = gistNewBuffer(r);
GISTInitBuffer(leftbuf, opaque->flags&F_LEAF);
lbknum = BufferGetBlockNumber(leftbuf);
left = (Page) BufferGetPage(leftbuf);
GistPageGetOpaque(left)->level = level;
}
else
{
leftbuf = buffer;
/* IncrBufferRefCount(buffer); */
lbknum = BufferGetBlockNumber(buffer);
left = (Page) PageGetTempPage(p, sizeof(GISTPageOpaqueData));
}
rightbuf = gistNewBuffer(r);
GISTInitBuffer(rightbuf, opaque->flags&F_LEAF);
rbknum = BufferGetBlockNumber(rightbuf);
right = (Page) BufferGetPage(rightbuf);
GistPageGetOpaque(right)->level = level;
/* generate the item array */
realoffset = palloc((*len + 1) * sizeof(OffsetNumber));
entryvec = palloc(GEVHDRSZ + (*len + 1) * sizeof(GISTENTRY));
entryvec->n = *len + 1;
fakeoffset = FirstOffsetNumber;
for (i = 1; i <= *len; i++)
{
Datum datum;
bool IsNull;
if (!GistPageIsLeaf(p) && GistTupleIsInvalid( itup[i - 1] )) {
entryvec->n--;
/* remember position of invalid tuple */
realoffset[ entryvec->n ] = i;
continue;
}
datum = index_getattr(itup[i - 1], 1, giststate->tupdesc, &IsNull);
gistdentryinit(giststate, 0, &(entryvec->vector[fakeoffset]),
datum, r, p, i,
ATTSIZE(datum, giststate->tupdesc, 1, IsNull),
FALSE, IsNull);
realoffset[ fakeoffset ] = i;
fakeoffset++;
}
/*
* if it was invalid tuple then we need special processing. If
* it's possible, we move all invalid tuples on right page.
* We should remember, that union with invalid tuples
* is a invalid tuple.
*/
if ( entryvec->n != *len + 1 ) {
lencleaneditup = entryvec->n-1;
cleaneditup = (IndexTuple*)palloc(lencleaneditup * sizeof(IndexTuple));
for(i=1;i<entryvec->n;i++)
cleaneditup[i-1] = itup[ realoffset[ i ]-1 ];
if ( gistnospace( left, cleaneditup, lencleaneditup ) ) {
/* no space on left to put all good tuples, so picksplit */
gistUserPicksplit(r, entryvec, &v, cleaneditup, lencleaneditup, giststate);
v.spl_leftvalid = true;
v.spl_rightvalid = false;
gistToRealOffset( v.spl_left, v.spl_nleft, realoffset );
gistToRealOffset( v.spl_right, v.spl_nright, realoffset );
} else {
/* we can try to store all valid tuples on one page */
v.spl_right = (OffsetNumber*)palloc( entryvec->n * sizeof(OffsetNumber) );
v.spl_left = (OffsetNumber*)palloc( entryvec->n * sizeof(OffsetNumber) );
if ( lencleaneditup==0 ) {
/* all tuples are invalid, so moves half of its to right */
v.spl_leftvalid = v.spl_rightvalid = false;
v.spl_nright = 0;
v.spl_nleft = 0;
for(i=1;i<=*len;i++)
if ( i-1<*len/2 )
v.spl_left[ v.spl_nleft++ ] = i;
else
v.spl_right[ v.spl_nright++ ] = i;
} else {
/* we will not call gistUserPicksplit, just put good
tuples on left and invalid on right */
v.spl_nleft = lencleaneditup;
v.spl_nright = 0;
for(i=1;i<entryvec->n;i++)
v.spl_left[i-1] = i;
gistToRealOffset( v.spl_left, v.spl_nleft, realoffset );
v.spl_lattr[0] = v.spl_ldatum = (Datum)0;
v.spl_rattr[0] = v.spl_rdatum = (Datum)0;
v.spl_lisnull[0] = true;
v.spl_risnull[0] = true;
gistunionsubkey(r, giststate, itup, &v, true);
v.spl_leftvalid = true;
v.spl_rightvalid = false;
}
}
} else {
/* there is no invalid tuples, so usial processing */
gistUserPicksplit(r, entryvec, &v, itup, *len, giststate);
v.spl_leftvalid = v.spl_rightvalid = true;
}
/* form left and right vector */
lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (*len+1));
rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (*len+1));
for (i = 0; i < v.spl_nleft; i++)
lvectup[i] = itup[v.spl_left[i] - 1];
for (i = 0; i < v.spl_nright; i++)
rvectup[i] = itup[v.spl_right[i] - 1];
/* place invalid tuples on right page if itsn't done yet */
for (fakeoffset = entryvec->n; fakeoffset < *len+1 && lencleaneditup; fakeoffset++) {
rvectup[v.spl_nright++] = itup[realoffset[fakeoffset] - 1];
}
/* write on disk (may need another split) */
2001-03-22 05:01:46 +01:00
if (gistnospace(right, rvectup, v.spl_nright))
{
nlen = v.spl_nright;
newtup = gistSplit(r, rightbuf, rvectup, &nlen, dist, giststate);
/* ReleaseBuffer(rightbuf); */
2001-03-22 05:01:46 +01:00
}
else
{
OffsetNumber l;
char *ptr;
2001-03-22 05:01:46 +01:00
l = gistfillbuffer(r, right, rvectup, v.spl_nright, FirstOffsetNumber);
/* XLOG stuff */
ROTATEDIST(*dist);
(*dist)->block.blkno = BufferGetBlockNumber(rightbuf);
(*dist)->block.num = v.spl_nright;
(*dist)->list = (IndexTupleData*)palloc( BLCKSZ );
ptr = (char*) ( (*dist)->list );
for(i=0;i<v.spl_nright;i++) {
memcpy( ptr, rvectup[i], IndexTupleSize( rvectup[i] ) );
ptr += IndexTupleSize( rvectup[i] );
}
(*dist)->lenlist = ptr - ( (char*) ( (*dist)->list ) );
(*dist)->buffer = rightbuf;
nlen = 1;
2001-03-22 05:01:46 +01:00
newtup = (IndexTuple *) palloc(sizeof(IndexTuple) * 1);
newtup[0] = ( v.spl_rightvalid ) ? gistFormTuple(giststate, r, v.spl_rattr, v.spl_rattrsize, v.spl_risnull)
: gist_form_invalid_tuple( rbknum );
ItemPointerSetBlockNumber(&(newtup[0]->t_tid), rbknum);
}
2001-03-22 05:01:46 +01:00
if (gistnospace(left, lvectup, v.spl_nleft))
{
int llen = v.spl_nleft;
IndexTuple *lntup;
lntup = gistSplit(r, leftbuf, lvectup, &llen, dist, giststate);
/* ReleaseBuffer(leftbuf); */
2001-03-22 05:01:46 +01:00
newtup = gistjoinvector(newtup, &nlen, lntup, llen);
}
else
{
OffsetNumber l;
char *ptr;
2001-03-22 05:01:46 +01:00
l = gistfillbuffer(r, left, lvectup, v.spl_nleft, FirstOffsetNumber);
/* XLOG stuff */
ROTATEDIST(*dist);
(*dist)->block.blkno = BufferGetBlockNumber(leftbuf);
(*dist)->block.num = v.spl_nleft;
(*dist)->list = (IndexTupleData*)palloc( BLCKSZ );
ptr = (char*) ( (*dist)->list );
for(i=0;i<v.spl_nleft;i++) {
memcpy( ptr, lvectup[i], IndexTupleSize( lvectup[i] ) );
ptr += IndexTupleSize( lvectup[i] );
}
(*dist)->lenlist = ptr - ( (char*) ( (*dist)->list ) );
(*dist)->buffer = leftbuf;
if (BufferGetBlockNumber(buffer) != GIST_ROOT_BLKNO)
PageRestoreTempPage(left, p);
nlen += 1;
newtup = (IndexTuple *) repalloc(newtup, sizeof(IndexTuple) * nlen);
newtup[nlen - 1] = ( v.spl_leftvalid ) ? gistFormTuple(giststate, r, v.spl_lattr, v.spl_lattrsize, v.spl_lisnull)
: gist_form_invalid_tuple( lbknum );
ItemPointerSetBlockNumber(&(newtup[nlen - 1]->t_tid), lbknum);
}
GistClearTuplesDeleted(p);
*len = nlen;
return newtup;
}
void
gistnewroot(Relation r, Buffer buffer, IndexTuple *itup, int len, ItemPointer key)
{
Page page;
int level;
Assert( BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO );
page = BufferGetPage(buffer);
level = GistPageGetOpaque(page)->level;
GISTInitBuffer(buffer, 0);
GistPageGetOpaque(page)->level = level+1;
gistfillbuffer(r, page, itup, len, FirstOffsetNumber);
if ( !r->rd_istemp ) {
XLogRecPtr recptr;
XLogRecData *rdata;
rdata = formUpdateRdata(r->rd_node, GIST_ROOT_BLKNO,
NULL, 0, false, itup, len, key);
START_CRIT_SECTION();
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_NEW_ROOT, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
END_CRIT_SECTION();
} else
PageSetLSN(page, XLogRecPtrForTemp);
}
void
initGISTstate(GISTSTATE *giststate, Relation index)
{
int i;
if (index->rd_att->natts > INDEX_MAX_KEYS)
elog(ERROR, "numberOfAttributes %d > %d",
index->rd_att->natts, INDEX_MAX_KEYS);
giststate->tupdesc = index->rd_att;
for (i = 0; i < index->rd_att->natts; i++)
{
fmgr_info_copy(&(giststate->consistentFn[i]),
index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC),
CurrentMemoryContext);
fmgr_info_copy(&(giststate->unionFn[i]),
index_getprocinfo(index, i + 1, GIST_UNION_PROC),
CurrentMemoryContext);
fmgr_info_copy(&(giststate->compressFn[i]),
index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC),
CurrentMemoryContext);
fmgr_info_copy(&(giststate->decompressFn[i]),
index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC),
CurrentMemoryContext);
fmgr_info_copy(&(giststate->penaltyFn[i]),
index_getprocinfo(index, i + 1, GIST_PENALTY_PROC),
CurrentMemoryContext);
fmgr_info_copy(&(giststate->picksplitFn[i]),
index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC),
CurrentMemoryContext);
fmgr_info_copy(&(giststate->equalFn[i]),
index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
CurrentMemoryContext);
}
}
void
freeGISTstate(GISTSTATE *giststate)
{
/* no work */
}