postgresql/src/backend/access/nbtree/nbtree.c

/*-------------------------------------------------------------------------
 *
 * btree.c--
 *    Implementation of Lehman and Yao's btree management algorithm for
 *    Postgres.
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.17 1997/03/24 08:48:11 vadim Exp $
 *
 * NOTES
 *    This file contains only the public interface routines.
 *
 *-------------------------------------------------------------------------
 */

#include <postgres.h>

#include <access/genam.h>
#include <storage/bufpage.h>
#include <storage/bufmgr.h>
#include <access/nbtree.h>
#include <executor/executor.h>
#include <access/heapam.h>
#include <catalog/index.h>
#include <miscadmin.h>

#ifndef HAVE_MEMMOVE
# include <regex/utils.h>
#else
# include <string.h>
#endif

bool	BuildingBtree = false;	/* see comment in btbuild() */
bool	FastBuild = true;	/* use sort/build instead of insertion build */

/*
 *  btbuild() -- build a new btree index.
 *
 *	We use a global variable to record the fact that we're creating
 *	a new index.  This is used to avoid high-concurrency locking,
 *	since the index won't be visible until this transaction commits
 *	and since building is guaranteed to be single-threaded.
 */
void
btbuild(Relation heap,
	Relation index,
	int natts,
	AttrNumber *attnum,
	IndexStrategy istrat,
	uint16 pcount,
	Datum *params,
	FuncIndexInfo *finfo,
	PredInfo *predInfo)
{
    HeapScanDesc hscan;
    Buffer buffer;
    HeapTuple htup;
    IndexTuple itup;
    TupleDesc htupdesc, itupdesc;
    Datum *attdata;
    bool *nulls;
    InsertIndexResult res = 0;
    int nhtups, nitups;
    int i;
    BTItem btitem;
#ifndef OMIT_PARTIAL_INDEX
    ExprContext *econtext = (ExprContext *) NULL;
    TupleTable tupleTable = (TupleTable) NULL;
    TupleTableSlot *slot = (TupleTableSlot *) NULL;
#endif
    Oid hrelid, irelid;
    Node *pred, *oldPred;
    void *spool = (void *) NULL;
    bool isunique;
    bool usefast;

#if 0
    ResetBufferUsage();
#endif

    /* note that this is a new btree */
    BuildingBtree = true;
    
    pred = predInfo->pred;
    oldPred = predInfo->oldPred;

    /*
     * bootstrap processing does something strange, so don't use
     * sort/build for initial catalog indices.  at some point i need
     * to look harder at this.  (there is some kind of incremental
     * processing going on there.) -- pma 08/29/95
     */
    usefast = (FastBuild && IsNormalProcessingMode());

    /* see if index is unique */
    isunique = IndexIsUniqueNoCache(RelationGetRelationId(index));

    /* initialize the btree index metadata page (if this is a new index) */
    if (oldPred == NULL)
	_bt_metapinit(index);
    
    /* get tuple descriptors for heap and index relations */
    htupdesc = RelationGetTupleDescriptor(heap);
    itupdesc = RelationGetTupleDescriptor(index);
    
    /* get space for data items that'll appear in the index tuple */
    attdata = (Datum *) palloc(natts * sizeof(Datum));
    nulls = (bool *) palloc(natts * sizeof(bool));
    
    /*
     * If this is a predicate (partial) index, we will need to evaluate the
     * predicate using ExecQual, which requires the current tuple to be in a
     * slot of a TupleTable.  In addition, ExecQual must have an ExprContext
     * referring to that slot.  Here, we initialize dummy TupleTable and
     * ExprContext objects for this purpose. --Nels, Feb '92
     */
#ifndef OMIT_PARTIAL_INDEX
    if (pred != NULL || oldPred != NULL) {
	tupleTable = ExecCreateTupleTable(1);
	slot = ExecAllocTableSlot(tupleTable);
	econtext = makeNode(ExprContext);
	FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);

	/*
	 * we never want to use sort/build if we are extending an
	 * existing partial index -- it works by inserting the
	 * newly-qualifying tuples into the existing index.
	 * (sort/build would overwrite the existing index with one
	 * consisting of the newly-qualifying tuples.)
	 */
	usefast = false;
    }
#endif /* OMIT_PARTIAL_INDEX */
    
    /* start a heap scan */
    hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
    htup = heap_getnext(hscan, 0, &buffer);
    
    /* build the index */
    nhtups = nitups = 0;
    
    if (usefast) {
	spool = _bt_spoolinit(index, 7, isunique);
	res = (InsertIndexResult) NULL;
    }

    for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
	
	nhtups++;
	
	/*
	 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
	 * this tuple if it was already in the existing partial index
	 */
	if (oldPred != NULL) {
#ifndef OMIT_PARTIAL_INDEX

	    /*SetSlotContents(slot, htup);*/
	    slot->val = htup;
	    if (ExecQual((List*)oldPred, econtext) == true) {
		nitups++;
		continue;
	    }
#endif /* OMIT_PARTIAL_INDEX */    	
	}
	
	/* Skip this tuple if it doesn't satisfy the partial-index predicate */
	if (pred != NULL) {
#ifndef OMIT_PARTIAL_INDEX
	    /* SetSlotContents(slot, htup); */
	    slot->val = htup;
	    if (ExecQual((List*)pred, econtext) == false)
		continue;
#endif /* OMIT_PARTIAL_INDEX */    	
	}
	
	nitups++;
	
	/*
	 *  For the current heap tuple, extract all the attributes
	 *  we use in this index, and note which are null.
	 */
	
	for (i = 1; i <= natts; i++) {
	    int  attoff;
	    bool attnull;
	    
	    /*
	     *  Offsets are from the start of the tuple, and are
	     *  zero-based; indices are one-based.  The next call
	     *  returns i - 1.  That's data hiding for you.
	     */
	    
	    attoff = AttrNumberGetAttrOffset(i);
	    attdata[attoff] = GetIndexValue(htup, 
					    htupdesc,
					    attoff, 
					    attnum, 
					    finfo, 
					    &attnull,
					    buffer);
	    nulls[attoff] = (attnull ? 'n' : ' ');
	}
	
	/* form an index tuple and point it at the heap tuple */
	itup = index_formtuple(itupdesc, attdata, nulls);
	
	/*
	 *  If the single index key is null, we don't insert it into
	 *  the index.  Btrees support scans on <, <=, =, >=, and >.
	 *  Relational algebra says that A op B (where op is one of the
	 *  operators above) returns null if either A or B is null.  This
	 *  means that no qualification used in an index scan could ever
	 *  return true on a null attribute.  It also means that indices
	 *  can't be used by ISNULL or NOTNULL scans, but that's an
	 *  artifact of the strategy map architecture chosen in 1986, not
	 *  of the way nulls are handled here.
	 */
	/*
	 * New comments: NULLs handling.
	 * While we can't do NULL comparison, we can follow simple
	 * rule for ordering items on btree pages - NULLs greater
	 * NOT_NULLs and NULL = NULL is TRUE. Sure, it's just rule
	 * for placing/finding items and no more - keytest'll return
	 * FALSE for a = 5 for items having 'a' isNULL.	
	 * Look at _bt_skeycmp, _bt_compare and _bt_itemcmp for
	 * how it works.		- vadim 03/23/97
	 
	if (itup->t_info & INDEX_NULL_MASK) {
	    pfree(itup);
	    continue;
	}
	 */
	
	itup->t_tid = htup->t_ctid;
	btitem = _bt_formitem(itup);

	/*
	 * if we are doing bottom-up btree build, we insert the index
	 * into a spool page for subsequent processing.  otherwise, we
	 * insert into the btree.
	 */
	if (usefast) {
	    _bt_spool(index, btitem, spool);
	} else {
	    res = _bt_doinsert(index, btitem, isunique, heap);
	}

	pfree(btitem);
	pfree(itup);
	if (res) {
	    pfree(res);
	}
    }
    
    /* okay, all heap tuples are indexed */
    heap_endscan(hscan);
    
    if (pred != NULL || oldPred != NULL) {
#ifndef OMIT_PARTIAL_INDEX
	ExecDestroyTupleTable(tupleTable, true);
	pfree(econtext);
#endif /* OMIT_PARTIAL_INDEX */    	
    }
    
    /*
     * if we are doing bottom-up btree build, we now have a bunch of
     * sorted runs in the spool pages.  finish the build by (1)
     * merging the runs, (2) inserting the sorted tuples into btree
     * pages and (3) building the upper levels.
     */
    if (usefast) {
	_bt_spool(index, (BTItem) NULL, spool);	/* flush the spool */
	_bt_leafbuild(index, spool);
	_bt_spooldestroy(spool);
    }

#if 0
    {
	extern int ReadBufferCount, BufferHitCount, BufferFlushCount;
	extern long NDirectFileRead, NDirectFileWrite;

	printf("buffer(%d): r=%d w=%d\n", heap->rd_rel->relblocksz,
	       ReadBufferCount - BufferHitCount, BufferFlushCount);
	printf("direct(%d): r=%d w=%d\n", LocalBlockSize,
	       NDirectFileRead, NDirectFileWrite);
    }
#endif

    /*
     *  Since we just counted the tuples in the heap, we update its
     *  stats in pg_class to guarantee that the planner takes advantage
     *  of the index we just created. Finally, only update statistics
     *  during normal index definitions, not for indices on system catalogs
     *  created during bootstrap processing.  We must close the relations
     *  before updatings statistics to guarantee that the relcache entries
     *  are flushed when we increment the command counter in UpdateStats().
     */
    if (IsNormalProcessingMode())
	{
	    hrelid = heap->rd_id;
	    irelid = index->rd_id;
	    heap_close(heap);
	    index_close(index);
	    UpdateStats(hrelid, nhtups, true);
	    UpdateStats(irelid, nitups, false);
	    if (oldPred != NULL) {
		if (nitups == nhtups) pred = NULL;
		UpdateIndexPredicate(irelid, oldPred, pred);
	    }  
	}
    
    pfree(nulls);
    pfree(attdata);
    
    /* all done */
    BuildingBtree = false;
}

/*
 *  btinsert() -- insert an index tuple into a btree.
 *
 *	Descend the tree recursively, find the appropriate location for our
 *	new tuple, put it there, set its unique OID as appropriate, and
 *	return an InsertIndexResult to the caller.
 */
InsertIndexResult
btinsert(Relation rel, Datum *datum, char *nulls, ItemPointer ht_ctid, Relation heapRel)
{
    BTItem btitem;
    IndexTuple itup;
    InsertIndexResult res;
    
    /* generate an index tuple */
    itup = index_formtuple(RelationGetTupleDescriptor(rel), datum, nulls);
    itup->t_tid = *ht_ctid;

    /*
     * See comments in btbuild.
     
    if (itup->t_info & INDEX_NULL_MASK)
	return ((InsertIndexResult) NULL);
    */
    
    btitem = _bt_formitem(itup);
    
    res = _bt_doinsert(rel, btitem, 
		       IndexIsUnique(RelationGetRelationId(rel)), heapRel);

    pfree(btitem);
    pfree(itup);

    /* adjust any active scans that will be affected by this insertion */
    _bt_adjscans(rel, &(res->pointerData), BT_INSERT);

    return (res);
}

/*
 *  btgettuple() -- Get the next tuple in the scan.
 */
char *
btgettuple(IndexScanDesc scan, ScanDirection dir)
{
    RetrieveIndexResult res;
    
    /*
     *  If we've already initialized this scan, we can just advance it
     *  in the appropriate direction.  If we haven't done so yet, we
     *  call a routine to get the first item in the scan.
     */
    
    if (ItemPointerIsValid(&(scan->currentItemData)))
	res = _bt_next(scan, dir);
    else
	res = _bt_first(scan, dir);
    
    return ((char *) res);
}

/*
 *  btbeginscan() -- start a scan on a btree index
 */
char *
btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
{
    IndexScanDesc scan;
    
    /* get the scan */
    scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
    
    /* register scan in case we change pages it's using */
    _bt_regscan(scan);
    
    return ((char *) scan);
}

/*
 *  btrescan() -- rescan an index relation
 */
void
btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
{
    ItemPointer iptr;
    BTScanOpaque so;
    StrategyNumber strat;
    
    so = (BTScanOpaque) scan->opaque;
    
    /* we hold a read lock on the current page in the scan */
    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
	so->btso_curbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }
    
    /* and we hold a read lock on the last marked item in the scan */
    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
	_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
	so->btso_mrkbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }
    
    if ( so == NULL )		/* if called from btbeginscan */
    {
	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
	so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
	so->keyData = (ScanKey) NULL;
	if ( scan->numberOfKeys > 0)
		so->keyData = (ScanKey) palloc (scan->numberOfKeys * sizeof(ScanKeyData));
	scan->opaque = so;
	scan->flags = 0x0;
    }
    
    /* reset the scan key */
    so->numberOfKeys = scan->numberOfKeys;
    so->numberOfFirstKeys = 0;		/* may be changed by _bt_orderkeys */
    so->qual_ok = 1;			/* may be changed by _bt_orderkeys */
    if (scan->numberOfKeys > 0) {
	memmove(scan->keyData,
		scankey,
		scan->numberOfKeys * sizeof(ScanKeyData));
	memmove(so->keyData,
		scankey,
		so->numberOfKeys * sizeof(ScanKeyData));
	/* order the keys in the qualification */
	_bt_orderkeys(scan->relation, so);
    }
    
    /* finally, be sure that the scan exploits the tree order */
    scan->scanFromEnd = false;
    if ( so->numberOfKeys > 0 ) {
	strat = _bt_getstrat(scan->relation, 1 /* XXX */,
			     so->keyData[0].sk_procedure);
	
	if (strat == BTLessStrategyNumber
	    || strat == BTLessEqualStrategyNumber)
	    scan->scanFromEnd = true;
    } else {
	scan->scanFromEnd = true;
    }

}

void
btmovescan(IndexScanDesc scan, Datum v)
{
    ItemPointer iptr;
    BTScanOpaque so;
    
    so = (BTScanOpaque) scan->opaque;
    
    /* release any locks we still hold */
    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
	so->btso_curbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }
    
/*    scan->keyData[0].sk_argument = v;	*/
    so->keyData[0].sk_argument = v;
}

/*
 *  btendscan() -- close down a scan
 */
void
btendscan(IndexScanDesc scan)
{
    ItemPointer iptr;
    BTScanOpaque so;
    
    so = (BTScanOpaque) scan->opaque;
    
    /* release any locks we still hold */
    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
	if (BufferIsValid(so->btso_curbuf))
	    _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
	so->btso_curbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }
    
    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
	if (BufferIsValid(so->btso_mrkbuf))
	    _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
	so->btso_mrkbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }

    if ( so->keyData != (ScanKey) NULL )
    	pfree (so->keyData);
    pfree (so);

    _bt_dropscan(scan);
}

/*
 *  btmarkpos() -- save current scan position
 */
void
btmarkpos(IndexScanDesc scan)
{
    ItemPointer iptr;
    BTScanOpaque so;
    
    so = (BTScanOpaque) scan->opaque;
    
    /* release lock on old marked data, if any */
    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
	_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
	so->btso_mrkbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }
    
    /* bump lock on currentItemData and copy to currentMarkData */
    if (ItemPointerIsValid(&(scan->currentItemData))) {
	so->btso_mrkbuf = _bt_getbuf(scan->relation,
				     BufferGetBlockNumber(so->btso_curbuf),
				     BT_READ);
	scan->currentMarkData = scan->currentItemData;
    }
}

/*
 *  btrestrpos() -- restore scan to last saved position
 */
void
btrestrpos(IndexScanDesc scan)
{
    ItemPointer iptr;
    BTScanOpaque so;
    
    so = (BTScanOpaque) scan->opaque;
    
    /* release lock on current data, if any */
    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
	so->btso_curbuf = InvalidBuffer;
	ItemPointerSetInvalid(iptr);
    }
    
    /* bump lock on currentMarkData and copy to currentItemData */
    if (ItemPointerIsValid(&(scan->currentMarkData))) {
	so->btso_curbuf = _bt_getbuf(scan->relation,
				     BufferGetBlockNumber(so->btso_mrkbuf),
				     BT_READ);
	
	scan->currentItemData = scan->currentMarkData;
    }
}

/* stubs */
void
btdelete(Relation rel, ItemPointer tid)
{
    /* adjust any active scans that will be affected by this deletion */
    _bt_adjscans(rel, tid, BT_DELETE);
    
    /* delete the data from the page */
    _bt_pagedel(rel, tid);
}