postgresql/src/backend/access/heap/hio.c

/*-------------------------------------------------------------------------
 *
 * hio.c
 *	  POSTGRES heap access method input/output code.
 *
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  $Id: hio.c,v 1.41 2001/06/29 21:08:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/heapam.h"
#include "access/hio.h"
#include "storage/freespace.h"


/*
 * RelationPutHeapTuple - place tuple at specified page
 *
 * !!! ELOG(ERROR) IS DISALLOWED HERE !!!
 *
 * Note - we assume that caller hold BUFFER_LOCK_EXCLUSIVE on the buffer.
 *
 */
void
RelationPutHeapTuple(Relation relation,
					 Buffer buffer,
					 HeapTuple tuple)
{
	Page		pageHeader;
	OffsetNumber offnum;
	Size		len;
	ItemId		itemId;
	Item		item;

	/*
	 * increment access statistics
	 */
	IncrHeapAccessStat(local_RelationPutHeapTuple);
	IncrHeapAccessStat(global_RelationPutHeapTuple);

	pageHeader = (Page) BufferGetPage(buffer);
	len = MAXALIGN(tuple->t_len);		/* be conservative */
	Assert(len <= PageGetFreeSpace(pageHeader));

	offnum = PageAddItem((Page) pageHeader, (Item) tuple->t_data,
						 tuple->t_len, InvalidOffsetNumber, LP_USED);

	if (offnum == InvalidOffsetNumber)
		elog(STOP, "RelationPutHeapTuple: failed to add tuple");

	itemId = PageGetItemId((Page) pageHeader, offnum);
	item = PageGetItem((Page) pageHeader, itemId);

	ItemPointerSet(&((HeapTupleHeader) item)->t_ctid,
				   BufferGetBlockNumber(buffer), offnum);

	/* return an accurate tuple */
	ItemPointerSet(&tuple->t_self, BufferGetBlockNumber(buffer), offnum);
}

/*
 * RelationGetBufferForTuple
 *
 *	Returns pinned and exclusive-locked buffer of a page in given relation
 *	with free space >= given len.
 *
 *	If otherBuffer is not InvalidBuffer, then it references a previously
 *	pinned buffer of another page in the same relation; on return, this
 *	buffer will also be exclusive-locked.  (This case is used by heap_update;
 *	the otherBuffer contains the tuple being updated.)
 *
 *	The reason for passing otherBuffer is that if two backends are doing
 *	concurrent heap_update operations, a deadlock could occur if they try
 *	to lock the same two buffers in opposite orders.  To ensure that this
 *	can't happen, we impose the rule that buffers of a relation must be
 *	locked in increasing page number order.  This is most conveniently done
 *	by having RelationGetBufferForTuple lock them both, with suitable care
 *	for ordering.
 *
 *	NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
 *	same buffer we select for insertion of the new tuple (this could only
 *	happen if space is freed in that page after heap_update finds there's not
 *	enough there).  In that case, the page will be pinned and locked only once.
 *
 *	Note that we use LockPage(rel, 0) to lock relation for extension.
 *	We can do this as long as in all other places we use page-level locking
 *	for indices only. Alternatively, we could define pseudo-table as
 *	we do for transactions with XactLockTable.
 *
 *	ELOG(ERROR) is allowed here, so this routine *must* be called
 *	before any (unlogged) changes are made in buffer pool.
 */
Buffer
RelationGetBufferForTuple(Relation relation, Size len,
						  Buffer otherBuffer)
{
	Buffer		buffer = InvalidBuffer;
	Page		pageHeader;
	Size		pageFreeSpace;
	BlockNumber targetBlock,
				otherBlock;

	len = MAXALIGN(len);		/* be conservative */

	/*
	 * If we're gonna fail for oversize tuple, do it right away
	 */
	if (len > MaxTupleSize)
		elog(ERROR, "Tuple is too big: size %lu, max size %ld",
			 (unsigned long) len, MaxTupleSize);

	if (otherBuffer != InvalidBuffer)
		otherBlock = BufferGetBlockNumber(otherBuffer);
	else
		otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */

	/*
	 * We first try to put the tuple on the same page we last inserted a
	 * tuple on, as cached in the relcache entry.  If that doesn't work,
	 * we ask the shared Free Space Map to locate a suitable page.  Since
	 * the FSM's info might be out of date, we have to be prepared to loop
	 * around and retry multiple times.  (To insure this isn't an infinite
	 * loop, we must update the FSM with the correct amount of free space on
	 * each page that proves not to be suitable.)  If the FSM has no record of
	 * a page with enough free space, we give up and extend the relation.
	 */

	targetBlock = relation->rd_targblock;

	if (targetBlock == InvalidBlockNumber)
	{
		/*
		 * We have no cached target page, so ask the FSM for an initial
		 * target.
		 */
		targetBlock = GetPageWithFreeSpace(&relation->rd_node, len);
		/*
		 * If the FSM knows nothing of the rel, try the last page before
		 * we give up and extend.  This avoids one-tuple-per-page syndrome
		 * during bootstrapping or in a recently-started system.
		 */
		if (targetBlock == InvalidBlockNumber)
		{
			BlockNumber	nblocks = RelationGetNumberOfBlocks(relation);

			if (nblocks > 0)
				targetBlock = nblocks - 1;
		}
	}

	while (targetBlock != InvalidBlockNumber)
	{
		/*
		 * Read and exclusive-lock the target block, as well as the
		 * other block if one was given, taking suitable care with
		 * lock ordering and the possibility they are the same block.
		 */
		if (otherBuffer == InvalidBuffer)
		{
			/* easy case */
			buffer = ReadBuffer(relation, targetBlock);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
		}
		else if (otherBlock == targetBlock)
		{
			/* also easy case */
			buffer = otherBuffer;
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
		}
		else if (otherBlock < targetBlock)
		{
			/* lock other buffer first */
			buffer = ReadBuffer(relation, targetBlock);
			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
		}
		else
		{
			/* lock target buffer first */
			buffer = ReadBuffer(relation, targetBlock);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
		}
		/*
		 * Now we can check to see if there's enough free space here.
		 * If so, we're done.
		 */
		pageHeader = (Page) BufferGetPage(buffer);
		pageFreeSpace = PageGetFreeSpace(pageHeader);
		if (len <= pageFreeSpace)
		{
			/* use this page as future insert target, too */
			relation->rd_targblock = targetBlock;
			return buffer;
		}
		/*
		 * Not enough space, so we must give up our page locks and
		 * pin (if any) and prepare to look elsewhere.  We don't care
		 * which order we unlock the two buffers in, so this can be
		 * slightly simpler than the code above.
		 */
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		if (otherBuffer == InvalidBuffer)
		{
			ReleaseBuffer(buffer);
		}
		else if (otherBlock != targetBlock)
		{
			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
			ReleaseBuffer(buffer);
		}
		/*
		 * Update FSM as to condition of this page, and ask for another
		 * page to try.
		 */
		targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
													targetBlock,
													pageFreeSpace,
													len);
	}

	/*
	 * Have to extend the relation.
	 *
	 * We have to use a lock to ensure no one else is extending the
	 * rel at the same time, else we will both try to initialize the
	 * same new page.
	 */
	if (!relation->rd_myxactonly)
		LockPage(relation, 0, ExclusiveLock);

	/*
	 * XXX This does an lseek - rather expensive - but at the moment it is
	 * the only way to accurately determine how many blocks are in a
	 * relation.  Is it worth keeping an accurate file length in shared
	 * memory someplace, rather than relying on the kernel to do it for us?
	 */
	buffer = ReadBuffer(relation, P_NEW);

	/*
	 * Release the file-extension lock; it's now OK for someone else
	 * to extend the relation some more.
	 */
	if (!relation->rd_myxactonly)
		UnlockPage(relation, 0, ExclusiveLock);

	/*
	 * We can be certain that locking the otherBuffer first is OK,
	 * since it must have a lower page number.
	 */
	if (otherBuffer != InvalidBuffer)
		LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

	/*
	 * We need to initialize the empty new page.
	 */
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
	pageHeader = (Page) BufferGetPage(buffer);
	Assert(PageIsNew((PageHeader) pageHeader));
	PageInit(pageHeader, BufferGetPageSize(buffer), 0);

	if (len > PageGetFreeSpace(pageHeader))
	{
		/* We should not get here given the test at the top */
		elog(STOP, "Tuple is too big: size %lu", (unsigned long) len);
	}

	/*
	 * Remember the new page as our target for future insertions.
	 *
	 * XXX should we enter the new page into the free space map immediately,
	 * or just keep it for this backend's exclusive use in the short run
	 * (until VACUUM sees it)?  Seems to depend on whether you expect the
	 * current backend to make more insertions or not, which is probably a
	 * good bet most of the time.  So for now, don't add it to FSM yet.
	 */
	relation->rd_targblock = BufferGetBlockNumber(buffer);

	return buffer;
}