1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* hio.c
|
1997-09-07 07:04:48 +02:00
|
|
|
* POSTGRES heap access method input/output code.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2011-01-01 19:18:15 +01:00
|
|
|
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/heap/hio.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1996-10-20 10:32:11 +02:00
|
|
|
|
2008-11-06 21:51:15 +01:00
|
|
|
#include "access/heapam.h"
|
1999-07-16 07:00:38 +02:00
|
|
|
#include "access/hio.h"
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
#include "access/visibilitymap.h"
|
2008-06-09 00:00:48 +02:00
|
|
|
#include "storage/bufmgr.h"
|
2001-06-29 23:08:25 +02:00
|
|
|
#include "storage/freespace.h"
|
2008-05-12 02:00:54 +02:00
|
|
|
#include "storage/lmgr.h"
|
2010-02-09 22:43:30 +01:00
|
|
|
#include "storage/smgr.h"
|
2001-06-29 23:08:25 +02:00
|
|
|
|
1996-10-21 07:59:49 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* RelationPutHeapTuple - place tuple at specified page
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2003-07-21 22:29:40 +02:00
|
|
|
* !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!!
|
1998-12-15 13:47:01 +01:00
|
|
|
*
|
2001-07-14 00:52:58 +02:00
|
|
|
* Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
RelationPutHeapTuple(Relation relation,
|
1998-12-15 13:47:01 +01:00
|
|
|
Buffer buffer,
|
1997-09-07 07:04:48 +02:00
|
|
|
HeapTuple tuple)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1999-05-25 18:15:34 +02:00
|
|
|
Page pageHeader;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
ItemId itemId;
|
|
|
|
Item item;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-07-14 00:52:58 +02:00
|
|
|
/* Add the tuple to the page */
|
|
|
|
pageHeader = BufferGetPage(buffer);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-07-14 00:52:58 +02:00
|
|
|
offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
|
2007-09-20 19:56:33 +02:00
|
|
|
tuple->t_len, InvalidOffsetNumber, false, true);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-03 04:54:21 +02:00
|
|
|
if (offnum == InvalidOffsetNumber)
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(PANIC, "failed to add tuple to page");
|
2000-07-03 04:54:21 +02:00
|
|
|
|
2001-07-14 00:52:58 +02:00
|
|
|
/* Update tuple->t_self to the actual position where it was stored */
|
|
|
|
ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-07-14 00:52:58 +02:00
|
|
|
/* Insert the correct position into CTID of the stored tuple, too */
|
|
|
|
itemId = PageGetItemId(pageHeader, offnum);
|
|
|
|
item = PageGetItem(pageHeader, itemId);
|
|
|
|
((HeapTupleHeader) item)->t_ctid = tuple->t_self;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2008-11-06 21:51:15 +01:00
|
|
|
/*
|
|
|
|
* Read in a buffer, using bulk-insert strategy if bistate isn't NULL.
|
|
|
|
*/
|
|
|
|
static Buffer
|
|
|
|
ReadBufferBI(Relation relation, BlockNumber targetBlock,
|
|
|
|
BulkInsertState bistate)
|
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
Buffer buffer;
|
2008-11-06 21:51:15 +01:00
|
|
|
|
|
|
|
/* If not bulk-insert, exactly like ReadBuffer */
|
|
|
|
if (!bistate)
|
|
|
|
return ReadBuffer(relation, targetBlock);
|
|
|
|
|
|
|
|
/* If we have the desired block already pinned, re-pin and return it */
|
|
|
|
if (bistate->current_buf != InvalidBuffer)
|
|
|
|
{
|
|
|
|
if (BufferGetBlockNumber(bistate->current_buf) == targetBlock)
|
|
|
|
{
|
|
|
|
IncrBufferRefCount(bistate->current_buf);
|
|
|
|
return bistate->current_buf;
|
|
|
|
}
|
|
|
|
/* ... else drop the old buffer */
|
|
|
|
ReleaseBuffer(bistate->current_buf);
|
|
|
|
bistate->current_buf = InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Perform a read using the buffer strategy */
|
|
|
|
buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock,
|
|
|
|
RBM_NORMAL, bistate->strategy);
|
|
|
|
|
|
|
|
/* Save the selected block as target for future inserts */
|
|
|
|
IncrBufferRefCount(buffer);
|
|
|
|
bistate->current_buf = buffer;
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2000-07-03 04:54:21 +02:00
|
|
|
* RelationGetBufferForTuple
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2001-06-29 23:08:25 +02:00
|
|
|
* Returns pinned and exclusive-locked buffer of a page in given relation
|
|
|
|
* with free space >= given len.
|
|
|
|
*
|
|
|
|
* If otherBuffer is not InvalidBuffer, then it references a previously
|
|
|
|
* pinned buffer of another page in the same relation; on return, this
|
|
|
|
* buffer will also be exclusive-locked. (This case is used by heap_update;
|
|
|
|
* the otherBuffer contains the tuple being updated.)
|
2000-09-07 11:58:38 +02:00
|
|
|
*
|
2001-06-29 23:08:25 +02:00
|
|
|
* The reason for passing otherBuffer is that if two backends are doing
|
|
|
|
* concurrent heap_update operations, a deadlock could occur if they try
|
|
|
|
* to lock the same two buffers in opposite orders. To ensure that this
|
|
|
|
* can't happen, we impose the rule that buffers of a relation must be
|
|
|
|
* locked in increasing page number order. This is most conveniently done
|
|
|
|
* by having RelationGetBufferForTuple lock them both, with suitable care
|
|
|
|
* for ordering.
|
1997-09-07 07:04:48 +02:00
|
|
|
*
|
2001-06-29 23:08:25 +02:00
|
|
|
* NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
|
|
|
|
* same buffer we select for insertion of the new tuple (this could only
|
|
|
|
* happen if space is freed in that page after heap_update finds there's not
|
2001-10-25 07:50:21 +02:00
|
|
|
* enough there). In that case, the page will be pinned and locked only once.
|
2001-06-29 23:08:25 +02:00
|
|
|
*
|
2009-06-11 16:49:15 +02:00
|
|
|
* We normally use FSM to help us find free space. However,
|
2008-11-06 21:51:15 +01:00
|
|
|
* if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to
|
|
|
|
* the end of the relation if the tuple won't fit on the current target page.
|
2005-06-20 20:37:02 +02:00
|
|
|
* This can save some cycles when we know the relation is new and doesn't
|
|
|
|
* contain useful amounts of free space.
|
|
|
|
*
|
2008-11-06 21:51:15 +01:00
|
|
|
* HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a
|
2005-06-20 20:37:02 +02:00
|
|
|
* relation, if the caller holds exclusive lock and is careful to invalidate
|
2010-02-09 22:43:30 +01:00
|
|
|
* relation's smgr_targblock before the first insertion --- that ensures that
|
2005-06-20 20:37:02 +02:00
|
|
|
* all insertions will occur into newly added pages and not be intermixed
|
|
|
|
* with tuples from other transactions. That way, a crash can't risk losing
|
|
|
|
* any committed data of other transactions. (See heap_insert's comments
|
|
|
|
* for additional constraints needed for safe usage of this behavior.)
|
|
|
|
*
|
2008-11-06 21:51:15 +01:00
|
|
|
* The caller can also provide a BulkInsertState object to optimize many
|
2009-06-11 16:49:15 +02:00
|
|
|
* insertions into the same relation. This keeps a pin on the current
|
2008-11-06 21:51:15 +01:00
|
|
|
* insertion target page (to save pin/unpin cycles) and also passes a
|
|
|
|
* BULKWRITE buffer selection strategy object to the buffer manager.
|
|
|
|
* Passing NULL for bistate selects the default behavior.
|
|
|
|
*
|
2006-07-04 00:45:41 +02:00
|
|
|
* We always try to avoid filling existing pages further than the fillfactor.
|
|
|
|
* This is OK since this routine is not consulted when updating a tuple and
|
|
|
|
* keeping it on the same page, which is the scenario fillfactor is meant
|
|
|
|
* to reserve space for.
|
|
|
|
*
|
2003-07-21 22:29:40 +02:00
|
|
|
* ereport(ERROR) is allowed here, so this routine *must* be called
|
2001-05-17 00:35:12 +02:00
|
|
|
* before any (unlogged) changes are made in buffer pool.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-03 04:54:21 +02:00
|
|
|
Buffer
|
2001-05-17 00:35:12 +02:00
|
|
|
RelationGetBufferForTuple(Relation relation, Size len,
|
2008-11-06 21:51:15 +01:00
|
|
|
Buffer otherBuffer, int options,
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
struct BulkInsertStateData * bistate,
|
|
|
|
Buffer *vmbuffer)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2008-11-06 21:51:15 +01:00
|
|
|
bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
|
2001-05-12 21:58:28 +02:00
|
|
|
Buffer buffer = InvalidBuffer;
|
2008-07-13 22:45:47 +02:00
|
|
|
Page page;
|
2006-07-02 04:23:23 +02:00
|
|
|
Size pageFreeSpace,
|
2006-07-04 00:45:41 +02:00
|
|
|
saveFreeSpace;
|
2001-06-29 23:08:25 +02:00
|
|
|
BlockNumber targetBlock,
|
|
|
|
otherBlock;
|
2002-08-06 04:36:35 +02:00
|
|
|
bool needLock;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-03 04:54:21 +02:00
|
|
|
len = MAXALIGN(len); /* be conservative */
|
1999-11-29 05:34:55 +01:00
|
|
|
|
2008-11-06 21:51:15 +01:00
|
|
|
/* Bulk insert is not supported for updates, only inserts. */
|
|
|
|
Assert(otherBuffer == InvalidBuffer || !bistate);
|
|
|
|
|
1999-11-29 05:34:55 +01:00
|
|
|
/*
|
2000-07-03 04:54:21 +02:00
|
|
|
* If we're gonna fail for oversize tuple, do it right away
|
1999-11-29 05:34:55 +01:00
|
|
|
*/
|
2007-02-05 05:22:18 +01:00
|
|
|
if (len > MaxHeapTupleSize)
|
2003-07-21 22:29:40 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
2003-09-25 08:58:07 +02:00
|
|
|
errmsg("row is too big: size %lu, maximum size %lu",
|
2003-07-21 22:29:40 +02:00
|
|
|
(unsigned long) len,
|
2007-02-05 05:22:18 +01:00
|
|
|
(unsigned long) MaxHeapTupleSize)));
|
1999-11-29 05:34:55 +01:00
|
|
|
|
2006-07-04 00:45:41 +02:00
|
|
|
/* Compute desired extra freespace due to fillfactor option */
|
|
|
|
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
|
|
|
|
HEAP_DEFAULT_FILLFACTOR);
|
|
|
|
|
2001-06-29 23:08:25 +02:00
|
|
|
if (otherBuffer != InvalidBuffer)
|
|
|
|
otherBlock = BufferGetBlockNumber(otherBuffer);
|
|
|
|
else
|
2005-10-15 04:49:52 +02:00
|
|
|
otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */
|
2001-06-29 23:08:25 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We first try to put the tuple on the same page we last inserted a tuple
|
2009-06-11 16:49:15 +02:00
|
|
|
* on, as cached in the BulkInsertState or relcache entry. If that
|
2008-11-06 21:51:15 +01:00
|
|
|
* doesn't work, we ask the Free Space Map to locate a suitable page.
|
|
|
|
* Since the FSM's info might be out of date, we have to be prepared to
|
|
|
|
* loop around and retry multiple times. (To insure this isn't an infinite
|
|
|
|
* loop, we must update the FSM with the correct amount of free space on
|
|
|
|
* each page that proves not to be suitable.) If the FSM has no record of
|
|
|
|
* a page with enough free space, we give up and extend the relation.
|
2005-06-20 20:37:02 +02:00
|
|
|
*
|
2005-10-15 04:49:52 +02:00
|
|
|
* When use_fsm is false, we either put the tuple onto the existing target
|
|
|
|
* page or extend the relation.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2008-11-06 21:51:15 +01:00
|
|
|
if (len + saveFreeSpace > MaxHeapTupleSize)
|
2006-07-04 00:45:41 +02:00
|
|
|
{
|
2008-11-06 21:51:15 +01:00
|
|
|
/* can't fit, don't bother asking FSM */
|
2006-07-04 00:45:41 +02:00
|
|
|
targetBlock = InvalidBlockNumber;
|
|
|
|
use_fsm = false;
|
|
|
|
}
|
2008-11-06 21:51:15 +01:00
|
|
|
else if (bistate && bistate->current_buf != InvalidBuffer)
|
|
|
|
targetBlock = BufferGetBlockNumber(bistate->current_buf);
|
|
|
|
else
|
2010-02-09 22:43:30 +01:00
|
|
|
targetBlock = RelationGetTargetBlock(relation);
|
2001-06-29 23:08:25 +02:00
|
|
|
|
2005-06-20 20:37:02 +02:00
|
|
|
if (targetBlock == InvalidBlockNumber && use_fsm)
|
2001-06-29 23:08:25 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We have no cached target page, so ask the FSM for an initial
|
|
|
|
* target.
|
|
|
|
*/
|
2008-09-30 12:52:14 +02:00
|
|
|
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
|
2001-10-25 07:50:21 +02:00
|
|
|
|
2001-06-29 23:08:25 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* If the FSM knows nothing of the rel, try the last page before we
|
|
|
|
* give up and extend. This avoids one-tuple-per-page syndrome during
|
|
|
|
* bootstrapping or in a recently-started system.
|
2001-06-29 23:08:25 +02:00
|
|
|
*/
|
|
|
|
if (targetBlock == InvalidBlockNumber)
|
|
|
|
{
|
2001-10-25 07:50:21 +02:00
|
|
|
BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
|
2001-06-29 23:08:25 +02:00
|
|
|
|
|
|
|
if (nblocks > 0)
|
|
|
|
targetBlock = nblocks - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (targetBlock != InvalidBlockNumber)
|
2000-09-07 11:58:38 +02:00
|
|
|
{
|
2001-06-29 23:08:25 +02:00
|
|
|
/*
|
2001-10-25 07:50:21 +02:00
|
|
|
* Read and exclusive-lock the target block, as well as the other
|
2005-10-15 04:49:52 +02:00
|
|
|
* block if one was given, taking suitable care with lock ordering and
|
|
|
|
* the possibility they are the same block.
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
*
|
|
|
|
* If the page-level all-visible flag is set, caller will need to clear
|
|
|
|
* both that and the corresponding visibility map bit. However, by the
|
|
|
|
* time we return, we'll have x-locked the buffer, and we don't want to
|
|
|
|
* do any I/O while in that state. So we check the bit here before
|
|
|
|
* taking the lock, and pin the page if it appears necessary.
|
|
|
|
* Checking without the lock creates a risk of getting the wrong
|
|
|
|
* answer, so we'll have to recheck after acquiring the lock.
|
2001-06-29 23:08:25 +02:00
|
|
|
*/
|
|
|
|
if (otherBuffer == InvalidBuffer)
|
|
|
|
{
|
|
|
|
/* easy case */
|
2008-11-06 21:51:15 +01:00
|
|
|
buffer = ReadBufferBI(relation, targetBlock, bistate);
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
if (PageIsAllVisible(BufferGetPage(buffer)))
|
|
|
|
visibilitymap_pin(relation, targetBlock, vmbuffer);
|
2001-06-29 23:08:25 +02:00
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
|
|
|
else if (otherBlock == targetBlock)
|
|
|
|
{
|
|
|
|
/* also easy case */
|
|
|
|
buffer = otherBuffer;
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
if (PageIsAllVisible(BufferGetPage(buffer)))
|
|
|
|
visibilitymap_pin(relation, targetBlock, vmbuffer);
|
2001-06-29 23:08:25 +02:00
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
|
|
|
else if (otherBlock < targetBlock)
|
2001-05-17 00:35:12 +02:00
|
|
|
{
|
2001-06-29 23:08:25 +02:00
|
|
|
/* lock other buffer first */
|
|
|
|
buffer = ReadBuffer(relation, targetBlock);
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
if (PageIsAllVisible(BufferGetPage(buffer)))
|
|
|
|
visibilitymap_pin(relation, targetBlock, vmbuffer);
|
2001-06-29 23:08:25 +02:00
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
|
2001-05-17 00:35:12 +02:00
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
2001-06-29 23:08:25 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
/* lock target buffer first */
|
|
|
|
buffer = ReadBuffer(relation, targetBlock);
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
if (PageIsAllVisible(BufferGetPage(buffer)))
|
|
|
|
visibilitymap_pin(relation, targetBlock, vmbuffer);
|
2001-06-29 23:08:25 +02:00
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
2001-10-25 07:50:21 +02:00
|
|
|
|
Make the visibility map crash-safe.
This involves two main changes from the previous behavior. First,
when we set a bit in the visibility map, emit a new WAL record of type
XLOG_HEAP2_VISIBLE. Replay sets the page-level PD_ALL_VISIBLE bit and
the visibility map bit. Second, when inserting, updating, or deleting
a tuple, we can no longer get away with clearing the visibility map
bit after releasing the lock on the corresponding heap page, because
an intervening crash might leave the visibility map bit set and the
page-level bit clear. Making this work requires a bit of interface
refactoring.
In passing, a few minor but related cleanups: change the test in
visibilitymap_set and visibilitymap_clear to throw an error if the
wrong page (or no page) is pinned, rather than silently doing nothing;
this case should never occur. Also, remove duplicate definitions of
InvalidXLogRecPtr.
Patch by me, review by Noah Misch.
2011-06-22 05:04:40 +02:00
|
|
|
/*
|
|
|
|
* If the page is all visible but we don't have the right visibility
|
|
|
|
* map page pinned, then give up our locks, go get the pin, and
|
|
|
|
* re-lock. This is pretty painful, but hopefully shouldn't happen
|
|
|
|
* often. Note that there's a small possibility that we didn't pin
|
|
|
|
* the page above but still have the correct page pinned anyway, either
|
|
|
|
* because we've already made a previous pass through this loop, or
|
|
|
|
* because caller passed us the right page anyway.
|
|
|
|
*
|
|
|
|
* Note also that it's possible that by the time we get the pin and
|
|
|
|
* retake the buffer locks, the visibility map bit will have been
|
|
|
|
* cleared by some other backend anyway. In that case, we'll have done
|
|
|
|
* a bit of extra work for no gain, but there's no real harm done.
|
|
|
|
*/
|
|
|
|
if (PageIsAllVisible(BufferGetPage(buffer))
|
|
|
|
&& !visibilitymap_pin_ok(targetBlock, *vmbuffer))
|
|
|
|
{
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
|
|
|
if (otherBlock != targetBlock)
|
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
|
|
|
|
visibilitymap_pin(relation, targetBlock, vmbuffer);
|
|
|
|
if (otherBuffer != InvalidBuffer && otherBlock < targetBlock)
|
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
if (otherBuffer != InvalidBuffer && otherBlock > targetBlock)
|
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
|
|
|
|
2001-06-29 23:08:25 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Now we can check to see if there's enough free space here. If so,
|
|
|
|
* we're done.
|
2001-06-29 23:08:25 +02:00
|
|
|
*/
|
2008-07-13 22:45:47 +02:00
|
|
|
page = BufferGetPage(buffer);
|
|
|
|
pageFreeSpace = PageGetHeapFreeSpace(page);
|
2006-07-04 00:45:41 +02:00
|
|
|
if (len + saveFreeSpace <= pageFreeSpace)
|
2001-06-29 23:08:25 +02:00
|
|
|
{
|
|
|
|
/* use this page as future insert target, too */
|
2010-02-09 22:43:30 +01:00
|
|
|
RelationSetTargetBlock(relation, targetBlock);
|
2001-06-29 23:08:25 +02:00
|
|
|
return buffer;
|
|
|
|
}
|
2001-10-25 07:50:21 +02:00
|
|
|
|
2001-06-29 23:08:25 +02:00
|
|
|
/*
|
2001-10-25 07:50:21 +02:00
|
|
|
* Not enough space, so we must give up our page locks and pin (if
|
2005-10-15 04:49:52 +02:00
|
|
|
* any) and prepare to look elsewhere. We don't care which order we
|
|
|
|
* unlock the two buffers in, so this can be slightly simpler than the
|
|
|
|
* code above.
|
2001-06-29 23:08:25 +02:00
|
|
|
*/
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
|
|
|
if (otherBuffer == InvalidBuffer)
|
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
else if (otherBlock != targetBlock)
|
|
|
|
{
|
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
|
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
}
|
2001-10-25 07:50:21 +02:00
|
|
|
|
2005-06-20 20:37:02 +02:00
|
|
|
/* Without FSM, always fall out of the loop and extend */
|
|
|
|
if (!use_fsm)
|
|
|
|
break;
|
|
|
|
|
2001-06-29 23:08:25 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Update FSM as to condition of this page, and ask for another page
|
|
|
|
* to try.
|
2001-06-29 23:08:25 +02:00
|
|
|
*/
|
2008-09-30 12:52:14 +02:00
|
|
|
targetBlock = RecordAndGetPageWithFreeSpace(relation,
|
2001-06-29 23:08:25 +02:00
|
|
|
targetBlock,
|
|
|
|
pageFreeSpace,
|
2006-07-04 00:45:41 +02:00
|
|
|
len + saveFreeSpace);
|
2000-09-07 11:58:38 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2001-06-29 23:08:25 +02:00
|
|
|
* Have to extend the relation.
|
2001-05-12 21:58:28 +02:00
|
|
|
*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We have to use a lock to ensure no one else is extending the rel at the
|
|
|
|
* same time, else we will both try to initialize the same new page. We
|
|
|
|
* can skip locking for new or temp relations, however, since no one else
|
|
|
|
* could be accessing them.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2004-08-28 22:31:44 +02:00
|
|
|
needLock = !RELATION_IS_LOCAL(relation);
|
2002-08-06 04:36:35 +02:00
|
|
|
|
|
|
|
if (needLock)
|
2005-04-30 00:28:24 +02:00
|
|
|
LockRelationForExtension(relation, ExclusiveLock);
|
2001-05-12 21:58:28 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* XXX This does an lseek - rather expensive - but at the moment it is the
|
|
|
|
* only way to accurately determine how many blocks are in a relation. Is
|
|
|
|
* it worth keeping an accurate file length in shared memory someplace,
|
|
|
|
* rather than relying on the kernel to do it for us?
|
2001-05-12 21:58:28 +02:00
|
|
|
*/
|
2008-11-06 21:51:15 +01:00
|
|
|
buffer = ReadBufferBI(relation, P_NEW, bistate);
|
2001-05-12 21:58:28 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We can be certain that locking the otherBuffer first is OK, since it
|
|
|
|
* must have a lower page number.
|
2001-05-12 21:58:28 +02:00
|
|
|
*/
|
2001-06-29 23:08:25 +02:00
|
|
|
if (otherBuffer != InvalidBuffer)
|
|
|
|
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
|
2001-05-12 21:58:28 +02:00
|
|
|
|
|
|
|
/*
|
2005-05-07 23:32:24 +02:00
|
|
|
* Now acquire lock on the new page.
|
2001-05-12 21:58:28 +02:00
|
|
|
*/
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
2005-05-07 23:32:24 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Release the file-extension lock; it's now OK for someone else to extend
|
|
|
|
* the relation some more. Note that we cannot release this lock before
|
|
|
|
* we have buffer lock on the new page, or we risk a race condition
|
|
|
|
* against vacuumlazy.c --- see comments therein.
|
2005-05-07 23:32:24 +02:00
|
|
|
*/
|
|
|
|
if (needLock)
|
|
|
|
UnlockRelationForExtension(relation, ExclusiveLock);
|
|
|
|
|
|
|
|
/*
|
2006-01-06 01:15:50 +01:00
|
|
|
* We need to initialize the empty new page. Double-check that it really
|
|
|
|
* is empty (this should never happen, but if it does we don't want to
|
|
|
|
* risk wiping out valid data).
|
2005-05-07 23:32:24 +02:00
|
|
|
*/
|
2008-07-13 22:45:47 +02:00
|
|
|
page = BufferGetPage(buffer);
|
2006-01-06 01:15:50 +01:00
|
|
|
|
2008-07-13 22:45:47 +02:00
|
|
|
if (!PageIsNew(page))
|
2006-01-06 01:15:50 +01:00
|
|
|
elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
|
|
|
|
BufferGetBlockNumber(buffer),
|
|
|
|
RelationGetRelationName(relation));
|
|
|
|
|
2008-07-13 22:45:47 +02:00
|
|
|
PageInit(page, BufferGetPageSize(buffer), 0);
|
2001-05-12 21:58:28 +02:00
|
|
|
|
2008-07-13 22:45:47 +02:00
|
|
|
if (len > PageGetHeapFreeSpace(page))
|
2001-05-12 21:58:28 +02:00
|
|
|
{
|
|
|
|
/* We should not get here given the test at the top */
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
|
2001-05-12 21:58:28 +02:00
|
|
|
}
|
1998-12-15 13:47:01 +01:00
|
|
|
|
2001-06-29 23:08:25 +02:00
|
|
|
/*
|
|
|
|
* Remember the new page as our target for future insertions.
|
|
|
|
*
|
2005-11-22 19:17:34 +01:00
|
|
|
* XXX should we enter the new page into the free space map immediately,
|
|
|
|
* or just keep it for this backend's exclusive use in the short run
|
|
|
|
* (until VACUUM sees it)? Seems to depend on whether you expect the
|
|
|
|
* current backend to make more insertions or not, which is probably a
|
|
|
|
* good bet most of the time. So for now, don't add it to FSM yet.
|
2001-06-29 23:08:25 +02:00
|
|
|
*/
|
2010-02-09 22:43:30 +01:00
|
|
|
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
|
2001-06-29 23:08:25 +02:00
|
|
|
|
2001-05-12 21:58:28 +02:00
|
|
|
return buffer;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|