2005-06-20 12:29:37 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* gistvacuum.c
|
2010-02-08 06:17:31 +01:00
|
|
|
* vacuuming routines for the postgres GiST index access method.
|
2005-06-20 12:29:37 +02:00
|
|
|
*
|
|
|
|
*
|
2013-01-01 23:15:01 +01:00
|
|
|
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
|
2005-06-20 12:29:37 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/gist/gistvacuum.c
|
2005-06-20 12:29:37 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include "access/genam.h"
|
|
|
|
#include "access/gist_private.h"
|
|
|
|
#include "commands/vacuum.h"
|
|
|
|
#include "miscadmin.h"
|
2008-09-30 12:52:14 +02:00
|
|
|
#include "storage/indexfsm.h"
|
2008-05-12 02:00:54 +02:00
|
|
|
#include "storage/lmgr.h"
|
2005-06-20 12:29:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
2010-02-08 05:33:55 +01:00
|
|
|
* VACUUM cleanup: update FSM
|
2005-06-20 12:29:37 +02:00
|
|
|
*/
|
|
|
|
Datum
|
2005-09-22 22:44:36 +02:00
|
|
|
gistvacuumcleanup(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2006-05-03 00:25:10 +02:00
|
|
|
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
|
2006-05-03 00:25:10 +02:00
|
|
|
Relation rel = info->index;
|
2005-09-22 22:44:36 +02:00
|
|
|
BlockNumber npages,
|
|
|
|
blkno;
|
2009-06-11 16:49:15 +02:00
|
|
|
BlockNumber totFreePages;
|
2005-09-22 22:44:36 +02:00
|
|
|
bool needLock;
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2009-03-24 21:17:18 +01:00
|
|
|
/* No-op in ANALYZE ONLY mode */
|
|
|
|
if (info->analyze_only)
|
|
|
|
PG_RETURN_POINTER(stats);
|
|
|
|
|
2006-05-03 00:25:10 +02:00
|
|
|
/* Set up all-zero stats if gistbulkdelete wasn't called */
|
|
|
|
if (stats == NULL)
|
|
|
|
{
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
2006-05-03 00:25:10 +02:00
|
|
|
/* use heap's tuple count */
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats->num_index_tuples = info->num_heap_tuples;
|
|
|
|
stats->estimated_count = info->estimated_count;
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2006-05-03 00:25:10 +02:00
|
|
|
/*
|
2006-10-04 02:30:14 +02:00
|
|
|
* XXX the above is wrong if index is partial. Would it be OK to just
|
|
|
|
* return NULL, or is there work we must do below?
|
2006-05-03 00:25:10 +02:00
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
2006-07-31 22:09:10 +02:00
|
|
|
/*
|
2010-02-08 05:33:55 +01:00
|
|
|
* Need lock unless it's local to this backend.
|
2006-07-31 22:09:10 +02:00
|
|
|
*/
|
2010-02-08 05:33:55 +01:00
|
|
|
needLock = !RELATION_IS_LOCAL(rel);
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2005-06-20 12:29:37 +02:00
|
|
|
/* try to find deleted pages */
|
2005-06-27 14:45:23 +02:00
|
|
|
if (needLock)
|
|
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
2005-06-20 12:29:37 +02:00
|
|
|
npages = RelationGetNumberOfBlocks(rel);
|
2005-06-27 14:45:23 +02:00
|
|
|
if (needLock)
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
|
2008-09-30 12:52:14 +02:00
|
|
|
totFreePages = 0;
|
2005-09-22 22:44:36 +02:00
|
|
|
for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++)
|
|
|
|
{
|
2006-02-14 17:39:32 +01:00
|
|
|
Buffer buffer;
|
2005-09-22 22:44:36 +02:00
|
|
|
Page page;
|
|
|
|
|
2006-02-14 17:39:32 +01:00
|
|
|
vacuum_delay_point();
|
|
|
|
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
|
|
|
|
info->strategy);
|
2005-09-22 22:44:36 +02:00
|
|
|
LockBuffer(buffer, GIST_SHARE);
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
2005-11-06 23:39:21 +01:00
|
|
|
if (PageIsNew(page) || GistPageIsDeleted(page))
|
2005-09-22 22:44:36 +02:00
|
|
|
{
|
2006-09-21 22:31:22 +02:00
|
|
|
totFreePages++;
|
2008-09-30 12:52:14 +02:00
|
|
|
RecordFreeIndexPage(rel, blkno);
|
2005-09-22 22:44:36 +02:00
|
|
|
}
|
2006-04-01 01:32:07 +02:00
|
|
|
UnlockReleaseBuffer(buffer);
|
2005-06-20 12:29:37 +02:00
|
|
|
}
|
2005-09-22 22:44:36 +02:00
|
|
|
|
2008-10-06 10:04:11 +02:00
|
|
|
/* Finally, vacuum the FSM */
|
|
|
|
IndexFreeSpaceMapVacuum(info->index);
|
|
|
|
|
2005-06-20 12:29:37 +02:00
|
|
|
/* return statistics */
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats->pages_free = totFreePages;
|
2005-06-27 14:45:23 +02:00
|
|
|
if (needLock)
|
|
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats->num_pages = RelationGetNumberOfBlocks(rel);
|
2005-06-27 14:45:23 +02:00
|
|
|
if (needLock)
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
|
|
|
PG_RETURN_POINTER(stats);
|
|
|
|
}
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
typedef struct GistBDItem
|
|
|
|
{
|
2005-06-27 14:45:23 +02:00
|
|
|
GistNSN parentlsn;
|
2005-09-22 22:44:36 +02:00
|
|
|
BlockNumber blkno;
|
|
|
|
struct GistBDItem *next;
|
2005-06-20 12:29:37 +02:00
|
|
|
} GistBDItem;
|
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
static void
|
2005-09-22 22:44:36 +02:00
|
|
|
pushStackIfSplited(Page page, GistBDItem *stack)
|
|
|
|
{
|
2005-06-27 14:45:23 +02:00
|
|
|
GISTPageOpaque opaque = GistPageGetOpaque(page);
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
|
2012-12-28 17:06:15 +01:00
|
|
|
(GistFollowRight(page) || stack->parentlsn < opaque->nsn) &&
|
2005-09-22 22:44:36 +02:00
|
|
|
opaque->rightlink != InvalidBlockNumber /* sanity check */ )
|
|
|
|
{
|
2005-06-27 14:45:23 +02:00
|
|
|
/* split page detected, install right link to the stack */
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
|
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
ptr->blkno = opaque->rightlink;
|
|
|
|
ptr->parentlsn = stack->parentlsn;
|
|
|
|
ptr->next = stack->next;
|
|
|
|
stack->next = ptr;
|
|
|
|
}
|
2005-09-22 22:44:36 +02:00
|
|
|
}
|
2005-06-27 14:45:23 +02:00
|
|
|
|
|
|
|
|
2005-06-20 12:29:37 +02:00
|
|
|
/*
|
|
|
|
* Bulk deletion of all index entries pointing to a set of heap tuples and
|
2012-01-30 01:23:56 +01:00
|
|
|
* check invalid tuples left after upgrade.
|
2005-06-20 12:29:37 +02:00
|
|
|
* The set of target tuples is specified via a callback routine that tells
|
|
|
|
* whether any given heap tuple (identified by ItemPointer) is being deleted.
|
|
|
|
*
|
|
|
|
* Result: a palloc'd struct containing statistical info for VACUUM displays.
|
|
|
|
*/
|
|
|
|
Datum
|
2005-09-22 22:44:36 +02:00
|
|
|
gistbulkdelete(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2006-05-03 00:25:10 +02:00
|
|
|
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
|
2006-05-03 00:25:10 +02:00
|
|
|
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
|
|
|
|
void *callback_state = (void *) PG_GETARG_POINTER(3);
|
|
|
|
Relation rel = info->index;
|
2005-09-22 22:44:36 +02:00
|
|
|
GistBDItem *stack,
|
|
|
|
*ptr;
|
|
|
|
|
2006-05-03 00:25:10 +02:00
|
|
|
/* first time through? */
|
|
|
|
if (stats == NULL)
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
2006-05-03 00:25:10 +02:00
|
|
|
/* we'll re-count the tuples each time */
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats->estimated_count = false;
|
|
|
|
stats->num_index_tuples = 0;
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2006-05-03 00:25:10 +02:00
|
|
|
stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
|
|
|
|
stack->blkno = GIST_ROOT_BLKNO;
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
while (stack)
|
|
|
|
{
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
Buffer buffer;
|
2005-09-22 22:44:36 +02:00
|
|
|
Page page;
|
|
|
|
OffsetNumber i,
|
|
|
|
maxoff;
|
2005-06-20 12:29:37 +02:00
|
|
|
IndexTuple idxtuple;
|
|
|
|
ItemId iid;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
2008-10-31 16:05:00 +01:00
|
|
|
buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno,
|
|
|
|
RBM_NORMAL, info->strategy);
|
2005-09-22 22:44:36 +02:00
|
|
|
LockBuffer(buffer, GIST_SHARE);
|
2005-11-06 23:39:21 +01:00
|
|
|
gistcheckpage(rel, buffer);
|
2005-09-22 22:44:36 +02:00
|
|
|
page = (Page) BufferGetPage(buffer);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
if (GistPageIsLeaf(page))
|
|
|
|
{
|
2005-09-02 21:02:20 +02:00
|
|
|
OffsetNumber todelete[MaxOffsetNumber];
|
2005-09-22 22:44:36 +02:00
|
|
|
int ntodelete = 0;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
|
|
|
LockBuffer(buffer, GIST_UNLOCK);
|
|
|
|
LockBuffer(buffer, GIST_EXCLUSIVE);
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
|
|
|
|
{
|
2006-02-14 17:39:32 +01:00
|
|
|
/* only the root can become non-leaf during relock */
|
2006-04-01 01:32:07 +02:00
|
|
|
UnlockReleaseBuffer(buffer);
|
2005-06-27 14:45:23 +02:00
|
|
|
/* one more check */
|
|
|
|
continue;
|
|
|
|
}
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
/*
|
|
|
|
* check for split proceeded after look at parent, we should check
|
|
|
|
* it after relock
|
|
|
|
*/
|
2005-06-27 14:45:23 +02:00
|
|
|
pushStackIfSplited(page, stack);
|
|
|
|
|
2006-03-31 01:03:10 +02:00
|
|
|
/*
|
|
|
|
* Remove deletable tuples from page
|
|
|
|
*/
|
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
|
|
|
|
{
|
|
|
|
iid = PageGetItemId(page, i);
|
2005-06-20 12:29:37 +02:00
|
|
|
idxtuple = (IndexTuple) PageGetItem(page, iid);
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
if (callback(&(idxtuple->t_tid), callback_state))
|
|
|
|
{
|
2006-10-04 02:30:14 +02:00
|
|
|
todelete[ntodelete] = i - ntodelete;
|
2005-09-22 22:44:36 +02:00
|
|
|
ntodelete++;
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats->tuples_removed += 1;
|
2005-09-22 22:44:36 +02:00
|
|
|
}
|
|
|
|
else
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
stats->num_index_tuples += 1;
|
2005-06-20 12:29:37 +02:00
|
|
|
}
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
if (ntodelete)
|
|
|
|
{
|
2006-05-10 11:19:54 +02:00
|
|
|
START_CRIT_SECTION();
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2006-04-01 01:32:07 +02:00
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
|
2006-10-04 02:30:14 +02:00
|
|
|
for (i = 0; i < ntodelete; i++)
|
2006-05-10 11:19:54 +02:00
|
|
|
PageIndexTupleDelete(page, todelete[i]);
|
|
|
|
GistMarkTuplesDeleted(page);
|
|
|
|
|
2010-12-13 18:34:26 +01:00
|
|
|
if (RelationNeedsWAL(rel))
|
2005-09-22 22:44:36 +02:00
|
|
|
{
|
|
|
|
XLogRecPtr recptr;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
recptr = gistXLogUpdate(rel->rd_node, buffer,
|
2006-10-04 02:30:14 +02:00
|
|
|
todelete, ntodelete,
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
NULL, 0, InvalidBuffer);
|
2005-06-27 14:45:23 +02:00
|
|
|
PageSetLSN(page, recptr);
|
|
|
|
PageSetTLI(page, ThisTimeLineID);
|
2005-09-22 22:44:36 +02:00
|
|
|
}
|
|
|
|
else
|
2010-11-16 10:02:11 +01:00
|
|
|
PageSetLSN(page, GetXLogRecPtrForTemp());
|
2006-05-10 11:19:54 +02:00
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
2005-06-27 14:45:23 +02:00
|
|
|
}
|
2006-03-31 01:03:10 +02:00
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-06-27 14:45:23 +02:00
|
|
|
/* check for split proceeded after look at parent */
|
|
|
|
pushStackIfSplited(page, stack);
|
|
|
|
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
|
|
|
|
{
|
2005-06-20 12:29:37 +02:00
|
|
|
iid = PageGetItemId(page, i);
|
|
|
|
idxtuple = (IndexTuple) PageGetItem(page, iid);
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
|
|
|
|
ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
|
|
|
|
ptr->parentlsn = PageGetLSN(page);
|
2005-06-20 12:29:37 +02:00
|
|
|
ptr->next = stack->next;
|
|
|
|
stack->next = ptr;
|
|
|
|
|
2005-09-22 22:44:36 +02:00
|
|
|
if (GistTupleIsInvalid(idxtuple))
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
ereport(LOG,
|
|
|
|
(errmsg("index \"%s\" contains an inner tuple marked as invalid",
|
|
|
|
RelationGetRelationName(rel)),
|
2011-06-21 23:33:20 +02:00
|
|
|
errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
errhint("Please REINDEX it.")));
|
2005-06-20 12:29:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-04-01 01:32:07 +02:00
|
|
|
UnlockReleaseBuffer(buffer);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
|
|
|
ptr = stack->next;
|
2005-09-22 22:44:36 +02:00
|
|
|
pfree(stack);
|
2005-06-20 12:29:37 +02:00
|
|
|
stack = ptr;
|
2005-06-20 17:22:38 +02:00
|
|
|
|
|
|
|
vacuum_delay_point();
|
2005-06-20 12:29:37 +02:00
|
|
|
}
|
|
|
|
|
2006-05-03 00:25:10 +02:00
|
|
|
PG_RETURN_POINTER(stats);
|
2005-06-20 12:29:37 +02:00
|
|
|
}
|