Repair still another bug in the btree page split WAL reduction patch:

it failed for splits of non-leaf pages because in such pages the first
data key on a page is suppressed, and so we can't just copy the first
key from the right page to reconstitute the left page's high key.
Problem found by Koichi Suzuki, patch by Heikki.
This commit is contained in:
Tom Lane 2007-11-16 19:53:50 +00:00
parent bb4a78c070
commit 93190c3098
3 changed files with 69 additions and 31 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.161 2007/11/15 21:14:32 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.162 2007/11/16 19:53:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -371,13 +371,13 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* removing any LP_DEAD tuples.
*
* On entry, *buf and *offsetptr point to the first legal position
* where the new tuple could be inserted. The caller should hold an
* exclusive lock on *buf. *offsetptr can also be set to
* InvalidOffsetNumber, in which case the function will search the right
* location within the page if needed. On exit, they point to the chosen
* insert location. If findinsertloc decided to move right, the lock and
* pin on the original page will be released and the new page returned to
* the caller is exclusively locked instead.
* where the new tuple could be inserted. The caller should hold an
* exclusive lock on *buf. *offsetptr can also be set to
* InvalidOffsetNumber, in which case the function will search for the
* right location within the page if needed. On exit, they point to the
* chosen insert location. If _bt_findinsertloc decides to move right,
* the lock and pin on the original page will be released and the new
* page returned to the caller is exclusively locked instead.
*
* newtup is the new tuple we're inserting, and scankey is an insertion
* type scan key for it.
@ -422,8 +422,6 @@ _bt_findinsertloc(Relation rel,
"Consider a function index of an MD5 hash of the value, "
"or use full text indexing.")));
/*----------
* If we will need to split the page to put the item on this page,
* check whether we can put the tuple somewhere to the right,
@ -1004,7 +1002,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
xl_btree_split xlrec;
uint8 xlinfo;
XLogRecPtr recptr;
XLogRecData rdata[6];
XLogRecData rdata[7];
XLogRecData *lastrdata;
xlrec.node = rel->rd_node;
@ -1020,15 +1018,32 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata = &rdata[0];
/* Log downlink on non-leaf pages. */
if (ropaque->btpo.level > 0)
{
/* Log downlink on non-leaf pages */
lastrdata->next = lastrdata + 1;
lastrdata++;
lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
lastrdata->len = sizeof(BlockIdData);
lastrdata->buffer = InvalidBuffer;
/*
* We must also log the left page's high key, because the right
* page's leftmost key is suppressed on non-leaf levels. Show it
* as belonging to the left page buffer, so that it is not stored
* if XLogInsert decides it needs a full-page image of the left
* page.
*/
lastrdata->next = lastrdata + 1;
lastrdata++;
itemid = PageGetItemId(origpage, P_HIKEY);
item = (IndexTuple) PageGetItem(origpage, itemid);
lastrdata->data = (char *) item;
lastrdata->len = MAXALIGN(IndexTupleSize(item));
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
/*
@ -1057,7 +1072,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
else
else if (ropaque->btpo.level == 0)
{
/*
* Although we don't need to WAL-log the new item, we still need

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.48 2007/11/15 22:25:15 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.49 2007/11/16 19:53:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -273,6 +273,8 @@ btree_xlog_split(bool onleft, bool isroot,
OffsetNumber newitemoff = 0;
Item newitem = NULL;
Size newitemsz = 0;
Item left_hikey = NULL;
Size left_hikeysz = 0;
reln = XLogOpenRelation(xlrec->node);
@ -289,6 +291,17 @@ btree_xlog_split(bool onleft, bool isroot,
datalen -= sizeof(BlockIdData);
forget_matching_split(xlrec->node, downlink, false);
/* Extract left hikey and its size (still assuming 16-bit alignment) */
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
/* We assume 16-bit alignment is enough for IndexTupleSize */
left_hikey = (Item) datapos;
left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
datapos += left_hikeysz;
datalen -= left_hikeysz;
}
}
/* Extract newitem and newitemoff, if present */
@ -302,17 +315,13 @@ btree_xlog_split(bool onleft, bool isroot,
if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
{
IndexTupleData itupdata;
/*
* We need to copy the tuple header to apply IndexTupleDSize, because
* of alignment considerations. However, we assume that PageAddItem
* doesn't care about the alignment of the newitem pointer it's given.
* We assume that 16-bit alignment is enough to apply IndexTupleSize
* (since it's fetching from a uint16 field) and also enough for
* PageAddItem to insert the tuple.
*/
newitem = datapos;
memcpy(&itupdata, datapos, sizeof(IndexTupleData));
newitemsz = IndexTupleDSize(itupdata);
newitemsz = MAXALIGN(newitemsz);
newitem = (Item) datapos;
newitemsz = MAXALIGN(IndexTupleSize(newitem));
datapos += newitemsz;
datalen -= newitemsz;
}
@ -333,6 +342,18 @@ btree_xlog_split(bool onleft, bool isroot,
_bt_restore_page(rpage, datapos, datalen);
/*
* On leaf level, the high key of the left page is equal to the
* first key on the right page.
*/
if (xlrec->level == 0)
{
ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
left_hikey = PageGetItem(rpage, hiItemId);
left_hikeysz = ItemIdGetLength(hiItemId);
}
PageSetLSN(rpage, lsn);
PageSetTLI(rpage, ThisTimeLineID);
MarkBufferDirty(rbuf);
@ -360,8 +381,6 @@ btree_xlog_split(bool onleft, bool isroot,
OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
ItemId hiItemId;
Item hiItem;
/*
* Remove the items from the left page that were copied to the
@ -394,11 +413,8 @@ btree_xlog_split(bool onleft, bool isroot,
elog(PANIC, "failed to add new item to left page after split");
}
/* Set high key equal to the first key on the right page */
hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
hiItem = PageGetItem(rpage, hiItemId);
if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
/* Set high key */
if (PageAddItem(lpage, left_hikey, left_hikeysz,
P_HIKEY, false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add high key to left page after split");

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.114 2007/11/15 21:14:42 momjian Exp $
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.115 2007/11/16 19:53:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -289,8 +289,15 @@ typedef struct xl_btree_split
* than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
* aligned.)
*
* If level > 0, an IndexTuple representing the HIKEY of the left page
* follows. We don't need this on leaf pages, because it's the same
* as the leftmost key in the new right page. Also, it's suppressed if
* XLogInsert chooses to store the left page's whole page image.
*
* In the _L variants, next are OffsetNumber newitemoff and the new item.
* (In the _R variants, the new item is one of the right page's tuples.)
* The new item, but not newitemoff, is suppressed if XLogInsert chooses
* to store the left page's whole page image.
*
* Last are the right page's tuples in the form used by _bt_restore_page.
*/