postgresql/src/include/access/spgxlog.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

260 lines
7.5 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* spgxlog.h
* xlog declarations for SP-GiST access method.
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/spgxlog.h
*
*-------------------------------------------------------------------------
*/
#ifndef SPGXLOG_H
#define SPGXLOG_H
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
#include "storage/off.h"
/* XLOG record types for SPGiST */
/* #define XLOG_SPGIST_CREATE_INDEX 0x00 */ /* not used anymore */
#define XLOG_SPGIST_ADD_LEAF 0x10
#define XLOG_SPGIST_MOVE_LEAFS 0x20
#define XLOG_SPGIST_ADD_NODE 0x30
#define XLOG_SPGIST_SPLIT_TUPLE 0x40
#define XLOG_SPGIST_PICKSPLIT 0x50
#define XLOG_SPGIST_VACUUM_LEAF 0x60
#define XLOG_SPGIST_VACUUM_ROOT 0x70
#define XLOG_SPGIST_VACUUM_REDIRECT 0x80
/*
* Some redo functions need an SpGistState, although only a few of its fields
* need to be valid. spgxlogState carries the required info in xlog records.
* (See fillFakeState in spgxlog.c for more comments.)
*/
typedef struct spgxlogState
{
TransactionId myXid;
bool isBuild;
} spgxlogState;
/*
* Backup Blk 0: destination page for leaf tuple
* Backup Blk 1: parent page (if any)
*/
typedef struct spgxlogAddLeaf
{
bool newPage; /* init dest page? */
bool storesNulls; /* page is in the nulls tree? */
OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */
OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */
OffsetNumber offnumParent; /* where the parent downlink is, if any */
uint16 nodeI;
/* new leaf tuple follows (unaligned!) */
} spgxlogAddLeaf;
/*
* Backup Blk 0: source leaf page
* Backup Blk 1: destination leaf page
* Backup Blk 2: parent page
*/
typedef struct spgxlogMoveLeafs
{
uint16 nMoves; /* number of tuples moved from source page */
bool newPage; /* init dest page? */
bool replaceDead; /* are we replacing a DEAD source tuple? */
bool storesNulls; /* pages are in the nulls tree? */
/* where the parent downlink is */
OffsetNumber offnumParent;
uint16 nodeI;
spgxlogState stateSrc;
/*----------
* data follows:
* array of deleted tuple numbers, length nMoves
* array of inserted tuple numbers, length nMoves + 1 or 1
* list of leaf tuples, length nMoves + 1 or 1 (unaligned!)
*
* Note: if replaceDead is true then there is only one inserted tuple
* number and only one leaf tuple in the data, because we are not copying
* the dead tuple from the source
*----------
*/
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgxlogMoveLeafs;
#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets)
/*
* Backup Blk 0: original page
* Backup Blk 1: where new tuple goes, if not same place
* Backup Blk 2: where parent downlink is, if updated and different from
* the old and new
*/
typedef struct spgxlogAddNode
{
/*
* Offset of the original inner tuple, in the original page (on backup
* block 0).
*/
OffsetNumber offnum;
/*
* Offset of the new tuple, on the new page (on backup block 1). Invalid,
* if we overwrote the old tuple in the original page).
*/
OffsetNumber offnumNew;
bool newPage; /* init new page? */
/*----
* Where is the parent downlink? parentBlk indicates which page it's on,
* and offnumParent is the offset within the page. The possible values for
* parentBlk are:
*
* 0: parent == original page
* 1: parent == new page
* 2: parent == different page (blk ref 2)
* -1: parent not updated
*----
*/
int8 parentBlk;
OffsetNumber offnumParent; /* offset within the parent page */
uint16 nodeI;
spgxlogState stateSrc;
/*
* updated inner tuple follows (unaligned!)
*/
} spgxlogAddNode;
/*
* Backup Blk 0: where the prefix tuple goes
* Backup Blk 1: where the postfix tuple goes (if different page)
*/
typedef struct spgxlogSplitTuple
{
/* where the prefix tuple goes */
OffsetNumber offnumPrefix;
/* where the postfix tuple goes */
OffsetNumber offnumPostfix;
bool newPage; /* need to init that page? */
bool postfixBlkSame; /* was postfix tuple put on same page as
* prefix? */
/*
* new prefix inner tuple follows, then new postfix inner tuple (both are
* unaligned!)
*/
} spgxlogSplitTuple;
/*
* Buffer references in the rdata array are:
* Backup Blk 0: Src page (only if not root)
* Backup Blk 1: Dest page (if used)
* Backup Blk 2: Inner page
* Backup Blk 3: Parent page (if any, and different from Inner)
*/
typedef struct spgxlogPickSplit
{
bool isRootSplit;
uint16 nDelete; /* n to delete from Src */
uint16 nInsert; /* n to insert on Src and/or Dest */
bool initSrc; /* re-init the Src page? */
bool initDest; /* re-init the Dest page? */
/* where to put new inner tuple */
OffsetNumber offnumInner;
bool initInner; /* re-init the Inner page? */
bool storesNulls; /* pages are in the nulls tree? */
/* where the parent downlink is, if any */
bool innerIsParent; /* is parent the same as inner page? */
OffsetNumber offnumParent;
uint16 nodeI;
spgxlogState stateSrc;
/*----------
* data follows:
* array of deleted tuple numbers, length nDelete
* array of inserted tuple numbers, length nInsert
* array of page selector bytes for inserted tuples, length nInsert
* new inner tuple (unaligned!)
* list of leaf tuples, length nInsert (unaligned!)
*----------
*/
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgxlogPickSplit;
#define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets)
typedef struct spgxlogVacuumLeaf
{
uint16 nDead; /* number of tuples to become DEAD */
uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */
uint16 nMove; /* number of tuples to move */
uint16 nChain; /* number of tuples to re-chain */
spgxlogState stateSrc;
/*----------
* data follows:
* tuple numbers to become DEAD
* tuple numbers to become PLACEHOLDER
* tuple numbers to move from (and replace with PLACEHOLDER)
* tuple numbers to move to (replacing what is there)
* tuple numbers to update nextOffset links of
* tuple numbers to insert in nextOffset links
*----------
*/
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgxlogVacuumLeaf;
#define SizeOfSpgxlogVacuumLeaf offsetof(spgxlogVacuumLeaf, offsets)
typedef struct spgxlogVacuumRoot
{
/* vacuum a root page when it is also a leaf */
uint16 nDelete; /* number of tuples to delete */
spgxlogState stateSrc;
/* offsets of tuples to delete follow */
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgxlogVacuumRoot;
#define SizeOfSpgxlogVacuumRoot offsetof(spgxlogVacuumRoot, offsets)
typedef struct spgxlogVacuumRedirect
{
uint16 nToPlaceholder; /* number of redirects to make placeholders */
OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */
TransactionId snapshotConflictHorizon; /* newest XID of removed redirects */
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
bool isCatalogRel; /* to handle recovery conflict during logical
* decoding on standby */
/* offsets of redirect tuples to make placeholders follow */
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgxlogVacuumRedirect;
#define SizeOfSpgxlogVacuumRedirect offsetof(spgxlogVacuumRedirect, offsets)
extern void spg_redo(XLogReaderState *record);
extern void spg_desc(StringInfo buf, XLogReaderState *record);
extern const char *spg_identify(uint8 info);
extern void spg_xlog_startup(void);
extern void spg_xlog_cleanup(void);
extern void spg_mask(char *pagedata, BlockNumber blkno);
#endif /* SPGXLOG_H */