postgresql/src/include/access/itup.h

151 lines
4.3 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* itup.h
* POSTGRES index tuple definitions.
*
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
2010-09-20 22:08:53 +02:00
* src/include/access/itup.h
*
*-------------------------------------------------------------------------
*/
#ifndef ITUP_H
#define ITUP_H
#include "access/tupdesc.h"
1999-07-16 19:07:40 +02:00
#include "access/tupmacs.h"
#include "storage/bufpage.h"
#include "storage/itemptr.h"
/*
* Index tuple header structure
*
* All index tuples start with IndexTupleData. If the HasNulls bit is set,
* this is followed by an IndexAttributeBitMapData. The index attribute
* values follow, beginning at a MAXALIGN boundary.
*
* Note that the space allocated for the bitmap does not vary with the number
* of attributes; that is because we don't have room to store the number of
* attributes in the header. Given the MAXALIGN constraint there's no space
* savings to be had anyway, for usual values of INDEX_MAX_KEYS.
*/
typedef struct IndexTupleData
{
2001-02-21 20:07:04 +01:00
ItemPointerData t_tid; /* reference TID to heap tuple */
/* ---------------
* t_info is laid out in the following fashion:
*
* 15th (high) bit: has nulls
* 14th bit: has var-width attributes
* 13th bit: unused
* 12-0 bit: size of tuple
* ---------------
*/
unsigned short t_info; /* various info about tuple */
} IndexTupleData; /* MORE DATA FOLLOWS AT END OF STRUCT */
typedef IndexTupleData *IndexTuple;
typedef struct IndexAttributeBitMapData
{
bits8 bits[(INDEX_MAX_KEYS + 8 - 1) / 8];
2017-06-21 20:39:04 +02:00
} IndexAttributeBitMapData;
2017-06-21 20:39:04 +02:00
typedef IndexAttributeBitMapData * IndexAttributeBitMap;
/*
* t_info manipulation macros
*/
#define INDEX_SIZE_MASK 0x1FFF
Improve hash index bucket split behavior. Previously, the right to split a bucket was represented by a heavyweight lock on the page number of the primary bucket page. Unfortunately, this meant that every scan needed to take a heavyweight lock on that bucket also, which was bad for concurrency. Instead, use a cleanup lock on the primary bucket page to indicate the right to begin a split, so that scans only need to retain a pin on that page, which is they would have to acquire anyway, and which is also much cheaper. In addition to reducing the locking cost, this also avoids locking out scans and inserts for the entire lifetime of the split: while the new bucket is being populated with copies of the appropriate tuples from the old bucket, scans and inserts can happen in parallel. There are minor concurrency improvements for vacuum operations as well, though the situation there is still far from ideal. This patch also removes the unworldly assumption that a split will never be interrupted. With the new code, a split is done in a series of small steps and the system can pick up where it left off if it is interrupted prior to completion. While this patch does not itself add write-ahead logging for hash indexes, it is clearly a necessary first step, since one of the things that could interrupt a split is the removal of electrical power from the machine performing it. Amit Kapila. I wrote the original design on which this patch is based, and did a good bit of work on the comments and README through multiple rounds of review, but all of the code is Amit's. Also reviewed by Jesper Pedersen, Jeff Janes, and others. Discussion: http://postgr.es/m/CAA4eK1LfzcZYxLoXS874Ad0+S-ZM60U9bwcyiUZx9mHZ-KCWhw@mail.gmail.com
2016-11-30 21:39:21 +01:00
/* bit 0x2000 is reserved for index-AM specific usage */
#define INDEX_VAR_MASK 0x4000
#define INDEX_NULL_MASK 0x8000
#define IndexTupleSize(itup) ((Size) ((itup)->t_info & INDEX_SIZE_MASK))
2001-03-22 05:01:46 +01:00
#define IndexTupleHasNulls(itup) ((((IndexTuple) (itup))->t_info & INDEX_NULL_MASK))
#define IndexTupleHasVarwidths(itup) ((((IndexTuple) (itup))->t_info & INDEX_VAR_MASK))
/*
* Takes an infomask as argument (primarily because this needs to be usable
* at index_form_tuple time so enough space is allocated).
*/
#define IndexInfoFindDataOffset(t_info) \
( \
(!((t_info) & INDEX_NULL_MASK)) ? \
( \
(Size)MAXALIGN(sizeof(IndexTupleData)) \
) \
: \
( \
(Size)MAXALIGN(sizeof(IndexTupleData) + sizeof(IndexAttributeBitMapData)) \
) \
)
/* ----------------
* index_getattr
*
* This gets called many times, so we macro the cacheable and NULL
* lookups, and call nocache_index_getattr() for the rest.
*
* ----------------
*/
#define index_getattr(tup, attnum, tupleDesc, isnull) \
( \
AssertMacro(PointerIsValid(isnull) && (attnum) > 0), \
*(isnull) = false, \
!IndexTupleHasNulls(tup) ? \
( \
TupleDescAttr((tupleDesc), (attnum)-1)->attcacheoff >= 0 ? \
( \
fetchatt(TupleDescAttr((tupleDesc), (attnum)-1), \
(char *) (tup) + IndexInfoFindDataOffset((tup)->t_info) \
+ TupleDescAttr((tupleDesc), (attnum)-1)->attcacheoff) \
) \
: \
nocache_index_getattr((tup), (attnum), (tupleDesc)) \
) \
: \
( \
(att_isnull((attnum)-1, (char *)(tup) + sizeof(IndexTupleData))) ? \
( \
*(isnull) = true, \
(Datum)NULL \
) \
: \
( \
nocache_index_getattr((tup), (attnum), (tupleDesc)) \
) \
) \
)
/*
* MaxIndexTuplesPerPage is an upper bound on the number of tuples that can
* fit on one index page. An index tuple must have either data or a null
* bitmap, so we can safely assume it's at least 1 byte bigger than a bare
* IndexTupleData struct. We arrive at the divisor because each tuple
* must be maxaligned, and it must have an associated item pointer.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
#define MinIndexTupleSize MAXALIGN(sizeof(IndexTupleData) + 1)
#define MaxIndexTuplesPerPage \
((int) ((BLCKSZ - SizeOfPageHeaderData) / \
(MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))))
/* routines in indextuple.c */
extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,
2005-10-15 04:49:52 +02:00
Datum *values, bool *isnull);
extern Datum nocache_index_getattr(IndexTuple tup, int attnum,
TupleDesc tupleDesc);
extern void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor,
2010-02-26 03:01:40 +01:00
Datum *values, bool *isnull);
extern IndexTuple CopyIndexTuple(IndexTuple source);
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
#endif /* ITUP_H */