postgresql/src/backend/access/common/heaptuple.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1502 lines
39 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* heaptuple.c
* This file contains heap tuple accessor and mutator routines, as well
* as various tuple utilities.
*
* Some notes about varlenas and this code:
*
* Before Postgres 8.3 varlenas always had a 4-byte length header, and
* therefore always needed 4-byte alignment (at least). This wasted space
* for short varlenas, for example CHAR(1) took 5 bytes and could need up to
* 3 additional padding bytes for alignment.
*
* Now, a short varlena (up to 126 data bytes) is reduced to a 1-byte header
* and we don't align it. To hide this from datatype-specific functions that
* don't want to deal with it, such a datum is considered "toasted" and will
* be expanded back to the normal 4-byte-header format by pg_detoast_datum.
* (In performance-critical code paths we can use pg_detoast_datum_packed
* and the appropriate access macros to avoid that overhead.) Note that this
* conversion is performed directly in heap_form_tuple, without invoking
* heaptoast.c.
*
* This change will break any code that assumes it needn't detoast values
* that have been put into a tuple but never sent to disk. Hopefully there
* are few such places.
*
* Varlenas still have alignment INT (or DOUBLE) in pg_type/pg_attribute, since
* that's the normal requirement for the untoasted format. But we ignore that
* for the 1-byte-header format. This means that the actual start position
* of a varlena datum may vary depending on which format it has. To determine
* what is stored, we have to require that alignment padding bytes be zero.
* (Postgres actually has always zeroed them, but now it's required!) Since
* the first byte of a 1-byte-header varlena can never be zero, we can examine
* the first byte after the previous datum to tell if it's a pad byte or the
* start of a 1-byte-header varlena.
*
* Note that while formerly we could rely on the first varlena column of a
* system catalog to be at the offset suggested by the C struct for the
* catalog, this is now risky: it's only safe if the preceding field is
* word-aligned, so that there will never be any padding.
*
* We don't pack varlenas whose attstorage is PLAIN, since the data type
* isn't expecting to have to detoast values. This is used in particular
* by oidvector and int2vector, which are used in the system catalogs
* and we'd like to still refer to them via C struct offsets.
*
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/access/common/heaptuple.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heaptoast.h"
#include "access/sysattr.h"
#include "access/tupdesc_details.h"
#include "executor/tuptable.h"
Support "expanded" objects, particularly arrays, for better performance. This patch introduces the ability for complex datatypes to have an in-memory representation that is different from their on-disk format. On-disk formats are typically optimized for minimal size, and in any case they can't contain pointers, so they are often not well-suited for computation. Now a datatype can invent an "expanded" in-memory format that is better suited for its operations, and then pass that around among the C functions that operate on the datatype. There are also provisions (rudimentary as yet) to allow an expanded object to be modified in-place under suitable conditions, so that operations like assignment to an element of an array need not involve copying the entire array. The initial application for this feature is arrays, but it is not hard to foresee using it for other container types like JSON, XML and hstore. I have hopes that it will be useful to PostGIS as well. In this initial implementation, a few heuristics have been hard-wired into plpgsql to improve performance for arrays that are stored in plpgsql variables. We would like to generalize those hacks so that other datatypes can obtain similar improvements, but figuring out some appropriate APIs is left as a task for future work. (The heuristics themselves are probably not optimal yet, either, as they sometimes force expansion of arrays that would be better left alone.) Preliminary performance testing shows impressive speed gains for plpgsql functions that do element-by-element access or update of large arrays. There are other cases that get a little slower, as a result of added array format conversions; but we can hope to improve anything that's annoyingly bad. In any case most applications should see a net win. Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
#include "utils/expandeddatum.h"
/* Does att's datatype allow packing into the 1-byte-header varlena format? */
#define ATT_IS_PACKABLE(att) \
((att)->attlen == -1 && (att)->attstorage != TYPSTORAGE_PLAIN)
/* Use this if it's already known varlena */
#define VARLENA_ATT_IS_PACKABLE(att) \
((att)->attstorage != TYPSTORAGE_PLAIN)
/* ----------------------------------------------------------------
* misc support routines
* ----------------------------------------------------------------
*/
/*
* Return the missing value of an attribute, or NULL if there isn't one.
*/
Datum
getmissingattr(TupleDesc tupleDesc,
int attnum, bool *isnull)
{
Form_pg_attribute att;
Assert(attnum <= tupleDesc->natts);
Assert(attnum > 0);
att = TupleDescAttr(tupleDesc, attnum - 1);
if (att->atthasmissing)
{
AttrMissing *attrmiss;
Assert(tupleDesc->constr);
Assert(tupleDesc->constr->missing);
attrmiss = tupleDesc->constr->missing + (attnum - 1);
if (attrmiss->am_present)
{
*isnull = false;
return attrmiss->am_value;
}
}
*isnull = true;
return PointerGetDatum(NULL);
}
/*
* heap_compute_data_size
* Determine size of the data area of a tuple to be constructed
*/
Size
heap_compute_data_size(TupleDesc tupleDesc,
Datum *values,
bool *isnull)
{
Size data_length = 0;
int i;
int numberOfAttributes = tupleDesc->natts;
for (i = 0; i < numberOfAttributes; i++)
{
Datum val;
Support "expanded" objects, particularly arrays, for better performance. This patch introduces the ability for complex datatypes to have an in-memory representation that is different from their on-disk format. On-disk formats are typically optimized for minimal size, and in any case they can't contain pointers, so they are often not well-suited for computation. Now a datatype can invent an "expanded" in-memory format that is better suited for its operations, and then pass that around among the C functions that operate on the datatype. There are also provisions (rudimentary as yet) to allow an expanded object to be modified in-place under suitable conditions, so that operations like assignment to an element of an array need not involve copying the entire array. The initial application for this feature is arrays, but it is not hard to foresee using it for other container types like JSON, XML and hstore. I have hopes that it will be useful to PostGIS as well. In this initial implementation, a few heuristics have been hard-wired into plpgsql to improve performance for arrays that are stored in plpgsql variables. We would like to generalize those hacks so that other datatypes can obtain similar improvements, but figuring out some appropriate APIs is left as a task for future work. (The heuristics themselves are probably not optimal yet, either, as they sometimes force expansion of arrays that would be better left alone.) Preliminary performance testing shows impressive speed gains for plpgsql functions that do element-by-element access or update of large arrays. There are other cases that get a little slower, as a result of added array format conversions; but we can hope to improve anything that's annoyingly bad. In any case most applications should see a net win. Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
Form_pg_attribute atti;
if (isnull[i])
continue;
val = values[i];
atti = TupleDescAttr(tupleDesc, i);
Support "expanded" objects, particularly arrays, for better performance. This patch introduces the ability for complex datatypes to have an in-memory representation that is different from their on-disk format. On-disk formats are typically optimized for minimal size, and in any case they can't contain pointers, so they are often not well-suited for computation. Now a datatype can invent an "expanded" in-memory format that is better suited for its operations, and then pass that around among the C functions that operate on the datatype. There are also provisions (rudimentary as yet) to allow an expanded object to be modified in-place under suitable conditions, so that operations like assignment to an element of an array need not involve copying the entire array. The initial application for this feature is arrays, but it is not hard to foresee using it for other container types like JSON, XML and hstore. I have hopes that it will be useful to PostGIS as well. In this initial implementation, a few heuristics have been hard-wired into plpgsql to improve performance for arrays that are stored in plpgsql variables. We would like to generalize those hacks so that other datatypes can obtain similar improvements, but figuring out some appropriate APIs is left as a task for future work. (The heuristics themselves are probably not optimal yet, either, as they sometimes force expansion of arrays that would be better left alone.) Preliminary performance testing shows impressive speed gains for plpgsql functions that do element-by-element access or update of large arrays. There are other cases that get a little slower, as a result of added array format conversions; but we can hope to improve anything that's annoyingly bad. In any case most applications should see a net win. Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
if (ATT_IS_PACKABLE(atti) &&
VARATT_CAN_MAKE_SHORT(DatumGetPointer(val)))
{
/*
* we're anticipating converting to a short varlena header, so
* adjust length and don't count any alignment
*/
data_length += VARATT_CONVERTED_SHORT_SIZE(DatumGetPointer(val));
}
Support "expanded" objects, particularly arrays, for better performance. This patch introduces the ability for complex datatypes to have an in-memory representation that is different from their on-disk format. On-disk formats are typically optimized for minimal size, and in any case they can't contain pointers, so they are often not well-suited for computation. Now a datatype can invent an "expanded" in-memory format that is better suited for its operations, and then pass that around among the C functions that operate on the datatype. There are also provisions (rudimentary as yet) to allow an expanded object to be modified in-place under suitable conditions, so that operations like assignment to an element of an array need not involve copying the entire array. The initial application for this feature is arrays, but it is not hard to foresee using it for other container types like JSON, XML and hstore. I have hopes that it will be useful to PostGIS as well. In this initial implementation, a few heuristics have been hard-wired into plpgsql to improve performance for arrays that are stored in plpgsql variables. We would like to generalize those hacks so that other datatypes can obtain similar improvements, but figuring out some appropriate APIs is left as a task for future work. (The heuristics themselves are probably not optimal yet, either, as they sometimes force expansion of arrays that would be better left alone.) Preliminary performance testing shows impressive speed gains for plpgsql functions that do element-by-element access or update of large arrays. There are other cases that get a little slower, as a result of added array format conversions; but we can hope to improve anything that's annoyingly bad. In any case most applications should see a net win. Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
else if (atti->attlen == -1 &&
VARATT_IS_EXTERNAL_EXPANDED(DatumGetPointer(val)))
{
/*
* we want to flatten the expanded value so that the constructed
* tuple doesn't depend on it
*/
data_length = att_align_nominal(data_length, atti->attalign);
data_length += EOH_get_flat_size(DatumGetEOHP(val));
}
else
{
Support "expanded" objects, particularly arrays, for better performance. This patch introduces the ability for complex datatypes to have an in-memory representation that is different from their on-disk format. On-disk formats are typically optimized for minimal size, and in any case they can't contain pointers, so they are often not well-suited for computation. Now a datatype can invent an "expanded" in-memory format that is better suited for its operations, and then pass that around among the C functions that operate on the datatype. There are also provisions (rudimentary as yet) to allow an expanded object to be modified in-place under suitable conditions, so that operations like assignment to an element of an array need not involve copying the entire array. The initial application for this feature is arrays, but it is not hard to foresee using it for other container types like JSON, XML and hstore. I have hopes that it will be useful to PostGIS as well. In this initial implementation, a few heuristics have been hard-wired into plpgsql to improve performance for arrays that are stored in plpgsql variables. We would like to generalize those hacks so that other datatypes can obtain similar improvements, but figuring out some appropriate APIs is left as a task for future work. (The heuristics themselves are probably not optimal yet, either, as they sometimes force expansion of arrays that would be better left alone.) Preliminary performance testing shows impressive speed gains for plpgsql functions that do element-by-element access or update of large arrays. There are other cases that get a little slower, as a result of added array format conversions; but we can hope to improve anything that's annoyingly bad. In any case most applications should see a net win. Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
data_length = att_align_datum(data_length, atti->attalign,
atti->attlen, val);
data_length = att_addlength_datum(data_length, atti->attlen,
val);
}
}
return data_length;
}
/*
* Per-attribute helper for heap_fill_tuple and other routines building tuples.
*
* Fill in either a data value or a bit in the null bitmask
*/
static inline void
fill_val(Form_pg_attribute att,
bits8 **bit,
int *bitmask,
char **dataP,
uint16 *infomask,
Datum datum,
bool isnull)
{
Size data_length;
char *data = *dataP;
/*
* If we're building a null bitmap, set the appropriate bit for the
* current column value here.
*/
if (bit != NULL)
{
if (*bitmask != HIGHBIT)
*bitmask <<= 1;
else
{
*bit += 1;
**bit = 0x0;
*bitmask = 1;
}
if (isnull)
{
*infomask |= HEAP_HASNULL;
return;
}
**bit |= *bitmask;
}
/*
* XXX we use the att_align macros on the pointer value itself, not on an
* offset. This is a bit of a hack.
*/
if (att->attbyval)
{
/* pass-by-value */
data = (char *) att_align_nominal(data, att->attalign);
store_att_byval(data, datum, att->attlen);
data_length = att->attlen;
}
else if (att->attlen == -1)
{
/* varlena */
Pointer val = DatumGetPointer(datum);
*infomask |= HEAP_HASVARWIDTH;
if (VARATT_IS_EXTERNAL(val))
{
if (VARATT_IS_EXTERNAL_EXPANDED(val))
{
/*
* we want to flatten the expanded value so that the
* constructed tuple doesn't depend on it
*/
ExpandedObjectHeader *eoh = DatumGetEOHP(datum);
data = (char *) att_align_nominal(data,
att->attalign);
data_length = EOH_get_flat_size(eoh);
EOH_flatten_into(eoh, data, data_length);
}
else
{
*infomask |= HEAP_HASEXTERNAL;
/* no alignment, since it's short by definition */
data_length = VARSIZE_EXTERNAL(val);
memcpy(data, val, data_length);
}
}
else if (VARATT_IS_SHORT(val))
{
/* no alignment for short varlenas */
data_length = VARSIZE_SHORT(val);
memcpy(data, val, data_length);
}
else if (VARLENA_ATT_IS_PACKABLE(att) &&
VARATT_CAN_MAKE_SHORT(val))
{
/* convert to short varlena -- no alignment */
data_length = VARATT_CONVERTED_SHORT_SIZE(val);
SET_VARSIZE_SHORT(data, data_length);
memcpy(data + 1, VARDATA(val), data_length - 1);
}
else
{
/* full 4-byte header varlena */
data = (char *) att_align_nominal(data,
att->attalign);
data_length = VARSIZE(val);
memcpy(data, val, data_length);
}
}
else if (att->attlen == -2)
{
/* cstring ... never needs alignment */
*infomask |= HEAP_HASVARWIDTH;
Assert(att->attalign == TYPALIGN_CHAR);
data_length = strlen(DatumGetCString(datum)) + 1;
memcpy(data, DatumGetPointer(datum), data_length);
}
else
{
/* fixed-length pass-by-reference */
data = (char *) att_align_nominal(data, att->attalign);
Assert(att->attlen > 0);
data_length = att->attlen;
memcpy(data, DatumGetPointer(datum), data_length);
}
data += data_length;
*dataP = data;
}
/*
* heap_fill_tuple
* Load data portion of a tuple from values/isnull arrays
*
* We also fill the null bitmap (if any) and set the infomask bits
* that reflect the tuple's data contents.
*
* NOTE: it is now REQUIRED that the caller have pre-zeroed the data area.
*/
void
heap_fill_tuple(TupleDesc tupleDesc,
Datum *values, bool *isnull,
char *data, Size data_size,
uint16 *infomask, bits8 *bit)
{
bits8 *bitP;
int bitmask;
int i;
int numberOfAttributes = tupleDesc->natts;
2007-11-15 22:14:46 +01:00
#ifdef USE_ASSERT_CHECKING
char *start = data;
#endif
if (bit != NULL)
{
bitP = &bit[-1];
bitmask = HIGHBIT;
}
else
{
/* just to keep compiler quiet */
bitP = NULL;
bitmask = 0;
}
*infomask &= ~(HEAP_HASNULL | HEAP_HASVARWIDTH | HEAP_HASEXTERNAL);
for (i = 0; i < numberOfAttributes; i++)
{
Form_pg_attribute attr = TupleDescAttr(tupleDesc, i);
fill_val(attr,
bitP ? &bitP : NULL,
&bitmask,
&data,
infomask,
values ? values[i] : PointerGetDatum(NULL),
isnull ? isnull[i] : true);
}
Assert((data - start) == data_size);
}
/* ----------------------------------------------------------------
* heap tuple interface
* ----------------------------------------------------------------
*/
/* ----------------
* heap_attisnull - returns true iff tuple attribute is not present
* ----------------
*/
bool
heap_attisnull(HeapTuple tup, int attnum, TupleDesc tupleDesc)
{
/*
* We allow a NULL tupledesc for relations not expected to have missing
* values, such as catalog relations and indexes.
*/
Assert(!tupleDesc || attnum <= tupleDesc->natts);
if (attnum > (int) HeapTupleHeaderGetNatts(tup->t_data))
{
if (tupleDesc && TupleDescAttr(tupleDesc, attnum - 1)->atthasmissing)
return false;
else
return true;
}
if (attnum > 0)
{
if (HeapTupleNoNulls(tup))
return false;
1998-11-27 20:52:36 +01:00
return att_isnull(attnum - 1, tup->t_data->t_bits);
}
switch (attnum)
{
case TableOidAttributeNumber:
case SelfItemPointerAttributeNumber:
case MinTransactionIdAttributeNumber:
case MinCommandIdAttributeNumber:
case MaxTransactionIdAttributeNumber:
case MaxCommandIdAttributeNumber:
/* these are never null */
break;
default:
elog(ERROR, "invalid attnum: %d", attnum);
}
return false;
}
/* ----------------
* nocachegetattr
*
* This only gets called from fastgetattr() macro, in cases where
* we can't use a cacheoffset and the value is not null.
*
* This caches attribute offsets in the attribute descriptor.
*
* An alternative way to speed things up would be to cache offsets
* with the tuple, but that seems more difficult unless you take
* the storage hit of actually putting those offsets into the
* tuple you send to disk. Yuck.
*
* This scheme will be slightly slower than that, but should
* perform well for queries which hit large #'s of tuples. After
* you cache the offsets once, examining all the other tuples using
* the same attribute descriptor will go much quicker. -cim 5/4/91
*
* NOTE: if you need to change this code, see also heap_deform_tuple.
* Also see nocache_index_getattr, which is the same code for index
* tuples.
* ----------------
*/
Datum
nocachegetattr(HeapTuple tup,
int attnum,
TupleDesc tupleDesc)
{
HeapTupleHeader td = tup->t_data;
char *tp; /* ptr to data part of tuple */
bits8 *bp = td->t_bits; /* ptr to null bitmap in tuple */
bool slow = false; /* do we have to walk attrs? */
int off; /* current offset within data */
/* ----------------
* Three cases:
*
* 1: No nulls and no variable-width attributes.
* 2: Has a null or a var-width AFTER att.
* 3: Has nulls or var-widths BEFORE att.
* ----------------
*/
attnum--;
if (!HeapTupleNoNulls(tup))
{
/*
* there's a null somewhere in the tuple
*
* check to see if any preceding bits are null...
*/
int byte = attnum >> 3;
int finalbit = attnum & 0x07;
/* check for nulls "before" final bit of last byte */
if ((~bp[byte]) & ((1 << finalbit) - 1))
slow = true;
else
{
/* check for nulls in any "earlier" bytes */
int i;
for (i = 0; i < byte; i++)
{
if (bp[i] != 0xFF)
{
slow = true;
break;
}
}
}
}
tp = (char *) td + td->t_hoff;
if (!slow)
{
Form_pg_attribute att;
/*
* If we get here, there are no nulls up to and including the target
* attribute. If we have a cached offset, we can use it.
*/
att = TupleDescAttr(tupleDesc, attnum);
if (att->attcacheoff >= 0)
return fetchatt(att, tp + att->attcacheoff);
/*
* Otherwise, check for non-fixed-length attrs up to and including
* target. If there aren't any, it's safe to cheaply initialize the
* cached offsets for these attrs.
*/
if (HeapTupleHasVarWidth(tup))
{
int j;
1999-05-25 18:15:34 +02:00
for (j = 0; j <= attnum; j++)
{
if (TupleDescAttr(tupleDesc, j)->attlen <= 0)
{
slow = true;
break;
}
}
}
}
if (!slow)
{
int natts = tupleDesc->natts;
int j = 1;
/*
* If we get here, we have a tuple with no nulls or var-widths up to
* and including the target attribute, so we can use the cached offset
* ... only we don't have it yet, or we'd not have got here. Since
* it's cheap to compute offsets for fixed-width columns, we take the
* opportunity to initialize the cached offsets for *all* the leading
* fixed-width columns, in hope of avoiding future visits to this
* routine.
*/
TupleDescAttr(tupleDesc, 0)->attcacheoff = 0;
/* we might have set some offsets in the slow path previously */
while (j < natts && TupleDescAttr(tupleDesc, j)->attcacheoff > 0)
j++;
off = TupleDescAttr(tupleDesc, j - 1)->attcacheoff +
TupleDescAttr(tupleDesc, j - 1)->attlen;
for (; j < natts; j++)
{
Form_pg_attribute att = TupleDescAttr(tupleDesc, j);
if (att->attlen <= 0)
break;
off = att_align_nominal(off, att->attalign);
att->attcacheoff = off;
off += att->attlen;
}
Assert(j > attnum);
off = TupleDescAttr(tupleDesc, attnum)->attcacheoff;
}
else
{
bool usecache = true;
int i;
/*
* Now we know that we have to walk the tuple CAREFULLY. But we still
* might be able to cache some offsets for next time.
*
* Note - This loop is a little tricky. For each non-null attribute,
* we have to first account for alignment padding before the attr,
* then advance over the attr based on its length. Nulls have no
* storage and no alignment padding either. We can use/set
* attcacheoff until we reach either a null or a var-width attribute.
*/
off = 0;
for (i = 0;; i++) /* loop exit is at "break" */
{
Form_pg_attribute att = TupleDescAttr(tupleDesc, i);
if (HeapTupleHasNulls(tup) && att_isnull(i, bp))
{
usecache = false;
continue; /* this cannot be the target att */
}
/* If we know the next offset, we can skip the rest */
if (usecache && att->attcacheoff >= 0)
off = att->attcacheoff;
else if (att->attlen == -1)
{
/*
* We can only cache the offset for a varlena attribute if the
* offset is already suitably aligned, so that there would be
* no pad bytes in any case: then the offset will be valid for
* either an aligned or unaligned value.
*/
if (usecache &&
off == att_align_nominal(off, att->attalign))
att->attcacheoff = off;
else
{
off = att_align_pointer(off, att->attalign, -1,
tp + off);
usecache = false;
}
}
else
{
/* not varlena, so safe to use att_align_nominal */
off = att_align_nominal(off, att->attalign);
if (usecache)
att->attcacheoff = off;
}
if (i == attnum)
break;
off = att_addlength_pointer(off, att->attlen, tp + off);
if (usecache && att->attlen <= 0)
usecache = false;
}
}
return fetchatt(TupleDescAttr(tupleDesc, attnum), tp + off);
}
/* ----------------
* heap_getsysattr
*
* Fetch the value of a system attribute for a tuple.
*
* This is a support routine for the heap_getattr macro. The macro
* has already determined that the attnum refers to a system attribute.
* ----------------
*/
Datum
heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
{
Datum result;
Assert(tup);
/* Currently, no sys attribute ever reads as NULL. */
*isnull = false;
switch (attnum)
{
case SelfItemPointerAttributeNumber:
/* pass-by-reference datatype */
result = PointerGetDatum(&(tup->t_self));
break;
case MinTransactionIdAttributeNumber:
result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmin(tup->t_data));
break;
case MaxTransactionIdAttributeNumber:
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data));
break;
case MinCommandIdAttributeNumber:
case MaxCommandIdAttributeNumber:
2007-11-15 22:14:46 +01:00
/*
* cmin and cmax are now both aliases for the same field, which
* can in fact also be a combo command id. XXX perhaps we should
* return the "real" cmin or cmax if possible, that is if we are
* inside the originating transaction?
*/
result = CommandIdGetDatum(HeapTupleHeaderGetRawCommandId(tup->t_data));
break;
case TableOidAttributeNumber:
result = ObjectIdGetDatum(tup->t_tableOid);
break;
default:
elog(ERROR, "invalid attnum: %d", attnum);
result = 0; /* keep compiler quiet */
break;
}
return result;
}
/* ----------------
* heap_copytuple
*
* returns a copy of an entire tuple
*
* The HeapTuple struct, tuple header, and tuple data are all allocated
* as a single palloc() block.
* ----------------
*/
HeapTuple
heap_copytuple(HeapTuple tuple)
{
HeapTuple newTuple;
1998-11-27 20:52:36 +01:00
if (!HeapTupleIsValid(tuple) || tuple->t_data == NULL)
return NULL;
1998-11-27 20:52:36 +01:00
newTuple = (HeapTuple) palloc(HEAPTUPLESIZE + tuple->t_len);
newTuple->t_len = tuple->t_len;
newTuple->t_self = tuple->t_self;
newTuple->t_tableOid = tuple->t_tableOid;
1998-11-27 20:52:36 +01:00
newTuple->t_data = (HeapTupleHeader) ((char *) newTuple + HEAPTUPLESIZE);
memcpy((char *) newTuple->t_data, (char *) tuple->t_data, tuple->t_len);
return newTuple;
}
1998-11-27 20:52:36 +01:00
/* ----------------
* heap_copytuple_with_tuple
*
* copy a tuple into a caller-supplied HeapTuple management struct
*
* Note that after calling this function, the "dest" HeapTuple will not be
* allocated as a single palloc() block (unlike with heap_copytuple()).
1998-11-27 20:52:36 +01:00
* ----------------
*/
void
heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest)
{
if (!HeapTupleIsValid(src) || src->t_data == NULL)
{
dest->t_data = NULL;
return;
}
1999-05-25 18:15:34 +02:00
1998-11-27 20:52:36 +01:00
dest->t_len = src->t_len;
dest->t_self = src->t_self;
dest->t_tableOid = src->t_tableOid;
1998-11-27 20:52:36 +01:00
dest->t_data = (HeapTupleHeader) palloc(src->t_len);
memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len);
1998-11-27 20:52:36 +01:00
}
/*
* Expand a tuple which has fewer attributes than required. For each attribute
* not present in the sourceTuple, if there is a missing value that will be
* used. Otherwise the attribute will be set to NULL.
*
* The source tuple must have fewer attributes than the required number.
*
* Only one of targetHeapTuple and targetMinimalTuple may be supplied. The
* other argument must be NULL.
*/
static void
expand_tuple(HeapTuple *targetHeapTuple,
MinimalTuple *targetMinimalTuple,
HeapTuple sourceTuple,
TupleDesc tupleDesc)
{
AttrMissing *attrmiss = NULL;
int attnum;
int firstmissingnum;
bool hasNulls = HeapTupleHasNulls(sourceTuple);
HeapTupleHeader targetTHeader;
HeapTupleHeader sourceTHeader = sourceTuple->t_data;
int sourceNatts = HeapTupleHeaderGetNatts(sourceTHeader);
int natts = tupleDesc->natts;
int sourceNullLen;
int targetNullLen;
Size sourceDataLen = sourceTuple->t_len - sourceTHeader->t_hoff;
Size targetDataLen;
Size len;
int hoff;
bits8 *nullBits = NULL;
int bitMask = 0;
char *targetData;
uint16 *infoMask;
Assert((targetHeapTuple && !targetMinimalTuple)
|| (!targetHeapTuple && targetMinimalTuple));
Assert(sourceNatts < natts);
sourceNullLen = (hasNulls ? BITMAPLEN(sourceNatts) : 0);
targetDataLen = sourceDataLen;
if (tupleDesc->constr &&
tupleDesc->constr->missing)
{
/*
* If there are missing values we want to put them into the tuple.
* Before that we have to compute the extra length for the values
* array and the variable length data.
*/
attrmiss = tupleDesc->constr->missing;
/*
* Find the first item in attrmiss for which we don't have a value in
* the source. We can ignore all the missing entries before that.
*/
for (firstmissingnum = sourceNatts;
firstmissingnum < natts;
firstmissingnum++)
{
if (attrmiss[firstmissingnum].am_present)
break;
else
hasNulls = true;
}
/*
* Now walk the missing attributes. If there is a missing value make
* space for it. Otherwise, it's going to be NULL.
*/
for (attnum = firstmissingnum;
attnum < natts;
attnum++)
{
if (attrmiss[attnum].am_present)
{
Form_pg_attribute att = TupleDescAttr(tupleDesc, attnum);
targetDataLen = att_align_datum(targetDataLen,
att->attalign,
att->attlen,
attrmiss[attnum].am_value);
targetDataLen = att_addlength_pointer(targetDataLen,
att->attlen,
attrmiss[attnum].am_value);
}
else
{
/* no missing value, so it must be null */
hasNulls = true;
}
}
} /* end if have missing values */
else
{
/*
* If there are no missing values at all then NULLS must be allowed,
* since some of the attributes are known to be absent.
*/
hasNulls = true;
}
len = 0;
if (hasNulls)
{
targetNullLen = BITMAPLEN(natts);
len += targetNullLen;
}
else
targetNullLen = 0;
/*
* Allocate and zero the space needed. Note that the tuple body and
* HeapTupleData management structure are allocated in one chunk.
*/
if (targetHeapTuple)
{
len += offsetof(HeapTupleHeaderData, t_bits);
hoff = len = MAXALIGN(len); /* align user data safely */
len += targetDataLen;
*targetHeapTuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
(*targetHeapTuple)->t_data
= targetTHeader
= (HeapTupleHeader) ((char *) *targetHeapTuple + HEAPTUPLESIZE);
(*targetHeapTuple)->t_len = len;
(*targetHeapTuple)->t_tableOid = sourceTuple->t_tableOid;
(*targetHeapTuple)->t_self = sourceTuple->t_self;
targetTHeader->t_infomask = sourceTHeader->t_infomask;
targetTHeader->t_hoff = hoff;
HeapTupleHeaderSetNatts(targetTHeader, natts);
HeapTupleHeaderSetDatumLength(targetTHeader, len);
HeapTupleHeaderSetTypeId(targetTHeader, tupleDesc->tdtypeid);
HeapTupleHeaderSetTypMod(targetTHeader, tupleDesc->tdtypmod);
/* We also make sure that t_ctid is invalid unless explicitly set */
ItemPointerSetInvalid(&(targetTHeader->t_ctid));
if (targetNullLen > 0)
nullBits = (bits8 *) ((char *) (*targetHeapTuple)->t_data
+ offsetof(HeapTupleHeaderData, t_bits));
targetData = (char *) (*targetHeapTuple)->t_data + hoff;
infoMask = &(targetTHeader->t_infomask);
}
else
{
len += SizeofMinimalTupleHeader;
hoff = len = MAXALIGN(len); /* align user data safely */
len += targetDataLen;
*targetMinimalTuple = (MinimalTuple) palloc0(len);
(*targetMinimalTuple)->t_len = len;
(*targetMinimalTuple)->t_hoff = hoff + MINIMAL_TUPLE_OFFSET;
(*targetMinimalTuple)->t_infomask = sourceTHeader->t_infomask;
/* Same macro works for MinimalTuples */
HeapTupleHeaderSetNatts(*targetMinimalTuple, natts);
if (targetNullLen > 0)
nullBits = (bits8 *) ((char *) *targetMinimalTuple
+ offsetof(MinimalTupleData, t_bits));
targetData = (char *) *targetMinimalTuple + hoff;
infoMask = &((*targetMinimalTuple)->t_infomask);
}
if (targetNullLen > 0)
{
if (sourceNullLen > 0)
{
/* if bitmap pre-existed copy in - all is set */
memcpy(nullBits,
((char *) sourceTHeader)
+ offsetof(HeapTupleHeaderData, t_bits),
sourceNullLen);
nullBits += sourceNullLen - 1;
}
else
{
sourceNullLen = BITMAPLEN(sourceNatts);
/* Set NOT NULL for all existing attributes */
memset(nullBits, 0xff, sourceNullLen);
nullBits += sourceNullLen - 1;
if (sourceNatts & 0x07)
{
/* build the mask (inverted!) */
bitMask = 0xff << (sourceNatts & 0x07);
/* Voila */
*nullBits = ~bitMask;
}
}
bitMask = (1 << ((sourceNatts - 1) & 0x07));
} /* End if have null bitmap */
memcpy(targetData,
((char *) sourceTuple->t_data) + sourceTHeader->t_hoff,
sourceDataLen);
targetData += sourceDataLen;
/* Now fill in the missing values */
for (attnum = sourceNatts; attnum < natts; attnum++)
{
Form_pg_attribute attr = TupleDescAttr(tupleDesc, attnum);
if (attrmiss && attrmiss[attnum].am_present)
{
fill_val(attr,
nullBits ? &nullBits : NULL,
&bitMask,
&targetData,
infoMask,
attrmiss[attnum].am_value,
false);
}
else
{
fill_val(attr,
&nullBits,
&bitMask,
&targetData,
infoMask,
(Datum) 0,
true);
}
} /* end loop over missing attributes */
}
/*
* Fill in the missing values for a minimal HeapTuple
*/
MinimalTuple
minimal_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc)
{
MinimalTuple minimalTuple;
expand_tuple(NULL, &minimalTuple, sourceTuple, tupleDesc);
return minimalTuple;
}
/*
* Fill in the missing values for an ordinary HeapTuple
*/
HeapTuple
heap_expand_tuple(HeapTuple sourceTuple, TupleDesc tupleDesc)
{
HeapTuple heapTuple;
expand_tuple(&heapTuple, NULL, sourceTuple, tupleDesc);
return heapTuple;
}
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
/* ----------------
* heap_copy_tuple_as_datum
*
* copy a tuple as a composite-type Datum
* ----------------
*/
Datum
heap_copy_tuple_as_datum(HeapTuple tuple, TupleDesc tupleDesc)
{
HeapTupleHeader td;
/*
* If the tuple contains any external TOAST pointers, we have to inline
* those fields to meet the conventions for composite-type Datums.
*/
if (HeapTupleHasExternal(tuple))
return toast_flatten_tuple_to_datum(tuple->t_data,
tuple->t_len,
tupleDesc);
/*
* Fast path for easy case: just make a palloc'd copy and insert the
* correct composite-Datum header fields (since those may not be set if
* the given tuple came from disk, rather than from heap_form_tuple).
*/
td = (HeapTupleHeader) palloc(tuple->t_len);
memcpy((char *) td, (char *) tuple->t_data, tuple->t_len);
HeapTupleHeaderSetDatumLength(td, tuple->t_len);
HeapTupleHeaderSetTypeId(td, tupleDesc->tdtypeid);
HeapTupleHeaderSetTypMod(td, tupleDesc->tdtypmod);
return PointerGetDatum(td);
}
/*
* heap_form_tuple
* construct a tuple from the given values[] and isnull[] arrays,
* which are of the length indicated by tupleDescriptor->natts
*
* The result is allocated in the current memory context.
*/
HeapTuple
heap_form_tuple(TupleDesc tupleDescriptor,
Datum *values,
bool *isnull)
{
HeapTuple tuple; /* return tuple */
HeapTupleHeader td; /* tuple data */
Size len,
data_len;
int hoff;
bool hasnull = false;
int numberOfAttributes = tupleDescriptor->natts;
int i;
if (numberOfAttributes > MaxTupleAttributeNumber)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("number of columns (%d) exceeds limit (%d)",
numberOfAttributes, MaxTupleAttributeNumber)));
/*
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
* Check for nulls
*/
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
{
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
hasnull = true;
break;
}
}
/*
* Determine total space needed
*/
len = offsetof(HeapTupleHeaderData, t_bits);
if (hasnull)
len += BITMAPLEN(numberOfAttributes);
hoff = len = MAXALIGN(len); /* align user data safely */
data_len = heap_compute_data_size(tupleDescriptor, values, isnull);
len += data_len;
/*
* Allocate and zero the space needed. Note that the tuple body and
* HeapTupleData management structure are allocated in one chunk.
*/
tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
tuple->t_data = td = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
/*
* And fill in the information. Note we fill the Datum fields even though
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
* this tuple may never become a Datum. This lets HeapTupleHeaderGetDatum
* identify the tuple type if needed.
*/
tuple->t_len = len;
ItemPointerSetInvalid(&(tuple->t_self));
tuple->t_tableOid = InvalidOid;
HeapTupleHeaderSetDatumLength(td, len);
HeapTupleHeaderSetTypeId(td, tupleDescriptor->tdtypeid);
HeapTupleHeaderSetTypMod(td, tupleDescriptor->tdtypmod);
Fix postgres_fdw to return the right ctid value in EvalPlanQual cases. If a postgres_fdw foreign table is a non-locked source relation in an UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, and the query selects its ctid column, the wrong value would be returned if an EvalPlanQual recheck occurred. This happened because the foreign table's result row was copied via the ROW_MARK_COPY code path, and EvalPlanQualFetchRowMarks just unconditionally set the reconstructed tuple's t_self to "invalid". To fix that, we can have EvalPlanQualFetchRowMarks copy the composite datum's t_ctid field, and be sure to initialize that along with t_self when postgres_fdw constructs a tuple to return. If we just did that much then EvalPlanQualFetchRowMarks would start returning "(0,0)" as ctid for all other ROW_MARK_COPY cases, which perhaps does not matter much, but then again maybe it might. The cause of that is that heap_form_tuple, which is the ultimate source of all composite datums, simply leaves t_ctid as zeroes in newly constructed tuples. That seems like a bad idea on general principles: a field that's really not been initialized shouldn't appear to have a valid value. So let's eat the trivial additional overhead of doing "ItemPointerSetInvalid(&(td->t_ctid))" in heap_form_tuple. This closes out our handling of Etsuro Fujita's report that tableoid and ctid weren't correctly set in postgres_fdw EvalPlanQual cases. Along the way we did a great deal of work to improve FDWs' ability to control row locking behavior; which was not wasted effort by any means, but it didn't end up being a fix for this problem because that feature would be too expensive for postgres_fdw to use all the time. Although the fix for the tableoid misbehavior was back-patched, I'm hesitant to do so here; it seems far less likely that people would care about remote ctid than tableoid, and even such a minor behavioral change as this in heap_form_tuple is perhaps best not back-patched. So commit to HEAD only, at least for the moment. Etsuro Fujita, with some adjustments by me
2015-05-13 20:05:17 +02:00
/* We also make sure that t_ctid is invalid unless explicitly set */
ItemPointerSetInvalid(&(td->t_ctid));
HeapTupleHeaderSetNatts(td, numberOfAttributes);
td->t_hoff = hoff;
heap_fill_tuple(tupleDescriptor,
values,
isnull,
(char *) td + hoff,
data_len,
&td->t_infomask,
(hasnull ? td->t_bits : NULL));
return tuple;
}
/*
* heap_modify_tuple
* form a new tuple from an old tuple and a set of replacement values.
*
* The replValues, replIsnull, and doReplace arrays must be of the length
* indicated by tupleDesc->natts. The new tuple is constructed using the data
* from replValues/replIsnull at columns where doReplace is true, and using
* the data from the old tuple at columns where doReplace is false.
*
* The result is allocated in the current memory context.
*/
HeapTuple
heap_modify_tuple(HeapTuple tuple,
TupleDesc tupleDesc,
Datum *replValues,
bool *replIsnull,
bool *doReplace)
{
int numberOfAttributes = tupleDesc->natts;
int attoff;
Datum *values;
bool *isnull;
HeapTuple newTuple;
/*
* allocate and fill values and isnull arrays from either the tuple or the
* repl information, as appropriate.
*
* NOTE: it's debatable whether to use heap_deform_tuple() here or just
* heap_getattr() only the non-replaced columns. The latter could win if
* there are many replaced columns and few non-replaced ones. However,
* heap_deform_tuple costs only O(N) while the heap_getattr way would cost
* O(N^2) if there are many non-replaced columns, so it seems better to
* err on the side of linear cost.
*/
values = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
isnull = (bool *) palloc(numberOfAttributes * sizeof(bool));
heap_deform_tuple(tuple, tupleDesc, values, isnull);
for (attoff = 0; attoff < numberOfAttributes; attoff++)
{
if (doReplace[attoff])
{
values[attoff] = replValues[attoff];
isnull[attoff] = replIsnull[attoff];
}
}
/*
* create a new tuple from the values and isnull arrays
*/
newTuple = heap_form_tuple(tupleDesc, values, isnull);
pfree(values);
pfree(isnull);
/*
Remove WITH OIDS support, change oid catalog column visibility. Previously tables declared WITH OIDS, including a significant fraction of the catalog tables, stored the oid column not as a normal column, but as part of the tuple header. This special column was not shown by default, which was somewhat odd, as it's often (consider e.g. pg_class.oid) one of the more important parts of a row. Neither pg_dump nor COPY included the contents of the oid column by default. The fact that the oid column was not an ordinary column necessitated a significant amount of special case code to support oid columns. That already was painful for the existing, but upcoming work aiming to make table storage pluggable, would have required expanding and duplicating that "specialness" significantly. WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0). Remove it. Removing includes: - CREATE TABLE and ALTER TABLE syntax for declaring the table to be WITH OIDS has been removed (WITH (oids[ = true]) will error out) - pg_dump does not support dumping tables declared WITH OIDS and will issue a warning when dumping one (and ignore the oid column). - restoring an pg_dump archive with pg_restore will warn when restoring a table with oid contents (and ignore the oid column) - COPY will refuse to load binary dump that includes oids. - pg_upgrade will error out when encountering tables declared WITH OIDS, they have to be altered to remove the oid column first. - Functionality to access the oid of the last inserted row (like plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed. The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false) for CREATE TABLE) is still supported. While that requires a bit of support code, it seems unnecessary to break applications / dumps that do not use oids, and are explicit about not using them. The biggest user of WITH OID columns was postgres' catalog. This commit changes all 'magic' oid columns to be columns that are normally declared and stored. To reduce unnecessary query breakage all the newly added columns are still named 'oid', even if a table's column naming scheme would indicate 'reloid' or such. This obviously requires adapting a lot code, mostly replacing oid access via HeapTupleGetOid() with access to the underlying Form_pg_*->oid column. The bootstrap process now assigns oids for all oid columns in genbki.pl that do not have an explicit value (starting at the largest oid previously used), only oids assigned later by oids will be above FirstBootstrapObjectId. As the oid column now is a normal column the special bootstrap syntax for oids has been removed. Oids are not automatically assigned during insertion anymore, all backend code explicitly assigns oids with GetNewOidWithIndex(). For the rare case that insertions into the catalog via SQL are called for the new pg_nextoid() function can be used (which only works on catalog tables). The fact that oid columns on system tables are now normal columns means that they will be included in the set of columns expanded by * (i.e. SELECT * FROM pg_class will now include the table's oid, previously it did not). It'd not technically be hard to hide oid column by default, but that'd mean confusing behavior would either have to be carried forward forever, or it'd cause breakage down the line. While it's not unlikely that further adjustments are needed, the scope/invasiveness of the patch makes it worthwhile to get merge this now. It's painful to maintain externally, too complicated to commit after the code code freeze, and a dependency of a number of other patches. Catversion bump, for obvious reasons. Author: Andres Freund, with contributions by John Naylor Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
* copy the identification info of the old tuple: t_ctid, t_self
*/
newTuple->t_data->t_ctid = tuple->t_data->t_ctid;
newTuple->t_self = tuple->t_self;
newTuple->t_tableOid = tuple->t_tableOid;
return newTuple;
}
/*
* heap_modify_tuple_by_cols
* form a new tuple from an old tuple and a set of replacement values.
*
* This is like heap_modify_tuple, except that instead of specifying which
* column(s) to replace by a boolean map, an array of target column numbers
* is used. This is often more convenient when a fixed number of columns
* are to be replaced. The replCols, replValues, and replIsnull arrays must
* be of length nCols. Target column numbers are indexed from 1.
*
* The result is allocated in the current memory context.
*/
HeapTuple
heap_modify_tuple_by_cols(HeapTuple tuple,
TupleDesc tupleDesc,
int nCols,
int *replCols,
Datum *replValues,
bool *replIsnull)
{
int numberOfAttributes = tupleDesc->natts;
Datum *values;
bool *isnull;
HeapTuple newTuple;
int i;
/*
* allocate and fill values and isnull arrays from the tuple, then replace
* selected columns from the input arrays.
*/
values = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
isnull = (bool *) palloc(numberOfAttributes * sizeof(bool));
heap_deform_tuple(tuple, tupleDesc, values, isnull);
for (i = 0; i < nCols; i++)
{
int attnum = replCols[i];
if (attnum <= 0 || attnum > numberOfAttributes)
elog(ERROR, "invalid column number %d", attnum);
values[attnum - 1] = replValues[i];
isnull[attnum - 1] = replIsnull[i];
}
/*
* create a new tuple from the values and isnull arrays
*/
newTuple = heap_form_tuple(tupleDesc, values, isnull);
pfree(values);
pfree(isnull);
/*
Remove WITH OIDS support, change oid catalog column visibility. Previously tables declared WITH OIDS, including a significant fraction of the catalog tables, stored the oid column not as a normal column, but as part of the tuple header. This special column was not shown by default, which was somewhat odd, as it's often (consider e.g. pg_class.oid) one of the more important parts of a row. Neither pg_dump nor COPY included the contents of the oid column by default. The fact that the oid column was not an ordinary column necessitated a significant amount of special case code to support oid columns. That already was painful for the existing, but upcoming work aiming to make table storage pluggable, would have required expanding and duplicating that "specialness" significantly. WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0). Remove it. Removing includes: - CREATE TABLE and ALTER TABLE syntax for declaring the table to be WITH OIDS has been removed (WITH (oids[ = true]) will error out) - pg_dump does not support dumping tables declared WITH OIDS and will issue a warning when dumping one (and ignore the oid column). - restoring an pg_dump archive with pg_restore will warn when restoring a table with oid contents (and ignore the oid column) - COPY will refuse to load binary dump that includes oids. - pg_upgrade will error out when encountering tables declared WITH OIDS, they have to be altered to remove the oid column first. - Functionality to access the oid of the last inserted row (like plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed. The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false) for CREATE TABLE) is still supported. While that requires a bit of support code, it seems unnecessary to break applications / dumps that do not use oids, and are explicit about not using them. The biggest user of WITH OID columns was postgres' catalog. This commit changes all 'magic' oid columns to be columns that are normally declared and stored. To reduce unnecessary query breakage all the newly added columns are still named 'oid', even if a table's column naming scheme would indicate 'reloid' or such. This obviously requires adapting a lot code, mostly replacing oid access via HeapTupleGetOid() with access to the underlying Form_pg_*->oid column. The bootstrap process now assigns oids for all oid columns in genbki.pl that do not have an explicit value (starting at the largest oid previously used), only oids assigned later by oids will be above FirstBootstrapObjectId. As the oid column now is a normal column the special bootstrap syntax for oids has been removed. Oids are not automatically assigned during insertion anymore, all backend code explicitly assigns oids with GetNewOidWithIndex(). For the rare case that insertions into the catalog via SQL are called for the new pg_nextoid() function can be used (which only works on catalog tables). The fact that oid columns on system tables are now normal columns means that they will be included in the set of columns expanded by * (i.e. SELECT * FROM pg_class will now include the table's oid, previously it did not). It'd not technically be hard to hide oid column by default, but that'd mean confusing behavior would either have to be carried forward forever, or it'd cause breakage down the line. While it's not unlikely that further adjustments are needed, the scope/invasiveness of the patch makes it worthwhile to get merge this now. It's painful to maintain externally, too complicated to commit after the code code freeze, and a dependency of a number of other patches. Catversion bump, for obvious reasons. Author: Andres Freund, with contributions by John Naylor Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
* copy the identification info of the old tuple: t_ctid, t_self
*/
newTuple->t_data->t_ctid = tuple->t_data->t_ctid;
newTuple->t_self = tuple->t_self;
newTuple->t_tableOid = tuple->t_tableOid;
return newTuple;
}
/*
* heap_deform_tuple
* Given a tuple, extract data into values/isnull arrays; this is
* the inverse of heap_form_tuple.
*
* Storage for the values/isnull arrays is provided by the caller;
* it should be sized according to tupleDesc->natts not
* HeapTupleHeaderGetNatts(tuple->t_data).
*
* Note that for pass-by-reference datatypes, the pointer placed
* in the Datum will point into the given tuple.
*
* When all or most of a tuple's fields need to be extracted,
* this routine will be significantly quicker than a loop around
* heap_getattr; the loop will become O(N^2) as soon as any
* noncacheable attribute offsets are involved.
*/
void
heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
Datum *values, bool *isnull)
{
HeapTupleHeader tup = tuple->t_data;
bool hasnulls = HeapTupleHasNulls(tuple);
int tdesc_natts = tupleDesc->natts;
int natts; /* number of atts to extract */
int attnum;
char *tp; /* ptr to tuple data */
uint32 off; /* offset in tuple data */
bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */
bool slow = false; /* can we use/set attcacheoff? */
natts = HeapTupleHeaderGetNatts(tup);
/*
* In inheritance situations, it is possible that the given tuple actually
* has more fields than the caller is expecting. Don't run off the end of
* the caller's arrays.
*/
natts = Min(natts, tdesc_natts);
tp = (char *) tup + tup->t_hoff;
off = 0;
for (attnum = 0; attnum < natts; attnum++)
{
Form_pg_attribute thisatt = TupleDescAttr(tupleDesc, attnum);
if (hasnulls && att_isnull(attnum, bp))
{
values[attnum] = (Datum) 0;
isnull[attnum] = true;
slow = true; /* can't use attcacheoff anymore */
continue;
}
isnull[attnum] = false;
if (!slow && thisatt->attcacheoff >= 0)
off = thisatt->attcacheoff;
else if (thisatt->attlen == -1)
{
/*
* We can only cache the offset for a varlena attribute if the
* offset is already suitably aligned, so that there would be no
* pad bytes in any case: then the offset will be valid for either
* an aligned or unaligned value.
*/
if (!slow &&
off == att_align_nominal(off, thisatt->attalign))
thisatt->attcacheoff = off;
else
{
off = att_align_pointer(off, thisatt->attalign, -1,
tp + off);
slow = true;
}
}
else
{
/* not varlena, so safe to use att_align_nominal */
off = att_align_nominal(off, thisatt->attalign);
if (!slow)
thisatt->attcacheoff = off;
}
values[attnum] = fetchatt(thisatt, tp + off);
off = att_addlength_pointer(off, thisatt->attlen, tp + off);
if (thisatt->attlen <= 0)
slow = true; /* can't use attcacheoff anymore */
}
/*
* If tuple doesn't have all the atts indicated by tupleDesc, read the
* rest as nulls or missing values as appropriate.
*/
for (; attnum < tdesc_natts; attnum++)
values[attnum] = getmissingattr(tupleDesc, attnum + 1, &isnull[attnum]);
}
/*
* heap_freetuple
*/
void
heap_freetuple(HeapTuple htup)
{
pfree(htup);
}
/*
* heap_form_minimal_tuple
* construct a MinimalTuple from the given values[] and isnull[] arrays,
* which are of the length indicated by tupleDescriptor->natts
*
* This is exactly like heap_form_tuple() except that the result is a
* "minimal" tuple lacking a HeapTupleData header as well as room for system
* columns.
*
* The result is allocated in the current memory context.
*/
MinimalTuple
heap_form_minimal_tuple(TupleDesc tupleDescriptor,
Datum *values,
bool *isnull)
{
MinimalTuple tuple; /* return tuple */
Size len,
data_len;
int hoff;
bool hasnull = false;
int numberOfAttributes = tupleDescriptor->natts;
int i;
if (numberOfAttributes > MaxTupleAttributeNumber)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("number of columns (%d) exceeds limit (%d)",
numberOfAttributes, MaxTupleAttributeNumber)));
/*
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
* Check for nulls
*/
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
{
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
hasnull = true;
break;
}
}
/*
* Determine total space needed
*/
len = SizeofMinimalTupleHeader;
if (hasnull)
len += BITMAPLEN(numberOfAttributes);
hoff = len = MAXALIGN(len); /* align user data safely */
data_len = heap_compute_data_size(tupleDescriptor, values, isnull);
len += data_len;
/*
* Allocate and zero the space needed.
*/
tuple = (MinimalTuple) palloc0(len);
/*
* And fill in the information.
*/
tuple->t_len = len;
HeapTupleHeaderSetNatts(tuple, numberOfAttributes);
tuple->t_hoff = hoff + MINIMAL_TUPLE_OFFSET;
heap_fill_tuple(tupleDescriptor,
values,
isnull,
(char *) tuple + hoff,
data_len,
&tuple->t_infomask,
(hasnull ? tuple->t_bits : NULL));
return tuple;
}
/*
* heap_free_minimal_tuple
*/
void
heap_free_minimal_tuple(MinimalTuple mtup)
{
pfree(mtup);
}
/*
* heap_copy_minimal_tuple
* copy a MinimalTuple
*
* The result is allocated in the current memory context.
*/
MinimalTuple
heap_copy_minimal_tuple(MinimalTuple mtup)
{
MinimalTuple result;
result = (MinimalTuple) palloc(mtup->t_len);
memcpy(result, mtup, mtup->t_len);
return result;
}
/*
* heap_tuple_from_minimal_tuple
* create a HeapTuple by copying from a MinimalTuple;
* system columns are filled with zeroes
*
* The result is allocated in the current memory context.
* The HeapTuple struct, tuple header, and tuple data are all allocated
* as a single palloc() block.
*/
HeapTuple
heap_tuple_from_minimal_tuple(MinimalTuple mtup)
{
HeapTuple result;
uint32 len = mtup->t_len + MINIMAL_TUPLE_OFFSET;
result = (HeapTuple) palloc(HEAPTUPLESIZE + len);
result->t_len = len;
ItemPointerSetInvalid(&(result->t_self));
result->t_tableOid = InvalidOid;
result->t_data = (HeapTupleHeader) ((char *) result + HEAPTUPLESIZE);
memcpy((char *) result->t_data + MINIMAL_TUPLE_OFFSET, mtup, mtup->t_len);
memset(result->t_data, 0, offsetof(HeapTupleHeaderData, t_infomask2));
return result;
}
/*
* minimal_tuple_from_heap_tuple
* create a MinimalTuple by copying from a HeapTuple
*
* The result is allocated in the current memory context.
*/
MinimalTuple
minimal_tuple_from_heap_tuple(HeapTuple htup)
{
MinimalTuple result;
uint32 len;
Assert(htup->t_len > MINIMAL_TUPLE_OFFSET);
len = htup->t_len - MINIMAL_TUPLE_OFFSET;
result = (MinimalTuple) palloc(len);
memcpy(result, (char *) htup->t_data + MINIMAL_TUPLE_OFFSET, len);
result->t_len = len;
return result;
}
/*
* This mainly exists so JIT can inline the definition, but it's also
* sometimes useful in debugging sessions.
*/
size_t
varsize_any(void *p)
{
return VARSIZE_ANY(p);
}