postgresql/src/backend/access/heap/tuptoaster.c

2180 lines
58 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* tuptoaster.c
* Support routines for external and compressed storage of
* variable size attributes.
*
* Copyright (c) 2000-2014, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/access/heap/tuptoaster.c
*
*
* INTERFACE ROUTINES
* toast_insert_or_update -
* Try to make a given tuple fit into one page by compressing
* or moving off attributes
*
* toast_delete -
* Reclaim toast storage when a tuple is deleted
*
* heap_tuple_untoast_attr -
* Fetch back a given value from the "secondary" relation
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include "access/genam.h"
#include "access/heapam.h"
#include "access/tuptoaster.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
#include "utils/fmgroids.h"
#include "utils/pg_lzcompress.h"
#include "utils/rel.h"
#include "utils/typcache.h"
#include "utils/tqual.h"
#undef TOAST_DEBUG
2001-03-22 05:01:46 +01:00
static void toast_delete_datum(Relation rel, Datum value);
static Datum toast_save_datum(Relation rel, Datum value,
struct varlena * oldexternal, int options);
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
static bool toastrel_valueid_exists(Relation toastrel, Oid valueid);
static bool toastid_valueid_exists(Oid toastrelid, Oid valueid);
2007-11-15 22:14:46 +01:00
static struct varlena *toast_fetch_datum(struct varlena * attr);
static struct varlena *toast_fetch_datum_slice(struct varlena * attr,
2002-09-04 22:31:48 +02:00
int32 sliceoffset, int32 length);
static int toast_open_indexes(Relation toastrel,
LOCKMODE lock,
Relation **toastidxs,
int *num_indexes);
static void toast_close_indexes(Relation *toastidxs, int num_indexes,
LOCKMODE lock);
/* ----------
* heap_tuple_fetch_attr -
*
* Public entry point to get back a toasted value from
* external source (possibly still in compressed format).
*
* This will return a datum that contains all the data internally, ie, not
* relying on external storage or memory, but it can still be compressed or
* have a short header.
----------
*/
struct varlena *
2007-11-15 22:14:46 +01:00
heap_tuple_fetch_attr(struct varlena * attr)
{
2007-11-15 22:14:46 +01:00
struct varlena *result;
if (VARATT_IS_EXTERNAL_ONDISK(attr))
{
/*
* This is an external stored plain value
*/
result = toast_fetch_datum(attr);
}
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
{
/*
* copy into the caller's memory context. That's not required in all
* cases but sufficient for now since this is mainly used when we need
* to persist a Datum for unusually long time, like in a HOLD cursor.
*/
struct varatt_indirect redirect;
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
attr = (struct varlena *) redirect.pointer;
/* nested indirect Datums aren't allowed */
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
/* doesn't make much sense, but better handle it */
if (VARATT_IS_EXTERNAL_ONDISK(attr))
return heap_tuple_fetch_attr(attr);
/* copy datum verbatim */
result = (struct varlena *) palloc(VARSIZE_ANY(attr));
memcpy(result, attr, VARSIZE_ANY(attr));
}
else
{
/*
2005-10-15 04:49:52 +02:00
* This is a plain value inside of the main tuple - why am I called?
*/
result = attr;
2001-03-22 05:01:46 +01:00
}
return result;
}
/* ----------
* heap_tuple_untoast_attr -
*
* Public entry point to get back a toasted value from compression
* or external storage.
* ----------
*/
struct varlena *
2007-11-15 22:14:46 +01:00
heap_tuple_untoast_attr(struct varlena * attr)
{
if (VARATT_IS_EXTERNAL_ONDISK(attr))
{
/*
* This is an externally stored datum --- fetch it back from there
*/
attr = toast_fetch_datum(attr);
/* If it's compressed, decompress it */
if (VARATT_IS_COMPRESSED(attr))
{
PGLZ_Header *tmp = (PGLZ_Header *) attr;
attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
pglz_decompress(tmp, VARDATA(attr));
pfree(tmp);
}
}
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
{
struct varatt_indirect redirect;
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
attr = (struct varlena *) redirect.pointer;
/* nested indirect Datums aren't allowed */
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
attr = heap_tuple_untoast_attr(attr);
}
else if (VARATT_IS_COMPRESSED(attr))
{
/*
* This is a compressed value inside of the main tuple
*/
PGLZ_Header *tmp = (PGLZ_Header *) attr;
attr = (struct varlena *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ);
pglz_decompress(tmp, VARDATA(attr));
}
else if (VARATT_IS_SHORT(attr))
{
/*
* This is a short-header varlena --- convert to 4-byte header format
*/
2007-11-15 22:14:46 +01:00
Size data_size = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT;
Size new_size = data_size + VARHDRSZ;
struct varlena *new_attr;
new_attr = (struct varlena *) palloc(new_size);
SET_VARSIZE(new_attr, new_size);
memcpy(VARDATA(new_attr), VARDATA_SHORT(attr), data_size);
attr = new_attr;
}
return attr;
}
/* ----------
* heap_tuple_untoast_attr_slice -
*
2002-09-04 22:31:48 +02:00
* Public entry point to get back part of a toasted value
* from compression or external storage.
* ----------
*/
struct varlena *
2007-11-15 22:14:46 +01:00
heap_tuple_untoast_attr_slice(struct varlena * attr,
int32 sliceoffset, int32 slicelength)
{
struct varlena *preslice;
struct varlena *result;
2007-11-15 22:14:46 +01:00
char *attrdata;
2002-09-04 22:31:48 +02:00
int32 attrsize;
if (VARATT_IS_EXTERNAL_ONDISK(attr))
{
struct varatt_external toast_pointer;
2002-09-04 22:31:48 +02:00
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
/* fast path for non-compressed external datums */
if (!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
return toast_fetch_datum_slice(attr, sliceoffset, slicelength);
2002-09-04 22:31:48 +02:00
/* fetch it back (compressed marker will get set automatically) */
preslice = toast_fetch_datum(attr);
}
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
{
struct varatt_indirect redirect;
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
/* nested indirect Datums aren't allowed */
Assert(!VARATT_IS_EXTERNAL_INDIRECT(redirect.pointer));
return heap_tuple_untoast_attr_slice(redirect.pointer,
sliceoffset, slicelength);
}
else
preslice = attr;
if (VARATT_IS_COMPRESSED(preslice))
{
PGLZ_Header *tmp = (PGLZ_Header *) preslice;
2007-11-15 22:14:46 +01:00
Size size = PGLZ_RAW_SIZE(tmp) + VARHDRSZ;
preslice = (struct varlena *) palloc(size);
SET_VARSIZE(preslice, size);
pglz_decompress(tmp, VARDATA(preslice));
2002-09-04 22:31:48 +02:00
if (tmp != (PGLZ_Header *) attr)
pfree(tmp);
}
if (VARATT_IS_SHORT(preslice))
{
attrdata = VARDATA_SHORT(preslice);
attrsize = VARSIZE_SHORT(preslice) - VARHDRSZ_SHORT;
}
2002-09-04 22:31:48 +02:00
else
{
attrdata = VARDATA(preslice);
attrsize = VARSIZE(preslice) - VARHDRSZ;
}
2002-09-04 22:31:48 +02:00
/* slicing of datum for compressed cases and plain value */
2002-09-04 22:31:48 +02:00
if (sliceoffset >= attrsize)
{
sliceoffset = 0;
slicelength = 0;
}
2002-09-04 22:31:48 +02:00
if (((sliceoffset + slicelength) > attrsize) || slicelength < 0)
slicelength = attrsize - sliceoffset;
2002-09-04 22:31:48 +02:00
result = (struct varlena *) palloc(slicelength + VARHDRSZ);
SET_VARSIZE(result, slicelength + VARHDRSZ);
2002-09-04 22:31:48 +02:00
memcpy(VARDATA(result), attrdata + sliceoffset, slicelength);
2002-09-04 22:31:48 +02:00
if (preslice != attr)
pfree(preslice);
return result;
}
/* ----------
* toast_raw_datum_size -
*
* Return the raw (detoasted) size of a varlena datum
* (including the VARHDRSZ header)
* ----------
*/
Size
toast_raw_datum_size(Datum value)
{
struct varlena *attr = (struct varlena *) DatumGetPointer(value);
Size result;
if (VARATT_IS_EXTERNAL_ONDISK(attr))
{
/* va_rawsize is the size of the original datum -- including header */
struct varatt_external toast_pointer;
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
result = toast_pointer.va_rawsize;
}
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
{
struct varatt_indirect toast_pointer;
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
/* nested indirect Datums aren't allowed */
Assert(!VARATT_IS_EXTERNAL_INDIRECT(toast_pointer.pointer));
return toast_raw_datum_size(PointerGetDatum(toast_pointer.pointer));
}
else if (VARATT_IS_COMPRESSED(attr))
{
/* here, va_rawsize is just the payload size */
result = VARRAWSIZE_4B_C(attr) + VARHDRSZ;
}
else if (VARATT_IS_SHORT(attr))
{
/*
* we have to normalize the header length to VARHDRSZ or else the
* callers of this function will be confused.
*/
result = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT + VARHDRSZ;
}
else
{
/* plain untoasted datum */
result = VARSIZE(attr);
}
return result;
}
/* ----------
* toast_datum_size
*
* Return the physical storage size (possibly compressed) of a varlena datum
* ----------
*/
2005-10-15 04:49:52 +02:00
Size
toast_datum_size(Datum value)
{
2007-11-15 22:14:46 +01:00
struct varlena *attr = (struct varlena *) DatumGetPointer(value);
Size result;
if (VARATT_IS_EXTERNAL_ONDISK(attr))
{
/*
* Attribute is stored externally - return the extsize whether
2005-10-15 04:49:52 +02:00
* compressed or not. We do not count the size of the toast pointer
* ... should we?
*/
struct varatt_external toast_pointer;
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
result = toast_pointer.va_extsize;
}
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
{
struct varatt_indirect toast_pointer;
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
/* nested indirect Datums aren't allowed */
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
return toast_datum_size(PointerGetDatum(toast_pointer.pointer));
}
else if (VARATT_IS_SHORT(attr))
{
result = VARSIZE_SHORT(attr);
}
else
{
/*
2005-10-15 04:49:52 +02:00
* Attribute is stored inline either compressed or not, just calculate
* the size of the datum in either case.
*/
result = VARSIZE(attr);
}
return result;
}
/* ----------
* toast_delete -
*
* Cascaded delete toast-entries on DELETE
* ----------
*/
void
toast_delete(Relation rel, HeapTuple oldtup)
{
2001-03-22 05:01:46 +01:00
TupleDesc tupleDesc;
Form_pg_attribute *att;
int numAttrs;
int i;
Datum toast_values[MaxHeapAttributeNumber];
bool toast_isnull[MaxHeapAttributeNumber];
/*
* We should only ever be called for tuples of plain relations or
* materialized views --- recursing on a toast rel is bad news.
*/
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW);
/*
* Get the tuple descriptor and break down the tuple into fields.
*
* NOTE: it's debatable whether to use heap_deform_tuple() here or just
2005-10-15 04:49:52 +02:00
* heap_getattr() only the varlena columns. The latter could win if there
* are few varlena columns and many non-varlena ones. However,
* heap_deform_tuple costs only O(N) while the heap_getattr way would cost
2005-10-15 04:49:52 +02:00
* O(N^2) if there are many varlena columns, so it seems better to err on
* the side of linear cost. (We won't even be here unless there's at
* least one varlena column, by the way.)
*/
2001-03-22 05:01:46 +01:00
tupleDesc = rel->rd_att;
att = tupleDesc->attrs;
numAttrs = tupleDesc->natts;
Assert(numAttrs <= MaxHeapAttributeNumber);
heap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull);
/*
2005-10-15 04:49:52 +02:00
* Check for external stored attributes and delete them from the secondary
* relation.
*/
for (i = 0; i < numAttrs; i++)
{
if (att[i]->attlen == -1)
{
2004-08-29 07:07:03 +02:00
Datum value = toast_values[i];
if (toast_isnull[i])
continue;
else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value)))
toast_delete_datum(rel, value);
else if (VARATT_IS_EXTERNAL_INDIRECT(PointerGetDatum(value)))
elog(ERROR, "attempt to delete tuple containing indirect datums");
}
}
}
/* ----------
* toast_insert_or_update -
*
* Delete no-longer-used toast-entries and create new ones to
* make the new tuple fit on INSERT or UPDATE
*
* Inputs:
* newtup: the candidate new tuple to be inserted
* oldtup: the old row version for UPDATE, or NULL for INSERT
* options: options to be passed to heap_insert() for toast rows
* Result:
* either newtup if no toasting is needed, or a palloc'd modified tuple
* that is what should actually get stored
*
* NOTE: neither newtup nor oldtup will be modified. This is a change
* from the pre-8.1 API of this routine.
* ----------
*/
HeapTuple
toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
int options)
{
HeapTuple result_tuple;
2001-03-22 05:01:46 +01:00
TupleDesc tupleDesc;
Form_pg_attribute *att;
int numAttrs;
int i;
bool need_change = false;
bool need_free = false;
bool need_delold = false;
bool has_nulls = false;
Size maxDataLen;
Size hoff;
2001-03-22 05:01:46 +01:00
char toast_action[MaxHeapAttributeNumber];
bool toast_isnull[MaxHeapAttributeNumber];
bool toast_oldisnull[MaxHeapAttributeNumber];
2001-03-22 05:01:46 +01:00
Datum toast_values[MaxHeapAttributeNumber];
Datum toast_oldvalues[MaxHeapAttributeNumber];
struct varlena *toast_oldexternal[MaxHeapAttributeNumber];
2001-03-22 05:01:46 +01:00
int32 toast_sizes[MaxHeapAttributeNumber];
bool toast_free[MaxHeapAttributeNumber];
bool toast_delold[MaxHeapAttributeNumber];
/*
* We should only ever be called for tuples of plain relations or
* materialized views --- recursing on a toast rel is bad news.
*/
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW);
/*
* Get the tuple descriptor and break down the tuple(s) into fields.
*/
2001-03-22 05:01:46 +01:00
tupleDesc = rel->rd_att;
att = tupleDesc->attrs;
numAttrs = tupleDesc->natts;
Assert(numAttrs <= MaxHeapAttributeNumber);
heap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull);
if (oldtup != NULL)
heap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull);
/* ----------
* Then collect information about the values given
*
* NOTE: toast_action[i] can have these values:
* ' ' default handling
* 'p' already processed --- don't touch it
* 'x' incompressible, but OK to move off
*
* NOTE: toast_sizes[i] is only made valid for varlena attributes with
* toast_action[i] different from 'p'.
* ----------
*/
2001-03-22 05:01:46 +01:00
memset(toast_action, ' ', numAttrs * sizeof(char));
memset(toast_oldexternal, 0, numAttrs * sizeof(struct varlena *));
2001-03-22 05:01:46 +01:00
memset(toast_free, 0, numAttrs * sizeof(bool));
memset(toast_delold, 0, numAttrs * sizeof(bool));
for (i = 0; i < numAttrs; i++)
{
2007-11-15 22:14:46 +01:00
struct varlena *old_value;
struct varlena *new_value;
if (oldtup != NULL)
{
/*
* For UPDATE get the old and new values of this attribute
*/
old_value = (struct varlena *) DatumGetPointer(toast_oldvalues[i]);
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
/*
* If the old value is stored on disk, check if it has changed so
* we have to delete it later.
*/
if (att[i]->attlen == -1 && !toast_oldisnull[i] &&
VARATT_IS_EXTERNAL_ONDISK(old_value))
{
if (toast_isnull[i] || !VARATT_IS_EXTERNAL_ONDISK(new_value) ||
2007-11-15 22:14:46 +01:00
memcmp((char *) old_value, (char *) new_value,
VARSIZE_EXTERNAL(old_value)) != 0)
{
/*
* The old external stored value isn't needed any more
* after the update
*/
toast_delold[i] = true;
need_delold = true;
}
else
{
/*
2005-10-15 04:49:52 +02:00
* This attribute isn't changed by this update so we reuse
* the original reference to the old value in the new
* tuple.
*/
toast_action[i] = 'p';
continue;
}
}
}
else
{
/*
* For INSERT simply get the new value
*/
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
}
/*
* Handle NULL attributes
*/
if (toast_isnull[i])
{
toast_action[i] = 'p';
has_nulls = true;
continue;
}
/*
* Now look at varlena attributes
*/
if (att[i]->attlen == -1)
{
/*
* If the table's attribute says PLAIN always, force it so.
*/
if (att[i]->attstorage == 'p')
toast_action[i] = 'p';
/*
* We took care of UPDATE above, so any external value we find
* still in the tuple must be someone else's we cannot reuse.
* Fetch it back (without decompression, unless we are forcing
* PLAIN storage). If necessary, we'll push it out as a new
* external value below.
*/
if (VARATT_IS_EXTERNAL(new_value))
{
toast_oldexternal[i] = new_value;
if (att[i]->attstorage == 'p')
new_value = heap_tuple_untoast_attr(new_value);
else
new_value = heap_tuple_fetch_attr(new_value);
toast_values[i] = PointerGetDatum(new_value);
toast_free[i] = true;
need_change = true;
need_free = true;
}
/*
* Remember the size of this attribute
*/
toast_sizes[i] = VARSIZE_ANY(new_value);
}
else
{
/*
* Not a varlena attribute, plain storage always
*/
toast_action[i] = 'p';
}
}
/* ----------
* Compress and/or save external until data fits into target length
*
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
* 1: Inline compress attributes with attstorage 'x', and store very
* large attributes with attstorage 'x' or 'e' external immediately
* 2: Store attributes with attstorage 'x' or 'e' external
2001-03-22 05:01:46 +01:00
* 3: Inline compress attributes with attstorage 'm'
* 4: Store attributes with attstorage 'm' external
* ----------
*/
/* compute header overhead --- this should match heap_form_tuple() */
hoff = offsetof(HeapTupleHeaderData, t_bits);
if (has_nulls)
hoff += BITMAPLEN(numAttrs);
if (newtup->t_data->t_infomask & HEAP_HASOID)
hoff += sizeof(Oid);
hoff = MAXALIGN(hoff);
/* now convert to a limit on the tuple data size */
maxDataLen = TOAST_TUPLE_TARGET - hoff;
/*
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
* Look for attributes with attstorage 'x' to compress. Also find large
* attributes with attstorage 'x' or 'e', and store them external.
*/
while (heap_compute_data_size(tupleDesc,
toast_values, toast_isnull) > maxDataLen)
{
2001-03-22 05:01:46 +01:00
int biggest_attno = -1;
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
2001-03-22 05:01:46 +01:00
Datum old_value;
Datum new_value;
/*
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
* Search for the biggest yet unprocessed internal attribute
*/
for (i = 0; i < numAttrs; i++)
{
if (toast_action[i] != ' ')
continue;
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
continue; /* can't happen, toast_action would be 'p' */
if (VARATT_IS_COMPRESSED(DatumGetPointer(toast_values[i])))
continue;
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e')
continue;
if (toast_sizes[i] > biggest_size)
{
biggest_attno = i;
2001-03-22 05:01:46 +01:00
biggest_size = toast_sizes[i];
}
}
if (biggest_attno < 0)
break;
/*
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
* Attempt to compress it inline, if it has attstorage 'x'
*/
2001-03-22 05:01:46 +01:00
i = biggest_attno;
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
if (att[i]->attstorage == 'x')
{
old_value = toast_values[i];
new_value = toast_compress_datum(old_value);
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
if (DatumGetPointer(new_value) != NULL)
{
/* successful compression */
if (toast_free[i])
pfree(DatumGetPointer(old_value));
toast_values[i] = new_value;
toast_free[i] = true;
toast_sizes[i] = VARSIZE(DatumGetPointer(toast_values[i]));
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
need_change = true;
need_free = true;
}
else
{
/* incompressible, ignore on subsequent compression passes */
toast_action[i] = 'x';
}
}
else
{
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
/* has attstorage 'e', ignore on subsequent compression passes */
toast_action[i] = 'x';
}
/*
* If this value is by itself more than maxDataLen (after compression
* if any), push it out to the toast table immediately, if possible.
* This avoids uselessly compressing other fields in the common case
* where we have one long field and several short ones.
*
* XXX maybe the threshold should be less than maxDataLen?
*/
if (toast_sizes[i] > maxDataLen &&
rel->rd_rel->reltoastrelid != InvalidOid)
{
old_value = toast_values[i];
toast_action[i] = 'p';
toast_values[i] = toast_save_datum(rel, toast_values[i],
toast_oldexternal[i], options);
if (toast_free[i])
pfree(DatumGetPointer(old_value));
2001-03-22 05:01:46 +01:00
toast_free[i] = true;
need_change = true;
need_free = true;
}
}
/*
2005-10-15 04:49:52 +02:00
* Second we look for attributes of attstorage 'x' or 'e' that are still
* inline. But skip this if there's no toast table to push them to.
*/
while (heap_compute_data_size(tupleDesc,
toast_values, toast_isnull) > maxDataLen &&
rel->rd_rel->reltoastrelid != InvalidOid)
{
2001-03-22 05:01:46 +01:00
int biggest_attno = -1;
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
2001-03-22 05:01:46 +01:00
Datum old_value;
/*------
* Search for the biggest yet inlined attribute with
* attstorage equals 'x' or 'e'
*------
*/
for (i = 0; i < numAttrs; i++)
{
if (toast_action[i] == 'p')
continue;
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
continue; /* can't happen, toast_action would be 'p' */
if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e')
continue;
if (toast_sizes[i] > biggest_size)
{
biggest_attno = i;
2001-03-22 05:01:46 +01:00
biggest_size = toast_sizes[i];
}
}
if (biggest_attno < 0)
break;
/*
* Store this external
*/
2001-03-22 05:01:46 +01:00
i = biggest_attno;
old_value = toast_values[i];
toast_action[i] = 'p';
toast_values[i] = toast_save_datum(rel, toast_values[i],
toast_oldexternal[i], options);
if (toast_free[i])
pfree(DatumGetPointer(old_value));
2001-03-22 05:01:46 +01:00
toast_free[i] = true;
need_change = true;
2001-03-22 05:01:46 +01:00
need_free = true;
}
/*
* Round 3 - this time we take attributes with storage 'm' into
* compression
*/
while (heap_compute_data_size(tupleDesc,
toast_values, toast_isnull) > maxDataLen)
{
2001-03-22 05:01:46 +01:00
int biggest_attno = -1;
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
2001-03-22 05:01:46 +01:00
Datum old_value;
Datum new_value;
/*
* Search for the biggest yet uncompressed internal attribute
*/
for (i = 0; i < numAttrs; i++)
{
if (toast_action[i] != ' ')
continue;
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
continue; /* can't happen, toast_action would be 'p' */
if (VARATT_IS_COMPRESSED(DatumGetPointer(toast_values[i])))
continue;
if (att[i]->attstorage != 'm')
continue;
if (toast_sizes[i] > biggest_size)
{
biggest_attno = i;
2001-03-22 05:01:46 +01:00
biggest_size = toast_sizes[i];
}
}
if (biggest_attno < 0)
break;
/*
* Attempt to compress it inline
*/
2001-03-22 05:01:46 +01:00
i = biggest_attno;
old_value = toast_values[i];
new_value = toast_compress_datum(old_value);
if (DatumGetPointer(new_value) != NULL)
{
/* successful compression */
if (toast_free[i])
pfree(DatumGetPointer(old_value));
2001-03-22 05:01:46 +01:00
toast_values[i] = new_value;
toast_free[i] = true;
toast_sizes[i] = VARSIZE(DatumGetPointer(toast_values[i]));
2001-03-22 05:01:46 +01:00
need_change = true;
need_free = true;
}
else
{
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
/* incompressible, ignore on subsequent compression passes */
toast_action[i] = 'x';
}
}
/*
* Finally we store attributes of type 'm' externally. At this point we
2010-02-26 03:01:40 +01:00
* increase the target tuple size, so that 'm' attributes aren't stored
* externally unless really necessary.
*/
maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff;
while (heap_compute_data_size(tupleDesc,
toast_values, toast_isnull) > maxDataLen &&
rel->rd_rel->reltoastrelid != InvalidOid)
{
2001-03-22 05:01:46 +01:00
int biggest_attno = -1;
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
2001-03-22 05:01:46 +01:00
Datum old_value;
/*--------
* Search for the biggest yet inlined attribute with
* attstorage = 'm'
*--------
*/
for (i = 0; i < numAttrs; i++)
{
if (toast_action[i] == 'p')
continue;
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
continue; /* can't happen, toast_action would be 'p' */
if (att[i]->attstorage != 'm')
continue;
if (toast_sizes[i] > biggest_size)
{
biggest_attno = i;
2001-03-22 05:01:46 +01:00
biggest_size = toast_sizes[i];
}
}
if (biggest_attno < 0)
break;
/*
* Store this external
*/
2001-03-22 05:01:46 +01:00
i = biggest_attno;
old_value = toast_values[i];
toast_action[i] = 'p';
toast_values[i] = toast_save_datum(rel, toast_values[i],
toast_oldexternal[i], options);
if (toast_free[i])
pfree(DatumGetPointer(old_value));
2001-03-22 05:01:46 +01:00
toast_free[i] = true;
need_change = true;
2001-03-22 05:01:46 +01:00
need_free = true;
}
/*
2005-10-15 04:49:52 +02:00
* In the case we toasted any values, we need to build a new heap tuple
* with the changed values.
*/
if (need_change)
{
HeapTupleHeader olddata = newtup->t_data;
HeapTupleHeader new_data;
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
int32 new_header_len;
int32 new_data_len;
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
int32 new_tuple_len;
/*
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
* Calculate the new size of the tuple.
*
* Note: we used to assume here that the old tuple's t_hoff must equal
* the new_header_len value, but that was incorrect. The old tuple
* might have a smaller-than-current natts, if there's been an ALTER
* TABLE ADD COLUMN since it was stored; and that would lead to a
* different conclusion about the size of the null bitmap, or even
* whether there needs to be one at all.
*/
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_header_len = offsetof(HeapTupleHeaderData, t_bits);
if (has_nulls)
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_header_len += BITMAPLEN(numAttrs);
if (olddata->t_infomask & HEAP_HASOID)
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_header_len += sizeof(Oid);
new_header_len = MAXALIGN(new_header_len);
new_data_len = heap_compute_data_size(tupleDesc,
toast_values, toast_isnull);
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_tuple_len = new_header_len + new_data_len;
/*
* Allocate and zero the space needed, and fill HeapTupleData fields.
*/
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_tuple_len);
result_tuple->t_len = new_tuple_len;
result_tuple->t_self = newtup->t_self;
result_tuple->t_tableOid = newtup->t_tableOid;
new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE);
result_tuple->t_data = new_data;
/*
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
* Copy the existing tuple header, but adjust natts and t_hoff.
*/
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
memcpy(new_data, olddata, offsetof(HeapTupleHeaderData, t_bits));
HeapTupleHeaderSetNatts(new_data, numAttrs);
new_data->t_hoff = new_header_len;
if (olddata->t_infomask & HEAP_HASOID)
HeapTupleHeaderSetOid(new_data, HeapTupleHeaderGetOid(olddata));
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
/* Copy over the data, and fill the null bitmap if needed */
heap_fill_tuple(tupleDesc,
toast_values,
toast_isnull,
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
(char *) new_data + new_header_len,
new_data_len,
&(new_data->t_infomask),
has_nulls ? new_data->t_bits : NULL);
}
else
result_tuple = newtup;
/*
* Free allocated temp values
*/
if (need_free)
for (i = 0; i < numAttrs; i++)
if (toast_free[i])
pfree(DatumGetPointer(toast_values[i]));
/*
* Delete external values from the old tuple
*/
if (need_delold)
for (i = 0; i < numAttrs; i++)
if (toast_delold[i])
toast_delete_datum(rel, toast_oldvalues[i]);
return result_tuple;
}
Fix race condition with toast table access from a stale syscache entry. If a tuple in a syscache contains an out-of-line toasted field, and we try to fetch that field shortly after some other transaction has committed an update or deletion of the tuple, there is a race condition: vacuum could come along and remove the toast tuples before we can fetch them. This leads to transient failures like "missing chunk number 0 for toast value NNNNN in pg_toast_2619", as seen in recent reports from Andrew Hammond and Tim Uckun. The design idea of syscache is that access to stale syscache entries should be prevented by relation-level locks, but that fails for at least two cases where toasted fields are possible: ANALYZE updates pg_statistic rows without locking out sessions that might want to plan queries on the same table, and CREATE OR REPLACE FUNCTION updates pg_proc rows without any meaningful lock at all. The least risky fix seems to be an idea that Heikki suggested when we were dealing with a related problem back in August: forcibly detoast any out-of-line fields before putting a tuple into syscache in the first place. This avoids the problem because at the time we fetch the parent tuple from the catalog, we should be holding an MVCC snapshot that will prevent removal of the toast tuples, even if the parent tuple is outdated immediately after we fetch it. (Note: I'm not convinced that this statement holds true at every instant where we could be fetching a syscache entry at all, but it does appear to hold true at the times where we could fetch an entry that could have a toasted field. We will need to be a bit wary of adding toast tables to low-level catalogs that don't have them already.) An additional benefit is that subsequent uses of the syscache entry should be faster, since they won't have to detoast the field. Back-patch to all supported versions. The problem is significantly harder to reproduce in pre-9.0 releases, because of their willingness to flush every entry in a syscache whenever the underlying catalog is vacuumed (cf CatalogCacheFlushRelation); but there is still a window for trouble.
2011-11-02 00:48:37 +01:00
/* ----------
* toast_flatten_tuple -
*
* "Flatten" a tuple to contain no out-of-line toasted fields.
* (This does not eliminate compressed or short-header datums.)
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
*
* Note: we expect the caller already checked HeapTupleHasExternal(tup),
* so there is no need for a short-circuit path.
Fix race condition with toast table access from a stale syscache entry. If a tuple in a syscache contains an out-of-line toasted field, and we try to fetch that field shortly after some other transaction has committed an update or deletion of the tuple, there is a race condition: vacuum could come along and remove the toast tuples before we can fetch them. This leads to transient failures like "missing chunk number 0 for toast value NNNNN in pg_toast_2619", as seen in recent reports from Andrew Hammond and Tim Uckun. The design idea of syscache is that access to stale syscache entries should be prevented by relation-level locks, but that fails for at least two cases where toasted fields are possible: ANALYZE updates pg_statistic rows without locking out sessions that might want to plan queries on the same table, and CREATE OR REPLACE FUNCTION updates pg_proc rows without any meaningful lock at all. The least risky fix seems to be an idea that Heikki suggested when we were dealing with a related problem back in August: forcibly detoast any out-of-line fields before putting a tuple into syscache in the first place. This avoids the problem because at the time we fetch the parent tuple from the catalog, we should be holding an MVCC snapshot that will prevent removal of the toast tuples, even if the parent tuple is outdated immediately after we fetch it. (Note: I'm not convinced that this statement holds true at every instant where we could be fetching a syscache entry at all, but it does appear to hold true at the times where we could fetch an entry that could have a toasted field. We will need to be a bit wary of adding toast tables to low-level catalogs that don't have them already.) An additional benefit is that subsequent uses of the syscache entry should be faster, since they won't have to detoast the field. Back-patch to all supported versions. The problem is significantly harder to reproduce in pre-9.0 releases, because of their willingness to flush every entry in a syscache whenever the underlying catalog is vacuumed (cf CatalogCacheFlushRelation); but there is still a window for trouble.
2011-11-02 00:48:37 +01:00
* ----------
*/
HeapTuple
toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
{
HeapTuple new_tuple;
Form_pg_attribute *att = tupleDesc->attrs;
int numAttrs = tupleDesc->natts;
int i;
Datum toast_values[MaxTupleAttributeNumber];
bool toast_isnull[MaxTupleAttributeNumber];
bool toast_free[MaxTupleAttributeNumber];
/*
* Break down the tuple into fields.
*/
Assert(numAttrs <= MaxTupleAttributeNumber);
heap_deform_tuple(tup, tupleDesc, toast_values, toast_isnull);
memset(toast_free, 0, numAttrs * sizeof(bool));
for (i = 0; i < numAttrs; i++)
{
/*
* Look at non-null varlena attributes
*/
if (!toast_isnull[i] && att[i]->attlen == -1)
{
struct varlena *new_value;
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
if (VARATT_IS_EXTERNAL(new_value))
{
new_value = toast_fetch_datum(new_value);
toast_values[i] = PointerGetDatum(new_value);
toast_free[i] = true;
}
}
}
/*
* Form the reconfigured tuple.
*/
new_tuple = heap_form_tuple(tupleDesc, toast_values, toast_isnull);
/*
* Be sure to copy the tuple's OID and identity fields. We also make a
* point of copying visibility info, just in case anybody looks at those
* fields in a syscache entry.
*/
if (tupleDesc->tdhasoid)
HeapTupleSetOid(new_tuple, HeapTupleGetOid(tup));
new_tuple->t_self = tup->t_self;
new_tuple->t_tableOid = tup->t_tableOid;
new_tuple->t_data->t_choice = tup->t_data->t_choice;
new_tuple->t_data->t_ctid = tup->t_data->t_ctid;
new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
new_tuple->t_data->t_infomask |=
tup->t_data->t_infomask & HEAP_XACT_MASK;
new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
new_tuple->t_data->t_infomask2 |=
tup->t_data->t_infomask2 & HEAP2_XACT_MASK;
/*
* Free allocated temp values
*/
for (i = 0; i < numAttrs; i++)
if (toast_free[i])
pfree(DatumGetPointer(toast_values[i]));
return new_tuple;
}
/* ----------
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
* toast_flatten_tuple_to_datum -
*
* "Flatten" a tuple containing out-of-line toasted fields into a Datum.
* The result is always palloc'd in the current memory context.
*
* We have a general rule that Datums of container types (rows, arrays,
* ranges, etc) must not contain any external TOAST pointers. Without
* this rule, we'd have to look inside each Datum when preparing a tuple
* for storage, which would be expensive and would fail to extend cleanly
* to new sorts of container types.
*
* However, we don't want to say that tuples represented as HeapTuples
* can't contain toasted fields, so instead this routine should be called
* when such a HeapTuple is being converted into a Datum.
*
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
* While we're at it, we decompress any compressed fields too. This is not
* necessary for correctness, but reflects an expectation that compression
* will be more effective if applied to the whole tuple not individual
* fields. We are not so concerned about that that we want to deconstruct
* and reconstruct tuples just to get rid of compressed fields, however.
* So callers typically won't call this unless they see that the tuple has
* at least one external field.
*
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
* On the other hand, in-line short-header varlena fields are left alone.
* If we "untoasted" them here, they'd just get changed back to short-header
* format anyway within heap_fill_tuple.
* ----------
*/
Datum
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
toast_flatten_tuple_to_datum(HeapTupleHeader tup,
uint32 tup_len,
TupleDesc tupleDesc)
{
HeapTupleHeader new_data;
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
int32 new_header_len;
int32 new_data_len;
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
int32 new_tuple_len;
HeapTupleData tmptup;
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
Form_pg_attribute *att = tupleDesc->attrs;
int numAttrs = tupleDesc->natts;
int i;
bool has_nulls = false;
Datum toast_values[MaxTupleAttributeNumber];
bool toast_isnull[MaxTupleAttributeNumber];
bool toast_free[MaxTupleAttributeNumber];
/* Build a temporary HeapTuple control structure */
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
tmptup.t_len = tup_len;
ItemPointerSetInvalid(&(tmptup.t_self));
tmptup.t_tableOid = InvalidOid;
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
tmptup.t_data = tup;
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
/*
* Break down the tuple into fields.
*/
Assert(numAttrs <= MaxTupleAttributeNumber);
heap_deform_tuple(&tmptup, tupleDesc, toast_values, toast_isnull);
memset(toast_free, 0, numAttrs * sizeof(bool));
for (i = 0; i < numAttrs; i++)
{
/*
* Look at non-null varlena attributes
*/
if (toast_isnull[i])
has_nulls = true;
else if (att[i]->attlen == -1)
{
2007-11-15 22:14:46 +01:00
struct varlena *new_value;
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
if (VARATT_IS_EXTERNAL(new_value) ||
VARATT_IS_COMPRESSED(new_value))
{
new_value = heap_tuple_untoast_attr(new_value);
toast_values[i] = PointerGetDatum(new_value);
toast_free[i] = true;
}
}
}
/*
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
* Calculate the new size of the tuple.
*
* This should match the reconstruction code in toast_insert_or_update.
*/
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_header_len = offsetof(HeapTupleHeaderData, t_bits);
if (has_nulls)
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_header_len += BITMAPLEN(numAttrs);
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
if (tup->t_infomask & HEAP_HASOID)
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_header_len += sizeof(Oid);
new_header_len = MAXALIGN(new_header_len);
new_data_len = heap_compute_data_size(tupleDesc,
toast_values, toast_isnull);
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_tuple_len = new_header_len + new_data_len;
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
new_data = (HeapTupleHeader) palloc0(new_tuple_len);
/*
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
* Copy the existing tuple header, but adjust natts and t_hoff.
*/
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
memcpy(new_data, tup, offsetof(HeapTupleHeaderData, t_bits));
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
HeapTupleHeaderSetNatts(new_data, numAttrs);
new_data->t_hoff = new_header_len;
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
if (tup->t_infomask & HEAP_HASOID)
HeapTupleHeaderSetOid(new_data, HeapTupleHeaderGetOid(tup));
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
/* Set the composite-Datum header fields correctly */
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
HeapTupleHeaderSetDatumLength(new_data, new_tuple_len);
Fix failure to detoast fields in composite elements of structured types. If we have an array of records stored on disk, the individual record fields cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are only prepared to deal with TOAST pointers appearing in top-level fields of a stored row. The same applies for ranges over composite types, nested composites, etc. However, the existing code only took care of expanding sub-field TOAST pointers for the case of nested composites, not for other structured types containing composites. For example, given a command such as UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ... where x is a direct reference to a field of an on-disk tuple, if that field is long enough to be toasted out-of-line then the TOAST pointer would be inserted as-is into the array column. If the source record for x is later deleted, the array field value would become a dangling pointer, leading to errors along the line of "missing chunk number 0 for toast value ..." when the value is referenced. A reproducible test case for this was provided by Jan Pecek, but it seems likely that some of the "missing chunk number" reports we've heard in the past were caused by similar issues. Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to produce a self-contained Datum value if the Datum is of composite type. Seen in this light, the problem is not just confined to arrays and ranges, but could also affect some other places where detoasting is done in that way, for example form_index_tuple(). I tried teaching the array code to apply toast_flatten_tuple_attribute() along with PG_DETOAST_DATUM() when the array element type is composite, but this was messy and imposed extra cache lookup costs whether or not any TOAST pointers were present, indeed sometimes when the array element type isn't even composite (since sometimes it takes a typcache lookup to find that out). The idea of extending that approach to all the places that currently use PG_DETOAST_DATUM() wasn't attractive at all. This patch instead solves the problem by decreeing that composite Datum values must not contain any out-of-line TOAST pointers in the first place; that is, we expand out-of-line fields at the point of constructing a composite Datum, not at the point where we're about to insert it into a larger tuple. This rule is applied only to true composite Datums, not to tuples that are being passed around the system as tuples, so it's not as invasive as it might sound at first. With this approach, the amount of code that has to be touched for a full solution is greatly reduced, and added cache lookup costs are avoided except when there actually is a TOAST pointer that needs to be inlined. The main drawback of this approach is that we might sometimes dereference a TOAST pointer that will never actually be used by the query, imposing a rather large cost that wasn't there before. On the other side of the coin, if the field value is used multiple times then we'll come out ahead by avoiding repeat detoastings. Experimentation suggests that common SQL coding patterns are unaffected either way, though. Applications that are very negatively affected could be advised to modify their code to not fetch columns they won't be using. In future, we might consider reverting this solution in favor of detoasting only at the point where data is about to be stored to disk, using some method that can drill down into multiple levels of nested structured types. That will require defining new APIs for structured types, though, so it doesn't seem feasible as a back-patchable fix. Note that this patch changes HeapTupleGetDatum() from a macro to a function call; this means that any third-party code using that macro will not get protection against creating TOAST-pointer-containing Datums until it's recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER(). It seems likely that this is not a big problem in practice: most of the tuple-returning functions in core and contrib produce outputs that could not possibly be toasted anyway, and the same probably holds for third-party extensions. This bug has existed since TOAST was invented, so back-patch to all supported branches.
2014-05-01 21:19:06 +02:00
HeapTupleHeaderSetTypeId(new_data, tupleDesc->tdtypeid);
HeapTupleHeaderSetTypMod(new_data, tupleDesc->tdtypmod);
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
/* Copy over the data, and fill the null bitmap if needed */
heap_fill_tuple(tupleDesc,
toast_values,
toast_isnull,
Don't assume that a tuple's header size is unchanged during toasting. This assumption can be wrong when the toaster is passed a raw on-disk tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation that added columns without rewriting the table. In such a case the tuple's natts value is smaller than what we expect from the tuple descriptor, and so its t_hoff value could be smaller too. In fact, the tuple might not have a null bitmap at all, and yet our current opinion of it is that it contains some trailing nulls. In such a situation, toast_insert_or_update did the wrong thing, because to save a few lines of code it would use the old t_hoff value as the offset where heap_fill_tuple should start filling data. This did not leave enough room for the new nulls bitmap, with the result that the first few bytes of data could be overwritten with null flag bits, as in a recent report from Hubert Depesz Lubaczewski. The particular case reported requires ALTER TABLE ADD COLUMN followed by CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and further requires that there be some out-of-line toasted fields in one of the tuples to be copied; else we'll not reach the troublesome code. The problem can only manifest in this form in 8.4 and later, because before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or INSERT/SELECT wouldn't result in raw disk tuples getting passed directly to heap_insert --- there would always have been at least a junkfilter in between, and that would reconstitute the tuple header with an up-to-date t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all the way anyway, because I'm not convinced there are no older code paths that present a similar risk.
2011-11-05 04:22:50 +01:00
(char *) new_data + new_header_len,
new_data_len,
&(new_data->t_infomask),
has_nulls ? new_data->t_bits : NULL);
/*
* Free allocated temp values
*/
for (i = 0; i < numAttrs; i++)
if (toast_free[i])
pfree(DatumGetPointer(toast_values[i]));
return PointerGetDatum(new_data);
}
/* ----------
* toast_compress_datum -
*
* Create a compressed version of a varlena datum
*
* If we fail (ie, compressed result is actually bigger than original)
* then return NULL. We must not use compressed data if it'd expand
* the tuple!
*
* We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without
* copying them. But we can't handle external or compressed datums.
* ----------
*/
Datum
toast_compress_datum(Datum value)
{
struct varlena *tmp;
int32 valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value)));
Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value)));
/*
* No point in wasting a palloc cycle if value size is out of the allowed
* range for compression
*/
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
if (valsize < PGLZ_strategy_default->min_input_size ||
valsize > PGLZ_strategy_default->max_input_size)
return PointerGetDatum(NULL);
tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize));
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
/*
* We recheck the actual size even if pglz_compress() reports success,
* because it might be satisfied with having saved as little as one byte
* in the compressed data --- which could turn into a net loss once you
* consider header and alignment padding. Worst case, the compressed
* format might require three padding bytes (plus header, which is
* included in VARSIZE(tmp)), whereas the uncompressed format would take
* only one header byte and no padding if the value is short enough. So
* we insist on a savings of more than 2 bytes to ensure we have a gain.
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
*/
if (pglz_compress(VARDATA_ANY(DatumGetPointer(value)), valsize,
(PGLZ_Header *) tmp, PGLZ_strategy_default) &&
This patch addresses some issues in TOAST compression strategy that were discussed last year, but we felt it was too late in the 8.3 cycle to change the code immediately. Specifically, the patch: * Reduces the minimum datum size to be considered for compression from 256 to 32 bytes, as suggested by Greg Stark. * Increases the required compression rate for compressed storage from 20% to 25%, again per Greg's suggestion. * Replaces force_input_size (size above which compression is forced) with a maximum size to be considered for compression. It was agreed that allowing large inputs to escape the minimum-compression-rate requirement was not bright, and that indeed we'd rather have a knob that acted in the other direction. I set this value to 1MB for the moment, but it could use some performance studies to tune it. * Adds an early-failure path to the compressor as suggested by Jan: if it's been unable to find even one compressible substring in the first 1KB (parameterizable), assume we're looking at incompressible input and give up. (Possibly this logic can be improved, but I'll commit it as-is for now.) * Improves the toasting heuristics so that when we have very large fields with attstorage 'x' or 'e', we will push those out to toast storage before considering inline compression of shorter fields. This also responds to a suggestion of Greg's, though my original proposal for a solution was a bit off base because it didn't fix the problem for large 'e' fields. There was some discussion in the earlier threads of exposing some of the compression knobs to users, perhaps even on a per-column basis. I have not done anything about that here. It seems to me that if we are changing around the parameters, we'd better get some experience and be sure we are happy with the design before we set things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
VARSIZE(tmp) < valsize - 2)
{
/* successful compression */
return PointerGetDatum(tmp);
}
else
{
/* incompressible data */
pfree(tmp);
return PointerGetDatum(NULL);
}
}
/* ----------
* toast_get_valid_index
*
* Get OID of valid index associated to given toast relation. A toast
* relation can have only one valid index at the same time.
*/
Oid
toast_get_valid_index(Oid toastoid, LOCKMODE lock)
{
int num_indexes;
int validIndex;
Oid validIndexOid;
Relation *toastidxs;
Relation toastrel;
/* Open the toast relation */
toastrel = heap_open(toastoid, lock);
/* Look for the valid index of the toast relation */
validIndex = toast_open_indexes(toastrel,
lock,
&toastidxs,
&num_indexes);
validIndexOid = RelationGetRelid(toastidxs[validIndex]);
/* Close the toast relation and all its indexes */
toast_close_indexes(toastidxs, num_indexes, lock);
heap_close(toastrel, lock);
return validIndexOid;
}
/* ----------
* toast_save_datum -
*
* Save one single datum into the secondary relation and return
* a Datum reference for it.
*
* rel: the main relation we're working with (not the toast rel!)
* value: datum to be pushed to toast storage
* oldexternal: if not NULL, toast pointer previously representing the datum
* options: options to be passed to heap_insert() for toast rows
* ----------
*/
static Datum
toast_save_datum(Relation rel, Datum value,
struct varlena * oldexternal, int options)
{
2001-03-22 05:01:46 +01:00
Relation toastrel;
Relation *toastidxs;
2001-03-22 05:01:46 +01:00
HeapTuple toasttup;
TupleDesc toasttupDesc;
Datum t_values[3];
bool t_isnull[3];
CommandId mycid = GetCurrentCommandId(true);
struct varlena *result;
struct varatt_external toast_pointer;
struct
{
struct varlena hdr;
char data[TOAST_MAX_CHUNK_SIZE]; /* make struct big enough */
int32 align_it; /* ensure struct is aligned well enough */
} chunk_data;
2001-03-22 05:01:46 +01:00
int32 chunk_size;
int32 chunk_seq = 0;
char *data_p;
int32 data_todo;
Pointer dval = DatumGetPointer(value);
int num_indexes;
int validIndex;
Assert(!VARATT_IS_EXTERNAL(value));
/*
* Open the toast relation and its indexes. We can use the index to check
2005-10-15 04:49:52 +02:00
* uniqueness of the OID we assign to the toasted item, even though it has
* additional columns besides OID.
*/
toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock);
toasttupDesc = toastrel->rd_att;
2013-07-04 19:47:49 +02:00
/* Open all the toast indexes and look for the valid one */
validIndex = toast_open_indexes(toastrel,
RowExclusiveLock,
&toastidxs,
&num_indexes);
/*
* Get the data pointer and length, and compute va_rawsize and va_extsize.
*
2007-11-15 22:14:46 +01:00
* va_rawsize is the size of the equivalent fully uncompressed datum, so
* we have to adjust for short headers.
*
* va_extsize is the actual size of the data payload in the toast records.
*/
if (VARATT_IS_SHORT(dval))
{
data_p = VARDATA_SHORT(dval);
data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT;
2007-11-15 22:14:46 +01:00
toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */
toast_pointer.va_extsize = data_todo;
}
else if (VARATT_IS_COMPRESSED(dval))
{
data_p = VARDATA(dval);
data_todo = VARSIZE(dval) - VARHDRSZ;
/* rawsize in a compressed datum is just the size of the payload */
toast_pointer.va_rawsize = VARRAWSIZE_4B_C(dval) + VARHDRSZ;
toast_pointer.va_extsize = data_todo;
/* Assert that the numbers look like it's compressed */
Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer));
}
else
{
data_p = VARDATA(dval);
data_todo = VARSIZE(dval) - VARHDRSZ;
toast_pointer.va_rawsize = VARSIZE(dval);
toast_pointer.va_extsize = data_todo;
}
2001-03-22 05:01:46 +01:00
/*
* Insert the correct table OID into the result TOAST pointer.
*
* Normally this is the actual OID of the target toast table, but during
* table-rewriting operations such as CLUSTER, we have to insert the OID
2010-02-26 03:01:40 +01:00
* of the table's real permanent toast table instead. rd_toastoid is set
* if we have to substitute such an OID.
*/
if (OidIsValid(rel->rd_toastoid))
toast_pointer.va_toastrelid = rel->rd_toastoid;
else
toast_pointer.va_toastrelid = RelationGetRelid(toastrel);
/*
* Choose an OID to use as the value ID for this toast value.
*
* Normally we just choose an unused OID within the toast table. But
* during table-rewriting operations where we are preserving an existing
* toast table OID, we want to preserve toast value OIDs too. So, if
* rd_toastoid is set and we had a prior external value from that same
* toast table, re-use its value ID. If we didn't have a prior external
* value (which is a corner case, but possible if the table's attstorage
* options have been changed), we have to pick a value ID that doesn't
* conflict with either new or existing toast value OIDs.
*/
if (!OidIsValid(rel->rd_toastoid))
{
/* normal case: just choose an unused OID */
toast_pointer.va_valueid =
GetNewOidWithIndex(toastrel,
RelationGetRelid(toastidxs[validIndex]),
(AttrNumber) 1);
}
else
{
/* rewrite case: check to see if value was in old toast table */
toast_pointer.va_valueid = InvalidOid;
if (oldexternal != NULL)
{
struct varatt_external old_toast_pointer;
Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal));
/* Must copy to access aligned fields */
VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal);
if (old_toast_pointer.va_toastrelid == rel->rd_toastoid)
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
{
/* This value came from the old toast table; reuse its OID */
toast_pointer.va_valueid = old_toast_pointer.va_valueid;
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
/*
* There is a corner case here: the table rewrite might have
* to copy both live and recently-dead versions of a row, and
* those versions could easily reference the same toast value.
* When we copy the second or later version of such a row,
* reusing the OID will mean we select an OID that's already
* in the new toast table. Check for that, and if so, just
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
* fall through without writing the data again.
*
* While annoying and ugly-looking, this is a good thing
* because it ensures that we wind up with only one copy of
* the toast value when there is only one copy in the old
* toast table. Before we detected this case, we'd have made
* multiple copies, wasting space; and what's worse, the
* copies belonging to already-deleted heap tuples would not
* be reclaimed by VACUUM.
*/
if (toastrel_valueid_exists(toastrel,
toast_pointer.va_valueid))
{
/* Match, so short-circuit the data storage loop below */
data_todo = 0;
}
}
}
if (toast_pointer.va_valueid == InvalidOid)
{
/*
* new value; must choose an OID that doesn't conflict in either
* old or new toast table
*/
do
{
toast_pointer.va_valueid =
GetNewOidWithIndex(toastrel,
RelationGetRelid(toastidxs[validIndex]),
(AttrNumber) 1);
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
} while (toastid_valueid_exists(rel->rd_toastoid,
toast_pointer.va_valueid));
}
}
/*
* Initialize constant parts of the tuple data
*/
t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid);
t_values[2] = PointerGetDatum(&chunk_data);
t_isnull[0] = false;
t_isnull[1] = false;
t_isnull[2] = false;
/*
2001-03-22 05:01:46 +01:00
* Split up the item into chunks
*/
while (data_todo > 0)
{
int i;
CHECK_FOR_INTERRUPTS();
/*
* Calculate the size of this chunk
*/
chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo);
/*
* Build a tuple and store it
*/
t_values[1] = Int32GetDatum(chunk_seq++);
SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ);
memcpy(VARDATA(&chunk_data), data_p, chunk_size);
toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull);
heap_insert(toastrel, toasttup, mycid, options, NULL);
/*
* Create the index entry. We cheat a little here by not using
2005-10-15 04:49:52 +02:00
* FormIndexDatum: this relies on the knowledge that the index columns
* are the same as the initial columns of the table for all the
* indexes.
*
* Note also that there had better not be any user-created index on
* the TOAST table, since we don't bother to update anything else.
*/
for (i = 0; i < num_indexes; i++)
{
2013-07-04 19:47:49 +02:00
/* Only index relations marked as ready can be updated */
if (IndexIsReady(toastidxs[i]->rd_index))
index_insert(toastidxs[i], t_values, t_isnull,
&(toasttup->t_self),
toastrel,
toastidxs[i]->rd_index->indisunique ?
UNIQUE_CHECK_YES : UNIQUE_CHECK_NO);
}
/*
* Free memory
*/
heap_freetuple(toasttup);
/*
* Move on to next chunk
*/
data_todo -= chunk_size;
data_p += chunk_size;
}
/*
* Done - close toast relation and its indexes
*/
toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock);
heap_close(toastrel, RowExclusiveLock);
/*
* Create the TOAST pointer value that we'll return
*/
result = (struct varlena *) palloc(TOAST_POINTER_SIZE);
SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK);
memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer));
return PointerGetDatum(result);
}
/* ----------
* toast_delete_datum -
*
* Delete a single external stored value.
* ----------
*/
static void
toast_delete_datum(Relation rel, Datum value)
{
struct varlena *attr = (struct varlena *) DatumGetPointer(value);
struct varatt_external toast_pointer;
2001-03-22 05:01:46 +01:00
Relation toastrel;
Relation *toastidxs;
2001-03-22 05:01:46 +01:00
ScanKeyData toastkey;
SysScanDesc toastscan;
HeapTuple toasttup;
int num_indexes;
int validIndex;
if (!VARATT_IS_EXTERNAL_ONDISK(attr))
return;
/* Must copy to access aligned fields */
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
/*
* Open the toast relation and its indexes
*/
toastrel = heap_open(toast_pointer.va_toastrelid, RowExclusiveLock);
/* Fetch valid relation used for process */
validIndex = toast_open_indexes(toastrel,
RowExclusiveLock,
&toastidxs,
&num_indexes);
/*
* Setup a scan key to find chunks with matching va_valueid
*/
ScanKeyInit(&toastkey,
(AttrNumber) 1,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(toast_pointer.va_valueid));
/*
* Find all the chunks. (We don't actually care whether we see them in
* sequence or not, but since we've already locked the index we might as
* well use systable_beginscan_ordered.)
*/
toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex],
SnapshotToast, 1, &toastkey);
while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL)
{
/*
* Have a chunk, delete it
*/
simple_heap_delete(toastrel, &toasttup->t_self);
}
/*
* End scan and close relations
*/
systable_endscan_ordered(toastscan);
toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock);
heap_close(toastrel, RowExclusiveLock);
}
/* ----------
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
* toastrel_valueid_exists -
*
* Test whether a toast value with the given ID exists in the toast relation
* ----------
*/
static bool
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
toastrel_valueid_exists(Relation toastrel, Oid valueid)
{
bool result = false;
ScanKeyData toastkey;
SysScanDesc toastscan;
int num_indexes;
int validIndex;
Relation *toastidxs;
/* Fetch a valid index relation */
validIndex = toast_open_indexes(toastrel,
RowExclusiveLock,
&toastidxs,
&num_indexes);
/*
* Setup a scan key to find chunks with matching va_valueid
*/
ScanKeyInit(&toastkey,
(AttrNumber) 1,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(valueid));
/*
* Is there any such chunk?
*/
toastscan = systable_beginscan(toastrel,
RelationGetRelid(toastidxs[validIndex]),
true, SnapshotToast, 1, &toastkey);
if (systable_getnext(toastscan) != NULL)
result = true;
systable_endscan(toastscan);
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
/* Clean up */
toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock);
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows. In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and VACUUM FULL try to preserve toast value OIDs from the original toast table to the new one. However, if we have to copy both live and recently-dead versions of a row that has a toasted column, those versions may well reference the same toast value with the same OID. The patch then led to duplicate-key failures as we tried to insert the toast value twice with the same OID. (The previous behavior was not very desirable either, since it would have silently inserted the same value twice with different OIDs. That wastes space, but what's worse is that the toast values inserted for already-dead heap rows would not be reclaimed by subsequent ordinary VACUUMs, since they go into the new toast table marked live not deleted.) To fix, check if the copied OID already exists in the new toast table, and if so, assume that it stores the desired value. This is reasonably safe since the only case where we will copy an OID from a previous toast pointer is when toast_insert_or_update was given that toast pointer and so we just pulled the data from the old table; if we got two different values that way then we have big problems anyway. We do have to assume that no other backend is inserting items into the new toast table concurrently, but that's surely safe for CLUSTER and VACUUM FULL. Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous patch.
2012-01-12 22:40:14 +01:00
return result;
}
/* ----------
* toastid_valueid_exists -
*
* As above, but work from toast rel's OID not an open relation
* ----------
*/
static bool
toastid_valueid_exists(Oid toastrelid, Oid valueid)
{
bool result;
Relation toastrel;
toastrel = heap_open(toastrelid, AccessShareLock);
result = toastrel_valueid_exists(toastrel, valueid);
heap_close(toastrel, AccessShareLock);
return result;
}
/* ----------
* toast_fetch_datum -
*
* Reconstruct an in memory Datum from the chunks saved
* in the toast relation
* ----------
*/
static struct varlena *
2007-11-15 22:14:46 +01:00
toast_fetch_datum(struct varlena * attr)
{
2001-03-22 05:01:46 +01:00
Relation toastrel;
Relation *toastidxs;
2001-03-22 05:01:46 +01:00
ScanKeyData toastkey;
SysScanDesc toastscan;
2001-03-22 05:01:46 +01:00
HeapTuple ttup;
TupleDesc toasttupDesc;
struct varlena *result;
struct varatt_external toast_pointer;
2001-03-22 05:01:46 +01:00
int32 ressize;
int32 residx,
nextidx;
int32 numchunks;
2001-03-22 05:01:46 +01:00
Pointer chunk;
bool isnull;
2007-11-15 22:14:46 +01:00
char *chunkdata;
2001-03-22 05:01:46 +01:00
int32 chunksize;
int num_indexes;
int validIndex;
2001-03-22 05:01:46 +01:00
if (VARATT_IS_EXTERNAL_INDIRECT(attr))
elog(ERROR, "shouldn't be called for indirect tuples");
/* Must copy to access aligned fields */
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
ressize = toast_pointer.va_extsize;
2001-03-22 05:01:46 +01:00
numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;
result = (struct varlena *) palloc(ressize + VARHDRSZ);
if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
SET_VARSIZE_COMPRESSED(result, ressize + VARHDRSZ);
else
SET_VARSIZE(result, ressize + VARHDRSZ);
/*
* Open the toast relation and its indexes
*/
toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock);
toasttupDesc = toastrel->rd_att;
/* Look for the valid index of the toast relation */
validIndex = toast_open_indexes(toastrel,
AccessShareLock,
&toastidxs,
&num_indexes);
/*
* Setup a scan key to fetch from the index by va_valueid
*/
ScanKeyInit(&toastkey,
(AttrNumber) 1,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(toast_pointer.va_valueid));
/*
* Read the chunks by index
*
* Note that because the index is actually on (valueid, chunkidx) we will
* see the chunks in chunkidx order, even though we didn't explicitly ask
* for it.
*/
nextidx = 0;
toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex],
SnapshotToast, 1, &toastkey);
while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL)
{
/*
* Have a chunk, extract the sequence number and the data
*/
residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull));
Assert(!isnull);
chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull));
Assert(!isnull);
if (!VARATT_IS_EXTENDED(chunk))
{
chunksize = VARSIZE(chunk) - VARHDRSZ;
chunkdata = VARDATA(chunk);
}
else if (VARATT_IS_SHORT(chunk))
{
/* could happen due to heap_form_tuple doing its thing */
chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
chunkdata = VARDATA_SHORT(chunk);
}
else
{
/* should never happen */
elog(ERROR, "found toasted toast chunk for toast value %u in %s",
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
2007-11-15 22:14:46 +01:00
chunksize = 0; /* keep compiler quiet */
chunkdata = NULL;
}
/*
* Some checks on the data we've found
*/
if (residx != nextidx)
elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s",
residx, nextidx,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
2001-03-22 05:01:46 +01:00
if (residx < numchunks - 1)
{
if (chunksize != TOAST_MAX_CHUNK_SIZE)
elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s",
chunksize, (int) TOAST_MAX_CHUNK_SIZE,
residx, numchunks,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
}
2007-11-15 22:14:46 +01:00
else if (residx == numchunks - 1)
{
if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize)
elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s",
chunksize,
2007-11-15 22:14:46 +01:00
(int) (ressize - residx * TOAST_MAX_CHUNK_SIZE),
residx,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
}
else
elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s",
residx,
0, numchunks - 1,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
/*
* Copy the data into proper place in our result
*/
memcpy(VARDATA(result) + residx * TOAST_MAX_CHUNK_SIZE,
chunkdata,
chunksize);
nextidx++;
}
/*
* Final checks that we successfully fetched the datum
*/
if (nextidx != numchunks)
elog(ERROR, "missing chunk number %d for toast value %u in %s",
nextidx,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
/*
* End scan and close relations
*/
systable_endscan_ordered(toastscan);
toast_close_indexes(toastidxs, num_indexes, AccessShareLock);
heap_close(toastrel, AccessShareLock);
return result;
}
/* ----------
* toast_fetch_datum_slice -
*
* Reconstruct a segment of a Datum from the chunks saved
* in the toast relation
* ----------
*/
static struct varlena *
2007-11-15 22:14:46 +01:00
toast_fetch_datum_slice(struct varlena * attr, int32 sliceoffset, int32 length)
{
Relation toastrel;
Relation *toastidxs;
ScanKeyData toastkey[3];
int nscankeys;
SysScanDesc toastscan;
HeapTuple ttup;
TupleDesc toasttupDesc;
struct varlena *result;
struct varatt_external toast_pointer;
int32 attrsize;
int32 residx;
2002-09-04 22:31:48 +02:00
int32 nextidx;
int numchunks;
int startchunk;
int endchunk;
int32 startoffset;
int32 endoffset;
2002-09-04 22:31:48 +02:00
int totalchunks;
Pointer chunk;
bool isnull;
2007-11-15 22:14:46 +01:00
char *chunkdata;
int32 chunksize;
2002-09-04 22:31:48 +02:00
int32 chcpystrt;
int32 chcpyend;
int num_indexes;
int validIndex;
Assert(VARATT_IS_EXTERNAL_ONDISK(attr));
/* Must copy to access aligned fields */
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
/*
* It's nonsense to fetch slices of a compressed datum -- this isn't lo_*
* we can't return a compressed datum which is meaningful to toast later
*/
Assert(!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer));
attrsize = toast_pointer.va_extsize;
totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;
2002-09-04 22:31:48 +02:00
if (sliceoffset >= attrsize)
{
2002-09-04 22:31:48 +02:00
sliceoffset = 0;
length = 0;
}
if (((sliceoffset + length) > attrsize) || length < 0)
2002-09-04 22:31:48 +02:00
length = attrsize - sliceoffset;
result = (struct varlena *) palloc(length + VARHDRSZ);
if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
SET_VARSIZE_COMPRESSED(result, length + VARHDRSZ);
else
SET_VARSIZE(result, length + VARHDRSZ);
2002-09-04 22:31:48 +02:00
if (length == 0)
2006-10-04 02:30:14 +02:00
return result; /* Can save a lot of work at this point! */
startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE;
endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE;
2002-09-04 22:31:48 +02:00
numchunks = (endchunk - startchunk) + 1;
startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE;
endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE;
/*
* Open the toast relation and its indexes
*/
toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock);
toasttupDesc = toastrel->rd_att;
/* Look for the valid index of toast relation */
validIndex = toast_open_indexes(toastrel,
AccessShareLock,
&toastidxs,
&num_indexes);
/*
2005-10-15 04:49:52 +02:00
* Setup a scan key to fetch from the index. This is either two keys or
* three depending on the number of chunks.
*/
ScanKeyInit(&toastkey[0],
(AttrNumber) 1,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(toast_pointer.va_valueid));
2002-09-04 22:31:48 +02:00
/*
* Use equality condition for one chunk, a range condition otherwise:
*/
2002-09-04 22:31:48 +02:00
if (numchunks == 1)
{
ScanKeyInit(&toastkey[1],
(AttrNumber) 2,
BTEqualStrategyNumber, F_INT4EQ,
Int32GetDatum(startchunk));
2002-09-04 22:31:48 +02:00
nscankeys = 2;
}
else
{
ScanKeyInit(&toastkey[1],
(AttrNumber) 2,
BTGreaterEqualStrategyNumber, F_INT4GE,
Int32GetDatum(startchunk));
ScanKeyInit(&toastkey[2],
(AttrNumber) 2,
BTLessEqualStrategyNumber, F_INT4LE,
Int32GetDatum(endchunk));
2002-09-04 22:31:48 +02:00
nscankeys = 3;
}
/*
* Read the chunks by index
*
* The index is on (valueid, chunkidx) so they will come in order
*/
nextidx = startchunk;
toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex],
SnapshotToast, nscankeys, toastkey);
while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL)
{
/*
* Have a chunk, extract the sequence number and the data
*/
residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull));
Assert(!isnull);
chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull));
Assert(!isnull);
if (!VARATT_IS_EXTENDED(chunk))
{
chunksize = VARSIZE(chunk) - VARHDRSZ;
chunkdata = VARDATA(chunk);
}
else if (VARATT_IS_SHORT(chunk))
{
/* could happen due to heap_form_tuple doing its thing */
chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
chunkdata = VARDATA_SHORT(chunk);
}
else
{
/* should never happen */
elog(ERROR, "found toasted toast chunk for toast value %u in %s",
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
2007-11-15 22:14:46 +01:00
chunksize = 0; /* keep compiler quiet */
chunkdata = NULL;
}
/*
* Some checks on the data we've found
*/
if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk))
elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s",
residx, nextidx,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
if (residx < totalchunks - 1)
{
if (chunksize != TOAST_MAX_CHUNK_SIZE)
elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s when fetching slice",
chunksize, (int) TOAST_MAX_CHUNK_SIZE,
residx, totalchunks,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
}
2007-11-15 22:14:46 +01:00
else if (residx == totalchunks - 1)
{
if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize)
elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s when fetching slice",
chunksize,
(int) (attrsize - residx * TOAST_MAX_CHUNK_SIZE),
residx,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
}
else
elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s",
residx,
0, totalchunks - 1,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
/*
* Copy the data into proper place in our result
*/
chcpystrt = 0;
chcpyend = chunksize - 1;
2002-09-04 22:31:48 +02:00
if (residx == startchunk)
chcpystrt = startoffset;
if (residx == endchunk)
chcpyend = endoffset;
memcpy(VARDATA(result) +
2002-09-04 22:31:48 +02:00
(residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt,
chunkdata + chcpystrt,
(chcpyend - chcpystrt) + 1);
2002-09-04 22:31:48 +02:00
nextidx++;
}
/*
* Final checks that we successfully fetched the datum
*/
2002-09-04 22:31:48 +02:00
if (nextidx != (endchunk + 1))
elog(ERROR, "missing chunk number %d for toast value %u in %s",
nextidx,
toast_pointer.va_valueid,
RelationGetRelationName(toastrel));
/*
* End scan and close relations
*/
systable_endscan_ordered(toastscan);
toast_close_indexes(toastidxs, num_indexes, AccessShareLock);
heap_close(toastrel, AccessShareLock);
return result;
}
/* ----------
* toast_open_indexes
*
* Get an array of the indexes associated to the given toast relation
* and return as well the position of the valid index used by the toast
* relation in this array. It is the responsibility of the caller of this
* function to close the indexes as well as free them.
*/
static int
toast_open_indexes(Relation toastrel,
LOCKMODE lock,
Relation **toastidxs,
int *num_indexes)
{
int i = 0;
int res = 0;
bool found = false;
List *indexlist;
ListCell *lc;
/* Get index list of the toast relation */
indexlist = RelationGetIndexList(toastrel);
Assert(indexlist != NIL);
*num_indexes = list_length(indexlist);
/* Open all the index relations */
*toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation));
foreach(lc, indexlist)
(*toastidxs)[i++] = index_open(lfirst_oid(lc), lock);
/* Fetch the first valid index in list */
for (i = 0; i < *num_indexes; i++)
{
Relation toastidx = (*toastidxs)[i];
if (toastidx->rd_index->indisvalid)
{
res = i;
found = true;
break;
}
}
/*
* Free index list, not necessary anymore as relations are opened and a
* valid index has been found.
*/
list_free(indexlist);
/*
* The toast relation should have one valid index, so something is going
* wrong if there is nothing.
*/
if (!found)
elog(ERROR, "no valid index found for toast relation with Oid %d",
RelationGetRelid(toastrel));
return res;
}
/* ----------
* toast_close_indexes
*
* Close an array of indexes for a toast relation and free it. This should
* be called for a set of indexes opened previously with toast_open_indexes.
*/
static void
toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock)
{
int i;
/* Close relations and clean up things */
for (i = 0; i < num_indexes; i++)
index_close(toastidxs[i], lock);
pfree(toastidxs);
}