1999-12-21 01:06:44 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* tuptoaster.c
|
|
|
|
* Support routines for external and compressed storage of
|
2000-04-12 19:17:23 +02:00
|
|
|
* variable size attributes.
|
1999-12-21 01:06:44 +01:00
|
|
|
*
|
2015-01-06 17:43:47 +01:00
|
|
|
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
|
1999-12-21 01:06:44 +01:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/heap/tuptoaster.c
|
1999-12-21 01:06:44 +01:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* INTERFACE ROUTINES
|
2005-11-20 19:38:20 +01:00
|
|
|
* toast_insert_or_update -
|
1999-12-21 01:06:44 +01:00
|
|
|
* Try to make a given tuple fit into one page by compressing
|
|
|
|
* or moving off attributes
|
|
|
|
*
|
2005-11-20 19:38:20 +01:00
|
|
|
* toast_delete -
|
|
|
|
* Reclaim toast storage when a tuple is deleted
|
|
|
|
*
|
2000-07-04 01:10:14 +02:00
|
|
|
* heap_tuple_untoast_attr -
|
|
|
|
* Fetch back a given value from the "secondary" relation
|
|
|
|
*
|
1999-12-21 01:06:44 +01:00
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
2000-08-04 06:16:17 +02:00
|
|
|
#include "postgres.h"
|
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
#include <unistd.h>
|
|
|
|
#include <fcntl.h>
|
1999-12-21 01:06:44 +01:00
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
#include "access/genam.h"
|
2006-07-11 20:26:11 +02:00
|
|
|
#include "access/heapam.h"
|
2000-07-04 01:10:14 +02:00
|
|
|
#include "access/tuptoaster.h"
|
2007-03-29 02:15:39 +02:00
|
|
|
#include "access/xact.h"
|
2000-07-04 01:10:14 +02:00
|
|
|
#include "catalog/catalog.h"
|
2015-02-09 07:15:24 +01:00
|
|
|
#include "common/pg_lzcompress.h"
|
2014-06-24 03:45:21 +02:00
|
|
|
#include "miscadmin.h"
|
Support "expanded" objects, particularly arrays, for better performance.
This patch introduces the ability for complex datatypes to have an
in-memory representation that is different from their on-disk format.
On-disk formats are typically optimized for minimal size, and in any case
they can't contain pointers, so they are often not well-suited for
computation. Now a datatype can invent an "expanded" in-memory format
that is better suited for its operations, and then pass that around among
the C functions that operate on the datatype. There are also provisions
(rudimentary as yet) to allow an expanded object to be modified in-place
under suitable conditions, so that operations like assignment to an element
of an array need not involve copying the entire array.
The initial application for this feature is arrays, but it is not hard
to foresee using it for other container types like JSON, XML and hstore.
I have hopes that it will be useful to PostGIS as well.
In this initial implementation, a few heuristics have been hard-wired
into plpgsql to improve performance for arrays that are stored in
plpgsql variables. We would like to generalize those hacks so that
other datatypes can obtain similar improvements, but figuring out some
appropriate APIs is left as a task for future work. (The heuristics
themselves are probably not optimal yet, either, as they sometimes
force expansion of arrays that would be better left alone.)
Preliminary performance testing shows impressive speed gains for plpgsql
functions that do element-by-element access or update of large arrays.
There are other cases that get a little slower, as a result of added array
format conversions; but we can hope to improve anything that's annoyingly
bad. In any case most applications should see a net win.
Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
|
|
|
#include "utils/expandeddatum.h"
|
2000-07-04 01:10:14 +02:00
|
|
|
#include "utils/fmgroids.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "utils/rel.h"
|
2004-06-05 03:55:05 +02:00
|
|
|
#include "utils/typcache.h"
|
2008-03-26 22:10:39 +01:00
|
|
|
#include "utils/tqual.h"
|
1999-12-21 01:06:44 +01:00
|
|
|
|
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
#undef TOAST_DEBUG
|
|
|
|
|
2015-02-09 07:15:24 +01:00
|
|
|
/*
|
|
|
|
* The information at the start of the compressed toast data.
|
|
|
|
*/
|
|
|
|
typedef struct toast_compress_header
|
|
|
|
{
|
2015-02-09 18:30:52 +01:00
|
|
|
int32 vl_len_; /* varlena header (do not touch directly!) */
|
2015-02-09 07:15:24 +01:00
|
|
|
int32 rawsize;
|
|
|
|
} toast_compress_header;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Utilities for manipulation of header information for compressed
|
|
|
|
* toast entries.
|
|
|
|
*/
|
2015-02-09 18:30:52 +01:00
|
|
|
#define TOAST_COMPRESS_HDRSZ ((int32) sizeof(toast_compress_header))
|
|
|
|
#define TOAST_COMPRESS_RAWSIZE(ptr) (((toast_compress_header *) (ptr))->rawsize)
|
2015-02-09 07:15:24 +01:00
|
|
|
#define TOAST_COMPRESS_RAWDATA(ptr) \
|
2015-02-09 18:30:52 +01:00
|
|
|
(((char *) (ptr)) + TOAST_COMPRESS_HDRSZ)
|
2015-02-09 07:15:24 +01:00
|
|
|
#define TOAST_COMPRESS_SET_RAWSIZE(ptr, len) \
|
2015-02-09 18:30:52 +01:00
|
|
|
(((toast_compress_header *) (ptr))->rawsize = (len))
|
2015-02-09 07:15:24 +01:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
static void toast_delete_datum(Relation rel, Datum value);
|
2011-08-16 19:48:04 +02:00
|
|
|
static Datum toast_save_datum(Relation rel, Datum value,
|
2012-06-10 21:20:04 +02:00
|
|
|
struct varlena * oldexternal, int options);
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
static bool toastrel_valueid_exists(Relation toastrel, Oid valueid);
|
|
|
|
static bool toastid_valueid_exists(Oid toastrelid, Oid valueid);
|
2007-11-15 22:14:46 +01:00
|
|
|
static struct varlena *toast_fetch_datum(struct varlena * attr);
|
|
|
|
static struct varlena *toast_fetch_datum_slice(struct varlena * attr,
|
2002-09-04 22:31:48 +02:00
|
|
|
int32 sliceoffset, int32 length);
|
2015-02-09 07:15:24 +01:00
|
|
|
static struct varlena *toast_decompress_datum(struct varlena * attr);
|
2013-07-03 20:24:09 +02:00
|
|
|
static int toast_open_indexes(Relation toastrel,
|
2014-05-06 18:12:18 +02:00
|
|
|
LOCKMODE lock,
|
|
|
|
Relation **toastidxs,
|
|
|
|
int *num_indexes);
|
2013-07-03 20:24:09 +02:00
|
|
|
static void toast_close_indexes(Relation *toastidxs, int num_indexes,
|
2014-05-06 18:12:18 +02:00
|
|
|
LOCKMODE lock);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
1999-12-21 01:06:44 +01:00
|
|
|
|
2000-07-22 13:18:47 +02:00
|
|
|
/* ----------
|
|
|
|
* heap_tuple_fetch_attr -
|
|
|
|
*
|
2007-04-06 06:21:44 +02:00
|
|
|
* Public entry point to get back a toasted value from
|
2013-07-02 19:35:14 +02:00
|
|
|
* external source (possibly still in compressed format).
|
2007-04-06 06:21:44 +02:00
|
|
|
*
|
|
|
|
* This will return a datum that contains all the data internally, ie, not
|
2013-07-02 19:35:14 +02:00
|
|
|
* relying on external storage or memory, but it can still be compressed or
|
2015-02-09 18:30:52 +01:00
|
|
|
* have a short header. Note some callers assume that if the input is an
|
|
|
|
* EXTERNAL datum, the result will be a pfree'able chunk.
|
|
|
|
* ----------
|
2000-07-22 13:18:47 +02:00
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *
|
2007-11-15 22:14:46 +01:00
|
|
|
heap_tuple_fetch_attr(struct varlena * attr)
|
2000-07-21 12:31:31 +02:00
|
|
|
{
|
2007-11-15 22:14:46 +01:00
|
|
|
struct varlena *result;
|
2000-07-22 13:18:47 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
if (VARATT_IS_EXTERNAL_ONDISK(attr))
|
2000-07-22 13:18:47 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-22 13:18:47 +02:00
|
|
|
* This is an external stored plain value
|
|
|
|
*/
|
|
|
|
result = toast_fetch_datum(attr);
|
|
|
|
}
|
2013-07-02 19:35:14 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
|
|
|
{
|
|
|
|
/*
|
2015-02-09 18:30:52 +01:00
|
|
|
* This is an indirect pointer --- dereference it
|
2013-07-02 19:35:14 +02:00
|
|
|
*/
|
|
|
|
struct varatt_indirect redirect;
|
2014-05-06 18:12:18 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
|
2014-05-06 18:12:18 +02:00
|
|
|
attr = (struct varlena *) redirect.pointer;
|
2013-07-02 19:35:14 +02:00
|
|
|
|
|
|
|
/* nested indirect Datums aren't allowed */
|
|
|
|
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
|
|
|
|
|
2015-02-09 18:30:52 +01:00
|
|
|
/* recurse if value is still external in some other way */
|
|
|
|
if (VARATT_IS_EXTERNAL(attr))
|
2013-07-02 19:35:14 +02:00
|
|
|
return heap_tuple_fetch_attr(attr);
|
|
|
|
|
2015-02-09 18:30:52 +01:00
|
|
|
/*
|
|
|
|
* Copy into the caller's memory context, in case caller tries to
|
|
|
|
* pfree the result.
|
|
|
|
*/
|
2013-07-02 19:35:14 +02:00
|
|
|
result = (struct varlena *) palloc(VARSIZE_ANY(attr));
|
|
|
|
memcpy(result, attr, VARSIZE_ANY(attr));
|
|
|
|
}
|
Support "expanded" objects, particularly arrays, for better performance.
This patch introduces the ability for complex datatypes to have an
in-memory representation that is different from their on-disk format.
On-disk formats are typically optimized for minimal size, and in any case
they can't contain pointers, so they are often not well-suited for
computation. Now a datatype can invent an "expanded" in-memory format
that is better suited for its operations, and then pass that around among
the C functions that operate on the datatype. There are also provisions
(rudimentary as yet) to allow an expanded object to be modified in-place
under suitable conditions, so that operations like assignment to an element
of an array need not involve copying the entire array.
The initial application for this feature is arrays, but it is not hard
to foresee using it for other container types like JSON, XML and hstore.
I have hopes that it will be useful to PostGIS as well.
In this initial implementation, a few heuristics have been hard-wired
into plpgsql to improve performance for arrays that are stored in
plpgsql variables. We would like to generalize those hacks so that
other datatypes can obtain similar improvements, but figuring out some
appropriate APIs is left as a task for future work. (The heuristics
themselves are probably not optimal yet, either, as they sometimes
force expansion of arrays that would be better left alone.)
Preliminary performance testing shows impressive speed gains for plpgsql
functions that do element-by-element access or update of large arrays.
There are other cases that get a little slower, as a result of added array
format conversions; but we can hope to improve anything that's annoyingly
bad. In any case most applications should see a net win.
Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_EXPANDED(attr))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This is an expanded-object pointer --- get flat format
|
|
|
|
*/
|
|
|
|
ExpandedObjectHeader *eoh;
|
|
|
|
Size resultsize;
|
|
|
|
|
|
|
|
eoh = DatumGetEOHP(PointerGetDatum(attr));
|
|
|
|
resultsize = EOH_get_flat_size(eoh);
|
|
|
|
result = (struct varlena *) palloc(resultsize);
|
|
|
|
EOH_flatten_into(eoh, (void *) result, resultsize);
|
|
|
|
}
|
2000-07-21 12:31:31 +02:00
|
|
|
else
|
2000-07-22 13:18:47 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* This is a plain value inside of the main tuple - why am I called?
|
2000-07-22 13:18:47 +02:00
|
|
|
*/
|
|
|
|
result = attr;
|
2001-03-22 05:01:46 +01:00
|
|
|
}
|
2000-07-22 13:18:47 +02:00
|
|
|
|
|
|
|
return result;
|
2000-07-21 12:31:31 +02:00
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* heap_tuple_untoast_attr -
|
|
|
|
*
|
|
|
|
* Public entry point to get back a toasted value from compression
|
2015-02-09 18:30:52 +01:00
|
|
|
* or external storage. The result is always non-extended varlena form.
|
|
|
|
*
|
|
|
|
* Note some callers assume that if the input is an EXTERNAL or COMPRESSED
|
|
|
|
* datum, the result will be a pfree'able chunk.
|
2000-07-04 01:10:14 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *
|
2007-11-15 22:14:46 +01:00
|
|
|
heap_tuple_untoast_attr(struct varlena * attr)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2013-07-02 19:35:14 +02:00
|
|
|
if (VARATT_IS_EXTERNAL_ONDISK(attr))
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
/*
|
|
|
|
* This is an externally stored datum --- fetch it back from there
|
|
|
|
*/
|
|
|
|
attr = toast_fetch_datum(attr);
|
|
|
|
/* If it's compressed, decompress it */
|
2000-07-04 01:10:14 +02:00
|
|
|
if (VARATT_IS_COMPRESSED(attr))
|
|
|
|
{
|
2015-02-09 07:15:24 +01:00
|
|
|
struct varlena *tmp = attr;
|
2015-02-09 18:30:52 +01:00
|
|
|
|
2015-02-09 07:15:24 +01:00
|
|
|
attr = toast_decompress_datum(tmp);
|
2000-07-04 01:10:14 +02:00
|
|
|
pfree(tmp);
|
|
|
|
}
|
|
|
|
}
|
2013-07-02 19:35:14 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
|
|
|
{
|
2015-02-09 18:30:52 +01:00
|
|
|
/*
|
|
|
|
* This is an indirect pointer --- dereference it
|
|
|
|
*/
|
2013-07-02 19:35:14 +02:00
|
|
|
struct varatt_indirect redirect;
|
2014-05-06 18:12:18 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
|
2014-05-06 18:12:18 +02:00
|
|
|
attr = (struct varlena *) redirect.pointer;
|
2013-07-02 19:35:14 +02:00
|
|
|
|
|
|
|
/* nested indirect Datums aren't allowed */
|
|
|
|
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
|
|
|
|
|
2015-02-09 18:30:52 +01:00
|
|
|
/* recurse in case value is still extended in some other way */
|
2013-07-02 19:35:14 +02:00
|
|
|
attr = heap_tuple_untoast_attr(attr);
|
2015-02-09 18:30:52 +01:00
|
|
|
|
|
|
|
/* if it isn't, we'd better copy it */
|
|
|
|
if (attr == (struct varlena *) redirect.pointer)
|
|
|
|
{
|
|
|
|
struct varlena *result;
|
|
|
|
|
|
|
|
result = (struct varlena *) palloc(VARSIZE_ANY(attr));
|
|
|
|
memcpy(result, attr, VARSIZE_ANY(attr));
|
|
|
|
attr = result;
|
|
|
|
}
|
2013-07-02 19:35:14 +02:00
|
|
|
}
|
Support "expanded" objects, particularly arrays, for better performance.
This patch introduces the ability for complex datatypes to have an
in-memory representation that is different from their on-disk format.
On-disk formats are typically optimized for minimal size, and in any case
they can't contain pointers, so they are often not well-suited for
computation. Now a datatype can invent an "expanded" in-memory format
that is better suited for its operations, and then pass that around among
the C functions that operate on the datatype. There are also provisions
(rudimentary as yet) to allow an expanded object to be modified in-place
under suitable conditions, so that operations like assignment to an element
of an array need not involve copying the entire array.
The initial application for this feature is arrays, but it is not hard
to foresee using it for other container types like JSON, XML and hstore.
I have hopes that it will be useful to PostGIS as well.
In this initial implementation, a few heuristics have been hard-wired
into plpgsql to improve performance for arrays that are stored in
plpgsql variables. We would like to generalize those hacks so that
other datatypes can obtain similar improvements, but figuring out some
appropriate APIs is left as a task for future work. (The heuristics
themselves are probably not optimal yet, either, as they sometimes
force expansion of arrays that would be better left alone.)
Preliminary performance testing shows impressive speed gains for plpgsql
functions that do element-by-element access or update of large arrays.
There are other cases that get a little slower, as a result of added array
format conversions; but we can hope to improve anything that's annoyingly
bad. In any case most applications should see a net win.
Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_EXPANDED(attr))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This is an expanded-object pointer --- get flat format
|
|
|
|
*/
|
|
|
|
attr = heap_tuple_fetch_attr(attr);
|
|
|
|
/* flatteners are not allowed to produce compressed/short output */
|
|
|
|
Assert(!VARATT_IS_EXTENDED(attr));
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
else if (VARATT_IS_COMPRESSED(attr))
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* This is a compressed value inside of the main tuple
|
|
|
|
*/
|
2015-02-09 07:15:24 +01:00
|
|
|
attr = toast_decompress_datum(attr);
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
2007-04-06 06:21:44 +02:00
|
|
|
else if (VARATT_IS_SHORT(attr))
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2007-04-06 06:21:44 +02:00
|
|
|
* This is a short-header varlena --- convert to 4-byte header format
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2007-11-15 22:14:46 +01:00
|
|
|
Size data_size = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT;
|
|
|
|
Size new_size = data_size + VARHDRSZ;
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *new_attr;
|
|
|
|
|
|
|
|
new_attr = (struct varlena *) palloc(new_size);
|
|
|
|
SET_VARSIZE(new_attr, new_size);
|
|
|
|
memcpy(VARDATA(new_attr), VARDATA_SHORT(attr), data_size);
|
|
|
|
attr = new_attr;
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
return attr;
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
/* ----------
|
|
|
|
* heap_tuple_untoast_attr_slice -
|
|
|
|
*
|
2002-09-04 22:31:48 +02:00
|
|
|
* Public entry point to get back part of a toasted value
|
|
|
|
* from compression or external storage.
|
2002-03-05 06:33:31 +01:00
|
|
|
* ----------
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *
|
2007-11-15 22:14:46 +01:00
|
|
|
heap_tuple_untoast_attr_slice(struct varlena * attr,
|
2007-04-06 06:21:44 +02:00
|
|
|
int32 sliceoffset, int32 slicelength)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *preslice;
|
|
|
|
struct varlena *result;
|
2007-11-15 22:14:46 +01:00
|
|
|
char *attrdata;
|
2002-09-04 22:31:48 +02:00
|
|
|
int32 attrsize;
|
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
if (VARATT_IS_EXTERNAL_ONDISK(attr))
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varatt_external toast_pointer;
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2007-09-30 21:54:58 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
2007-04-06 06:21:44 +02:00
|
|
|
|
|
|
|
/* fast path for non-compressed external datums */
|
|
|
|
if (!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
|
|
|
|
return toast_fetch_datum_slice(attr, sliceoffset, slicelength);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
/* fetch it back (compressed marker will get set automatically) */
|
|
|
|
preslice = toast_fetch_datum(attr);
|
|
|
|
}
|
2013-07-02 19:35:14 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
|
|
|
{
|
|
|
|
struct varatt_indirect redirect;
|
2014-05-06 18:12:18 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
|
|
|
|
|
|
|
|
/* nested indirect Datums aren't allowed */
|
|
|
|
Assert(!VARATT_IS_EXTERNAL_INDIRECT(redirect.pointer));
|
|
|
|
|
|
|
|
return heap_tuple_untoast_attr_slice(redirect.pointer,
|
|
|
|
sliceoffset, slicelength);
|
|
|
|
}
|
Support "expanded" objects, particularly arrays, for better performance.
This patch introduces the ability for complex datatypes to have an
in-memory representation that is different from their on-disk format.
On-disk formats are typically optimized for minimal size, and in any case
they can't contain pointers, so they are often not well-suited for
computation. Now a datatype can invent an "expanded" in-memory format
that is better suited for its operations, and then pass that around among
the C functions that operate on the datatype. There are also provisions
(rudimentary as yet) to allow an expanded object to be modified in-place
under suitable conditions, so that operations like assignment to an element
of an array need not involve copying the entire array.
The initial application for this feature is arrays, but it is not hard
to foresee using it for other container types like JSON, XML and hstore.
I have hopes that it will be useful to PostGIS as well.
In this initial implementation, a few heuristics have been hard-wired
into plpgsql to improve performance for arrays that are stored in
plpgsql variables. We would like to generalize those hacks so that
other datatypes can obtain similar improvements, but figuring out some
appropriate APIs is left as a task for future work. (The heuristics
themselves are probably not optimal yet, either, as they sometimes
force expansion of arrays that would be better left alone.)
Preliminary performance testing shows impressive speed gains for plpgsql
functions that do element-by-element access or update of large arrays.
There are other cases that get a little slower, as a result of added array
format conversions; but we can hope to improve anything that's annoyingly
bad. In any case most applications should see a net win.
Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_EXPANDED(attr))
|
|
|
|
{
|
|
|
|
/* pass it off to heap_tuple_fetch_attr to flatten */
|
|
|
|
preslice = heap_tuple_fetch_attr(attr);
|
|
|
|
}
|
2007-04-06 06:21:44 +02:00
|
|
|
else
|
|
|
|
preslice = attr;
|
|
|
|
|
2015-02-09 18:30:52 +01:00
|
|
|
Assert(!VARATT_IS_EXTERNAL(preslice));
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
if (VARATT_IS_COMPRESSED(preslice))
|
|
|
|
{
|
2015-02-09 07:15:24 +01:00
|
|
|
struct varlena *tmp = preslice;
|
2015-02-09 18:30:52 +01:00
|
|
|
|
2015-02-09 07:15:24 +01:00
|
|
|
preslice = toast_decompress_datum(tmp);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2015-02-09 07:15:24 +01:00
|
|
|
if (tmp != attr)
|
2002-03-05 06:33:31 +01:00
|
|
|
pfree(tmp);
|
|
|
|
}
|
2007-04-06 06:21:44 +02:00
|
|
|
|
|
|
|
if (VARATT_IS_SHORT(preslice))
|
|
|
|
{
|
|
|
|
attrdata = VARDATA_SHORT(preslice);
|
|
|
|
attrsize = VARSIZE_SHORT(preslice) - VARHDRSZ_SHORT;
|
|
|
|
}
|
2002-09-04 22:31:48 +02:00
|
|
|
else
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
attrdata = VARDATA(preslice);
|
|
|
|
attrsize = VARSIZE(preslice) - VARHDRSZ;
|
2002-03-05 06:33:31 +01:00
|
|
|
}
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
/* slicing of datum for compressed cases and plain value */
|
2002-09-04 22:31:48 +02:00
|
|
|
|
|
|
|
if (sliceoffset >= attrsize)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
|
|
|
sliceoffset = 0;
|
|
|
|
slicelength = 0;
|
|
|
|
}
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
if (((sliceoffset + slicelength) > attrsize) || slicelength < 0)
|
|
|
|
slicelength = attrsize - sliceoffset;
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
result = (struct varlena *) palloc(slicelength + VARHDRSZ);
|
2007-02-28 00:48:10 +01:00
|
|
|
SET_VARSIZE(result, slicelength + VARHDRSZ);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
memcpy(VARDATA(result), attrdata + sliceoffset, slicelength);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
|
|
|
if (preslice != attr)
|
|
|
|
pfree(preslice);
|
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2001-05-07 02:43:27 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_raw_datum_size -
|
|
|
|
*
|
|
|
|
* Return the raw (detoasted) size of a varlena datum
|
2007-04-06 06:21:44 +02:00
|
|
|
* (including the VARHDRSZ header)
|
2001-05-07 02:43:27 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
Size
|
|
|
|
toast_raw_datum_size(Datum value)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *attr = (struct varlena *) DatumGetPointer(value);
|
2001-05-07 02:43:27 +02:00
|
|
|
Size result;
|
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
if (VARATT_IS_EXTERNAL_ONDISK(attr))
|
2001-05-07 02:43:27 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
/* va_rawsize is the size of the original datum -- including header */
|
|
|
|
struct varatt_external toast_pointer;
|
|
|
|
|
2007-09-30 21:54:58 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
2007-04-06 06:21:44 +02:00
|
|
|
result = toast_pointer.va_rawsize;
|
2001-05-07 02:43:27 +02:00
|
|
|
}
|
2013-07-02 19:35:14 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
|
|
|
{
|
|
|
|
struct varatt_indirect toast_pointer;
|
2014-05-06 18:12:18 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
|
|
|
|
|
|
|
/* nested indirect Datums aren't allowed */
|
|
|
|
Assert(!VARATT_IS_EXTERNAL_INDIRECT(toast_pointer.pointer));
|
|
|
|
|
|
|
|
return toast_raw_datum_size(PointerGetDatum(toast_pointer.pointer));
|
|
|
|
}
|
Support "expanded" objects, particularly arrays, for better performance.
This patch introduces the ability for complex datatypes to have an
in-memory representation that is different from their on-disk format.
On-disk formats are typically optimized for minimal size, and in any case
they can't contain pointers, so they are often not well-suited for
computation. Now a datatype can invent an "expanded" in-memory format
that is better suited for its operations, and then pass that around among
the C functions that operate on the datatype. There are also provisions
(rudimentary as yet) to allow an expanded object to be modified in-place
under suitable conditions, so that operations like assignment to an element
of an array need not involve copying the entire array.
The initial application for this feature is arrays, but it is not hard
to foresee using it for other container types like JSON, XML and hstore.
I have hopes that it will be useful to PostGIS as well.
In this initial implementation, a few heuristics have been hard-wired
into plpgsql to improve performance for arrays that are stored in
plpgsql variables. We would like to generalize those hacks so that
other datatypes can obtain similar improvements, but figuring out some
appropriate APIs is left as a task for future work. (The heuristics
themselves are probably not optimal yet, either, as they sometimes
force expansion of arrays that would be better left alone.)
Preliminary performance testing shows impressive speed gains for plpgsql
functions that do element-by-element access or update of large arrays.
There are other cases that get a little slower, as a result of added array
format conversions; but we can hope to improve anything that's annoyingly
bad. In any case most applications should see a net win.
Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_EXPANDED(attr))
|
|
|
|
{
|
|
|
|
result = EOH_get_flat_size(DatumGetEOHP(value));
|
|
|
|
}
|
2007-04-06 06:21:44 +02:00
|
|
|
else if (VARATT_IS_COMPRESSED(attr))
|
|
|
|
{
|
|
|
|
/* here, va_rawsize is just the payload size */
|
|
|
|
result = VARRAWSIZE_4B_C(attr) + VARHDRSZ;
|
|
|
|
}
|
|
|
|
else if (VARATT_IS_SHORT(attr))
|
2001-05-07 02:43:27 +02:00
|
|
|
{
|
|
|
|
/*
|
2007-04-06 06:21:44 +02:00
|
|
|
* we have to normalize the header length to VARHDRSZ or else the
|
|
|
|
* callers of this function will be confused.
|
2001-05-07 02:43:27 +02:00
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
result = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT + VARHDRSZ;
|
2001-05-07 02:43:27 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* plain untoasted datum */
|
|
|
|
result = VARSIZE(attr);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2005-08-02 18:11:57 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_datum_size
|
|
|
|
*
|
|
|
|
* Return the physical storage size (possibly compressed) of a varlena datum
|
|
|
|
* ----------
|
|
|
|
*/
|
2005-10-15 04:49:52 +02:00
|
|
|
Size
|
2005-08-02 18:11:57 +02:00
|
|
|
toast_datum_size(Datum value)
|
|
|
|
{
|
2007-11-15 22:14:46 +01:00
|
|
|
struct varlena *attr = (struct varlena *) DatumGetPointer(value);
|
2005-08-02 18:11:57 +02:00
|
|
|
Size result;
|
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
if (VARATT_IS_EXTERNAL_ONDISK(attr))
|
2005-08-02 18:11:57 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Attribute is stored externally - return the extsize whether
|
2005-10-15 04:49:52 +02:00
|
|
|
* compressed or not. We do not count the size of the toast pointer
|
|
|
|
* ... should we?
|
2005-08-02 18:11:57 +02:00
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varatt_external toast_pointer;
|
|
|
|
|
2007-09-30 21:54:58 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
2007-04-06 06:21:44 +02:00
|
|
|
result = toast_pointer.va_extsize;
|
|
|
|
}
|
2013-07-02 19:35:14 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
|
|
|
{
|
|
|
|
struct varatt_indirect toast_pointer;
|
2014-05-06 18:12:18 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
|
|
|
|
|
|
|
/* nested indirect Datums aren't allowed */
|
|
|
|
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
|
|
|
|
|
|
|
|
return toast_datum_size(PointerGetDatum(toast_pointer.pointer));
|
|
|
|
}
|
Support "expanded" objects, particularly arrays, for better performance.
This patch introduces the ability for complex datatypes to have an
in-memory representation that is different from their on-disk format.
On-disk formats are typically optimized for minimal size, and in any case
they can't contain pointers, so they are often not well-suited for
computation. Now a datatype can invent an "expanded" in-memory format
that is better suited for its operations, and then pass that around among
the C functions that operate on the datatype. There are also provisions
(rudimentary as yet) to allow an expanded object to be modified in-place
under suitable conditions, so that operations like assignment to an element
of an array need not involve copying the entire array.
The initial application for this feature is arrays, but it is not hard
to foresee using it for other container types like JSON, XML and hstore.
I have hopes that it will be useful to PostGIS as well.
In this initial implementation, a few heuristics have been hard-wired
into plpgsql to improve performance for arrays that are stored in
plpgsql variables. We would like to generalize those hacks so that
other datatypes can obtain similar improvements, but figuring out some
appropriate APIs is left as a task for future work. (The heuristics
themselves are probably not optimal yet, either, as they sometimes
force expansion of arrays that would be better left alone.)
Preliminary performance testing shows impressive speed gains for plpgsql
functions that do element-by-element access or update of large arrays.
There are other cases that get a little slower, as a result of added array
format conversions; but we can hope to improve anything that's annoyingly
bad. In any case most applications should see a net win.
Tom Lane, reviewed by Andres Freund
2015-05-14 18:08:40 +02:00
|
|
|
else if (VARATT_IS_EXTERNAL_EXPANDED(attr))
|
|
|
|
{
|
|
|
|
result = EOH_get_flat_size(DatumGetEOHP(value));
|
|
|
|
}
|
2007-04-06 06:21:44 +02:00
|
|
|
else if (VARATT_IS_SHORT(attr))
|
|
|
|
{
|
|
|
|
result = VARSIZE_SHORT(attr);
|
2005-08-02 18:11:57 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Attribute is stored inline either compressed or not, just calculate
|
|
|
|
* the size of the datum in either case.
|
2005-08-02 18:11:57 +02:00
|
|
|
*/
|
|
|
|
result = VARSIZE(attr);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2001-05-07 02:43:27 +02:00
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_delete -
|
|
|
|
*
|
|
|
|
* Cascaded delete toast-entries on DELETE
|
|
|
|
* ----------
|
|
|
|
*/
|
2005-11-20 19:38:20 +01:00
|
|
|
void
|
2000-07-04 01:10:14 +02:00
|
|
|
toast_delete(Relation rel, HeapTuple oldtup)
|
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
TupleDesc tupleDesc;
|
|
|
|
Form_pg_attribute *att;
|
|
|
|
int numAttrs;
|
|
|
|
int i;
|
2004-06-04 22:35:21 +02:00
|
|
|
Datum toast_values[MaxHeapAttributeNumber];
|
2005-03-21 02:24:04 +01:00
|
|
|
bool toast_isnull[MaxHeapAttributeNumber];
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2007-04-03 06:14:26 +02:00
|
|
|
/*
|
2013-03-04 01:23:31 +01:00
|
|
|
* We should only ever be called for tuples of plain relations or
|
|
|
|
* materialized views --- recursing on a toast rel is bad news.
|
2007-04-03 06:14:26 +02:00
|
|
|
*/
|
2013-03-04 01:23:31 +01:00
|
|
|
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
|
|
|
|
rel->rd_rel->relkind == RELKIND_MATVIEW);
|
2007-04-03 06:14:26 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-06-04 22:35:21 +02:00
|
|
|
* Get the tuple descriptor and break down the tuple into fields.
|
|
|
|
*
|
2008-11-02 02:45:28 +01:00
|
|
|
* NOTE: it's debatable whether to use heap_deform_tuple() here or just
|
2005-10-15 04:49:52 +02:00
|
|
|
* heap_getattr() only the varlena columns. The latter could win if there
|
|
|
|
* are few varlena columns and many non-varlena ones. However,
|
2008-11-02 02:45:28 +01:00
|
|
|
* heap_deform_tuple costs only O(N) while the heap_getattr way would cost
|
2005-10-15 04:49:52 +02:00
|
|
|
* O(N^2) if there are many varlena columns, so it seems better to err on
|
|
|
|
* the side of linear cost. (We won't even be here unless there's at
|
|
|
|
* least one varlena column, by the way.)
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
tupleDesc = rel->rd_att;
|
|
|
|
att = tupleDesc->attrs;
|
2004-06-04 22:35:21 +02:00
|
|
|
numAttrs = tupleDesc->natts;
|
|
|
|
|
|
|
|
Assert(numAttrs <= MaxHeapAttributeNumber);
|
2005-03-21 02:24:04 +01:00
|
|
|
heap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Check for external stored attributes and delete them from the secondary
|
|
|
|
* relation.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
2001-01-15 06:29:19 +01:00
|
|
|
if (att[i]->attlen == -1)
|
|
|
|
{
|
2004-08-29 07:07:03 +02:00
|
|
|
Datum value = toast_values[i];
|
2004-06-04 22:35:21 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
if (toast_isnull[i])
|
|
|
|
continue;
|
|
|
|
else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value)))
|
2000-07-04 01:10:14 +02:00
|
|
|
toast_delete_datum(rel, value);
|
2001-01-15 06:29:19 +01:00
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* toast_insert_or_update -
|
|
|
|
*
|
2000-08-04 06:16:17 +02:00
|
|
|
* Delete no-longer-used toast-entries and create new ones to
|
2000-07-04 01:10:14 +02:00
|
|
|
* make the new tuple fit on INSERT or UPDATE
|
2005-11-20 19:38:20 +01:00
|
|
|
*
|
|
|
|
* Inputs:
|
|
|
|
* newtup: the candidate new tuple to be inserted
|
|
|
|
* oldtup: the old row version for UPDATE, or NULL for INSERT
|
2008-11-06 21:51:15 +01:00
|
|
|
* options: options to be passed to heap_insert() for toast rows
|
2005-11-20 19:38:20 +01:00
|
|
|
* Result:
|
|
|
|
* either newtup if no toasting is needed, or a palloc'd modified tuple
|
|
|
|
* that is what should actually get stored
|
|
|
|
*
|
|
|
|
* NOTE: neither newtup nor oldtup will be modified. This is a change
|
|
|
|
* from the pre-8.1 API of this routine.
|
2000-07-04 01:10:14 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
2005-11-20 19:38:20 +01:00
|
|
|
HeapTuple
|
2007-03-29 02:15:39 +02:00
|
|
|
toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
|
2008-11-06 21:51:15 +01:00
|
|
|
int options)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2005-11-20 19:38:20 +01:00
|
|
|
HeapTuple result_tuple;
|
2001-03-22 05:01:46 +01:00
|
|
|
TupleDesc tupleDesc;
|
|
|
|
Form_pg_attribute *att;
|
|
|
|
int numAttrs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
bool need_change = false;
|
|
|
|
bool need_free = false;
|
|
|
|
bool need_delold = false;
|
|
|
|
bool has_nulls = false;
|
|
|
|
|
|
|
|
Size maxDataLen;
|
2007-04-03 06:14:26 +02:00
|
|
|
Size hoff;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
|
|
|
char toast_action[MaxHeapAttributeNumber];
|
2005-03-21 02:24:04 +01:00
|
|
|
bool toast_isnull[MaxHeapAttributeNumber];
|
|
|
|
bool toast_oldisnull[MaxHeapAttributeNumber];
|
2001-03-22 05:01:46 +01:00
|
|
|
Datum toast_values[MaxHeapAttributeNumber];
|
2004-06-04 22:35:21 +02:00
|
|
|
Datum toast_oldvalues[MaxHeapAttributeNumber];
|
2011-08-16 19:48:04 +02:00
|
|
|
struct varlena *toast_oldexternal[MaxHeapAttributeNumber];
|
2001-03-22 05:01:46 +01:00
|
|
|
int32 toast_sizes[MaxHeapAttributeNumber];
|
|
|
|
bool toast_free[MaxHeapAttributeNumber];
|
|
|
|
bool toast_delold[MaxHeapAttributeNumber];
|
2000-07-04 01:10:14 +02:00
|
|
|
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
/*
|
|
|
|
* Ignore the INSERT_SPECULATIVE option. Speculative insertions/super
|
|
|
|
* deletions just normally insert/delete the toast values. It seems
|
|
|
|
* easiest to deal with that here, instead on, potentially, multiple
|
|
|
|
* callers.
|
|
|
|
*/
|
|
|
|
options &= ~HEAP_INSERT_SPECULATIVE;
|
|
|
|
|
2007-04-03 06:14:26 +02:00
|
|
|
/*
|
2013-07-05 21:25:51 +02:00
|
|
|
* We should only ever be called for tuples of plain relations or
|
|
|
|
* materialized views --- recursing on a toast rel is bad news.
|
2007-04-03 06:14:26 +02:00
|
|
|
*/
|
2013-03-04 01:23:31 +01:00
|
|
|
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
|
|
|
|
rel->rd_rel->relkind == RELKIND_MATVIEW);
|
2007-04-03 06:14:26 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-06-04 22:35:21 +02:00
|
|
|
* Get the tuple descriptor and break down the tuple(s) into fields.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
tupleDesc = rel->rd_att;
|
|
|
|
att = tupleDesc->attrs;
|
2004-06-04 22:35:21 +02:00
|
|
|
numAttrs = tupleDesc->natts;
|
|
|
|
|
|
|
|
Assert(numAttrs <= MaxHeapAttributeNumber);
|
2005-03-21 02:24:04 +01:00
|
|
|
heap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull);
|
2004-06-04 22:35:21 +02:00
|
|
|
if (oldtup != NULL)
|
2005-03-21 02:24:04 +01:00
|
|
|
heap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* Then collect information about the values given
|
2000-10-24 01:42:04 +02:00
|
|
|
*
|
|
|
|
* NOTE: toast_action[i] can have these values:
|
|
|
|
* ' ' default handling
|
|
|
|
* 'p' already processed --- don't touch it
|
|
|
|
* 'x' incompressible, but OK to move off
|
2004-06-04 22:35:21 +02:00
|
|
|
*
|
|
|
|
* NOTE: toast_sizes[i] is only made valid for varlena attributes with
|
|
|
|
* toast_action[i] different from 'p'.
|
2000-07-04 01:10:14 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
memset(toast_action, ' ', numAttrs * sizeof(char));
|
2011-08-16 19:48:04 +02:00
|
|
|
memset(toast_oldexternal, 0, numAttrs * sizeof(struct varlena *));
|
2001-03-22 05:01:46 +01:00
|
|
|
memset(toast_free, 0, numAttrs * sizeof(bool));
|
|
|
|
memset(toast_delold, 0, numAttrs * sizeof(bool));
|
2004-06-04 22:35:21 +02:00
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
2007-11-15 22:14:46 +01:00
|
|
|
struct varlena *old_value;
|
|
|
|
struct varlena *new_value;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
|
|
|
if (oldtup != NULL)
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* For UPDATE get the old and new values of this attribute
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
old_value = (struct varlena *) DatumGetPointer(toast_oldvalues[i]);
|
|
|
|
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2013-07-02 19:35:14 +02:00
|
|
|
* If the old value is stored on disk, check if it has changed so
|
|
|
|
* we have to delete it later.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2005-03-21 02:24:04 +01:00
|
|
|
if (att[i]->attlen == -1 && !toast_oldisnull[i] &&
|
2013-07-02 19:35:14 +02:00
|
|
|
VARATT_IS_EXTERNAL_ONDISK(old_value))
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2013-07-02 19:35:14 +02:00
|
|
|
if (toast_isnull[i] || !VARATT_IS_EXTERNAL_ONDISK(new_value) ||
|
2007-11-15 22:14:46 +01:00
|
|
|
memcmp((char *) old_value, (char *) new_value,
|
2007-09-30 21:54:58 +02:00
|
|
|
VARSIZE_EXTERNAL(old_value)) != 0)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-06-04 22:35:21 +02:00
|
|
|
* The old external stored value isn't needed any more
|
2001-03-22 07:16:21 +01:00
|
|
|
* after the update
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
toast_delold[i] = true;
|
|
|
|
need_delold = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* This attribute isn't changed by this update so we reuse
|
|
|
|
* the original reference to the old value in the new
|
|
|
|
* tuple.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
toast_action[i] = 'p';
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* For INSERT simply get the new value
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Handle NULL attributes
|
|
|
|
*/
|
2005-03-21 02:24:04 +01:00
|
|
|
if (toast_isnull[i])
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
|
|
|
toast_action[i] = 'p';
|
|
|
|
has_nulls = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-06-04 22:35:21 +02:00
|
|
|
* Now look at varlena attributes
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
if (att[i]->attlen == -1)
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-08-01 00:39:17 +02:00
|
|
|
* If the table's attribute says PLAIN always, force it so.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
if (att[i]->attstorage == 'p')
|
|
|
|
toast_action[i] = 'p';
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2001-02-09 18:30:03 +01:00
|
|
|
* We took care of UPDATE above, so any external value we find
|
2015-02-09 18:30:52 +01:00
|
|
|
* still in the tuple must be someone else's that we cannot reuse
|
|
|
|
* (this includes the case of an out-of-line in-memory datum).
|
2007-09-27 01:29:10 +02:00
|
|
|
* Fetch it back (without decompression, unless we are forcing
|
2014-05-06 18:12:18 +02:00
|
|
|
* PLAIN storage). If necessary, we'll push it out as a new
|
2007-09-27 01:29:10 +02:00
|
|
|
* external value below.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2004-06-05 03:55:05 +02:00
|
|
|
if (VARATT_IS_EXTERNAL(new_value))
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2011-08-16 19:48:04 +02:00
|
|
|
toast_oldexternal[i] = new_value;
|
2007-09-27 01:29:10 +02:00
|
|
|
if (att[i]->attstorage == 'p')
|
|
|
|
new_value = heap_tuple_untoast_attr(new_value);
|
|
|
|
else
|
|
|
|
new_value = heap_tuple_fetch_attr(new_value);
|
2004-06-05 03:55:05 +02:00
|
|
|
toast_values[i] = PointerGetDatum(new_value);
|
2000-07-04 01:10:14 +02:00
|
|
|
toast_free[i] = true;
|
|
|
|
need_change = true;
|
|
|
|
need_free = true;
|
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Remember the size of this attribute
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
toast_sizes[i] = VARSIZE_ANY(new_value);
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-06-04 22:35:21 +02:00
|
|
|
* Not a varlena attribute, plain storage always
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
toast_action[i] = 'p';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
2000-08-04 06:16:17 +02:00
|
|
|
* Compress and/or save external until data fits into target length
|
2000-07-04 01:10:14 +02:00
|
|
|
*
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
* 1: Inline compress attributes with attstorage 'x', and store very
|
|
|
|
* large attributes with attstorage 'x' or 'e' external immediately
|
2000-07-04 01:10:14 +02:00
|
|
|
* 2: Store attributes with attstorage 'x' or 'e' external
|
2001-03-22 05:01:46 +01:00
|
|
|
* 3: Inline compress attributes with attstorage 'm'
|
2000-07-04 01:10:14 +02:00
|
|
|
* 4: Store attributes with attstorage 'm' external
|
|
|
|
* ----------
|
|
|
|
*/
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
|
|
|
|
/* compute header overhead --- this should match heap_form_tuple() */
|
2015-02-21 21:13:06 +01:00
|
|
|
hoff = SizeofHeapTupleHeader;
|
2000-07-04 01:10:14 +02:00
|
|
|
if (has_nulls)
|
2007-04-03 06:14:26 +02:00
|
|
|
hoff += BITMAPLEN(numAttrs);
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
if (newtup->t_data->t_infomask & HEAP_HASOID)
|
2007-04-03 06:14:26 +02:00
|
|
|
hoff += sizeof(Oid);
|
|
|
|
hoff = MAXALIGN(hoff);
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
/* now convert to a limit on the tuple data size */
|
2007-04-03 06:14:26 +02:00
|
|
|
maxDataLen = TOAST_TUPLE_TARGET - hoff;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
* Look for attributes with attstorage 'x' to compress. Also find large
|
|
|
|
* attributes with attstorage 'x' or 'e', and store them external.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
while (heap_compute_data_size(tupleDesc,
|
|
|
|
toast_values, toast_isnull) > maxDataLen)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
int biggest_attno = -1;
|
2007-10-01 18:25:56 +02:00
|
|
|
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
|
2001-03-22 05:01:46 +01:00
|
|
|
Datum old_value;
|
|
|
|
Datum new_value;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
* Search for the biggest yet unprocessed internal attribute
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
2000-10-24 01:42:04 +02:00
|
|
|
if (toast_action[i] != ' ')
|
2000-07-04 01:10:14 +02:00
|
|
|
continue;
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
|
2007-09-27 01:29:10 +02:00
|
|
|
continue; /* can't happen, toast_action would be 'p' */
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_COMPRESSED(DatumGetPointer(toast_values[i])))
|
2000-07-04 01:10:14 +02:00
|
|
|
continue;
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e')
|
2000-07-04 01:10:14 +02:00
|
|
|
continue;
|
|
|
|
if (toast_sizes[i] > biggest_size)
|
|
|
|
{
|
|
|
|
biggest_attno = i;
|
2001-03-22 05:01:46 +01:00
|
|
|
biggest_size = toast_sizes[i];
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (biggest_attno < 0)
|
|
|
|
break;
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
* Attempt to compress it inline, if it has attstorage 'x'
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
i = biggest_attno;
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
if (att[i]->attstorage == 'x')
|
|
|
|
{
|
|
|
|
old_value = toast_values[i];
|
|
|
|
new_value = toast_compress_datum(old_value);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
if (DatumGetPointer(new_value) != NULL)
|
|
|
|
{
|
|
|
|
/* successful compression */
|
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(old_value));
|
|
|
|
toast_values[i] = new_value;
|
|
|
|
toast_free[i] = true;
|
2008-04-17 23:37:28 +02:00
|
|
|
toast_sizes[i] = VARSIZE(DatumGetPointer(toast_values[i]));
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
need_change = true;
|
|
|
|
need_free = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* incompressible, ignore on subsequent compression passes */
|
|
|
|
toast_action[i] = 'x';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2000-10-24 01:42:04 +02:00
|
|
|
{
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
/* has attstorage 'e', ignore on subsequent compression passes */
|
|
|
|
toast_action[i] = 'x';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this value is by itself more than maxDataLen (after compression
|
|
|
|
* if any), push it out to the toast table immediately, if possible.
|
|
|
|
* This avoids uselessly compressing other fields in the common case
|
|
|
|
* where we have one long field and several short ones.
|
|
|
|
*
|
|
|
|
* XXX maybe the threshold should be less than maxDataLen?
|
|
|
|
*/
|
|
|
|
if (toast_sizes[i] > maxDataLen &&
|
|
|
|
rel->rd_rel->reltoastrelid != InvalidOid)
|
|
|
|
{
|
|
|
|
old_value = toast_values[i];
|
|
|
|
toast_action[i] = 'p';
|
2011-08-16 19:48:04 +02:00
|
|
|
toast_values[i] = toast_save_datum(rel, toast_values[i],
|
|
|
|
toast_oldexternal[i], options);
|
2000-10-24 01:42:04 +02:00
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(old_value));
|
2001-03-22 05:01:46 +01:00
|
|
|
toast_free[i] = true;
|
|
|
|
need_change = true;
|
|
|
|
need_free = true;
|
2000-10-24 01:42:04 +02:00
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Second we look for attributes of attstorage 'x' or 'e' that are still
|
2014-05-06 18:12:18 +02:00
|
|
|
* inline. But skip this if there's no toast table to push them to.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
while (heap_compute_data_size(tupleDesc,
|
|
|
|
toast_values, toast_isnull) > maxDataLen &&
|
|
|
|
rel->rd_rel->reltoastrelid != InvalidOid)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
int biggest_attno = -1;
|
2007-10-01 18:25:56 +02:00
|
|
|
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
|
2001-03-22 05:01:46 +01:00
|
|
|
Datum old_value;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-23 05:49:58 +01:00
|
|
|
/*------
|
|
|
|
* Search for the biggest yet inlined attribute with
|
|
|
|
* attstorage equals 'x' or 'e'
|
|
|
|
*------
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
|
|
|
if (toast_action[i] == 'p')
|
|
|
|
continue;
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
|
2007-09-27 01:29:10 +02:00
|
|
|
continue; /* can't happen, toast_action would be 'p' */
|
2000-07-04 01:10:14 +02:00
|
|
|
if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e')
|
|
|
|
continue;
|
|
|
|
if (toast_sizes[i] > biggest_size)
|
|
|
|
{
|
|
|
|
biggest_attno = i;
|
2001-03-22 05:01:46 +01:00
|
|
|
biggest_size = toast_sizes[i];
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (biggest_attno < 0)
|
|
|
|
break;
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Store this external
|
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
i = biggest_attno;
|
|
|
|
old_value = toast_values[i];
|
|
|
|
toast_action[i] = 'p';
|
2011-08-16 19:48:04 +02:00
|
|
|
toast_values[i] = toast_save_datum(rel, toast_values[i],
|
|
|
|
toast_oldexternal[i], options);
|
2000-07-04 01:10:14 +02:00
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(old_value));
|
2001-03-22 05:01:46 +01:00
|
|
|
toast_free[i] = true;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
|
|
|
need_change = true;
|
2001-03-22 05:01:46 +01:00
|
|
|
need_free = true;
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
|
|
|
* Round 3 - this time we take attributes with storage 'm' into
|
|
|
|
* compression
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
while (heap_compute_data_size(tupleDesc,
|
|
|
|
toast_values, toast_isnull) > maxDataLen)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
int biggest_attno = -1;
|
2007-10-01 18:25:56 +02:00
|
|
|
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
|
2001-03-22 05:01:46 +01:00
|
|
|
Datum old_value;
|
|
|
|
Datum new_value;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Search for the biggest yet uncompressed internal attribute
|
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
2000-10-24 01:42:04 +02:00
|
|
|
if (toast_action[i] != ' ')
|
2000-07-04 01:10:14 +02:00
|
|
|
continue;
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
|
2007-09-27 01:29:10 +02:00
|
|
|
continue; /* can't happen, toast_action would be 'p' */
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_COMPRESSED(DatumGetPointer(toast_values[i])))
|
2000-07-04 01:10:14 +02:00
|
|
|
continue;
|
|
|
|
if (att[i]->attstorage != 'm')
|
|
|
|
continue;
|
|
|
|
if (toast_sizes[i] > biggest_size)
|
|
|
|
{
|
|
|
|
biggest_attno = i;
|
2001-03-22 05:01:46 +01:00
|
|
|
biggest_size = toast_sizes[i];
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (biggest_attno < 0)
|
|
|
|
break;
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-10-24 01:42:04 +02:00
|
|
|
* Attempt to compress it inline
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
i = biggest_attno;
|
|
|
|
old_value = toast_values[i];
|
|
|
|
new_value = toast_compress_datum(old_value);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2000-10-24 01:42:04 +02:00
|
|
|
if (DatumGetPointer(new_value) != NULL)
|
|
|
|
{
|
|
|
|
/* successful compression */
|
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(old_value));
|
2001-03-22 05:01:46 +01:00
|
|
|
toast_values[i] = new_value;
|
|
|
|
toast_free[i] = true;
|
2008-04-17 23:37:28 +02:00
|
|
|
toast_sizes[i] = VARSIZE(DatumGetPointer(toast_values[i]));
|
2001-03-22 05:01:46 +01:00
|
|
|
need_change = true;
|
|
|
|
need_free = true;
|
2000-10-24 01:42:04 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
/* incompressible, ignore on subsequent compression passes */
|
2000-10-24 01:42:04 +02:00
|
|
|
toast_action[i] = 'x';
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Finally we store attributes of type 'm' externally. At this point we
|
2010-02-26 03:01:40 +01:00
|
|
|
* increase the target tuple size, so that 'm' attributes aren't stored
|
|
|
|
* externally unless really necessary.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2009-07-22 03:21:22 +02:00
|
|
|
maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff;
|
|
|
|
|
Don't MAXALIGN in the checks to decide whether a tuple is over TOAST's
threshold for tuple length. On 4-byte-MAXALIGN machines, the toast code
creates tuples that have t_len exactly TOAST_TUPLE_THRESHOLD ... but this
number is not itself maxaligned, so if heap_insert maxaligns t_len before
comparing to TOAST_TUPLE_THRESHOLD, it'll uselessly recurse back to
tuptoaster.c, wasting cycles. (It turns out that this does not happen on
8-byte-MAXALIGN machines, because for them the outer MAXALIGN in the
TOAST_MAX_CHUNK_SIZE macro reduces TOAST_MAX_CHUNK_SIZE so that toast tuples
will be less than TOAST_TUPLE_THRESHOLD in size. That MAXALIGN is really
incorrect, but we can't remove it now, see below.) There isn't any particular
value in maxaligning before comparing to the thresholds, so just don't do
that, which saves a small number of cycles in itself.
These numbers should be rejiggered to minimize wasted space on toast-relation
pages, but we can't do that in the back branches because changing
TOAST_MAX_CHUNK_SIZE would force an initdb (by changing the contents of toast
tables). We can move the toast decision thresholds a bit, though, which is
what this patch effectively does.
Thanks to Pavan Deolasee for discovering the unintended recursion.
Back-patch into 8.2, but not further, pending more testing. (HEAD is about
to get a further patch modifying the thresholds, so it won't help much
for testing this form of the patch.)
2007-02-04 21:00:37 +01:00
|
|
|
while (heap_compute_data_size(tupleDesc,
|
|
|
|
toast_values, toast_isnull) > maxDataLen &&
|
|
|
|
rel->rd_rel->reltoastrelid != InvalidOid)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
int biggest_attno = -1;
|
2007-10-01 18:25:56 +02:00
|
|
|
int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
|
2001-03-22 05:01:46 +01:00
|
|
|
Datum old_value;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-23 05:49:58 +01:00
|
|
|
/*--------
|
|
|
|
* Search for the biggest yet inlined attribute with
|
|
|
|
* attstorage = 'm'
|
|
|
|
*--------
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
|
|
|
if (toast_action[i] == 'p')
|
|
|
|
continue;
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_EXTERNAL(DatumGetPointer(toast_values[i])))
|
2007-09-27 01:29:10 +02:00
|
|
|
continue; /* can't happen, toast_action would be 'p' */
|
2000-07-04 01:10:14 +02:00
|
|
|
if (att[i]->attstorage != 'm')
|
|
|
|
continue;
|
|
|
|
if (toast_sizes[i] > biggest_size)
|
|
|
|
{
|
|
|
|
biggest_attno = i;
|
2001-03-22 05:01:46 +01:00
|
|
|
biggest_size = toast_sizes[i];
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (biggest_attno < 0)
|
|
|
|
break;
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Store this external
|
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
i = biggest_attno;
|
|
|
|
old_value = toast_values[i];
|
|
|
|
toast_action[i] = 'p';
|
2011-08-16 19:48:04 +02:00
|
|
|
toast_values[i] = toast_save_datum(rel, toast_values[i],
|
|
|
|
toast_oldexternal[i], options);
|
2000-07-04 01:10:14 +02:00
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(old_value));
|
2001-03-22 05:01:46 +01:00
|
|
|
toast_free[i] = true;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
|
|
|
need_change = true;
|
2001-03-22 05:01:46 +01:00
|
|
|
need_free = true;
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* In the case we toasted any values, we need to build a new heap tuple
|
|
|
|
* with the changed values.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
if (need_change)
|
|
|
|
{
|
2002-05-27 21:53:33 +02:00
|
|
|
HeapTupleHeader olddata = newtup->t_data;
|
2005-11-20 19:38:20 +01:00
|
|
|
HeapTupleHeader new_data;
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
int32 new_header_len;
|
2007-04-06 06:21:44 +02:00
|
|
|
int32 new_data_len;
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
int32 new_tuple_len;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
* Calculate the new size of the tuple.
|
|
|
|
*
|
|
|
|
* Note: we used to assume here that the old tuple's t_hoff must equal
|
|
|
|
* the new_header_len value, but that was incorrect. The old tuple
|
|
|
|
* might have a smaller-than-current natts, if there's been an ALTER
|
|
|
|
* TABLE ADD COLUMN since it was stored; and that would lead to a
|
|
|
|
* different conclusion about the size of the null bitmap, or even
|
|
|
|
* whether there needs to be one at all.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2015-02-21 21:13:06 +01:00
|
|
|
new_header_len = SizeofHeapTupleHeader;
|
2000-07-04 01:10:14 +02:00
|
|
|
if (has_nulls)
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_header_len += BITMAPLEN(numAttrs);
|
2002-09-02 03:05:06 +02:00
|
|
|
if (olddata->t_infomask & HEAP_HASOID)
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_header_len += sizeof(Oid);
|
|
|
|
new_header_len = MAXALIGN(new_header_len);
|
2007-04-06 06:21:44 +02:00
|
|
|
new_data_len = heap_compute_data_size(tupleDesc,
|
|
|
|
toast_values, toast_isnull);
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_tuple_len = new_header_len + new_data_len;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2005-11-20 19:38:20 +01:00
|
|
|
* Allocate and zero the space needed, and fill HeapTupleData fields.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_tuple_len);
|
|
|
|
result_tuple->t_len = new_tuple_len;
|
2005-11-20 19:38:20 +01:00
|
|
|
result_tuple->t_self = newtup->t_self;
|
|
|
|
result_tuple->t_tableOid = newtup->t_tableOid;
|
|
|
|
new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE);
|
|
|
|
result_tuple->t_data = new_data;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
* Copy the existing tuple header, but adjust natts and t_hoff.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2015-02-21 21:13:06 +01:00
|
|
|
memcpy(new_data, olddata, SizeofHeapTupleHeader);
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
HeapTupleHeaderSetNatts(new_data, numAttrs);
|
|
|
|
new_data->t_hoff = new_header_len;
|
|
|
|
if (olddata->t_infomask & HEAP_HASOID)
|
|
|
|
HeapTupleHeaderSetOid(new_data, HeapTupleHeaderGetOid(olddata));
|
2000-07-04 01:10:14 +02:00
|
|
|
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
/* Copy over the data, and fill the null bitmap if needed */
|
2005-03-21 02:24:04 +01:00
|
|
|
heap_fill_tuple(tupleDesc,
|
|
|
|
toast_values,
|
|
|
|
toast_isnull,
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
(char *) new_data + new_header_len,
|
2007-04-06 06:21:44 +02:00
|
|
|
new_data_len,
|
2005-11-20 19:38:20 +01:00
|
|
|
&(new_data->t_infomask),
|
|
|
|
has_nulls ? new_data->t_bits : NULL);
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
2005-11-20 19:38:20 +01:00
|
|
|
else
|
|
|
|
result_tuple = newtup;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Free allocated temp values
|
|
|
|
*/
|
|
|
|
if (need_free)
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(toast_values[i]));
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Delete external values from the old tuple
|
|
|
|
*/
|
|
|
|
if (need_delold)
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
if (toast_delold[i])
|
2004-06-04 22:35:21 +02:00
|
|
|
toast_delete_datum(rel, toast_oldvalues[i]);
|
2005-11-20 19:38:20 +01:00
|
|
|
|
|
|
|
return result_tuple;
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
Fix race condition with toast table access from a stale syscache entry.
If a tuple in a syscache contains an out-of-line toasted field, and we
try to fetch that field shortly after some other transaction has committed
an update or deletion of the tuple, there is a race condition: vacuum
could come along and remove the toast tuples before we can fetch them.
This leads to transient failures like "missing chunk number 0 for toast
value NNNNN in pg_toast_2619", as seen in recent reports from Andrew
Hammond and Tim Uckun.
The design idea of syscache is that access to stale syscache entries
should be prevented by relation-level locks, but that fails for at least
two cases where toasted fields are possible: ANALYZE updates pg_statistic
rows without locking out sessions that might want to plan queries on the
same table, and CREATE OR REPLACE FUNCTION updates pg_proc rows without
any meaningful lock at all.
The least risky fix seems to be an idea that Heikki suggested when we
were dealing with a related problem back in August: forcibly detoast any
out-of-line fields before putting a tuple into syscache in the first place.
This avoids the problem because at the time we fetch the parent tuple from
the catalog, we should be holding an MVCC snapshot that will prevent
removal of the toast tuples, even if the parent tuple is outdated
immediately after we fetch it. (Note: I'm not convinced that this
statement holds true at every instant where we could be fetching a syscache
entry at all, but it does appear to hold true at the times where we could
fetch an entry that could have a toasted field. We will need to be a bit
wary of adding toast tables to low-level catalogs that don't have them
already.) An additional benefit is that subsequent uses of the syscache
entry should be faster, since they won't have to detoast the field.
Back-patch to all supported versions. The problem is significantly harder
to reproduce in pre-9.0 releases, because of their willingness to flush
every entry in a syscache whenever the underlying catalog is vacuumed
(cf CatalogCacheFlushRelation); but there is still a window for trouble.
2011-11-02 00:48:37 +01:00
|
|
|
/* ----------
|
|
|
|
* toast_flatten_tuple -
|
|
|
|
*
|
|
|
|
* "Flatten" a tuple to contain no out-of-line toasted fields.
|
|
|
|
* (This does not eliminate compressed or short-header datums.)
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
*
|
|
|
|
* Note: we expect the caller already checked HeapTupleHasExternal(tup),
|
|
|
|
* so there is no need for a short-circuit path.
|
Fix race condition with toast table access from a stale syscache entry.
If a tuple in a syscache contains an out-of-line toasted field, and we
try to fetch that field shortly after some other transaction has committed
an update or deletion of the tuple, there is a race condition: vacuum
could come along and remove the toast tuples before we can fetch them.
This leads to transient failures like "missing chunk number 0 for toast
value NNNNN in pg_toast_2619", as seen in recent reports from Andrew
Hammond and Tim Uckun.
The design idea of syscache is that access to stale syscache entries
should be prevented by relation-level locks, but that fails for at least
two cases where toasted fields are possible: ANALYZE updates pg_statistic
rows without locking out sessions that might want to plan queries on the
same table, and CREATE OR REPLACE FUNCTION updates pg_proc rows without
any meaningful lock at all.
The least risky fix seems to be an idea that Heikki suggested when we
were dealing with a related problem back in August: forcibly detoast any
out-of-line fields before putting a tuple into syscache in the first place.
This avoids the problem because at the time we fetch the parent tuple from
the catalog, we should be holding an MVCC snapshot that will prevent
removal of the toast tuples, even if the parent tuple is outdated
immediately after we fetch it. (Note: I'm not convinced that this
statement holds true at every instant where we could be fetching a syscache
entry at all, but it does appear to hold true at the times where we could
fetch an entry that could have a toasted field. We will need to be a bit
wary of adding toast tables to low-level catalogs that don't have them
already.) An additional benefit is that subsequent uses of the syscache
entry should be faster, since they won't have to detoast the field.
Back-patch to all supported versions. The problem is significantly harder
to reproduce in pre-9.0 releases, because of their willingness to flush
every entry in a syscache whenever the underlying catalog is vacuumed
(cf CatalogCacheFlushRelation); but there is still a window for trouble.
2011-11-02 00:48:37 +01:00
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
HeapTuple
|
|
|
|
toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
|
|
|
|
{
|
|
|
|
HeapTuple new_tuple;
|
|
|
|
Form_pg_attribute *att = tupleDesc->attrs;
|
|
|
|
int numAttrs = tupleDesc->natts;
|
|
|
|
int i;
|
|
|
|
Datum toast_values[MaxTupleAttributeNumber];
|
|
|
|
bool toast_isnull[MaxTupleAttributeNumber];
|
|
|
|
bool toast_free[MaxTupleAttributeNumber];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Break down the tuple into fields.
|
|
|
|
*/
|
|
|
|
Assert(numAttrs <= MaxTupleAttributeNumber);
|
|
|
|
heap_deform_tuple(tup, tupleDesc, toast_values, toast_isnull);
|
|
|
|
|
|
|
|
memset(toast_free, 0, numAttrs * sizeof(bool));
|
|
|
|
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Look at non-null varlena attributes
|
|
|
|
*/
|
|
|
|
if (!toast_isnull[i] && att[i]->attlen == -1)
|
|
|
|
{
|
|
|
|
struct varlena *new_value;
|
|
|
|
|
|
|
|
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
|
|
|
|
if (VARATT_IS_EXTERNAL(new_value))
|
|
|
|
{
|
2015-02-09 18:30:52 +01:00
|
|
|
new_value = heap_tuple_fetch_attr(new_value);
|
Fix race condition with toast table access from a stale syscache entry.
If a tuple in a syscache contains an out-of-line toasted field, and we
try to fetch that field shortly after some other transaction has committed
an update or deletion of the tuple, there is a race condition: vacuum
could come along and remove the toast tuples before we can fetch them.
This leads to transient failures like "missing chunk number 0 for toast
value NNNNN in pg_toast_2619", as seen in recent reports from Andrew
Hammond and Tim Uckun.
The design idea of syscache is that access to stale syscache entries
should be prevented by relation-level locks, but that fails for at least
two cases where toasted fields are possible: ANALYZE updates pg_statistic
rows without locking out sessions that might want to plan queries on the
same table, and CREATE OR REPLACE FUNCTION updates pg_proc rows without
any meaningful lock at all.
The least risky fix seems to be an idea that Heikki suggested when we
were dealing with a related problem back in August: forcibly detoast any
out-of-line fields before putting a tuple into syscache in the first place.
This avoids the problem because at the time we fetch the parent tuple from
the catalog, we should be holding an MVCC snapshot that will prevent
removal of the toast tuples, even if the parent tuple is outdated
immediately after we fetch it. (Note: I'm not convinced that this
statement holds true at every instant where we could be fetching a syscache
entry at all, but it does appear to hold true at the times where we could
fetch an entry that could have a toasted field. We will need to be a bit
wary of adding toast tables to low-level catalogs that don't have them
already.) An additional benefit is that subsequent uses of the syscache
entry should be faster, since they won't have to detoast the field.
Back-patch to all supported versions. The problem is significantly harder
to reproduce in pre-9.0 releases, because of their willingness to flush
every entry in a syscache whenever the underlying catalog is vacuumed
(cf CatalogCacheFlushRelation); but there is still a window for trouble.
2011-11-02 00:48:37 +01:00
|
|
|
toast_values[i] = PointerGetDatum(new_value);
|
|
|
|
toast_free[i] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Form the reconfigured tuple.
|
|
|
|
*/
|
|
|
|
new_tuple = heap_form_tuple(tupleDesc, toast_values, toast_isnull);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Be sure to copy the tuple's OID and identity fields. We also make a
|
|
|
|
* point of copying visibility info, just in case anybody looks at those
|
|
|
|
* fields in a syscache entry.
|
|
|
|
*/
|
|
|
|
if (tupleDesc->tdhasoid)
|
|
|
|
HeapTupleSetOid(new_tuple, HeapTupleGetOid(tup));
|
|
|
|
|
|
|
|
new_tuple->t_self = tup->t_self;
|
|
|
|
new_tuple->t_tableOid = tup->t_tableOid;
|
|
|
|
|
|
|
|
new_tuple->t_data->t_choice = tup->t_data->t_choice;
|
|
|
|
new_tuple->t_data->t_ctid = tup->t_data->t_ctid;
|
|
|
|
new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
|
|
|
|
new_tuple->t_data->t_infomask |=
|
|
|
|
tup->t_data->t_infomask & HEAP_XACT_MASK;
|
|
|
|
new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
|
|
|
|
new_tuple->t_data->t_infomask2 |=
|
|
|
|
tup->t_data->t_infomask2 & HEAP2_XACT_MASK;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free allocated temp values
|
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(toast_values[i]));
|
|
|
|
|
|
|
|
return new_tuple;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-06-05 03:55:05 +02:00
|
|
|
/* ----------
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
* toast_flatten_tuple_to_datum -
|
|
|
|
*
|
|
|
|
* "Flatten" a tuple containing out-of-line toasted fields into a Datum.
|
|
|
|
* The result is always palloc'd in the current memory context.
|
|
|
|
*
|
|
|
|
* We have a general rule that Datums of container types (rows, arrays,
|
|
|
|
* ranges, etc) must not contain any external TOAST pointers. Without
|
|
|
|
* this rule, we'd have to look inside each Datum when preparing a tuple
|
|
|
|
* for storage, which would be expensive and would fail to extend cleanly
|
|
|
|
* to new sorts of container types.
|
|
|
|
*
|
|
|
|
* However, we don't want to say that tuples represented as HeapTuples
|
|
|
|
* can't contain toasted fields, so instead this routine should be called
|
|
|
|
* when such a HeapTuple is being converted into a Datum.
|
2004-06-05 03:55:05 +02:00
|
|
|
*
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
* While we're at it, we decompress any compressed fields too. This is not
|
|
|
|
* necessary for correctness, but reflects an expectation that compression
|
|
|
|
* will be more effective if applied to the whole tuple not individual
|
|
|
|
* fields. We are not so concerned about that that we want to deconstruct
|
|
|
|
* and reconstruct tuples just to get rid of compressed fields, however.
|
|
|
|
* So callers typically won't call this unless they see that the tuple has
|
|
|
|
* at least one external field.
|
2007-04-06 06:21:44 +02:00
|
|
|
*
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
* On the other hand, in-line short-header varlena fields are left alone.
|
|
|
|
* If we "untoasted" them here, they'd just get changed back to short-header
|
|
|
|
* format anyway within heap_fill_tuple.
|
2004-06-05 03:55:05 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
Datum
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
toast_flatten_tuple_to_datum(HeapTupleHeader tup,
|
|
|
|
uint32 tup_len,
|
|
|
|
TupleDesc tupleDesc)
|
2004-06-05 03:55:05 +02:00
|
|
|
{
|
|
|
|
HeapTupleHeader new_data;
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
int32 new_header_len;
|
2007-04-06 06:21:44 +02:00
|
|
|
int32 new_data_len;
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
int32 new_tuple_len;
|
2004-06-05 03:55:05 +02:00
|
|
|
HeapTupleData tmptup;
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
Form_pg_attribute *att = tupleDesc->attrs;
|
|
|
|
int numAttrs = tupleDesc->natts;
|
2004-06-05 03:55:05 +02:00
|
|
|
int i;
|
|
|
|
bool has_nulls = false;
|
|
|
|
Datum toast_values[MaxTupleAttributeNumber];
|
2005-03-21 02:24:04 +01:00
|
|
|
bool toast_isnull[MaxTupleAttributeNumber];
|
2004-06-05 03:55:05 +02:00
|
|
|
bool toast_free[MaxTupleAttributeNumber];
|
|
|
|
|
|
|
|
/* Build a temporary HeapTuple control structure */
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
tmptup.t_len = tup_len;
|
2004-06-05 03:55:05 +02:00
|
|
|
ItemPointerSetInvalid(&(tmptup.t_self));
|
|
|
|
tmptup.t_tableOid = InvalidOid;
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
tmptup.t_data = tup;
|
2004-06-05 03:55:05 +02:00
|
|
|
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
/*
|
|
|
|
* Break down the tuple into fields.
|
|
|
|
*/
|
2004-06-05 03:55:05 +02:00
|
|
|
Assert(numAttrs <= MaxTupleAttributeNumber);
|
2005-03-21 02:24:04 +01:00
|
|
|
heap_deform_tuple(&tmptup, tupleDesc, toast_values, toast_isnull);
|
2004-06-05 03:55:05 +02:00
|
|
|
|
|
|
|
memset(toast_free, 0, numAttrs * sizeof(bool));
|
|
|
|
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Look at non-null varlena attributes
|
|
|
|
*/
|
2005-03-21 02:24:04 +01:00
|
|
|
if (toast_isnull[i])
|
2004-06-05 03:55:05 +02:00
|
|
|
has_nulls = true;
|
|
|
|
else if (att[i]->attlen == -1)
|
|
|
|
{
|
2007-11-15 22:14:46 +01:00
|
|
|
struct varlena *new_value;
|
2004-06-05 03:55:05 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
new_value = (struct varlena *) DatumGetPointer(toast_values[i]);
|
|
|
|
if (VARATT_IS_EXTERNAL(new_value) ||
|
|
|
|
VARATT_IS_COMPRESSED(new_value))
|
2004-06-05 03:55:05 +02:00
|
|
|
{
|
|
|
|
new_value = heap_tuple_untoast_attr(new_value);
|
|
|
|
toast_values[i] = PointerGetDatum(new_value);
|
|
|
|
toast_free[i] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
* Calculate the new size of the tuple.
|
|
|
|
*
|
|
|
|
* This should match the reconstruction code in toast_insert_or_update.
|
2004-06-05 03:55:05 +02:00
|
|
|
*/
|
2015-02-21 21:13:06 +01:00
|
|
|
new_header_len = SizeofHeapTupleHeader;
|
2004-06-05 03:55:05 +02:00
|
|
|
if (has_nulls)
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_header_len += BITMAPLEN(numAttrs);
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
if (tup->t_infomask & HEAP_HASOID)
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_header_len += sizeof(Oid);
|
|
|
|
new_header_len = MAXALIGN(new_header_len);
|
2007-04-06 06:21:44 +02:00
|
|
|
new_data_len = heap_compute_data_size(tupleDesc,
|
|
|
|
toast_values, toast_isnull);
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_tuple_len = new_header_len + new_data_len;
|
2004-06-05 03:55:05 +02:00
|
|
|
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
new_data = (HeapTupleHeader) palloc0(new_tuple_len);
|
2004-06-05 03:55:05 +02:00
|
|
|
|
|
|
|
/*
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
* Copy the existing tuple header, but adjust natts and t_hoff.
|
2004-06-05 03:55:05 +02:00
|
|
|
*/
|
2015-02-21 21:13:06 +01:00
|
|
|
memcpy(new_data, tup, SizeofHeapTupleHeader);
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
HeapTupleHeaderSetNatts(new_data, numAttrs);
|
|
|
|
new_data->t_hoff = new_header_len;
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
if (tup->t_infomask & HEAP_HASOID)
|
|
|
|
HeapTupleHeaderSetOid(new_data, HeapTupleHeaderGetOid(tup));
|
2004-06-05 03:55:05 +02:00
|
|
|
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
/* Set the composite-Datum header fields correctly */
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
HeapTupleHeaderSetDatumLength(new_data, new_tuple_len);
|
Fix failure to detoast fields in composite elements of structured types.
If we have an array of records stored on disk, the individual record fields
cannot contain out-of-line TOAST pointers: the tuptoaster.c mechanisms are
only prepared to deal with TOAST pointers appearing in top-level fields of
a stored row. The same applies for ranges over composite types, nested
composites, etc. However, the existing code only took care of expanding
sub-field TOAST pointers for the case of nested composites, not for other
structured types containing composites. For example, given a command such
as
UPDATE tab SET arraycol = ARRAY[(ROW(x,42)::mycompositetype] ...
where x is a direct reference to a field of an on-disk tuple, if that field
is long enough to be toasted out-of-line then the TOAST pointer would be
inserted as-is into the array column. If the source record for x is later
deleted, the array field value would become a dangling pointer, leading
to errors along the line of "missing chunk number 0 for toast value ..."
when the value is referenced. A reproducible test case for this was
provided by Jan Pecek, but it seems likely that some of the "missing chunk
number" reports we've heard in the past were caused by similar issues.
Code-wise, the problem is that PG_DETOAST_DATUM() is not adequate to
produce a self-contained Datum value if the Datum is of composite type.
Seen in this light, the problem is not just confined to arrays and ranges,
but could also affect some other places where detoasting is done in that
way, for example form_index_tuple().
I tried teaching the array code to apply toast_flatten_tuple_attribute()
along with PG_DETOAST_DATUM() when the array element type is composite,
but this was messy and imposed extra cache lookup costs whether or not any
TOAST pointers were present, indeed sometimes when the array element type
isn't even composite (since sometimes it takes a typcache lookup to find
that out). The idea of extending that approach to all the places that
currently use PG_DETOAST_DATUM() wasn't attractive at all.
This patch instead solves the problem by decreeing that composite Datum
values must not contain any out-of-line TOAST pointers in the first place;
that is, we expand out-of-line fields at the point of constructing a
composite Datum, not at the point where we're about to insert it into a
larger tuple. This rule is applied only to true composite Datums, not
to tuples that are being passed around the system as tuples, so it's not
as invasive as it might sound at first. With this approach, the amount
of code that has to be touched for a full solution is greatly reduced,
and added cache lookup costs are avoided except when there actually is
a TOAST pointer that needs to be inlined.
The main drawback of this approach is that we might sometimes dereference
a TOAST pointer that will never actually be used by the query, imposing a
rather large cost that wasn't there before. On the other side of the coin,
if the field value is used multiple times then we'll come out ahead by
avoiding repeat detoastings. Experimentation suggests that common SQL
coding patterns are unaffected either way, though. Applications that are
very negatively affected could be advised to modify their code to not fetch
columns they won't be using.
In future, we might consider reverting this solution in favor of detoasting
only at the point where data is about to be stored to disk, using some
method that can drill down into multiple levels of nested structured types.
That will require defining new APIs for structured types, though, so it
doesn't seem feasible as a back-patchable fix.
Note that this patch changes HeapTupleGetDatum() from a macro to a function
call; this means that any third-party code using that macro will not get
protection against creating TOAST-pointer-containing Datums until it's
recompiled. The same applies to any uses of PG_RETURN_HEAPTUPLEHEADER().
It seems likely that this is not a big problem in practice: most of the
tuple-returning functions in core and contrib produce outputs that could
not possibly be toasted anyway, and the same probably holds for third-party
extensions.
This bug has existed since TOAST was invented, so back-patch to all
supported branches.
2014-05-01 21:19:06 +02:00
|
|
|
HeapTupleHeaderSetTypeId(new_data, tupleDesc->tdtypeid);
|
|
|
|
HeapTupleHeaderSetTypMod(new_data, tupleDesc->tdtypmod);
|
2004-06-05 03:55:05 +02:00
|
|
|
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
/* Copy over the data, and fill the null bitmap if needed */
|
2005-03-21 02:24:04 +01:00
|
|
|
heap_fill_tuple(tupleDesc,
|
|
|
|
toast_values,
|
|
|
|
toast_isnull,
|
Don't assume that a tuple's header size is unchanged during toasting.
This assumption can be wrong when the toaster is passed a raw on-disk
tuple, because the tuple might pre-date an ALTER TABLE ADD COLUMN operation
that added columns without rewriting the table. In such a case the tuple's
natts value is smaller than what we expect from the tuple descriptor, and
so its t_hoff value could be smaller too. In fact, the tuple might not
have a null bitmap at all, and yet our current opinion of it is that it
contains some trailing nulls.
In such a situation, toast_insert_or_update did the wrong thing, because
to save a few lines of code it would use the old t_hoff value as the offset
where heap_fill_tuple should start filling data. This did not leave enough
room for the new nulls bitmap, with the result that the first few bytes of
data could be overwritten with null flag bits, as in a recent report from
Hubert Depesz Lubaczewski.
The particular case reported requires ALTER TABLE ADD COLUMN followed by
CREATE TABLE AS SELECT * FROM ... or INSERT ... SELECT * FROM ..., and
further requires that there be some out-of-line toasted fields in one of
the tuples to be copied; else we'll not reach the troublesome code.
The problem can only manifest in this form in 8.4 and later, because
before commit a77eaa6a95009a3441e0d475d1980259d45da072, CREATE TABLE AS or
INSERT/SELECT wouldn't result in raw disk tuples getting passed directly
to heap_insert --- there would always have been at least a junkfilter in
between, and that would reconstitute the tuple header with an up-to-date
t_natts and hence t_hoff. But I'm backpatching the tuptoaster change all
the way anyway, because I'm not convinced there are no older code paths
that present a similar risk.
2011-11-05 04:22:50 +01:00
|
|
|
(char *) new_data + new_header_len,
|
2007-04-06 06:21:44 +02:00
|
|
|
new_data_len,
|
2005-03-21 02:24:04 +01:00
|
|
|
&(new_data->t_infomask),
|
|
|
|
has_nulls ? new_data->t_bits : NULL);
|
2004-06-05 03:55:05 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Free allocated temp values
|
|
|
|
*/
|
|
|
|
for (i = 0; i < numAttrs; i++)
|
|
|
|
if (toast_free[i])
|
|
|
|
pfree(DatumGetPointer(toast_values[i]));
|
|
|
|
|
|
|
|
return PointerGetDatum(new_data);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_compress_datum -
|
|
|
|
*
|
2000-08-04 06:16:17 +02:00
|
|
|
* Create a compressed version of a varlena datum
|
2000-10-24 01:42:04 +02:00
|
|
|
*
|
|
|
|
* If we fail (ie, compressed result is actually bigger than original)
|
|
|
|
* then return NULL. We must not use compressed data if it'd expand
|
|
|
|
* the tuple!
|
2007-04-06 06:21:44 +02:00
|
|
|
*
|
|
|
|
* We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without
|
|
|
|
* copying them. But we can't handle external or compressed datums.
|
2000-07-04 01:10:14 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
2001-02-15 21:57:01 +01:00
|
|
|
Datum
|
2000-07-04 01:10:14 +02:00
|
|
|
toast_compress_datum(Datum value)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *tmp;
|
2008-04-17 23:37:28 +02:00
|
|
|
int32 valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
|
2015-02-09 07:15:24 +01:00
|
|
|
int32 len;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2008-04-17 23:37:28 +02:00
|
|
|
Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value)));
|
|
|
|
Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value)));
|
2007-04-06 06:21:44 +02:00
|
|
|
|
2007-09-27 01:29:10 +02:00
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* No point in wasting a palloc cycle if value size is out of the allowed
|
|
|
|
* range for compression
|
2007-09-27 01:29:10 +02:00
|
|
|
*/
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
if (valsize < PGLZ_strategy_default->min_input_size ||
|
|
|
|
valsize > PGLZ_strategy_default->max_input_size)
|
2007-09-27 01:29:10 +02:00
|
|
|
return PointerGetDatum(NULL);
|
|
|
|
|
2015-02-09 07:15:24 +01:00
|
|
|
tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize) +
|
|
|
|
TOAST_COMPRESS_HDRSZ);
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We recheck the actual size even if pglz_compress() reports success,
|
|
|
|
* because it might be satisfied with having saved as little as one byte
|
|
|
|
* in the compressed data --- which could turn into a net loss once you
|
|
|
|
* consider header and alignment padding. Worst case, the compressed
|
2009-06-11 16:49:15 +02:00
|
|
|
* format might require three padding bytes (plus header, which is
|
|
|
|
* included in VARSIZE(tmp)), whereas the uncompressed format would take
|
|
|
|
* only one header byte and no padding if the value is short enough. So
|
|
|
|
* we insist on a savings of more than 2 bytes to ensure we have a gain.
|
This patch addresses some issues in TOAST compression strategy that
were discussed last year, but we felt it was too late in the 8.3 cycle to
change the code immediately. Specifically, the patch:
* Reduces the minimum datum size to be considered for compression from
256 to 32 bytes, as suggested by Greg Stark.
* Increases the required compression rate for compressed storage from
20% to 25%, again per Greg's suggestion.
* Replaces force_input_size (size above which compression is forced)
with a maximum size to be considered for compression. It was agreed
that allowing large inputs to escape the minimum-compression-rate
requirement was not bright, and that indeed we'd rather have a knob
that acted in the other direction. I set this value to 1MB for the
moment, but it could use some performance studies to tune it.
* Adds an early-failure path to the compressor as suggested by Jan:
if it's been unable to find even one compressible substring in the
first 1KB (parameterizable), assume we're looking at incompressible
input and give up. (Possibly this logic can be improved, but I'll
commit it as-is for now.)
* Improves the toasting heuristics so that when we have very large
fields with attstorage 'x' or 'e', we will push those out to toast
storage before considering inline compression of shorter fields.
This also responds to a suggestion of Greg's, though my original
proposal for a solution was a bit off base because it didn't fix
the problem for large 'e' fields.
There was some discussion in the earlier threads of exposing some
of the compression knobs to users, perhaps even on a per-column
basis. I have not done anything about that here. It seems to me
that if we are changing around the parameters, we'd better get some
experience and be sure we are happy with the design before we set
things in stone by providing user-visible knobs.
2008-03-08 00:20:21 +01:00
|
|
|
*/
|
2015-02-09 07:15:24 +01:00
|
|
|
len = pglz_compress(VARDATA_ANY(DatumGetPointer(value)),
|
|
|
|
valsize,
|
|
|
|
TOAST_COMPRESS_RAWDATA(tmp),
|
|
|
|
PGLZ_strategy_default);
|
|
|
|
if (len >= 0 &&
|
|
|
|
len + TOAST_COMPRESS_HDRSZ < valsize - 2)
|
2000-10-24 01:42:04 +02:00
|
|
|
{
|
2015-02-09 07:15:24 +01:00
|
|
|
TOAST_COMPRESS_SET_RAWSIZE(tmp, valsize);
|
|
|
|
SET_VARSIZE_COMPRESSED(tmp, len + TOAST_COMPRESS_HDRSZ);
|
2000-10-24 01:42:04 +02:00
|
|
|
/* successful compression */
|
|
|
|
return PointerGetDatum(tmp);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* incompressible data */
|
|
|
|
pfree(tmp);
|
|
|
|
return PointerGetDatum(NULL);
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-07-03 20:24:09 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_get_valid_index
|
|
|
|
*
|
|
|
|
* Get OID of valid index associated to given toast relation. A toast
|
|
|
|
* relation can have only one valid index at the same time.
|
|
|
|
*/
|
|
|
|
Oid
|
|
|
|
toast_get_valid_index(Oid toastoid, LOCKMODE lock)
|
|
|
|
{
|
|
|
|
int num_indexes;
|
|
|
|
int validIndex;
|
|
|
|
Oid validIndexOid;
|
|
|
|
Relation *toastidxs;
|
|
|
|
Relation toastrel;
|
|
|
|
|
|
|
|
/* Open the toast relation */
|
|
|
|
toastrel = heap_open(toastoid, lock);
|
|
|
|
|
|
|
|
/* Look for the valid index of the toast relation */
|
|
|
|
validIndex = toast_open_indexes(toastrel,
|
|
|
|
lock,
|
|
|
|
&toastidxs,
|
|
|
|
&num_indexes);
|
|
|
|
validIndexOid = RelationGetRelid(toastidxs[validIndex]);
|
|
|
|
|
|
|
|
/* Close the toast relation and all its indexes */
|
|
|
|
toast_close_indexes(toastidxs, num_indexes, lock);
|
|
|
|
heap_close(toastrel, lock);
|
|
|
|
|
|
|
|
return validIndexOid;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_save_datum -
|
|
|
|
*
|
|
|
|
* Save one single datum into the secondary relation and return
|
2007-04-06 06:21:44 +02:00
|
|
|
* a Datum reference for it.
|
2011-08-16 19:48:04 +02:00
|
|
|
*
|
|
|
|
* rel: the main relation we're working with (not the toast rel!)
|
|
|
|
* value: datum to be pushed to toast storage
|
|
|
|
* oldexternal: if not NULL, toast pointer previously representing the datum
|
|
|
|
* options: options to be passed to heap_insert() for toast rows
|
2000-07-04 01:10:14 +02:00
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
static Datum
|
2011-08-16 19:48:04 +02:00
|
|
|
toast_save_datum(Relation rel, Datum value,
|
2012-06-10 21:20:04 +02:00
|
|
|
struct varlena * oldexternal, int options)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
Relation toastrel;
|
2013-07-03 20:24:09 +02:00
|
|
|
Relation *toastidxs;
|
2001-03-22 05:01:46 +01:00
|
|
|
HeapTuple toasttup;
|
|
|
|
TupleDesc toasttupDesc;
|
|
|
|
Datum t_values[3];
|
2005-03-21 02:24:04 +01:00
|
|
|
bool t_isnull[3];
|
2007-11-30 22:22:54 +01:00
|
|
|
CommandId mycid = GetCurrentCommandId(true);
|
2007-10-01 18:25:56 +02:00
|
|
|
struct varlena *result;
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varatt_external toast_pointer;
|
2015-02-20 22:51:53 +01:00
|
|
|
union
|
2001-10-25 07:50:21 +02:00
|
|
|
{
|
|
|
|
struct varlena hdr;
|
2015-02-20 22:51:53 +01:00
|
|
|
/* this is to make the union big enough for a chunk: */
|
|
|
|
char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ];
|
|
|
|
/* ensure union is aligned well enough: */
|
|
|
|
int32 align_it;
|
2001-03-25 01:45:20 +01:00
|
|
|
} chunk_data;
|
2001-03-22 05:01:46 +01:00
|
|
|
int32 chunk_size;
|
|
|
|
int32 chunk_seq = 0;
|
|
|
|
char *data_p;
|
|
|
|
int32 data_todo;
|
2008-04-17 23:37:28 +02:00
|
|
|
Pointer dval = DatumGetPointer(value);
|
2013-07-03 20:24:09 +02:00
|
|
|
int num_indexes;
|
|
|
|
int validIndex;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
Assert(!VARATT_IS_EXTERNAL(value));
|
|
|
|
|
2005-08-12 03:36:05 +02:00
|
|
|
/*
|
2013-07-03 20:24:09 +02:00
|
|
|
* Open the toast relation and its indexes. We can use the index to check
|
2005-10-15 04:49:52 +02:00
|
|
|
* uniqueness of the OID we assign to the toasted item, even though it has
|
|
|
|
* additional columns besides OID.
|
2005-08-12 03:36:05 +02:00
|
|
|
*/
|
|
|
|
toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock);
|
|
|
|
toasttupDesc = toastrel->rd_att;
|
2013-07-03 20:24:09 +02:00
|
|
|
|
2013-07-04 19:47:49 +02:00
|
|
|
/* Open all the toast indexes and look for the valid one */
|
2013-07-03 20:24:09 +02:00
|
|
|
validIndex = toast_open_indexes(toastrel,
|
|
|
|
RowExclusiveLock,
|
|
|
|
&toastidxs,
|
|
|
|
&num_indexes);
|
2005-08-12 03:36:05 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2007-04-06 06:21:44 +02:00
|
|
|
* Get the data pointer and length, and compute va_rawsize and va_extsize.
|
|
|
|
*
|
2007-11-15 22:14:46 +01:00
|
|
|
* va_rawsize is the size of the equivalent fully uncompressed datum, so
|
|
|
|
* we have to adjust for short headers.
|
2007-04-06 06:21:44 +02:00
|
|
|
*
|
|
|
|
* va_extsize is the actual size of the data payload in the toast records.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2008-04-17 23:37:28 +02:00
|
|
|
if (VARATT_IS_SHORT(dval))
|
2007-04-06 06:21:44 +02:00
|
|
|
{
|
2008-04-17 23:37:28 +02:00
|
|
|
data_p = VARDATA_SHORT(dval);
|
|
|
|
data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT;
|
2007-11-15 22:14:46 +01:00
|
|
|
toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */
|
2007-04-06 06:21:44 +02:00
|
|
|
toast_pointer.va_extsize = data_todo;
|
|
|
|
}
|
2008-04-17 23:37:28 +02:00
|
|
|
else if (VARATT_IS_COMPRESSED(dval))
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2008-04-17 23:37:28 +02:00
|
|
|
data_p = VARDATA(dval);
|
|
|
|
data_todo = VARSIZE(dval) - VARHDRSZ;
|
2007-04-06 06:21:44 +02:00
|
|
|
/* rawsize in a compressed datum is just the size of the payload */
|
2008-04-17 23:37:28 +02:00
|
|
|
toast_pointer.va_rawsize = VARRAWSIZE_4B_C(dval) + VARHDRSZ;
|
2007-04-06 06:21:44 +02:00
|
|
|
toast_pointer.va_extsize = data_todo;
|
|
|
|
/* Assert that the numbers look like it's compressed */
|
|
|
|
Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer));
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
else
|
2007-04-06 06:21:44 +02:00
|
|
|
{
|
2008-04-17 23:37:28 +02:00
|
|
|
data_p = VARDATA(dval);
|
|
|
|
data_todo = VARSIZE(dval) - VARHDRSZ;
|
|
|
|
toast_pointer.va_rawsize = VARSIZE(dval);
|
2007-04-06 06:21:44 +02:00
|
|
|
toast_pointer.va_extsize = data_todo;
|
|
|
|
}
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2010-02-04 01:09:14 +01:00
|
|
|
/*
|
|
|
|
* Insert the correct table OID into the result TOAST pointer.
|
|
|
|
*
|
|
|
|
* Normally this is the actual OID of the target toast table, but during
|
|
|
|
* table-rewriting operations such as CLUSTER, we have to insert the OID
|
2010-02-26 03:01:40 +01:00
|
|
|
* of the table's real permanent toast table instead. rd_toastoid is set
|
|
|
|
* if we have to substitute such an OID.
|
2010-02-04 01:09:14 +01:00
|
|
|
*/
|
|
|
|
if (OidIsValid(rel->rd_toastoid))
|
|
|
|
toast_pointer.va_toastrelid = rel->rd_toastoid;
|
|
|
|
else
|
|
|
|
toast_pointer.va_toastrelid = RelationGetRelid(toastrel);
|
|
|
|
|
|
|
|
/*
|
2011-08-16 19:48:04 +02:00
|
|
|
* Choose an OID to use as the value ID for this toast value.
|
|
|
|
*
|
|
|
|
* Normally we just choose an unused OID within the toast table. But
|
|
|
|
* during table-rewriting operations where we are preserving an existing
|
|
|
|
* toast table OID, we want to preserve toast value OIDs too. So, if
|
|
|
|
* rd_toastoid is set and we had a prior external value from that same
|
|
|
|
* toast table, re-use its value ID. If we didn't have a prior external
|
|
|
|
* value (which is a corner case, but possible if the table's attstorage
|
|
|
|
* options have been changed), we have to pick a value ID that doesn't
|
|
|
|
* conflict with either new or existing toast value OIDs.
|
2010-02-04 01:09:14 +01:00
|
|
|
*/
|
2011-08-16 19:48:04 +02:00
|
|
|
if (!OidIsValid(rel->rd_toastoid))
|
|
|
|
{
|
|
|
|
/* normal case: just choose an unused OID */
|
|
|
|
toast_pointer.va_valueid =
|
|
|
|
GetNewOidWithIndex(toastrel,
|
2013-07-03 20:24:09 +02:00
|
|
|
RelationGetRelid(toastidxs[validIndex]),
|
2011-08-16 19:48:04 +02:00
|
|
|
(AttrNumber) 1);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* rewrite case: check to see if value was in old toast table */
|
|
|
|
toast_pointer.va_valueid = InvalidOid;
|
|
|
|
if (oldexternal != NULL)
|
|
|
|
{
|
|
|
|
struct varatt_external old_toast_pointer;
|
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal));
|
2011-08-16 19:48:04 +02:00
|
|
|
/* Must copy to access aligned fields */
|
|
|
|
VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal);
|
|
|
|
if (old_toast_pointer.va_toastrelid == rel->rd_toastoid)
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
{
|
|
|
|
/* This value came from the old toast table; reuse its OID */
|
2011-08-16 19:48:04 +02:00
|
|
|
toast_pointer.va_valueid = old_toast_pointer.va_valueid;
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* There is a corner case here: the table rewrite might have
|
|
|
|
* to copy both live and recently-dead versions of a row, and
|
|
|
|
* those versions could easily reference the same toast value.
|
|
|
|
* When we copy the second or later version of such a row,
|
|
|
|
* reusing the OID will mean we select an OID that's already
|
2014-05-06 18:12:18 +02:00
|
|
|
* in the new toast table. Check for that, and if so, just
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
* fall through without writing the data again.
|
|
|
|
*
|
|
|
|
* While annoying and ugly-looking, this is a good thing
|
|
|
|
* because it ensures that we wind up with only one copy of
|
|
|
|
* the toast value when there is only one copy in the old
|
|
|
|
* toast table. Before we detected this case, we'd have made
|
|
|
|
* multiple copies, wasting space; and what's worse, the
|
|
|
|
* copies belonging to already-deleted heap tuples would not
|
|
|
|
* be reclaimed by VACUUM.
|
|
|
|
*/
|
|
|
|
if (toastrel_valueid_exists(toastrel,
|
|
|
|
toast_pointer.va_valueid))
|
|
|
|
{
|
|
|
|
/* Match, so short-circuit the data storage loop below */
|
|
|
|
data_todo = 0;
|
|
|
|
}
|
|
|
|
}
|
2011-08-16 19:48:04 +02:00
|
|
|
}
|
|
|
|
if (toast_pointer.va_valueid == InvalidOid)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* new value; must choose an OID that doesn't conflict in either
|
|
|
|
* old or new toast table
|
|
|
|
*/
|
|
|
|
do
|
|
|
|
{
|
|
|
|
toast_pointer.va_valueid =
|
|
|
|
GetNewOidWithIndex(toastrel,
|
2014-05-06 18:12:18 +02:00
|
|
|
RelationGetRelid(toastidxs[validIndex]),
|
2011-08-16 19:48:04 +02:00
|
|
|
(AttrNumber) 1);
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
} while (toastid_valueid_exists(rel->rd_toastoid,
|
|
|
|
toast_pointer.va_valueid));
|
2011-08-16 19:48:04 +02:00
|
|
|
}
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Initialize constant parts of the tuple data
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid);
|
2001-03-25 01:45:20 +01:00
|
|
|
t_values[2] = PointerGetDatum(&chunk_data);
|
2005-03-21 02:24:04 +01:00
|
|
|
t_isnull[0] = false;
|
|
|
|
t_isnull[1] = false;
|
|
|
|
t_isnull[2] = false;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* Split up the item into chunks
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
|
|
|
while (data_todo > 0)
|
|
|
|
{
|
2014-05-06 18:12:18 +02:00
|
|
|
int i;
|
2013-07-03 20:24:09 +02:00
|
|
|
|
2014-06-24 03:45:21 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Calculate the size of this chunk
|
|
|
|
*/
|
2000-08-04 06:16:17 +02:00
|
|
|
chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2001-08-10 20:57:42 +02:00
|
|
|
* Build a tuple and store it
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2000-08-04 06:16:17 +02:00
|
|
|
t_values[1] = Int32GetDatum(chunk_seq++);
|
2007-02-28 00:48:10 +01:00
|
|
|
SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ);
|
|
|
|
memcpy(VARDATA(&chunk_data), data_p, chunk_size);
|
2005-03-21 02:24:04 +01:00
|
|
|
toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2008-11-06 21:51:15 +01:00
|
|
|
heap_insert(toastrel, toasttup, mycid, options, NULL);
|
2001-08-10 20:57:42 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Create the index entry. We cheat a little here by not using
|
2005-10-15 04:49:52 +02:00
|
|
|
* FormIndexDatum: this relies on the knowledge that the index columns
|
2013-07-03 20:24:09 +02:00
|
|
|
* are the same as the initial columns of the table for all the
|
|
|
|
* indexes.
|
2001-08-10 20:57:42 +02:00
|
|
|
*
|
2005-11-22 19:17:34 +01:00
|
|
|
* Note also that there had better not be any user-created index on
|
|
|
|
* the TOAST table, since we don't bother to update anything else.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2013-07-03 20:24:09 +02:00
|
|
|
for (i = 0; i < num_indexes; i++)
|
|
|
|
{
|
2013-07-04 19:47:49 +02:00
|
|
|
/* Only index relations marked as ready can be updated */
|
2013-07-03 20:24:09 +02:00
|
|
|
if (IndexIsReady(toastidxs[i]->rd_index))
|
|
|
|
index_insert(toastidxs[i], t_values, t_isnull,
|
|
|
|
&(toasttup->t_self),
|
|
|
|
toastrel,
|
|
|
|
toastidxs[i]->rd_index->indisunique ?
|
|
|
|
UNIQUE_CHECK_YES : UNIQUE_CHECK_NO);
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Free memory
|
|
|
|
*/
|
2001-08-10 20:57:42 +02:00
|
|
|
heap_freetuple(toasttup);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Move on to next chunk
|
|
|
|
*/
|
|
|
|
data_todo -= chunk_size;
|
|
|
|
data_p += chunk_size;
|
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2013-07-03 20:24:09 +02:00
|
|
|
* Done - close toast relation and its indexes
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2013-07-03 20:24:09 +02:00
|
|
|
toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock);
|
2000-07-04 01:10:14 +02:00
|
|
|
heap_close(toastrel, RowExclusiveLock);
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
/*
|
|
|
|
* Create the TOAST pointer value that we'll return
|
|
|
|
*/
|
2007-10-01 18:25:56 +02:00
|
|
|
result = (struct varlena *) palloc(TOAST_POINTER_SIZE);
|
2013-07-02 19:35:14 +02:00
|
|
|
SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK);
|
2007-09-30 21:54:58 +02:00
|
|
|
memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer));
|
2007-04-06 06:21:44 +02:00
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
return PointerGetDatum(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* toast_delete_datum -
|
|
|
|
*
|
|
|
|
* Delete a single external stored value.
|
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
toast_delete_datum(Relation rel, Datum value)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *attr = (struct varlena *) DatumGetPointer(value);
|
|
|
|
struct varatt_external toast_pointer;
|
2001-03-22 05:01:46 +01:00
|
|
|
Relation toastrel;
|
2013-07-03 20:24:09 +02:00
|
|
|
Relation *toastidxs;
|
2001-03-22 05:01:46 +01:00
|
|
|
ScanKeyData toastkey;
|
2008-04-13 01:14:21 +02:00
|
|
|
SysScanDesc toastscan;
|
2002-05-21 01:51:44 +02:00
|
|
|
HeapTuple toasttup;
|
2013-07-03 20:24:09 +02:00
|
|
|
int num_indexes;
|
|
|
|
int validIndex;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2013-07-02 19:35:14 +02:00
|
|
|
if (!VARATT_IS_EXTERNAL_ONDISK(attr))
|
2000-07-04 01:10:14 +02:00
|
|
|
return;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
/* Must copy to access aligned fields */
|
2007-09-30 21:54:58 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
2007-04-06 06:21:44 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2013-07-03 20:24:09 +02:00
|
|
|
* Open the toast relation and its indexes
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
toastrel = heap_open(toast_pointer.va_toastrelid, RowExclusiveLock);
|
2013-07-03 20:24:09 +02:00
|
|
|
|
|
|
|
/* Fetch valid relation used for process */
|
|
|
|
validIndex = toast_open_indexes(toastrel,
|
|
|
|
RowExclusiveLock,
|
|
|
|
&toastidxs,
|
|
|
|
&num_indexes);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2008-04-13 01:14:21 +02:00
|
|
|
* Setup a scan key to find chunks with matching va_valueid
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2003-11-12 22:15:59 +01:00
|
|
|
ScanKeyInit(&toastkey,
|
|
|
|
(AttrNumber) 1,
|
|
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
2007-04-06 06:21:44 +02:00
|
|
|
ObjectIdGetDatum(toast_pointer.va_valueid));
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2008-04-13 01:14:21 +02:00
|
|
|
* Find all the chunks. (We don't actually care whether we see them in
|
2009-06-11 16:49:15 +02:00
|
|
|
* sequence or not, but since we've already locked the index we might as
|
|
|
|
* well use systable_beginscan_ordered.)
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2013-07-03 20:24:09 +02:00
|
|
|
toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex],
|
2008-04-13 01:14:21 +02:00
|
|
|
SnapshotToast, 1, &toastkey);
|
|
|
|
while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Have a chunk, delete it
|
|
|
|
*/
|
2002-05-21 01:51:44 +02:00
|
|
|
simple_heap_delete(toastrel, &toasttup->t_self);
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* End scan and close relations
|
|
|
|
*/
|
2008-04-13 01:14:21 +02:00
|
|
|
systable_endscan_ordered(toastscan);
|
2013-07-03 20:24:09 +02:00
|
|
|
toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock);
|
2000-07-04 01:10:14 +02:00
|
|
|
heap_close(toastrel, RowExclusiveLock);
|
1999-12-21 01:06:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-08-16 19:48:04 +02:00
|
|
|
/* ----------
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
* toastrel_valueid_exists -
|
2011-08-16 19:48:04 +02:00
|
|
|
*
|
|
|
|
* Test whether a toast value with the given ID exists in the toast relation
|
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
static bool
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
toastrel_valueid_exists(Relation toastrel, Oid valueid)
|
2011-08-16 19:48:04 +02:00
|
|
|
{
|
|
|
|
bool result = false;
|
|
|
|
ScanKeyData toastkey;
|
|
|
|
SysScanDesc toastscan;
|
2013-07-03 20:24:09 +02:00
|
|
|
int num_indexes;
|
|
|
|
int validIndex;
|
|
|
|
Relation *toastidxs;
|
|
|
|
|
|
|
|
/* Fetch a valid index relation */
|
|
|
|
validIndex = toast_open_indexes(toastrel,
|
|
|
|
RowExclusiveLock,
|
|
|
|
&toastidxs,
|
|
|
|
&num_indexes);
|
2011-08-16 19:48:04 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup a scan key to find chunks with matching va_valueid
|
|
|
|
*/
|
|
|
|
ScanKeyInit(&toastkey,
|
|
|
|
(AttrNumber) 1,
|
|
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
|
|
|
ObjectIdGetDatum(valueid));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Is there any such chunk?
|
|
|
|
*/
|
2013-07-03 20:24:09 +02:00
|
|
|
toastscan = systable_beginscan(toastrel,
|
2014-05-06 18:12:18 +02:00
|
|
|
RelationGetRelid(toastidxs[validIndex]),
|
|
|
|
true, SnapshotToast, 1, &toastkey);
|
2011-08-16 19:48:04 +02:00
|
|
|
|
|
|
|
if (systable_getnext(toastscan) != NULL)
|
|
|
|
result = true;
|
|
|
|
|
|
|
|
systable_endscan(toastscan);
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
|
2013-07-03 20:24:09 +02:00
|
|
|
/* Clean up */
|
|
|
|
toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock);
|
|
|
|
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* toastid_valueid_exists -
|
|
|
|
*
|
|
|
|
* As above, but work from toast rel's OID not an open relation
|
|
|
|
* ----------
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
toastid_valueid_exists(Oid toastrelid, Oid valueid)
|
|
|
|
{
|
|
|
|
bool result;
|
|
|
|
Relation toastrel;
|
|
|
|
|
|
|
|
toastrel = heap_open(toastrelid, AccessShareLock);
|
|
|
|
|
|
|
|
result = toastrel_valueid_exists(toastrel, valueid);
|
|
|
|
|
2011-08-16 19:48:04 +02:00
|
|
|
heap_close(toastrel, AccessShareLock);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2000-07-04 01:10:14 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_fetch_datum -
|
|
|
|
*
|
2007-04-06 06:21:44 +02:00
|
|
|
* Reconstruct an in memory Datum from the chunks saved
|
2000-07-04 01:10:14 +02:00
|
|
|
* in the toast relation
|
|
|
|
* ----------
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
static struct varlena *
|
2007-11-15 22:14:46 +01:00
|
|
|
toast_fetch_datum(struct varlena * attr)
|
1999-12-21 01:06:44 +01:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
Relation toastrel;
|
2013-07-03 20:24:09 +02:00
|
|
|
Relation *toastidxs;
|
2001-03-22 05:01:46 +01:00
|
|
|
ScanKeyData toastkey;
|
2008-04-13 01:14:21 +02:00
|
|
|
SysScanDesc toastscan;
|
2001-03-22 05:01:46 +01:00
|
|
|
HeapTuple ttup;
|
|
|
|
TupleDesc toasttupDesc;
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *result;
|
|
|
|
struct varatt_external toast_pointer;
|
2001-03-22 05:01:46 +01:00
|
|
|
int32 ressize;
|
2002-01-16 21:29:02 +01:00
|
|
|
int32 residx,
|
|
|
|
nextidx;
|
|
|
|
int32 numchunks;
|
2001-03-22 05:01:46 +01:00
|
|
|
Pointer chunk;
|
|
|
|
bool isnull;
|
2007-11-15 22:14:46 +01:00
|
|
|
char *chunkdata;
|
2001-03-22 05:01:46 +01:00
|
|
|
int32 chunksize;
|
2013-07-03 20:24:09 +02:00
|
|
|
int num_indexes;
|
|
|
|
int validIndex;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2015-02-09 18:30:52 +01:00
|
|
|
if (!VARATT_IS_EXTERNAL_ONDISK(attr))
|
|
|
|
elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums");
|
2013-07-02 19:35:14 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
/* Must copy to access aligned fields */
|
2007-09-30 21:54:58 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
2007-04-06 06:21:44 +02:00
|
|
|
|
|
|
|
ressize = toast_pointer.va_extsize;
|
2001-03-22 05:01:46 +01:00
|
|
|
numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
result = (struct varlena *) palloc(ressize + VARHDRSZ);
|
|
|
|
|
|
|
|
if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
|
|
|
|
SET_VARSIZE_COMPRESSED(result, ressize + VARHDRSZ);
|
|
|
|
else
|
|
|
|
SET_VARSIZE(result, ressize + VARHDRSZ);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2013-07-03 20:24:09 +02:00
|
|
|
* Open the toast relation and its indexes
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock);
|
2000-07-04 01:10:14 +02:00
|
|
|
toasttupDesc = toastrel->rd_att;
|
2013-07-03 20:24:09 +02:00
|
|
|
|
|
|
|
/* Look for the valid index of the toast relation */
|
|
|
|
validIndex = toast_open_indexes(toastrel,
|
|
|
|
AccessShareLock,
|
|
|
|
&toastidxs,
|
|
|
|
&num_indexes);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Setup a scan key to fetch from the index by va_valueid
|
|
|
|
*/
|
2003-11-12 22:15:59 +01:00
|
|
|
ScanKeyInit(&toastkey,
|
|
|
|
(AttrNumber) 1,
|
|
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
2007-04-06 06:21:44 +02:00
|
|
|
ObjectIdGetDatum(toast_pointer.va_valueid));
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Read the chunks by index
|
2000-08-04 06:16:17 +02:00
|
|
|
*
|
2005-11-22 19:17:34 +01:00
|
|
|
* Note that because the index is actually on (valueid, chunkidx) we will
|
|
|
|
* see the chunks in chunkidx order, even though we didn't explicitly ask
|
|
|
|
* for it.
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2002-01-16 21:29:02 +01:00
|
|
|
nextidx = 0;
|
|
|
|
|
2013-07-03 20:24:09 +02:00
|
|
|
toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex],
|
2008-04-13 01:14:21 +02:00
|
|
|
SnapshotToast, 1, &toastkey);
|
|
|
|
while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL)
|
2000-07-04 01:10:14 +02:00
|
|
|
{
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* Have a chunk, extract the sequence number and the data
|
|
|
|
*/
|
2005-03-14 05:41:13 +01:00
|
|
|
residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull));
|
2000-08-04 06:16:17 +02:00
|
|
|
Assert(!isnull);
|
2005-03-14 05:41:13 +01:00
|
|
|
chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull));
|
2000-08-04 06:16:17 +02:00
|
|
|
Assert(!isnull);
|
2007-04-06 06:21:44 +02:00
|
|
|
if (!VARATT_IS_EXTENDED(chunk))
|
|
|
|
{
|
|
|
|
chunksize = VARSIZE(chunk) - VARHDRSZ;
|
|
|
|
chunkdata = VARDATA(chunk);
|
|
|
|
}
|
|
|
|
else if (VARATT_IS_SHORT(chunk))
|
|
|
|
{
|
|
|
|
/* could happen due to heap_form_tuple doing its thing */
|
|
|
|
chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
|
|
|
|
chunkdata = VARDATA_SHORT(chunk);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* should never happen */
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "found toasted toast chunk for toast value %u in %s",
|
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2007-11-15 22:14:46 +01:00
|
|
|
chunksize = 0; /* keep compiler quiet */
|
2007-04-06 06:21:44 +02:00
|
|
|
chunkdata = NULL;
|
|
|
|
}
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-11 14:32:03 +02:00
|
|
|
* Some checks on the data we've found
|
|
|
|
*/
|
2002-01-16 21:29:02 +01:00
|
|
|
if (residx != nextidx)
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s",
|
2002-01-16 21:29:02 +01:00
|
|
|
residx, nextidx,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2001-03-22 05:01:46 +01:00
|
|
|
if (residx < numchunks - 1)
|
2000-08-04 06:16:17 +02:00
|
|
|
{
|
|
|
|
if (chunksize != TOAST_MAX_CHUNK_SIZE)
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s",
|
2007-04-06 06:21:44 +02:00
|
|
|
chunksize, (int) TOAST_MAX_CHUNK_SIZE,
|
|
|
|
residx, numchunks,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2000-08-04 06:16:17 +02:00
|
|
|
}
|
2007-11-15 22:14:46 +01:00
|
|
|
else if (residx == numchunks - 1)
|
2000-08-04 06:16:17 +02:00
|
|
|
{
|
|
|
|
if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize)
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s",
|
2007-04-06 06:21:44 +02:00
|
|
|
chunksize,
|
2007-11-15 22:14:46 +01:00
|
|
|
(int) (ressize - residx * TOAST_MAX_CHUNK_SIZE),
|
2007-04-06 06:21:44 +02:00
|
|
|
residx,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2000-08-04 06:16:17 +02:00
|
|
|
}
|
2002-01-16 21:29:02 +01:00
|
|
|
else
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s",
|
2000-08-04 06:16:17 +02:00
|
|
|
residx,
|
2008-06-13 04:59:47 +02:00
|
|
|
0, numchunks - 1,
|
2007-04-06 06:21:44 +02:00
|
|
|
toast_pointer.va_valueid,
|
2008-06-13 04:59:47 +02:00
|
|
|
RelationGetRelationName(toastrel));
|
2000-07-11 14:32:03 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-08-04 06:16:17 +02:00
|
|
|
* Copy the data into proper place in our result
|
2000-07-04 01:10:14 +02:00
|
|
|
*/
|
2007-02-28 00:48:10 +01:00
|
|
|
memcpy(VARDATA(result) + residx * TOAST_MAX_CHUNK_SIZE,
|
2007-04-06 06:21:44 +02:00
|
|
|
chunkdata,
|
2000-08-04 06:16:17 +02:00
|
|
|
chunksize);
|
2000-07-04 01:10:14 +02:00
|
|
|
|
2002-01-16 21:29:02 +01:00
|
|
|
nextidx++;
|
2000-07-04 01:10:14 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-11 14:32:03 +02:00
|
|
|
* Final checks that we successfully fetched the datum
|
|
|
|
*/
|
2002-01-16 21:29:02 +01:00
|
|
|
if (nextidx != numchunks)
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "missing chunk number %d for toast value %u in %s",
|
2002-01-16 21:29:02 +01:00
|
|
|
nextidx,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2000-07-11 14:32:03 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2000-07-04 01:10:14 +02:00
|
|
|
* End scan and close relations
|
|
|
|
*/
|
2008-04-13 01:14:21 +02:00
|
|
|
systable_endscan_ordered(toastscan);
|
2013-07-03 20:24:09 +02:00
|
|
|
toast_close_indexes(toastidxs, num_indexes, AccessShareLock);
|
2000-07-04 01:10:14 +02:00
|
|
|
heap_close(toastrel, AccessShareLock);
|
|
|
|
|
|
|
|
return result;
|
1999-12-21 01:06:44 +01:00
|
|
|
}
|
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
/* ----------
|
|
|
|
* toast_fetch_datum_slice -
|
|
|
|
*
|
2007-04-06 06:21:44 +02:00
|
|
|
* Reconstruct a segment of a Datum from the chunks saved
|
2002-03-05 06:33:31 +01:00
|
|
|
* in the toast relation
|
|
|
|
* ----------
|
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
static struct varlena *
|
2007-11-15 22:14:46 +01:00
|
|
|
toast_fetch_datum_slice(struct varlena * attr, int32 sliceoffset, int32 length)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
|
|
|
Relation toastrel;
|
2013-07-03 20:24:09 +02:00
|
|
|
Relation *toastidxs;
|
2002-03-05 06:33:31 +01:00
|
|
|
ScanKeyData toastkey[3];
|
2002-05-21 01:51:44 +02:00
|
|
|
int nscankeys;
|
2008-04-13 01:14:21 +02:00
|
|
|
SysScanDesc toastscan;
|
2002-03-05 06:33:31 +01:00
|
|
|
HeapTuple ttup;
|
|
|
|
TupleDesc toasttupDesc;
|
2007-04-06 06:21:44 +02:00
|
|
|
struct varlena *result;
|
|
|
|
struct varatt_external toast_pointer;
|
2002-03-05 06:33:31 +01:00
|
|
|
int32 attrsize;
|
|
|
|
int32 residx;
|
2002-09-04 22:31:48 +02:00
|
|
|
int32 nextidx;
|
|
|
|
int numchunks;
|
|
|
|
int startchunk;
|
|
|
|
int endchunk;
|
2002-03-05 06:33:31 +01:00
|
|
|
int32 startoffset;
|
|
|
|
int32 endoffset;
|
2002-09-04 22:31:48 +02:00
|
|
|
int totalchunks;
|
2002-03-05 06:33:31 +01:00
|
|
|
Pointer chunk;
|
|
|
|
bool isnull;
|
2007-11-15 22:14:46 +01:00
|
|
|
char *chunkdata;
|
2002-03-05 06:33:31 +01:00
|
|
|
int32 chunksize;
|
2002-09-04 22:31:48 +02:00
|
|
|
int32 chcpystrt;
|
|
|
|
int32 chcpyend;
|
2013-07-03 20:24:09 +02:00
|
|
|
int num_indexes;
|
|
|
|
int validIndex;
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2015-02-09 18:30:52 +01:00
|
|
|
if (!VARATT_IS_EXTERNAL_ONDISK(attr))
|
|
|
|
elog(ERROR, "toast_fetch_datum_slice shouldn't be called for non-ondisk datums");
|
2007-09-27 01:29:10 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
/* Must copy to access aligned fields */
|
2007-09-30 21:54:58 +02:00
|
|
|
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
2007-04-06 06:21:44 +02:00
|
|
|
|
2007-09-27 01:29:10 +02:00
|
|
|
/*
|
|
|
|
* It's nonsense to fetch slices of a compressed datum -- this isn't lo_*
|
|
|
|
* we can't return a compressed datum which is meaningful to toast later
|
|
|
|
*/
|
|
|
|
Assert(!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer));
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
attrsize = toast_pointer.va_extsize;
|
2002-03-05 06:33:31 +01:00
|
|
|
totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1;
|
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
if (sliceoffset >= attrsize)
|
2002-05-21 01:51:44 +02:00
|
|
|
{
|
2002-09-04 22:31:48 +02:00
|
|
|
sliceoffset = 0;
|
|
|
|
length = 0;
|
2002-05-21 01:51:44 +02:00
|
|
|
}
|
2002-03-05 06:33:31 +01:00
|
|
|
|
|
|
|
if (((sliceoffset + length) > attrsize) || length < 0)
|
2002-09-04 22:31:48 +02:00
|
|
|
length = attrsize - sliceoffset;
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
result = (struct varlena *) palloc(length + VARHDRSZ);
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
|
|
|
|
SET_VARSIZE_COMPRESSED(result, length + VARHDRSZ);
|
|
|
|
else
|
|
|
|
SET_VARSIZE(result, length + VARHDRSZ);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
|
|
|
if (length == 0)
|
2006-10-04 02:30:14 +02:00
|
|
|
return result; /* Can save a lot of work at this point! */
|
2002-03-05 06:33:31 +01:00
|
|
|
|
|
|
|
startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE;
|
|
|
|
endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE;
|
2002-09-04 22:31:48 +02:00
|
|
|
numchunks = (endchunk - startchunk) + 1;
|
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE;
|
|
|
|
endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE;
|
|
|
|
|
|
|
|
/*
|
2013-07-03 20:24:09 +02:00
|
|
|
* Open the toast relation and its indexes
|
2002-03-05 06:33:31 +01:00
|
|
|
*/
|
2007-04-06 06:21:44 +02:00
|
|
|
toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock);
|
2002-03-05 06:33:31 +01:00
|
|
|
toasttupDesc = toastrel->rd_att;
|
2013-07-03 20:24:09 +02:00
|
|
|
|
|
|
|
/* Look for the valid index of toast relation */
|
|
|
|
validIndex = toast_open_indexes(toastrel,
|
|
|
|
AccessShareLock,
|
|
|
|
&toastidxs,
|
|
|
|
&num_indexes);
|
2002-03-05 06:33:31 +01:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Setup a scan key to fetch from the index. This is either two keys or
|
|
|
|
* three depending on the number of chunks.
|
2002-03-05 06:33:31 +01:00
|
|
|
*/
|
2003-11-12 22:15:59 +01:00
|
|
|
ScanKeyInit(&toastkey[0],
|
|
|
|
(AttrNumber) 1,
|
|
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
2007-04-06 06:21:44 +02:00
|
|
|
ObjectIdGetDatum(toast_pointer.va_valueid));
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
/*
|
2003-11-09 22:30:38 +01:00
|
|
|
* Use equality condition for one chunk, a range condition otherwise:
|
2002-03-05 06:33:31 +01:00
|
|
|
*/
|
2002-09-04 22:31:48 +02:00
|
|
|
if (numchunks == 1)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
ScanKeyInit(&toastkey[1],
|
|
|
|
(AttrNumber) 2,
|
|
|
|
BTEqualStrategyNumber, F_INT4EQ,
|
|
|
|
Int32GetDatum(startchunk));
|
2002-09-04 22:31:48 +02:00
|
|
|
nscankeys = 2;
|
2002-03-05 06:33:31 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
ScanKeyInit(&toastkey[1],
|
|
|
|
(AttrNumber) 2,
|
|
|
|
BTGreaterEqualStrategyNumber, F_INT4GE,
|
|
|
|
Int32GetDatum(startchunk));
|
|
|
|
ScanKeyInit(&toastkey[2],
|
|
|
|
(AttrNumber) 2,
|
|
|
|
BTLessEqualStrategyNumber, F_INT4LE,
|
|
|
|
Int32GetDatum(endchunk));
|
2002-09-04 22:31:48 +02:00
|
|
|
nscankeys = 3;
|
2002-03-05 06:33:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the chunks by index
|
|
|
|
*
|
|
|
|
* The index is on (valueid, chunkidx) so they will come in order
|
|
|
|
*/
|
|
|
|
nextidx = startchunk;
|
2013-07-03 20:24:09 +02:00
|
|
|
toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex],
|
2009-06-11 16:49:15 +02:00
|
|
|
SnapshotToast, nscankeys, toastkey);
|
2008-04-13 01:14:21 +02:00
|
|
|
while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Have a chunk, extract the sequence number and the data
|
|
|
|
*/
|
2005-03-14 05:41:13 +01:00
|
|
|
residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull));
|
2002-03-05 06:33:31 +01:00
|
|
|
Assert(!isnull);
|
2005-03-14 05:41:13 +01:00
|
|
|
chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull));
|
2002-03-05 06:33:31 +01:00
|
|
|
Assert(!isnull);
|
2007-04-06 06:21:44 +02:00
|
|
|
if (!VARATT_IS_EXTENDED(chunk))
|
|
|
|
{
|
|
|
|
chunksize = VARSIZE(chunk) - VARHDRSZ;
|
|
|
|
chunkdata = VARDATA(chunk);
|
|
|
|
}
|
|
|
|
else if (VARATT_IS_SHORT(chunk))
|
|
|
|
{
|
|
|
|
/* could happen due to heap_form_tuple doing its thing */
|
|
|
|
chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
|
|
|
|
chunkdata = VARDATA_SHORT(chunk);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* should never happen */
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "found toasted toast chunk for toast value %u in %s",
|
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2007-11-15 22:14:46 +01:00
|
|
|
chunksize = 0; /* keep compiler quiet */
|
2007-04-06 06:21:44 +02:00
|
|
|
chunkdata = NULL;
|
|
|
|
}
|
2002-03-05 06:33:31 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Some checks on the data we've found
|
|
|
|
*/
|
|
|
|
if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk))
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s",
|
2002-03-05 06:33:31 +01:00
|
|
|
residx, nextidx,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2002-03-05 06:33:31 +01:00
|
|
|
if (residx < totalchunks - 1)
|
|
|
|
{
|
|
|
|
if (chunksize != TOAST_MAX_CHUNK_SIZE)
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s when fetching slice",
|
2007-04-06 06:21:44 +02:00
|
|
|
chunksize, (int) TOAST_MAX_CHUNK_SIZE,
|
|
|
|
residx, totalchunks,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2002-03-05 06:33:31 +01:00
|
|
|
}
|
2007-11-15 22:14:46 +01:00
|
|
|
else if (residx == totalchunks - 1)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
|
|
|
if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize)
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s when fetching slice",
|
2007-04-06 06:21:44 +02:00
|
|
|
chunksize,
|
|
|
|
(int) (attrsize - residx * TOAST_MAX_CHUNK_SIZE),
|
|
|
|
residx,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2002-03-05 06:33:31 +01:00
|
|
|
}
|
2007-04-06 06:21:44 +02:00
|
|
|
else
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s",
|
2007-04-06 06:21:44 +02:00
|
|
|
residx,
|
2008-06-13 04:59:47 +02:00
|
|
|
0, totalchunks - 1,
|
2007-04-06 06:21:44 +02:00
|
|
|
toast_pointer.va_valueid,
|
2008-06-13 04:59:47 +02:00
|
|
|
RelationGetRelationName(toastrel));
|
2002-03-05 06:33:31 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy the data into proper place in our result
|
|
|
|
*/
|
|
|
|
chcpystrt = 0;
|
|
|
|
chcpyend = chunksize - 1;
|
2002-09-04 22:31:48 +02:00
|
|
|
if (residx == startchunk)
|
|
|
|
chcpystrt = startoffset;
|
|
|
|
if (residx == endchunk)
|
|
|
|
chcpyend = endoffset;
|
|
|
|
|
2007-02-28 00:48:10 +01:00
|
|
|
memcpy(VARDATA(result) +
|
2002-09-04 22:31:48 +02:00
|
|
|
(residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt,
|
2007-04-06 06:21:44 +02:00
|
|
|
chunkdata + chcpystrt,
|
2002-03-05 06:33:31 +01:00
|
|
|
(chcpyend - chcpystrt) + 1);
|
2002-09-04 22:31:48 +02:00
|
|
|
|
2002-03-05 06:33:31 +01:00
|
|
|
nextidx++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Final checks that we successfully fetched the datum
|
|
|
|
*/
|
2002-09-04 22:31:48 +02:00
|
|
|
if (nextidx != (endchunk + 1))
|
2008-06-13 04:59:47 +02:00
|
|
|
elog(ERROR, "missing chunk number %d for toast value %u in %s",
|
2002-03-05 06:33:31 +01:00
|
|
|
nextidx,
|
2008-06-13 04:59:47 +02:00
|
|
|
toast_pointer.va_valueid,
|
|
|
|
RelationGetRelationName(toastrel));
|
2002-03-05 06:33:31 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* End scan and close relations
|
|
|
|
*/
|
2008-04-13 01:14:21 +02:00
|
|
|
systable_endscan_ordered(toastscan);
|
2013-07-03 20:24:09 +02:00
|
|
|
toast_close_indexes(toastidxs, num_indexes, AccessShareLock);
|
2002-03-05 06:33:31 +01:00
|
|
|
heap_close(toastrel, AccessShareLock);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
2013-07-03 20:24:09 +02:00
|
|
|
|
2015-02-09 07:15:24 +01:00
|
|
|
/* ----------
|
|
|
|
* toast_decompress_datum -
|
|
|
|
*
|
|
|
|
* Decompress a compressed version of a varlena datum
|
|
|
|
*/
|
|
|
|
static struct varlena *
|
|
|
|
toast_decompress_datum(struct varlena * attr)
|
|
|
|
{
|
|
|
|
struct varlena *result;
|
|
|
|
|
|
|
|
Assert(VARATT_IS_COMPRESSED(attr));
|
|
|
|
|
|
|
|
result = (struct varlena *)
|
|
|
|
palloc(TOAST_COMPRESS_RAWSIZE(attr) + VARHDRSZ);
|
|
|
|
SET_VARSIZE(result, TOAST_COMPRESS_RAWSIZE(attr) + VARHDRSZ);
|
|
|
|
|
|
|
|
if (pglz_decompress(TOAST_COMPRESS_RAWDATA(attr),
|
|
|
|
VARSIZE(attr) - TOAST_COMPRESS_HDRSZ,
|
|
|
|
VARDATA(result),
|
|
|
|
TOAST_COMPRESS_RAWSIZE(attr)) < 0)
|
|
|
|
elog(ERROR, "compressed data is corrupted");
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-07-03 20:24:09 +02:00
|
|
|
/* ----------
|
|
|
|
* toast_open_indexes
|
|
|
|
*
|
|
|
|
* Get an array of the indexes associated to the given toast relation
|
|
|
|
* and return as well the position of the valid index used by the toast
|
|
|
|
* relation in this array. It is the responsibility of the caller of this
|
|
|
|
* function to close the indexes as well as free them.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
toast_open_indexes(Relation toastrel,
|
|
|
|
LOCKMODE lock,
|
|
|
|
Relation **toastidxs,
|
|
|
|
int *num_indexes)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
int res = 0;
|
|
|
|
bool found = false;
|
|
|
|
List *indexlist;
|
|
|
|
ListCell *lc;
|
|
|
|
|
|
|
|
/* Get index list of the toast relation */
|
|
|
|
indexlist = RelationGetIndexList(toastrel);
|
|
|
|
Assert(indexlist != NIL);
|
|
|
|
|
|
|
|
*num_indexes = list_length(indexlist);
|
|
|
|
|
|
|
|
/* Open all the index relations */
|
|
|
|
*toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation));
|
|
|
|
foreach(lc, indexlist)
|
|
|
|
(*toastidxs)[i++] = index_open(lfirst_oid(lc), lock);
|
|
|
|
|
|
|
|
/* Fetch the first valid index in list */
|
|
|
|
for (i = 0; i < *num_indexes; i++)
|
|
|
|
{
|
2014-05-06 18:12:18 +02:00
|
|
|
Relation toastidx = (*toastidxs)[i];
|
|
|
|
|
2013-07-03 20:24:09 +02:00
|
|
|
if (toastidx->rd_index->indisvalid)
|
|
|
|
{
|
|
|
|
res = i;
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Free index list, not necessary anymore as relations are opened and a
|
|
|
|
* valid index has been found.
|
2013-07-03 20:24:09 +02:00
|
|
|
*/
|
|
|
|
list_free(indexlist);
|
|
|
|
|
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* The toast relation should have one valid index, so something is going
|
|
|
|
* wrong if there is nothing.
|
2013-07-03 20:24:09 +02:00
|
|
|
*/
|
|
|
|
if (!found)
|
2014-12-11 21:41:15 +01:00
|
|
|
elog(ERROR, "no valid index found for toast relation with Oid %u",
|
2013-07-03 20:24:09 +02:00
|
|
|
RelationGetRelid(toastrel));
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------
|
|
|
|
* toast_close_indexes
|
|
|
|
*
|
|
|
|
* Close an array of indexes for a toast relation and free it. This should
|
|
|
|
* be called for a set of indexes opened previously with toast_open_indexes.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock)
|
|
|
|
{
|
2014-05-06 18:12:18 +02:00
|
|
|
int i;
|
2013-07-03 20:24:09 +02:00
|
|
|
|
|
|
|
/* Close relations and clean up things */
|
|
|
|
for (i = 0; i < num_indexes; i++)
|
|
|
|
index_close(toastidxs[i], lock);
|
|
|
|
pfree(toastidxs);
|
|
|
|
}
|