postgresql/src/backend/catalog/toasting.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

415 lines
12 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* toasting.c
* This file contains routines to support creation of toast tables
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/catalog/toasting.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
2019-01-15 00:54:18 +01:00
#include "access/heapam.h"
Allow configurable LZ4 TOAST compression. There is now a per-column COMPRESSION option which can be set to pglz (the default, and the only option in up until now) or lz4. Or, if you like, you can set the new default_toast_compression GUC to lz4, and then that will be the default for new table columns for which no value is specified. We don't have lz4 support in the PostgreSQL code, so to use lz4 compression, PostgreSQL must be built --with-lz4. In general, TOAST compression means compression of individual column values, not the whole tuple, and those values can either be compressed inline within the tuple or compressed and then stored externally in the TOAST table, so those properties also apply to this feature. Prior to this commit, a TOAST pointer has two unused bits as part of the va_extsize field, and a compessed datum has two unused bits as part of the va_rawsize field. These bits are unused because the length of a varlena is limited to 1GB; we now use them to indicate the compression type that was used. This means we only have bit space for 2 more built-in compresison types, but we could work around that problem, if necessary, by introducing a new vartag_external value for any further types we end up wanting to add. Hopefully, it won't be too important to offer a wide selection of algorithms here, since each one we add not only takes more coding but also adds a build dependency for every packager. Nevertheless, it seems worth doing at least this much, because LZ4 gets better compression than PGLZ with less CPU usage. It's possible for LZ4-compressed datums to leak into composite type values stored on disk, just as it is for PGLZ. It's also possible for LZ4-compressed attributes to be copied into a different table via SQL commands such as CREATE TABLE AS or INSERT .. SELECT. It would be expensive to force such values to be decompressed, so PostgreSQL has never done so. For the same reasons, we also don't force recompression of already-compressed values even if the target table prefers a different compression method than was used for the source data. These architectural decisions are perhaps arguable but revisiting them is well beyond the scope of what seemed possible to do as part of this project. However, it's relatively cheap to recompress as part of VACUUM FULL or CLUSTER, so this commit adjusts those commands to do so, if the configured compression method of the table happens not to match what was used for some column value stored therein. Dilip Kumar. The original patches on which this work was based were written by Ildus Kurbangaliev, and those were patches were based on even earlier work by Nikita Glukhov, but the design has since changed very substantially, since allow a potentially large number of compression methods that could be added and dropped on a running system proved too problematic given some of the architectural issues mentioned above; the choice of which specific compression method to add first is now different; and a lot of the code has been heavily refactored. More recently, Justin Przyby helped quite a bit with testing and reviewing and this version also includes some code contributions from him. Other design input and review from Tomas Vondra, Álvaro Herrera, Andres Freund, Oleg Bartunov, Alexander Korotkov, and me. Discussion: http://postgr.es/m/20170907194236.4cefce96%40wp.localdomain Discussion: http://postgr.es/m/CAFiTN-uUpX3ck%3DK0mLEk-G_kUQY%3DSNOTeqdaNRR9FMdQrHKebw%40mail.gmail.com
2021-03-19 20:10:38 +01:00
#include "access/toast_compression.h"
#include "access/xact.h"
#include "catalog/binary_upgrade.h"
#include "catalog/catalog.h"
#include "catalog/dependency.h"
#include "catalog/heap.h"
#include "catalog/index.h"
#include "catalog/namespace.h"
#include "catalog/pg_am.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_type.h"
#include "catalog/toasting.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/syscache.h"
static void CheckAndCreateToastTable(Oid relOid, Datum reloptions,
LOCKMODE lockmode, bool check,
Oid OIDOldToast);
static bool create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
Datum reloptions, LOCKMODE lockmode, bool check,
Oid OIDOldToast);
static bool needs_toast_table(Relation rel);
/*
* CreateToastTable variants
* If the table needs a toast table, and doesn't already have one,
* then create a toast table for it.
*
* reloptions for the toast table can be passed, too. Pass (Datum) 0
* for default reloptions.
*
* We expect the caller to have verified that the relation is a table and have
* already done any necessary permission checks. Callers expect this function
* to end with CommandCounterIncrement if it makes any changes.
*/
void
AlterTableCreateToastTable(Oid relOid, Datum reloptions, LOCKMODE lockmode)
{
CheckAndCreateToastTable(relOid, reloptions, lockmode, true, InvalidOid);
}
void
NewHeapCreateToastTable(Oid relOid, Datum reloptions, LOCKMODE lockmode,
Oid OIDOldToast)
{
CheckAndCreateToastTable(relOid, reloptions, lockmode, false, OIDOldToast);
}
void
NewRelationCreateToastTable(Oid relOid, Datum reloptions)
{
CheckAndCreateToastTable(relOid, reloptions, AccessExclusiveLock, false,
InvalidOid);
}
static void
CheckAndCreateToastTable(Oid relOid, Datum reloptions, LOCKMODE lockmode,
bool check, Oid OIDOldToast)
{
Relation rel;
rel = table_open(relOid, lockmode);
/* create_toast_table does all the work */
(void) create_toast_table(rel, InvalidOid, InvalidOid, reloptions, lockmode,
check, OIDOldToast);
table_close(rel, NoLock);
}
/*
* Create a toast table during bootstrap
*
* Here we need to prespecify the OIDs of the toast table and its index
*/
void
BootstrapToastTable(char *relName, Oid toastOid, Oid toastIndexOid)
{
Relation rel;
rel = table_openrv(makeRangeVar(NULL, relName, -1), AccessExclusiveLock);
if (rel->rd_rel->relkind != RELKIND_RELATION &&
rel->rd_rel->relkind != RELKIND_MATVIEW)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is not a table or materialized view",
relName)));
/* create_toast_table does all the work */
if (!create_toast_table(rel, toastOid, toastIndexOid, (Datum) 0,
AccessExclusiveLock, false, InvalidOid))
elog(ERROR, "\"%s\" does not require a toast table",
relName);
table_close(rel, NoLock);
}
/*
* create_toast_table --- internal workhorse
*
* rel is already opened and locked
* toastOid and toastIndexOid are normally InvalidOid, but during
* bootstrap they can be nonzero to specify hand-assigned OIDs
*/
static bool
create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
Datum reloptions, LOCKMODE lockmode, bool check,
Oid OIDOldToast)
{
Oid relOid = RelationGetRelid(rel);
HeapTuple reltup;
TupleDesc tupdesc;
bool shared_relation;
bool mapped_relation;
Relation toast_rel;
Relation class_rel;
Oid toast_relid;
Oid namespaceid;
char toast_relname[NAMEDATALEN];
char toast_idxname[NAMEDATALEN];
IndexInfo *indexInfo;
Oid collationObjectId[2];
Oid classObjectId[2];
int16 coloptions[2];
ObjectAddress baseobject,
toastobject;
/*
* Is it already toasted?
*/
if (rel->rd_rel->reltoastrelid != InvalidOid)
return false;
Fix pg_upgrade to not fail when new-cluster TOAST rules differ from old. This patch essentially reverts commit 4c6780fd17aa43ed, in favor of a much simpler solution for the case where the new cluster would choose to create a TOAST table but the old cluster doesn't have one: just don't create a TOAST table. The existing code failed in at least two different ways if the situation arose: (1) ALTER TABLE RESET didn't grab an exclusive lock, so that the lock sanity check in create_toast_table failed; (2) pg_upgrade did not provide a pg_type OID for the new toast table, so that the crosscheck in TypeCreate failed. While both these problems were introduced by later patches, they show that the hack being used to cause TOAST table creation is overwhelmingly fragile (and untested). I also note that before the TypeCreate crosscheck was added, the code would have resulted in assigning an indeterminate pg_type OID to the toast table, possibly causing a later OID conflict in that catalog; so that it didn't really work even when committed. If we simply don't create a TOAST table, there will only be a problem if the code tries to store a tuple that's wider than a page, and field compression isn't sufficient to get it under a page. Given that the TOAST creation threshold is intended to be about a quarter of a page, it's very hard to believe that cross-version differences in the do-we-need-a-toast- table heuristic could result in an observable problem. So let's just follow the old version's conclusion about whether a TOAST table is needed. (If we ever do change needs_toast_table() so much that this conclusion doesn't apply, we can devise a solution at that time, and hopefully do it in a less klugy way than 4c6780fd17aa43ed did.) Back-patch to 9.3, like the previous patch. Discussion: <8110.1462291671@sss.pgh.pa.us>
2016-05-07 04:05:51 +02:00
/*
* Check to see whether the table actually needs a TOAST table.
*/
if (!IsBinaryUpgrade)
{
Fix pg_upgrade to not fail when new-cluster TOAST rules differ from old. This patch essentially reverts commit 4c6780fd17aa43ed, in favor of a much simpler solution for the case where the new cluster would choose to create a TOAST table but the old cluster doesn't have one: just don't create a TOAST table. The existing code failed in at least two different ways if the situation arose: (1) ALTER TABLE RESET didn't grab an exclusive lock, so that the lock sanity check in create_toast_table failed; (2) pg_upgrade did not provide a pg_type OID for the new toast table, so that the crosscheck in TypeCreate failed. While both these problems were introduced by later patches, they show that the hack being used to cause TOAST table creation is overwhelmingly fragile (and untested). I also note that before the TypeCreate crosscheck was added, the code would have resulted in assigning an indeterminate pg_type OID to the toast table, possibly causing a later OID conflict in that catalog; so that it didn't really work even when committed. If we simply don't create a TOAST table, there will only be a problem if the code tries to store a tuple that's wider than a page, and field compression isn't sufficient to get it under a page. Given that the TOAST creation threshold is intended to be about a quarter of a page, it's very hard to believe that cross-version differences in the do-we-need-a-toast- table heuristic could result in an observable problem. So let's just follow the old version's conclusion about whether a TOAST table is needed. (If we ever do change needs_toast_table() so much that this conclusion doesn't apply, we can devise a solution at that time, and hopefully do it in a less klugy way than 4c6780fd17aa43ed did.) Back-patch to 9.3, like the previous patch. Discussion: <8110.1462291671@sss.pgh.pa.us>
2016-05-07 04:05:51 +02:00
/* Normal mode, normal check */
if (!needs_toast_table(rel))
return false;
}
else
{
/*
Fix pg_upgrade to not fail when new-cluster TOAST rules differ from old. This patch essentially reverts commit 4c6780fd17aa43ed, in favor of a much simpler solution for the case where the new cluster would choose to create a TOAST table but the old cluster doesn't have one: just don't create a TOAST table. The existing code failed in at least two different ways if the situation arose: (1) ALTER TABLE RESET didn't grab an exclusive lock, so that the lock sanity check in create_toast_table failed; (2) pg_upgrade did not provide a pg_type OID for the new toast table, so that the crosscheck in TypeCreate failed. While both these problems were introduced by later patches, they show that the hack being used to cause TOAST table creation is overwhelmingly fragile (and untested). I also note that before the TypeCreate crosscheck was added, the code would have resulted in assigning an indeterminate pg_type OID to the toast table, possibly causing a later OID conflict in that catalog; so that it didn't really work even when committed. If we simply don't create a TOAST table, there will only be a problem if the code tries to store a tuple that's wider than a page, and field compression isn't sufficient to get it under a page. Given that the TOAST creation threshold is intended to be about a quarter of a page, it's very hard to believe that cross-version differences in the do-we-need-a-toast- table heuristic could result in an observable problem. So let's just follow the old version's conclusion about whether a TOAST table is needed. (If we ever do change needs_toast_table() so much that this conclusion doesn't apply, we can devise a solution at that time, and hopefully do it in a less klugy way than 4c6780fd17aa43ed did.) Back-patch to 9.3, like the previous patch. Discussion: <8110.1462291671@sss.pgh.pa.us>
2016-05-07 04:05:51 +02:00
* In binary-upgrade mode, create a TOAST table if and only if
* pg_upgrade told us to (ie, a TOAST table OID has been provided).
*
Fix pg_upgrade to not fail when new-cluster TOAST rules differ from old. This patch essentially reverts commit 4c6780fd17aa43ed, in favor of a much simpler solution for the case where the new cluster would choose to create a TOAST table but the old cluster doesn't have one: just don't create a TOAST table. The existing code failed in at least two different ways if the situation arose: (1) ALTER TABLE RESET didn't grab an exclusive lock, so that the lock sanity check in create_toast_table failed; (2) pg_upgrade did not provide a pg_type OID for the new toast table, so that the crosscheck in TypeCreate failed. While both these problems were introduced by later patches, they show that the hack being used to cause TOAST table creation is overwhelmingly fragile (and untested). I also note that before the TypeCreate crosscheck was added, the code would have resulted in assigning an indeterminate pg_type OID to the toast table, possibly causing a later OID conflict in that catalog; so that it didn't really work even when committed. If we simply don't create a TOAST table, there will only be a problem if the code tries to store a tuple that's wider than a page, and field compression isn't sufficient to get it under a page. Given that the TOAST creation threshold is intended to be about a quarter of a page, it's very hard to believe that cross-version differences in the do-we-need-a-toast- table heuristic could result in an observable problem. So let's just follow the old version's conclusion about whether a TOAST table is needed. (If we ever do change needs_toast_table() so much that this conclusion doesn't apply, we can devise a solution at that time, and hopefully do it in a less klugy way than 4c6780fd17aa43ed did.) Back-patch to 9.3, like the previous patch. Discussion: <8110.1462291671@sss.pgh.pa.us>
2016-05-07 04:05:51 +02:00
* This indicates that the old cluster had a TOAST table for the
* current table. We must create a TOAST table to receive the old
* TOAST file, even if the table seems not to need one.
*
* Contrariwise, if the old cluster did not have a TOAST table, we
* should be able to get along without one even if the new version's
* needs_toast_table rules suggest we should have one. There is a lot
* of daylight between where we will create a TOAST table and where
* one is really necessary to avoid failures, so small cross-version
* differences in the when-to-create heuristic shouldn't be a problem.
* If we tried to create a TOAST table anyway, we would have the
* problem that it might take up an OID that will conflict with some
* old-cluster table we haven't seen yet.
*/
if (!OidIsValid(binary_upgrade_next_toast_pg_class_oid))
return false;
}
/*
* If requested check lockmode is sufficient. This is a cross check in
* case of errors or conflicting decisions in earlier code.
*/
if (check && lockmode != AccessExclusiveLock)
elog(ERROR, "AccessExclusiveLock required to add toast table.");
/*
* Create the toast table and its index
*/
snprintf(toast_relname, sizeof(toast_relname),
"pg_toast_%u", relOid);
snprintf(toast_idxname, sizeof(toast_idxname),
"pg_toast_%u_index", relOid);
/* this is pretty painful... need a tuple descriptor */
Remove WITH OIDS support, change oid catalog column visibility. Previously tables declared WITH OIDS, including a significant fraction of the catalog tables, stored the oid column not as a normal column, but as part of the tuple header. This special column was not shown by default, which was somewhat odd, as it's often (consider e.g. pg_class.oid) one of the more important parts of a row. Neither pg_dump nor COPY included the contents of the oid column by default. The fact that the oid column was not an ordinary column necessitated a significant amount of special case code to support oid columns. That already was painful for the existing, but upcoming work aiming to make table storage pluggable, would have required expanding and duplicating that "specialness" significantly. WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0). Remove it. Removing includes: - CREATE TABLE and ALTER TABLE syntax for declaring the table to be WITH OIDS has been removed (WITH (oids[ = true]) will error out) - pg_dump does not support dumping tables declared WITH OIDS and will issue a warning when dumping one (and ignore the oid column). - restoring an pg_dump archive with pg_restore will warn when restoring a table with oid contents (and ignore the oid column) - COPY will refuse to load binary dump that includes oids. - pg_upgrade will error out when encountering tables declared WITH OIDS, they have to be altered to remove the oid column first. - Functionality to access the oid of the last inserted row (like plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed. The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false) for CREATE TABLE) is still supported. While that requires a bit of support code, it seems unnecessary to break applications / dumps that do not use oids, and are explicit about not using them. The biggest user of WITH OID columns was postgres' catalog. This commit changes all 'magic' oid columns to be columns that are normally declared and stored. To reduce unnecessary query breakage all the newly added columns are still named 'oid', even if a table's column naming scheme would indicate 'reloid' or such. This obviously requires adapting a lot code, mostly replacing oid access via HeapTupleGetOid() with access to the underlying Form_pg_*->oid column. The bootstrap process now assigns oids for all oid columns in genbki.pl that do not have an explicit value (starting at the largest oid previously used), only oids assigned later by oids will be above FirstBootstrapObjectId. As the oid column now is a normal column the special bootstrap syntax for oids has been removed. Oids are not automatically assigned during insertion anymore, all backend code explicitly assigns oids with GetNewOidWithIndex(). For the rare case that insertions into the catalog via SQL are called for the new pg_nextoid() function can be used (which only works on catalog tables). The fact that oid columns on system tables are now normal columns means that they will be included in the set of columns expanded by * (i.e. SELECT * FROM pg_class will now include the table's oid, previously it did not). It'd not technically be hard to hide oid column by default, but that'd mean confusing behavior would either have to be carried forward forever, or it'd cause breakage down the line. While it's not unlikely that further adjustments are needed, the scope/invasiveness of the patch makes it worthwhile to get merge this now. It's painful to maintain externally, too complicated to commit after the code code freeze, and a dependency of a number of other patches. Catversion bump, for obvious reasons. Author: Andres Freund, with contributions by John Naylor Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
tupdesc = CreateTemplateTupleDesc(3);
TupleDescInitEntry(tupdesc, (AttrNumber) 1,
"chunk_id",
OIDOID,
-1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2,
"chunk_seq",
INT4OID,
-1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3,
"chunk_data",
BYTEAOID,
-1, 0);
/*
* Ensure that the toast table doesn't itself get toasted, or we'll be
* toast :-(. This is essential for chunk_data because type bytea is
* toastable; hit the other two just to be sure.
*/
TupleDescAttr(tupdesc, 0)->attstorage = TYPSTORAGE_PLAIN;
TupleDescAttr(tupdesc, 1)->attstorage = TYPSTORAGE_PLAIN;
TupleDescAttr(tupdesc, 2)->attstorage = TYPSTORAGE_PLAIN;
Allow configurable LZ4 TOAST compression. There is now a per-column COMPRESSION option which can be set to pglz (the default, and the only option in up until now) or lz4. Or, if you like, you can set the new default_toast_compression GUC to lz4, and then that will be the default for new table columns for which no value is specified. We don't have lz4 support in the PostgreSQL code, so to use lz4 compression, PostgreSQL must be built --with-lz4. In general, TOAST compression means compression of individual column values, not the whole tuple, and those values can either be compressed inline within the tuple or compressed and then stored externally in the TOAST table, so those properties also apply to this feature. Prior to this commit, a TOAST pointer has two unused bits as part of the va_extsize field, and a compessed datum has two unused bits as part of the va_rawsize field. These bits are unused because the length of a varlena is limited to 1GB; we now use them to indicate the compression type that was used. This means we only have bit space for 2 more built-in compresison types, but we could work around that problem, if necessary, by introducing a new vartag_external value for any further types we end up wanting to add. Hopefully, it won't be too important to offer a wide selection of algorithms here, since each one we add not only takes more coding but also adds a build dependency for every packager. Nevertheless, it seems worth doing at least this much, because LZ4 gets better compression than PGLZ with less CPU usage. It's possible for LZ4-compressed datums to leak into composite type values stored on disk, just as it is for PGLZ. It's also possible for LZ4-compressed attributes to be copied into a different table via SQL commands such as CREATE TABLE AS or INSERT .. SELECT. It would be expensive to force such values to be decompressed, so PostgreSQL has never done so. For the same reasons, we also don't force recompression of already-compressed values even if the target table prefers a different compression method than was used for the source data. These architectural decisions are perhaps arguable but revisiting them is well beyond the scope of what seemed possible to do as part of this project. However, it's relatively cheap to recompress as part of VACUUM FULL or CLUSTER, so this commit adjusts those commands to do so, if the configured compression method of the table happens not to match what was used for some column value stored therein. Dilip Kumar. The original patches on which this work was based were written by Ildus Kurbangaliev, and those were patches were based on even earlier work by Nikita Glukhov, but the design has since changed very substantially, since allow a potentially large number of compression methods that could be added and dropped on a running system proved too problematic given some of the architectural issues mentioned above; the choice of which specific compression method to add first is now different; and a lot of the code has been heavily refactored. More recently, Justin Przyby helped quite a bit with testing and reviewing and this version also includes some code contributions from him. Other design input and review from Tomas Vondra, Álvaro Herrera, Andres Freund, Oleg Bartunov, Alexander Korotkov, and me. Discussion: http://postgr.es/m/20170907194236.4cefce96%40wp.localdomain Discussion: http://postgr.es/m/CAFiTN-uUpX3ck%3DK0mLEk-G_kUQY%3DSNOTeqdaNRR9FMdQrHKebw%40mail.gmail.com
2021-03-19 20:10:38 +01:00
/* Toast field should not be compressed */
TupleDescAttr(tupdesc, 0)->attcompression = InvalidCompressionMethod;
TupleDescAttr(tupdesc, 1)->attcompression = InvalidCompressionMethod;
TupleDescAttr(tupdesc, 2)->attcompression = InvalidCompressionMethod;
/*
* Toast tables for regular relations go in pg_toast; those for temp
* relations go into the per-backend temp-toast-table namespace.
*/
if (isTempOrTempToastNamespace(rel->rd_rel->relnamespace))
namespaceid = GetTempToastNamespace();
else
namespaceid = PG_TOAST_NAMESPACE;
/* Toast table is shared if and only if its parent is. */
shared_relation = rel->rd_rel->relisshared;
/* It's mapped if and only if its parent is, too */
mapped_relation = RelationIsMapped(rel);
toast_relid = heap_create_with_catalog(toast_relname,
namespaceid,
rel->rd_rel->reltablespace,
toastOid,
InvalidOid,
InvalidOid,
rel->rd_rel->relowner,
table_relation_toast_am(rel),
tupdesc,
NIL,
RELKIND_TOASTVALUE,
rel->rd_rel->relpersistence,
shared_relation,
mapped_relation,
ONCOMMIT_NOOP,
reloptions,
false,
true,
true,
OIDOldToast,
NULL);
Assert(toast_relid != InvalidOid);
/* make the toast relation visible, else table_open will fail */
CommandCounterIncrement();
/* ShareLock is not really needed here, but take it anyway */
toast_rel = table_open(toast_relid, ShareLock);
/*
* Create unique index on chunk_id, chunk_seq.
*
* NOTE: the normal TOAST access routines could actually function with a
* single-column index on chunk_id only. However, the slice access
* routines use both columns for faster access to an individual chunk. In
* addition, we want it to be unique as a check against the possibility of
* duplicate TOAST chunk OIDs. The index might also be a little more
* efficient this way, since btree isn't all that happy with large numbers
* of equal keys.
*/
indexInfo = makeNode(IndexInfo);
indexInfo->ii_NumIndexAttrs = 2;
indexInfo->ii_NumIndexKeyAttrs = 2;
indexInfo->ii_IndexAttrNumbers[0] = 1;
indexInfo->ii_IndexAttrNumbers[1] = 2;
indexInfo->ii_Expressions = NIL;
indexInfo->ii_ExpressionsState = NIL;
indexInfo->ii_Predicate = NIL;
Faster expression evaluation and targetlist projection. This replaces the old, recursive tree-walk based evaluation, with non-recursive, opcode dispatch based, expression evaluation. Projection is now implemented as part of expression evaluation. This both leads to significant performance improvements, and makes future just-in-time compilation of expressions easier. The speed gains primarily come from: - non-recursive implementation reduces stack usage / overhead - simple sub-expressions are implemented with a single jump, without function calls - sharing some state between different sub-expressions - reduced amount of indirect/hard to predict memory accesses by laying out operation metadata sequentially; including the avoidance of nearly all of the previously used linked lists - more code has been moved to expression initialization, avoiding constant re-checks at evaluation time Future just-in-time compilation (JIT) has become easier, as demonstrated by released patches intended to be merged in a later release, for primarily two reasons: Firstly, due to a stricter split between expression initialization and evaluation, less code has to be handled by the JIT. Secondly, due to the non-recursive nature of the generated "instructions", less performance-critical code-paths can easily be shared between interpreted and compiled evaluation. The new framework allows for significant future optimizations. E.g.: - basic infrastructure for to later reduce the per executor-startup overhead of expression evaluation, by caching state in prepared statements. That'd be helpful in OLTPish scenarios where initialization overhead is measurable. - optimizing the generated "code". A number of proposals for potential work has already been made. - optimizing the interpreter. Similarly a number of proposals have been made here too. The move of logic into the expression initialization step leads to some backward-incompatible changes: - Function permission checks are now done during expression initialization, whereas previously they were done during execution. In edge cases this can lead to errors being raised that previously wouldn't have been, e.g. a NULL array being coerced to a different array type previously didn't perform checks. - The set of domain constraints to be checked, is now evaluated once during expression initialization, previously it was re-built every time a domain check was evaluated. For normal queries this doesn't change much, but e.g. for plpgsql functions, which caches ExprStates, the old set could stick around longer. The behavior around might still change. Author: Andres Freund, with significant changes by Tom Lane, changes by Heikki Linnakangas Reviewed-By: Tom Lane, Heikki Linnakangas Discussion: https://postgr.es/m/20161206034955.bh33paeralxbtluv@alap3.anarazel.de
2017-03-14 23:45:36 +01:00
indexInfo->ii_PredicateState = NULL;
indexInfo->ii_ExclusionOps = NULL;
indexInfo->ii_ExclusionProcs = NULL;
indexInfo->ii_ExclusionStrats = NULL;
Implement operator class parameters PostgreSQL provides set of template index access methods, where opclasses have much freedom in the semantics of indexing. These index AMs are GiST, GIN, SP-GiST and BRIN. There opclasses define representation of keys, operations on them and supported search strategies. So, it's natural that opclasses may be faced some tradeoffs, which require user-side decision. This commit implements opclass parameters allowing users to set some values, which tell opclass how to index the particular dataset. This commit doesn't introduce new storage in system catalog. Instead it uses pg_attribute.attoptions, which is used for table column storage options but unused for index attributes. In order to evade changing signature of each opclass support function, we implement unified way to pass options to opclass support functions. Options are set to fn_expr as the constant bytea expression. It's possible due to the fact that opclass support functions are executed outside of expressions, so fn_expr is unused for them. This commit comes with some examples of opclass options usage. We parametrize signature length in GiST. That applies to multiple opclasses: tsvector_ops, gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and gist_hstore_ops. Also we parametrize maximum number of integer ranges for gist__int_ops. However, the main future usage of this feature is expected to be json, where users would be able to specify which way to index particular json parts. Catversion is bumped. Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru Author: Nikita Glukhov, revised by me Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
indexInfo->ii_OpclassOptions = NULL;
indexInfo->ii_Unique = true;
indexInfo->ii_ReadyForInserts = true;
indexInfo->ii_Concurrent = false;
indexInfo->ii_BrokenHotChain = false;
indexInfo->ii_ParallelWorkers = 0;
indexInfo->ii_Am = BTREE_AM_OID;
indexInfo->ii_AmCache = NULL;
indexInfo->ii_Context = CurrentMemoryContext;
collationObjectId[0] = InvalidOid;
collationObjectId[1] = InvalidOid;
classObjectId[0] = OID_BTREE_OPS_OID;
classObjectId[1] = INT4_BTREE_OPS_OID;
coloptions[0] = 0;
coloptions[1] = 0;
index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid,
InvalidOid, InvalidOid,
indexInfo,
list_make2("chunk_id", "chunk_seq"),
BTREE_AM_OID,
rel->rd_rel->reltablespace,
collationObjectId, classObjectId, coloptions, (Datum) 0,
INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL);
table_close(toast_rel, NoLock);
/*
* Store the toast table's OID in the parent relation's pg_class row
*/
class_rel = table_open(RelationRelationId, RowExclusiveLock);
reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid));
if (!HeapTupleIsValid(reltup))
elog(ERROR, "cache lookup failed for relation %u", relOid);
((Form_pg_class) GETSTRUCT(reltup))->reltoastrelid = toast_relid;
if (!IsBootstrapProcessingMode())
{
/* normal case, use a transactional update */
CatalogTupleUpdate(class_rel, &reltup->t_self, reltup);
}
else
{
/* While bootstrapping, we cannot UPDATE, so overwrite in-place */
heap_inplace_update(class_rel, reltup);
}
heap_freetuple(reltup);
table_close(class_rel, RowExclusiveLock);
/*
* Register dependency from the toast table to the main, so that the toast
* table will be deleted if the main is. Skip this in bootstrap mode.
*/
if (!IsBootstrapProcessingMode())
{
baseobject.classId = RelationRelationId;
baseobject.objectId = relOid;
baseobject.objectSubId = 0;
toastobject.classId = RelationRelationId;
toastobject.objectId = toast_relid;
toastobject.objectSubId = 0;
recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
}
/*
* Make changes visible
*/
CommandCounterIncrement();
return true;
}
/*
* Check to see whether the table needs a TOAST table.
*/
static bool
needs_toast_table(Relation rel)
{
/*
* No need to create a TOAST table for partitioned tables.
*/
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
return false;
/*
* We cannot allow toasting a shared relation after initdb (because
* there's no way to mark it toasted in other databases' pg_class).
*/
if (rel->rd_rel->relisshared && !IsBootstrapProcessingMode())
return false;
/*
* Ignore attempts to create toast tables on catalog tables after initdb.
* Which catalogs get toast tables is explicitly chosen in catalog/pg_*.h.
* (We could get here via some ALTER TABLE command if the catalog doesn't
* have a toast table.)
*/
if (IsCatalogRelation(rel) && !IsBootstrapProcessingMode())
return false;
/* Otherwise, let the AM decide. */
return table_relation_needs_toast_table(rel);
}