postgresql/contrib/pgrowlocks/pgrowlocks.c
Andres Freund 5db6df0c01 tableam: Add tuple_{insert, delete, update, lock} and use.
This adds new, required, table AM callbacks for insert/delete/update
and lock_tuple. To be able to reasonably use those, the EvalPlanQual
mechanism had to be adapted, moving more logic into the AM.

Previously both delete/update/lock call-sites and the EPQ mechanism had
to have awareness of the specific tuple format to be able to fetch the
latest version of a tuple. Obviously that needs to be abstracted
away. To do so, move the logic that find the latest row version into
the AM. lock_tuple has a new flag argument,
TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last
version, rather than the current one.  It'd have been possible to do
so via a separate callback as well, but finding the last version
usually also necessitates locking the newest version, making it
sensible to combine the two. This replaces the previous use of
EvalPlanQualFetch().  Additionally HeapTupleUpdated, which previously
signaled either a concurrent update or delete, is now split into two,
to avoid callers needing AM specific knowledge to differentiate.

The move of finding the latest row version into tuple_lock means that
encountering a row concurrently moved into another partition will now
raise an error about "tuple to be locked" rather than "tuple to be
updated/deleted" - which is accurate, as that always happens when
locking rows. While possible slightly less helpful for users, it seems
like an acceptable trade-off.

As part of this commit HTSU_Result has been renamed to TM_Result, and
its members been expanded to differentiated between updating and
deleting. HeapUpdateFailureData has been renamed to TM_FailureData.

The interface to speculative insertion is changed so nodeModifyTable.c
does not have to set the speculative token itself anymore. Instead
there's a version of tuple_insert, tuple_insert_speculative, that
performs the speculative insertion (without requiring a flag to signal
that fact), and the speculative insertion is either made permanent
with table_complete_speculative(succeeded = true) or aborted with
succeeded = false).

Note that multi_insert is not yet routed through tableam, nor is
COPY. Changing multi_insert requires changes to copy.c that are large
enough to better be done separately.

Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED
VIEW are also only going to be adjusted in a later commit.

Author: Andres Freund and Haribabu Kommi
Discussion:
    https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
    https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de
    https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-23 19:55:57 -07:00

317 lines
8.6 KiB
C

/*
* contrib/pgrowlocks/pgrowlocks.c
*
* Copyright (c) 2005-2006 Tatsuo Ishii
*
* Permission to use, copy, modify, and distribute this software and
* its documentation for any purpose, without fee, and without a
* written agreement is hereby granted, provided that the above
* copyright notice and this paragraph and the following two
* paragraphs appear in all copies.
*
* IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
* INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
* LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
* DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
* IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
* SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/xact.h"
#include "catalog/namespace.h"
#include "catalog/pg_authid.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/procarray.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
#include "utils/varlena.h"
PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(pgrowlocks);
/* ----------
* pgrowlocks:
* returns tids of rows being locked
* ----------
*/
#define NCHARS 32
typedef struct
{
Relation rel;
TableScanDesc scan;
int ncolumns;
} MyData;
#define Atnum_tid 0
#define Atnum_xmax 1
#define Atnum_ismulti 2
#define Atnum_xids 3
#define Atnum_modes 4
#define Atnum_pids 5
Datum
pgrowlocks(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
TableScanDesc scan;
HeapScanDesc hscan;
HeapTuple tuple;
TupleDesc tupdesc;
AttInMetadata *attinmeta;
Datum result;
MyData *mydata;
Relation rel;
if (SRF_IS_FIRSTCALL())
{
text *relname;
RangeVar *relrv;
MemoryContext oldcontext;
AclResult aclresult;
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
attinmeta = TupleDescGetAttInMetadata(tupdesc);
funcctx->attinmeta = attinmeta;
relname = PG_GETARG_TEXT_PP(0);
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = relation_openrv(relrv, AccessShareLock);
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is a partitioned table",
RelationGetRelationName(rel)),
errdetail("Partitioned tables do not contain rows.")));
else if (rel->rd_rel->relkind != RELKIND_RELATION)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is not a table",
RelationGetRelationName(rel))));
/*
* check permissions: must have SELECT on table or be in
* pg_stat_scan_tables
*/
aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
ACL_SELECT);
if (aclresult != ACLCHECK_OK)
aclresult = is_member_of_role(GetUserId(), DEFAULT_ROLE_STAT_SCAN_TABLES) ? ACLCHECK_OK : ACLCHECK_NO_PRIV;
if (aclresult != ACLCHECK_OK)
aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind),
RelationGetRelationName(rel));
scan = table_beginscan(rel, GetActiveSnapshot(), 0, NULL);
hscan = (HeapScanDesc) scan;
mydata = palloc(sizeof(*mydata));
mydata->rel = rel;
mydata->scan = scan;
mydata->ncolumns = tupdesc->natts;
funcctx->user_fctx = mydata;
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
attinmeta = funcctx->attinmeta;
mydata = (MyData *) funcctx->user_fctx;
scan = mydata->scan;
hscan = (HeapScanDesc) scan;
/* scan the relation */
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
TM_Result htsu;
TransactionId xmax;
uint16 infomask;
/* must hold a buffer lock to call HeapTupleSatisfiesUpdate */
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
htsu = HeapTupleSatisfiesUpdate(tuple,
GetCurrentCommandId(false),
hscan->rs_cbuf);
xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
infomask = tuple->t_data->t_infomask;
/*
* A tuple is locked if HTSU returns BeingModified.
*/
if (htsu == TM_BeingModified)
{
char **values;
values = (char **) palloc(mydata->ncolumns * sizeof(char *));
values[Atnum_tid] = (char *) DirectFunctionCall1(tidout,
PointerGetDatum(&tuple->t_self));
values[Atnum_xmax] = palloc(NCHARS * sizeof(char));
snprintf(values[Atnum_xmax], NCHARS, "%d", xmax);
if (infomask & HEAP_XMAX_IS_MULTI)
{
MultiXactMember *members;
int nmembers;
bool first = true;
bool allow_old;
values[Atnum_ismulti] = pstrdup("true");
allow_old = HEAP_LOCKED_UPGRADED(infomask);
nmembers = GetMultiXactIdMembers(xmax, &members, allow_old,
false);
if (nmembers == -1)
{
values[Atnum_xids] = "{0}";
values[Atnum_modes] = "{transient upgrade status}";
values[Atnum_pids] = "{0}";
}
else
{
int j;
values[Atnum_xids] = palloc(NCHARS * nmembers);
values[Atnum_modes] = palloc(NCHARS * nmembers);
values[Atnum_pids] = palloc(NCHARS * nmembers);
strcpy(values[Atnum_xids], "{");
strcpy(values[Atnum_modes], "{");
strcpy(values[Atnum_pids], "{");
for (j = 0; j < nmembers; j++)
{
char buf[NCHARS];
if (!first)
{
strcat(values[Atnum_xids], ",");
strcat(values[Atnum_modes], ",");
strcat(values[Atnum_pids], ",");
}
snprintf(buf, NCHARS, "%d", members[j].xid);
strcat(values[Atnum_xids], buf);
switch (members[j].status)
{
case MultiXactStatusUpdate:
snprintf(buf, NCHARS, "Update");
break;
case MultiXactStatusNoKeyUpdate:
snprintf(buf, NCHARS, "No Key Update");
break;
case MultiXactStatusForUpdate:
snprintf(buf, NCHARS, "For Update");
break;
case MultiXactStatusForNoKeyUpdate:
snprintf(buf, NCHARS, "For No Key Update");
break;
case MultiXactStatusForShare:
snprintf(buf, NCHARS, "Share");
break;
case MultiXactStatusForKeyShare:
snprintf(buf, NCHARS, "Key Share");
break;
}
strcat(values[Atnum_modes], buf);
snprintf(buf, NCHARS, "%d",
BackendXidGetPid(members[j].xid));
strcat(values[Atnum_pids], buf);
first = false;
}
strcat(values[Atnum_xids], "}");
strcat(values[Atnum_modes], "}");
strcat(values[Atnum_pids], "}");
}
}
else
{
values[Atnum_ismulti] = pstrdup("false");
values[Atnum_xids] = palloc(NCHARS * sizeof(char));
snprintf(values[Atnum_xids], NCHARS, "{%d}", xmax);
values[Atnum_modes] = palloc(NCHARS);
if (infomask & HEAP_XMAX_LOCK_ONLY)
{
if (HEAP_XMAX_IS_SHR_LOCKED(infomask))
snprintf(values[Atnum_modes], NCHARS, "{For Share}");
else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
snprintf(values[Atnum_modes], NCHARS, "{For Key Share}");
else if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
{
if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
snprintf(values[Atnum_modes], NCHARS, "{For Update}");
else
snprintf(values[Atnum_modes], NCHARS, "{For No Key Update}");
}
else
/* neither keyshare nor exclusive bit it set */
snprintf(values[Atnum_modes], NCHARS,
"{transient upgrade status}");
}
else
{
if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
snprintf(values[Atnum_modes], NCHARS, "{Update}");
else
snprintf(values[Atnum_modes], NCHARS, "{No Key Update}");
}
values[Atnum_pids] = palloc(NCHARS * sizeof(char));
snprintf(values[Atnum_pids], NCHARS, "{%d}",
BackendXidGetPid(xmax));
}
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/* build a tuple */
tuple = BuildTupleFromCStrings(attinmeta, values);
/* make the tuple into a datum */
result = HeapTupleGetDatum(tuple);
/*
* no need to pfree what we allocated; it's on a short-lived
* memory context anyway
*/
SRF_RETURN_NEXT(funcctx, result);
}
else
{
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
}
}
table_endscan(scan);
table_close(mydata->rel, AccessShareLock);
SRF_RETURN_DONE(funcctx);
}