postgresql/contrib/pgrowlocks/pgrowlocks.c
Alvaro Herrera e3ad3ffa68 Fix handling of multixacts predating pg_upgrade
After pg_upgrade, it is possible that some tuples' Xmax have multixacts
corresponding to the old installation; such multixacts cannot have
running members anymore.  In many code sites we already know not to read
them and clobber them silently, but at least when VACUUM tries to freeze
a multixact or determine whether one needs freezing, there's an attempt
to resolve it to its member transactions by calling GetMultiXactIdMembers,
and if the multixact value is "in the future" with regards to the
current valid multixact range, an error like this is raised:
    ERROR:  MultiXactId 123 has not been created yet -- apparent wraparound
and vacuuming fails.  Per discussion with Andrew Gierth, it is completely
bogus to try to resolve multixacts coming from before a pg_upgrade,
regardless of where they stand with regards to the current valid
multixact range.

It's possible to get from under this problem by doing SELECT FOR UPDATE
of the problem tuples, but if tables are large, this is slow and
tedious, so a more thorough solution is desirable.

To fix, we realize that multixacts in xmax created in 9.2 and previous
have a specific bit pattern that is never used in 9.3 and later (we
already knew this, per comments and infomask tests sprinkled in various
places, but we weren't leveraging this knowledge appropriately).
Whenever the infomask of the tuple matches that bit pattern, we just
ignore the multixact completely as if Xmax wasn't set; or, in the case
of tuple freezing, we act as if an unwanted value is set and clobber it
without decoding.  This guarantees that no errors will be raised, and
that the values will be progressively removed until all tables are
clean.  Most callers of GetMultiXactIdMembers are patched to recognize
directly that the value is a removable "empty" multixact and avoid
calling GetMultiXactIdMembers altogether.

To avoid changing the signature of GetMultiXactIdMembers() in back
branches, we keep the "allow_old" boolean flag but rename it to
"from_pgupgrade"; if the flag is true, we always return an empty set
instead of looking up the multixact.  (I suppose we could remove the
argument in the master branch, but I chose not to do so in this commit).

This was broken all along, but the error-facing message appeared first
because of commit 8e9a16ab8f and was partially fixed in a25c2b7c4d.
This fix, backpatched all the way back to 9.3, goes approximately in the
same direction as a25c2b7c4d but should cover all cases.

Bug analysis by Andrew Gierth and Álvaro Herrera.

A number of public reports match this bug:
  https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net
  https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com
  https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk
  https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com
  https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-24 18:29:28 -04:00

293 lines
7.8 KiB
C

/*
* contrib/pgrowlocks/pgrowlocks.c
*
* Copyright (c) 2005-2006 Tatsuo Ishii
*
* Permission to use, copy, modify, and distribute this software and
* its documentation for any purpose, without fee, and without a
* written agreement is hereby granted, provided that the above
* copyright notice and this paragraph and the following two
* paragraphs appear in all copies.
*
* IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
* INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
* LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
* DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
* IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
* SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*/
#include "postgres.h"
#include "access/multixact.h"
#include "access/relscan.h"
#include "access/xact.h"
#include "catalog/namespace.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/procarray.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
#include "utils/tqual.h"
PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(pgrowlocks);
/* ----------
* pgrowlocks:
* returns tids of rows being locked
* ----------
*/
#define NCHARS 32
typedef struct
{
Relation rel;
HeapScanDesc scan;
int ncolumns;
} MyData;
#define Atnum_tid 0
#define Atnum_xmax 1
#define Atnum_ismulti 2
#define Atnum_xids 3
#define Atnum_modes 4
#define Atnum_pids 5
Datum
pgrowlocks(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
HeapScanDesc scan;
HeapTuple tuple;
TupleDesc tupdesc;
AttInMetadata *attinmeta;
Datum result;
MyData *mydata;
Relation rel;
if (SRF_IS_FIRSTCALL())
{
text *relname;
RangeVar *relrv;
MemoryContext oldcontext;
AclResult aclresult;
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
attinmeta = TupleDescGetAttInMetadata(tupdesc);
funcctx->attinmeta = attinmeta;
relname = PG_GETARG_TEXT_P(0);
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = heap_openrv(relrv, AccessShareLock);
/* check permissions: must have SELECT on table */
aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
ACL_SELECT);
if (aclresult != ACLCHECK_OK)
aclcheck_error(aclresult, ACL_KIND_CLASS,
RelationGetRelationName(rel));
scan = heap_beginscan(rel, GetActiveSnapshot(), 0, NULL);
mydata = palloc(sizeof(*mydata));
mydata->rel = rel;
mydata->scan = scan;
mydata->ncolumns = tupdesc->natts;
funcctx->user_fctx = mydata;
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
attinmeta = funcctx->attinmeta;
mydata = (MyData *) funcctx->user_fctx;
scan = mydata->scan;
/* scan the relation */
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
HTSU_Result htsu;
TransactionId xmax;
uint16 infomask;
/* must hold a buffer lock to call HeapTupleSatisfiesUpdate */
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
htsu = HeapTupleSatisfiesUpdate(tuple,
GetCurrentCommandId(false),
scan->rs_cbuf);
xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
infomask = tuple->t_data->t_infomask;
/*
* A tuple is locked if HTSU returns BeingUpdated.
*/
if (htsu == HeapTupleBeingUpdated)
{
char **values;
values = (char **) palloc(mydata->ncolumns * sizeof(char *));
values[Atnum_tid] = (char *) DirectFunctionCall1(tidout,
PointerGetDatum(&tuple->t_self));
values[Atnum_xmax] = palloc(NCHARS * sizeof(char));
snprintf(values[Atnum_xmax], NCHARS, "%d", xmax);
if (infomask & HEAP_XMAX_IS_MULTI)
{
MultiXactMember *members;
int nmembers;
bool first = true;
bool allow_old;
values[Atnum_ismulti] = pstrdup("true");
allow_old = HEAP_LOCKED_UPGRADED(infomask);
nmembers = GetMultiXactIdMembers(xmax, &members, allow_old,
false);
if (nmembers == -1)
{
values[Atnum_xids] = "{0}";
values[Atnum_modes] = "{transient upgrade status}";
values[Atnum_pids] = "{0}";
}
else
{
int j;
values[Atnum_xids] = palloc(NCHARS * nmembers);
values[Atnum_modes] = palloc(NCHARS * nmembers);
values[Atnum_pids] = palloc(NCHARS * nmembers);
strcpy(values[Atnum_xids], "{");
strcpy(values[Atnum_modes], "{");
strcpy(values[Atnum_pids], "{");
for (j = 0; j < nmembers; j++)
{
char buf[NCHARS];
if (!first)
{
strcat(values[Atnum_xids], ",");
strcat(values[Atnum_modes], ",");
strcat(values[Atnum_pids], ",");
}
snprintf(buf, NCHARS, "%d", members[j].xid);
strcat(values[Atnum_xids], buf);
switch (members[j].status)
{
case MultiXactStatusUpdate:
snprintf(buf, NCHARS, "Update");
break;
case MultiXactStatusNoKeyUpdate:
snprintf(buf, NCHARS, "No Key Update");
break;
case MultiXactStatusForUpdate:
snprintf(buf, NCHARS, "For Update");
break;
case MultiXactStatusForNoKeyUpdate:
snprintf(buf, NCHARS, "For No Key Update");
break;
case MultiXactStatusForShare:
snprintf(buf, NCHARS, "Share");
break;
case MultiXactStatusForKeyShare:
snprintf(buf, NCHARS, "Key Share");
break;
}
strcat(values[Atnum_modes], buf);
snprintf(buf, NCHARS, "%d",
BackendXidGetPid(members[j].xid));
strcat(values[Atnum_pids], buf);
first = false;
}
strcat(values[Atnum_xids], "}");
strcat(values[Atnum_modes], "}");
strcat(values[Atnum_pids], "}");
}
}
else
{
values[Atnum_ismulti] = pstrdup("false");
values[Atnum_xids] = palloc(NCHARS * sizeof(char));
snprintf(values[Atnum_xids], NCHARS, "{%d}", xmax);
values[Atnum_modes] = palloc(NCHARS);
if (infomask & HEAP_XMAX_LOCK_ONLY)
{
if (HEAP_XMAX_IS_SHR_LOCKED(infomask))
snprintf(values[Atnum_modes], NCHARS, "{For Share}");
else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
snprintf(values[Atnum_modes], NCHARS, "{For Key Share}");
else if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
{
if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
snprintf(values[Atnum_modes], NCHARS, "{For Update}");
else
snprintf(values[Atnum_modes], NCHARS, "{For No Key Update}");
}
else
/* neither keyshare nor exclusive bit it set */
snprintf(values[Atnum_modes], NCHARS,
"{transient upgrade status}");
}
else
{
if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
snprintf(values[Atnum_modes], NCHARS, "{Update}");
else
snprintf(values[Atnum_modes], NCHARS, "{No Key Update}");
}
values[Atnum_pids] = palloc(NCHARS * sizeof(char));
snprintf(values[Atnum_pids], NCHARS, "{%d}",
BackendXidGetPid(xmax));
}
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/* build a tuple */
tuple = BuildTupleFromCStrings(attinmeta, values);
/* make the tuple into a datum */
result = HeapTupleGetDatum(tuple);
/*
* no need to pfree what we allocated; it's on a short-lived
* memory context anyway
*/
SRF_RETURN_NEXT(funcctx, result);
}
else
{
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
}
}
heap_endscan(scan);
heap_close(mydata->rel, AccessShareLock);
SRF_RETURN_DONE(funcctx);
}