Modify the isolation tester so that multiple sessions can wait.

This allows testing of deadlock scenarios.  Scenarios that would
previously have been considered invalid are now simply taken as a
scenario in which more than one backend will wait.
This commit is contained in:
Robert Haas 2016-02-11 08:36:30 -05:00
parent c9882c60f4
commit 38f8bdcac4
2 changed files with 200 additions and 151 deletions

View File

@ -107,10 +107,11 @@ Each step may contain commands that block until further action has been taken
(most likely, some other session runs a step that unblocks it or causes a (most likely, some other session runs a step that unblocks it or causes a
deadlock). A test that uses this ability must manually specify valid deadlock). A test that uses this ability must manually specify valid
permutations, i.e. those that would not expect a blocked session to execute a permutations, i.e. those that would not expect a blocked session to execute a
command. If the test fails to follow that rule, the test is aborted. command. If a test fails to follow that rule, isolationtester will cancel it
after 60 seconds. If the cancel doesn't work, isolationtester will exit
Currently, at most one step can be waiting at a time. As long as one uncleanly after a total of 75 seconds of wait time. Testing invalid
step is waiting, subsequent steps are run to completion synchronously. permutations should be avoided because they can make the isolation tests take
a very long time to run, and they serve no useful testing purpose.
Note that isolationtester recognizes that a command has blocked by looking Note that isolationtester recognizes that a command has blocked by looking
to see if it is shown as waiting in the pg_locks view; therefore, only to see if it is shown as waiting in the pg_locks view; therefore, only

View File

@ -15,6 +15,7 @@
#include <sys/select.h> #include <sys/select.h>
#endif #endif
#include "datatype/timestamp.h"
#include "libpq-fe.h" #include "libpq-fe.h"
#include "pqexpbuffer.h" #include "pqexpbuffer.h"
#include "pg_getopt.h" #include "pg_getopt.h"
@ -237,54 +238,6 @@ main(int argc, char **argv)
appendPQExpBufferStr(&wait_query, appendPQExpBufferStr(&wait_query,
") " ") "
"AND holder.mode = ANY (CASE waiter.mode "
"WHEN 'AccessShareLock' THEN ARRAY["
"'AccessExclusiveLock'] "
"WHEN 'RowShareLock' THEN ARRAY["
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'RowExclusiveLock' THEN ARRAY["
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ShareUpdateExclusiveLock' THEN ARRAY["
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ShareLock' THEN ARRAY["
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ShareRowExclusiveLock' THEN ARRAY["
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ExclusiveLock' THEN ARRAY["
"'RowShareLock',"
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'AccessExclusiveLock' THEN ARRAY["
"'AccessShareLock',"
"'RowShareLock',"
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] END) "
"AND holder.locktype IS NOT DISTINCT FROM waiter.locktype " "AND holder.locktype IS NOT DISTINCT FROM waiter.locktype "
"AND holder.database IS NOT DISTINCT FROM waiter.database " "AND holder.database IS NOT DISTINCT FROM waiter.database "
"AND holder.relation IS NOT DISTINCT FROM waiter.relation " "AND holder.relation IS NOT DISTINCT FROM waiter.relation "
@ -467,34 +420,48 @@ report_error_message(Step *step)
} }
/* /*
* As above, but reports messages possibly emitted by two steps. This is * As above, but reports messages possibly emitted by multiple steps. This is
* useful when we have a blocked command awakened by another one; we want to * useful when we have a blocked command awakened by another one; we want to
* report both messages identically, for the case where we don't care which * report all messages identically, for the case where we don't care which
* one fails due to a timeout such as deadlock timeout. * one fails due to a timeout such as deadlock timeout.
*/ */
static void static void
report_two_error_messages(Step *step1, Step *step2) report_multiple_error_messages(Step *step, int nextra, Step **extrastep)
{ {
char *prefix; PQExpBufferData buffer;
int n;
prefix = psprintf("%s %s", step1->name, step2->name); if (nextra == 0)
if (step1->errormsg)
{ {
fprintf(stdout, "error in steps %s: %s\n", prefix, report_error_message(step);
step1->errormsg); return;
free(step1->errormsg);
step1->errormsg = NULL;
}
if (step2->errormsg)
{
fprintf(stdout, "error in steps %s: %s\n", prefix,
step2->errormsg);
free(step2->errormsg);
step2->errormsg = NULL;
} }
free(prefix); initPQExpBuffer(&buffer);
appendPQExpBufferStr(&buffer, step->name);
for (n = 0; n < nextra; ++n)
appendPQExpBuffer(&buffer, " %s", extrastep[n]->name);
if (step->errormsg)
{
fprintf(stdout, "error in steps %s: %s\n", buffer.data,
step->errormsg);
free(step->errormsg);
step->errormsg = NULL;
}
for (n = 0; n < nextra; ++n)
{
if (extrastep[n]->errormsg == NULL)
continue;
fprintf(stdout, "error in steps %s: %s\n",
buffer.data, extrastep[n]->errormsg);
free(extrastep[n]->errormsg);
extrastep[n]->errormsg = NULL;
}
termPQExpBuffer(&buffer);
} }
/* /*
@ -505,7 +472,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
{ {
PGresult *res; PGresult *res;
int i; int i;
Step *waiting = NULL; int w;
int nwaiting = 0;
int nerrorstep = 0;
Step **waiting;
Step **errorstep;
waiting = malloc(sizeof(Step *) * testspec->nsessions);
errorstep = malloc(sizeof(Step *) * testspec->nsessions);
/* /*
* In dry run mode, just display the permutation in the same format used * In dry run mode, just display the permutation in the same format used
@ -566,54 +540,76 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
for (i = 0; i < nsteps; i++) for (i = 0; i < nsteps; i++)
{ {
Step *step = steps[i]; Step *step = steps[i];
Step *oldstep = NULL;
PGconn *conn = conns[1 + step->session]; PGconn *conn = conns[1 + step->session];
bool mustwait;
if (waiting != NULL && step->session == waiting->session)
{
PGcancel *cancel;
PGresult *res;
int j;
/* /*
* This permutation is invalid: it can never happen in real life. * Check whether the session that needs to perform the next step
* is still blocked on an earlier step. If so, wait for it to finish.
* *
* A session is blocked on an earlier step (waiting) and no * In older versions of this tool, when allowed precisely one session
* further steps from this session can run until it is unblocked, * to be waiting at a time. If we reached a step which required that
* but it can only be unblocked by running steps from other * session to execute the next command, we would declare the whole
* sessions. * permutation invalid, cancel everything, and move on to the next one.
* Unfortunately, that made it impossible to test the deadlock detector
* using this framework unless the numebr of processes involved in the
* deadlock was precisely two. We now assume that if we reach a step
* that is still blocked, we need to wait for it to unblock itself.
*/ */
fprintf(stderr, "invalid permutation detected\n"); for (w = 0; w < nwaiting; ++w)
/* Cancel the waiting statement from this session. */
cancel = PQgetCancel(conn);
if (cancel != NULL)
{ {
char buf[256]; if (step->session == waiting[w]->session)
{
if (!PQcancel(cancel, buf, sizeof(buf))) oldstep = waiting[w];
fprintf(stderr, "PQcancel failed: %s\n", buf); break;
/* Be sure to consume the error message. */
while ((res = PQgetResult(conn)) != NULL)
PQclear(res);
PQfreeCancel(cancel);
} }
}
if (oldstep != NULL)
{
/* Wait for previous step on this connection. */
try_complete_step(oldstep, STEP_RETRY);
/* /*
* Now we really have to complete all the running transactions to * Attempt to complete any steps that were previously waiting.
* make sure teardown doesn't block. * Remove any that have completed, and also remove the one for
* which we just waited.
*/ */
for (j = 1; j < nconns; j++) w = 0;
nerrorstep = 0;
while (w < nwaiting)
{ {
res = PQexec(conns[j], "ROLLBACK"); if (waiting[w] == oldstep)
if (res != NULL) {
PQclear(res); /* We just waited for this; remove it. */
if (w + 1 < nwaiting)
memcpy(&waiting[w], &waiting[w + 1],
(nwaiting - (w + 1)) * sizeof(Step *));
nwaiting--;
continue;
} }
goto teardown; if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
{
/* Still blocked on a lock, leave it alone. */
w++;
}
else
{
/* This one finished, too! */
errorstep[nerrorstep++] = waiting[w];
if (w + 1 < nwaiting)
memcpy(&waiting[w], &waiting[w + 1],
(nwaiting - (w + 1)) * sizeof(Step *));
nwaiting--;
}
} }
/* Report all errors together. */
report_multiple_error_messages(oldstep, nerrorstep, errorstep);
}
/* Send the query for this step. */
if (!PQsendQuery(conn, step->sql)) if (!PQsendQuery(conn, step->sql))
{ {
fprintf(stdout, "failed to send query for step %s: %s\n", fprintf(stdout, "failed to send query for step %s: %s\n",
@ -621,39 +617,41 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
exit_nicely(); exit_nicely();
} }
if (waiting != NULL) /* Try to complete this step without blocking. */
{ mustwait = try_complete_step(step, STEP_NONBLOCK);
/* Some other step is already waiting: just block. */
try_complete_step(step, 0);
/* /* Attempt to complete any steps that were previously waiting. */
* See if this step unblocked the waiting step; report both error w = 0;
* messages together if so. nerrorstep = 0;
*/ while (w < nwaiting)
if (!try_complete_step(waiting, STEP_NONBLOCK | STEP_RETRY))
{ {
report_two_error_messages(step, waiting); if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
waiting = NULL; w++;
}
else
report_error_message(step);
}
else else
{ {
if (try_complete_step(step, STEP_NONBLOCK)) errorstep[nerrorstep++] = waiting[w];
waiting = step; if (w + 1 < nwaiting)
report_error_message(step); memcpy(&waiting[w], &waiting[w + 1],
(nwaiting - (w + 1)) * sizeof(Step *));
nwaiting--;
} }
} }
/* Report any error from this step, and any steps that it unblocked. */
report_multiple_error_messages(step, nerrorstep, errorstep);
/* If this step is waiting, add it to the array of waiters. */
if (mustwait)
waiting[nwaiting++] = step;
}
/* Finish any waiting query. */ /* Finish any waiting query. */
if (waiting != NULL) for (w = 0; w < nwaiting; ++w)
{ {
try_complete_step(waiting, STEP_RETRY); try_complete_step(waiting[w], STEP_RETRY);
report_error_message(waiting); report_error_message(waiting[w]);
} }
teardown:
/* Perform per-session teardown */ /* Perform per-session teardown */
for (i = 0; i < testspec->nsessions; i++) for (i = 0; i < testspec->nsessions; i++)
{ {
@ -714,14 +712,17 @@ try_complete_step(Step *step, int flags)
{ {
PGconn *conn = conns[1 + step->session]; PGconn *conn = conns[1 + step->session];
fd_set read_set; fd_set read_set;
struct timeval start_time;
struct timeval timeout; struct timeval timeout;
int sock = PQsocket(conn); int sock = PQsocket(conn);
int ret; int ret;
PGresult *res; PGresult *res;
bool canceled = false;
gettimeofday(&start_time, NULL);
FD_ZERO(&read_set); FD_ZERO(&read_set);
while ((flags & STEP_NONBLOCK) && PQisBusy(conn)) while (PQisBusy(conn))
{ {
FD_SET(sock, &read_set); FD_SET(sock, &read_set);
timeout.tv_sec = 0; timeout.tv_sec = 0;
@ -736,6 +737,12 @@ try_complete_step(Step *step, int flags)
exit_nicely(); exit_nicely();
} }
else if (ret == 0) /* select() timeout: check for lock wait */ else if (ret == 0) /* select() timeout: check for lock wait */
{
struct timeval current_time;
int64 td;
/* If it's OK for the step to block, check whether it has. */
if (flags & STEP_NONBLOCK)
{ {
int ntuples; int ntuples;
@ -758,7 +765,48 @@ try_complete_step(Step *step, int flags)
step->name, step->sql); step->name, step->sql);
return true; return true;
} }
/* else, not waiting: give it more time */ /* else, not waiting */
}
/* Figure out how long we've been waiting for this step. */
gettimeofday(&current_time, NULL);
td = (int64) current_time.tv_sec - (int64) start_time.tv_sec;
td *= USECS_PER_SEC;
td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
/*
* After 60 seconds, try to cancel the query.
*
* If the user tries to test an invalid permutation, we don't
* want to hang forever, especially when this is running in the
* buildfarm. So try to cancel it after a minute. This will
* presumably lead to this permutation failing, but remaining
* permutations and tests should still be OK.
*/
if (td > 60 * USECS_PER_SEC && !canceled)
{
PGcancel *cancel = PQgetCancel(conn);
char buf[256];
if (cancel != NULL && !PQcancel(cancel, buf, sizeof(buf)))
fprintf(stderr, "PQcancel failed: %s\n", buf);
if (cancel != NULL)
PQfreeCancel(cancel);
}
/*
* After 75 seconds, just give up and die.
*
* Since cleanup steps won't be run in this case, this may cause
* later tests to fail. That stinks, but it's better than waiting
* forever for the server to respond to the cancel.
*/
if (td > 75 * USECS_PER_SEC)
{
fprintf(stderr, "step %s timed out after 75 seconds\n",
step->name);
exit_nicely();
}
} }
else if (!PQconsumeInput(conn)) /* select(): data available */ else if (!PQconsumeInput(conn)) /* select(): data available */
{ {