Modify the isolation tester so that multiple sessions can wait.

This allows testing of deadlock scenarios.  Scenarios that would
previously have been considered invalid are now simply taken as a
scenario in which more than one backend will wait.
This commit is contained in:
Robert Haas 2016-02-11 08:36:30 -05:00
parent c9882c60f4
commit 38f8bdcac4
2 changed files with 200 additions and 151 deletions

View File

@ -107,10 +107,11 @@ Each step may contain commands that block until further action has been taken
(most likely, some other session runs a step that unblocks it or causes a
deadlock). A test that uses this ability must manually specify valid
permutations, i.e. those that would not expect a blocked session to execute a
command. If the test fails to follow that rule, the test is aborted.
Currently, at most one step can be waiting at a time. As long as one
step is waiting, subsequent steps are run to completion synchronously.
command. If a test fails to follow that rule, isolationtester will cancel it
after 60 seconds. If the cancel doesn't work, isolationtester will exit
uncleanly after a total of 75 seconds of wait time. Testing invalid
permutations should be avoided because they can make the isolation tests take
a very long time to run, and they serve no useful testing purpose.
Note that isolationtester recognizes that a command has blocked by looking
to see if it is shown as waiting in the pg_locks view; therefore, only

View File

@ -15,6 +15,7 @@
#include <sys/select.h>
#endif
#include "datatype/timestamp.h"
#include "libpq-fe.h"
#include "pqexpbuffer.h"
#include "pg_getopt.h"
@ -237,54 +238,6 @@ main(int argc, char **argv)
appendPQExpBufferStr(&wait_query,
") "
"AND holder.mode = ANY (CASE waiter.mode "
"WHEN 'AccessShareLock' THEN ARRAY["
"'AccessExclusiveLock'] "
"WHEN 'RowShareLock' THEN ARRAY["
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'RowExclusiveLock' THEN ARRAY["
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ShareUpdateExclusiveLock' THEN ARRAY["
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ShareLock' THEN ARRAY["
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ShareRowExclusiveLock' THEN ARRAY["
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'ExclusiveLock' THEN ARRAY["
"'RowShareLock',"
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] "
"WHEN 'AccessExclusiveLock' THEN ARRAY["
"'AccessShareLock',"
"'RowShareLock',"
"'RowExclusiveLock',"
"'ShareUpdateExclusiveLock',"
"'ShareLock',"
"'ShareRowExclusiveLock',"
"'ExclusiveLock',"
"'AccessExclusiveLock'] END) "
"AND holder.locktype IS NOT DISTINCT FROM waiter.locktype "
"AND holder.database IS NOT DISTINCT FROM waiter.database "
"AND holder.relation IS NOT DISTINCT FROM waiter.relation "
@ -467,34 +420,48 @@ report_error_message(Step *step)
}
/*
* As above, but reports messages possibly emitted by two steps. This is
* As above, but reports messages possibly emitted by multiple steps. This is
* useful when we have a blocked command awakened by another one; we want to
* report both messages identically, for the case where we don't care which
* report all messages identically, for the case where we don't care which
* one fails due to a timeout such as deadlock timeout.
*/
static void
report_two_error_messages(Step *step1, Step *step2)
report_multiple_error_messages(Step *step, int nextra, Step **extrastep)
{
char *prefix;
PQExpBufferData buffer;
int n;
prefix = psprintf("%s %s", step1->name, step2->name);
if (step1->errormsg)
if (nextra == 0)
{
fprintf(stdout, "error in steps %s: %s\n", prefix,
step1->errormsg);
free(step1->errormsg);
step1->errormsg = NULL;
}
if (step2->errormsg)
{
fprintf(stdout, "error in steps %s: %s\n", prefix,
step2->errormsg);
free(step2->errormsg);
step2->errormsg = NULL;
report_error_message(step);
return;
}
free(prefix);
initPQExpBuffer(&buffer);
appendPQExpBufferStr(&buffer, step->name);
for (n = 0; n < nextra; ++n)
appendPQExpBuffer(&buffer, " %s", extrastep[n]->name);
if (step->errormsg)
{
fprintf(stdout, "error in steps %s: %s\n", buffer.data,
step->errormsg);
free(step->errormsg);
step->errormsg = NULL;
}
for (n = 0; n < nextra; ++n)
{
if (extrastep[n]->errormsg == NULL)
continue;
fprintf(stdout, "error in steps %s: %s\n",
buffer.data, extrastep[n]->errormsg);
free(extrastep[n]->errormsg);
extrastep[n]->errormsg = NULL;
}
termPQExpBuffer(&buffer);
}
/*
@ -505,7 +472,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
{
PGresult *res;
int i;
Step *waiting = NULL;
int w;
int nwaiting = 0;
int nerrorstep = 0;
Step **waiting;
Step **errorstep;
waiting = malloc(sizeof(Step *) * testspec->nsessions);
errorstep = malloc(sizeof(Step *) * testspec->nsessions);
/*
* In dry run mode, just display the permutation in the same format used
@ -566,54 +540,76 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
for (i = 0; i < nsteps; i++)
{
Step *step = steps[i];
Step *oldstep = NULL;
PGconn *conn = conns[1 + step->session];
bool mustwait;
if (waiting != NULL && step->session == waiting->session)
/*
* Check whether the session that needs to perform the next step
* is still blocked on an earlier step. If so, wait for it to finish.
*
* In older versions of this tool, when allowed precisely one session
* to be waiting at a time. If we reached a step which required that
* session to execute the next command, we would declare the whole
* permutation invalid, cancel everything, and move on to the next one.
* Unfortunately, that made it impossible to test the deadlock detector
* using this framework unless the numebr of processes involved in the
* deadlock was precisely two. We now assume that if we reach a step
* that is still blocked, we need to wait for it to unblock itself.
*/
for (w = 0; w < nwaiting; ++w)
{
PGcancel *cancel;
PGresult *res;
int j;
if (step->session == waiting[w]->session)
{
oldstep = waiting[w];
break;
}
}
if (oldstep != NULL)
{
/* Wait for previous step on this connection. */
try_complete_step(oldstep, STEP_RETRY);
/*
* This permutation is invalid: it can never happen in real life.
*
* A session is blocked on an earlier step (waiting) and no
* further steps from this session can run until it is unblocked,
* but it can only be unblocked by running steps from other
* sessions.
* Attempt to complete any steps that were previously waiting.
* Remove any that have completed, and also remove the one for
* which we just waited.
*/
fprintf(stderr, "invalid permutation detected\n");
/* Cancel the waiting statement from this session. */
cancel = PQgetCancel(conn);
if (cancel != NULL)
w = 0;
nerrorstep = 0;
while (w < nwaiting)
{
char buf[256];
if (waiting[w] == oldstep)
{
/* We just waited for this; remove it. */
if (w + 1 < nwaiting)
memcpy(&waiting[w], &waiting[w + 1],
(nwaiting - (w + 1)) * sizeof(Step *));
nwaiting--;
continue;
}
if (!PQcancel(cancel, buf, sizeof(buf)))
fprintf(stderr, "PQcancel failed: %s\n", buf);
/* Be sure to consume the error message. */
while ((res = PQgetResult(conn)) != NULL)
PQclear(res);
PQfreeCancel(cancel);
if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
{
/* Still blocked on a lock, leave it alone. */
w++;
}
else
{
/* This one finished, too! */
errorstep[nerrorstep++] = waiting[w];
if (w + 1 < nwaiting)
memcpy(&waiting[w], &waiting[w + 1],
(nwaiting - (w + 1)) * sizeof(Step *));
nwaiting--;
}
}
/*
* Now we really have to complete all the running transactions to
* make sure teardown doesn't block.
*/
for (j = 1; j < nconns; j++)
{
res = PQexec(conns[j], "ROLLBACK");
if (res != NULL)
PQclear(res);
}
goto teardown;
/* Report all errors together. */
report_multiple_error_messages(oldstep, nerrorstep, errorstep);
}
/* Send the query for this step. */
if (!PQsendQuery(conn, step->sql))
{
fprintf(stdout, "failed to send query for step %s: %s\n",
@ -621,39 +617,41 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
exit_nicely();
}
if (waiting != NULL)
{
/* Some other step is already waiting: just block. */
try_complete_step(step, 0);
/* Try to complete this step without blocking. */
mustwait = try_complete_step(step, STEP_NONBLOCK);
/*
* See if this step unblocked the waiting step; report both error
* messages together if so.
*/
if (!try_complete_step(waiting, STEP_NONBLOCK | STEP_RETRY))
{
report_two_error_messages(step, waiting);
waiting = NULL;
}
else
report_error_message(step);
}
else
/* Attempt to complete any steps that were previously waiting. */
w = 0;
nerrorstep = 0;
while (w < nwaiting)
{
if (try_complete_step(step, STEP_NONBLOCK))
waiting = step;
report_error_message(step);
if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
w++;
else
{
errorstep[nerrorstep++] = waiting[w];
if (w + 1 < nwaiting)
memcpy(&waiting[w], &waiting[w + 1],
(nwaiting - (w + 1)) * sizeof(Step *));
nwaiting--;
}
}
/* Report any error from this step, and any steps that it unblocked. */
report_multiple_error_messages(step, nerrorstep, errorstep);
/* If this step is waiting, add it to the array of waiters. */
if (mustwait)
waiting[nwaiting++] = step;
}
/* Finish any waiting query. */
if (waiting != NULL)
for (w = 0; w < nwaiting; ++w)
{
try_complete_step(waiting, STEP_RETRY);
report_error_message(waiting);
try_complete_step(waiting[w], STEP_RETRY);
report_error_message(waiting[w]);
}
teardown:
/* Perform per-session teardown */
for (i = 0; i < testspec->nsessions; i++)
{
@ -714,14 +712,17 @@ try_complete_step(Step *step, int flags)
{
PGconn *conn = conns[1 + step->session];
fd_set read_set;
struct timeval start_time;
struct timeval timeout;
int sock = PQsocket(conn);
int ret;
PGresult *res;
bool canceled = false;
gettimeofday(&start_time, NULL);
FD_ZERO(&read_set);
while ((flags & STEP_NONBLOCK) && PQisBusy(conn))
while (PQisBusy(conn))
{
FD_SET(sock, &read_set);
timeout.tv_sec = 0;
@ -737,28 +738,75 @@ try_complete_step(Step *step, int flags)
}
else if (ret == 0) /* select() timeout: check for lock wait */
{
int ntuples;
struct timeval current_time;
int64 td;
res = PQexecPrepared(conns[0], PREP_WAITING, 1,
&backend_pids[step->session + 1],
NULL, NULL, 0);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
/* If it's OK for the step to block, check whether it has. */
if (flags & STEP_NONBLOCK)
{
fprintf(stderr, "lock wait query failed: %s",
PQerrorMessage(conn));
int ntuples;
res = PQexecPrepared(conns[0], PREP_WAITING, 1,
&backend_pids[step->session + 1],
NULL, NULL, 0);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
fprintf(stderr, "lock wait query failed: %s",
PQerrorMessage(conn));
exit_nicely();
}
ntuples = PQntuples(res);
PQclear(res);
if (ntuples >= 1) /* waiting to acquire a lock */
{
if (!(flags & STEP_RETRY))
printf("step %s: %s <waiting ...>\n",
step->name, step->sql);
return true;
}
/* else, not waiting */
}
/* Figure out how long we've been waiting for this step. */
gettimeofday(&current_time, NULL);
td = (int64) current_time.tv_sec - (int64) start_time.tv_sec;
td *= USECS_PER_SEC;
td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
/*
* After 60 seconds, try to cancel the query.
*
* If the user tries to test an invalid permutation, we don't
* want to hang forever, especially when this is running in the
* buildfarm. So try to cancel it after a minute. This will
* presumably lead to this permutation failing, but remaining
* permutations and tests should still be OK.
*/
if (td > 60 * USECS_PER_SEC && !canceled)
{
PGcancel *cancel = PQgetCancel(conn);
char buf[256];
if (cancel != NULL && !PQcancel(cancel, buf, sizeof(buf)))
fprintf(stderr, "PQcancel failed: %s\n", buf);
if (cancel != NULL)
PQfreeCancel(cancel);
}
/*
* After 75 seconds, just give up and die.
*
* Since cleanup steps won't be run in this case, this may cause
* later tests to fail. That stinks, but it's better than waiting
* forever for the server to respond to the cancel.
*/
if (td > 75 * USECS_PER_SEC)
{
fprintf(stderr, "step %s timed out after 75 seconds\n",
step->name);
exit_nicely();
}
ntuples = PQntuples(res);
PQclear(res);
if (ntuples >= 1) /* waiting to acquire a lock */
{
if (!(flags & STEP_RETRY))
printf("step %s: %s <waiting ...>\n",
step->name, step->sql);
return true;
}
/* else, not waiting: give it more time */
}
else if (!PQconsumeInput(conn)) /* select(): data available */
{