Fix a performance problem in pg_dump's dump order selection logic.

findDependencyLoops() was not bright about cases where there are multiple
dependency paths between the same two dumpable objects.  In most scenarios
this did not hurt us too badly; but since the introduction of section
boundary pseudo-objects in commit a1ef01fe16,
it was possible for this code to take unreasonable amounts of time (tens
of seconds on a database with a couple thousand objects), as reported in
bug #11033 from Joe Van Dyk.  Joe's particular problem scenario involved
"pg_dump -a" mode with long chains of foreign key constraints, but I think
that similar problems could arise with other situations as long as there
were enough objects.  To fix, add a flag array that lets us notice when we
arrive at the same object again while searching from a given start object.
This simple change seems to be enough to eliminate the performance problem.

Back-patch to 9.1, like the patch that introduced section boundary objects.
This commit is contained in:
Tom Lane 2014-07-25 19:48:45 -04:00
parent df58a17df2
commit a8dddcaf75

View File

@ -137,6 +137,7 @@ static void findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs);
static int findLoop(DumpableObject *obj,
DumpId startPoint,
bool *processed,
DumpId *searchFailed,
DumpableObject **workspace,
int depth);
static void repairDependencyLoop(DumpableObject **loop,
@ -633,21 +634,35 @@ static void
findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
{
/*
* We use two data structures here. One is a bool array processed[],
* which is indexed by dump ID and marks the objects already processed
* during this invocation of findDependencyLoops(). The other is a
* workspace[] array of DumpableObject pointers, in which we try to build
* lists of objects constituting loops. We make workspace[] large enough
* to hold all the objects, which is huge overkill in most cases but could
* theoretically be necessary if there is a single dependency chain
* linking all the objects.
* We use three data structures here:
*
* processed[] is a bool array indexed by dump ID, marking the objects
* already processed during this invocation of findDependencyLoops().
*
* searchFailed[] is another array indexed by dump ID. searchFailed[j] is
* set to dump ID k if we have proven that there is no dependency path
* leading from object j back to start point k. This allows us to skip
* useless searching when there are multiple dependency paths from k to j,
* which is a common situation. We could use a simple bool array for
* this, but then we'd need to re-zero it for each start point, resulting
* in O(N^2) zeroing work. Using the start point's dump ID as the "true"
* value lets us skip clearing the array before we consider the next start
* point.
*
* workspace[] is an array of DumpableObject pointers, in which we try to
* build lists of objects constituting loops. We make workspace[] large
* enough to hold all the objects in TopoSort's output, which is huge
* overkill in most cases but could theoretically be necessary if there is
* a single dependency chain linking all the objects.
*/
bool *processed;
DumpId *searchFailed;
DumpableObject **workspace;
bool fixedloop;
int i;
processed = (bool *) pg_malloc0((getMaxDumpId() + 1) * sizeof(bool));
searchFailed = (DumpId *) pg_malloc0((getMaxDumpId() + 1) * sizeof(DumpId));
workspace = (DumpableObject **) pg_malloc(totObjs * sizeof(DumpableObject *));
fixedloop = false;
@ -657,7 +672,12 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
int looplen;
int j;
looplen = findLoop(obj, obj->dumpId, processed, workspace, 0);
looplen = findLoop(obj,
obj->dumpId,
processed,
searchFailed,
workspace,
0);
if (looplen > 0)
{
@ -685,6 +705,7 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
exit_horribly(modulename, "could not identify dependency loop\n");
free(workspace);
free(searchFailed);
free(processed);
}
@ -695,6 +716,7 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
* obj: object we are examining now
* startPoint: dumpId of starting object for the hoped-for circular loop
* processed[]: flag array marking already-processed objects
* searchFailed[]: flag array marking already-unsuccessfully-visited objects
* workspace[]: work array in which we are building list of loop members
* depth: number of valid entries in workspace[] at call
*
@ -708,6 +730,7 @@ static int
findLoop(DumpableObject *obj,
DumpId startPoint,
bool *processed,
DumpId *searchFailed,
DumpableObject **workspace,
int depth)
{
@ -720,6 +743,13 @@ findLoop(DumpableObject *obj,
if (processed[obj->dumpId])
return 0;
/*
* If we've already proven there is no path from this object back to the
* startPoint, forget it.
*/
if (searchFailed[obj->dumpId] == startPoint)
return 0;
/*
* Reject if obj is already present in workspace. This test prevents us
* from going into infinite recursion if we are given a startPoint object
@ -759,12 +789,18 @@ findLoop(DumpableObject *obj,
newDepth = findLoop(nextobj,
startPoint,
processed,
searchFailed,
workspace,
depth);
if (newDepth > 0)
return newDepth;
}
/*
* Remember there is no path from here back to startPoint
*/
searchFailed[obj->dumpId] = startPoint;
return 0;
}