Commit d5881c03 authored by Tom Lane's avatar Tom Lane

Fix O(N^2) behavior in pg_dump when many objects are in dependency loops.

Combining the loop workspace with the record of already-processed objects
might have been a cute trick, but it behaves horridly if there are many
dependency loops to repair: the time spent in the first step of findLoop()
grows as O(N^2).  Instead use a separate flag array indexed by dump ID,
which we can check in constant time.  The length of the workspace array
is now never more than the actual length of a dependency chain, which
should be reasonably short in all cases of practical interest.  The code
is noticeably easier to understand this way, too.

Per gripe from Mike Roest.  Since this is a longstanding performance bug,
backpatch to all supported versions.
parent 0d8117ab
...@@ -111,11 +111,11 @@ static bool TopoSort(DumpableObject **objs, ...@@ -111,11 +111,11 @@ static bool TopoSort(DumpableObject **objs,
static void addHeapElement(int val, int *heap, int heapLength); static void addHeapElement(int val, int *heap, int heapLength);
static int removeHeapElement(int *heap, int heapLength); static int removeHeapElement(int *heap, int heapLength);
static void findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs); static void findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs);
static bool findLoop(DumpableObject *obj, static int findLoop(DumpableObject *obj,
DumpId startPoint, DumpId startPoint,
bool *processed,
DumpableObject **workspace, DumpableObject **workspace,
int depth, int depth);
int *newDepth);
static void repairDependencyLoop(DumpableObject **loop, static void repairDependencyLoop(DumpableObject **loop,
int nLoop); int nLoop);
static void describeDumpableObject(DumpableObject *obj, static void describeDumpableObject(DumpableObject *obj,
...@@ -506,56 +506,50 @@ static void ...@@ -506,56 +506,50 @@ static void
findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs) findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
{ {
/* /*
* We use a workspace array, the initial part of which stores objects * We use two data structures here. One is a bool array processed[],
* already processed, and the rest of which is used as temporary space to * which is indexed by dump ID and marks the objects already processed
* try to build a loop in. This is convenient because we do not care * during this invocation of findDependencyLoops(). The other is a
* about loops involving already-processed objects (see notes above); we * workspace[] array of DumpableObject pointers, in which we try to build
* can easily reject such loops in findLoop() because of this * lists of objects constituting loops. We make workspace[] large enough
* representation. After we identify and process a loop, we can add it to * to hold all the objects, which is huge overkill in most cases but could
* the initial part of the workspace just by moving the boundary pointer. * theoretically be necessary if there is a single dependency chain
* * linking all the objects.
* When we determine that an object is not part of any interesting loop,
* we also add it to the initial part of the workspace. This is not
* necessary for correctness, but saves later invocations of findLoop()
* from uselessly chasing references to such an object.
*
* We make the workspace large enough to hold all objects in the original
* universe. This is probably overkill, but it's provably enough space...
*/ */
bool *processed;
DumpableObject **workspace; DumpableObject **workspace;
int initiallen;
bool fixedloop; bool fixedloop;
int i; int i;
processed = (bool *) pg_calloc(getMaxDumpId() + 1, sizeof(bool));
workspace = (DumpableObject **) pg_malloc(totObjs * sizeof(DumpableObject *)); workspace = (DumpableObject **) pg_malloc(totObjs * sizeof(DumpableObject *));
initiallen = 0;
fixedloop = false; fixedloop = false;
for (i = 0; i < nObjs; i++) for (i = 0; i < nObjs; i++)
{ {
DumpableObject *obj = objs[i]; DumpableObject *obj = objs[i];
int newlen; int looplen;
int j;
workspace[initiallen] = NULL; /* see test below */ looplen = findLoop(obj, obj->dumpId, processed, workspace, 0);
if (findLoop(obj, obj->dumpId, workspace, initiallen, &newlen)) if (looplen > 0)
{ {
/* Found a loop of length newlen - initiallen */ /* Found a loop, repair it */
repairDependencyLoop(&workspace[initiallen], newlen - initiallen); repairDependencyLoop(workspace, looplen);
/* Add loop members to workspace */
initiallen = newlen;
fixedloop = true; fixedloop = true;
/* Mark loop members as processed */
for (j = 0; j < looplen; j++)
processed[workspace[j]->dumpId] = true;
} }
else else
{ {
/* /*
* Didn't find a loop, but add this object to workspace anyway, * There's no loop starting at this object, but mark it processed
* unless it's already present. We piggyback on the test that * anyway. This is not necessary for correctness, but saves later
* findLoop() already did: it won't have tentatively added obj to * invocations of findLoop() from uselessly chasing references to
* workspace if it's already present. * such an object.
*/ */
if (workspace[initiallen] == obj) processed[obj->dumpId] = true;
initiallen++;
} }
} }
...@@ -564,45 +558,51 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs) ...@@ -564,45 +558,51 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
exit_horribly(modulename, "could not identify dependency loop\n"); exit_horribly(modulename, "could not identify dependency loop\n");
free(workspace); free(workspace);
free(processed);
} }
/* /*
* Recursively search for a circular dependency loop that doesn't include * Recursively search for a circular dependency loop that doesn't include
* any existing workspace members. * any already-processed objects.
* *
* obj: object we are examining now * obj: object we are examining now
* startPoint: dumpId of starting object for the hoped-for circular loop * startPoint: dumpId of starting object for the hoped-for circular loop
* workspace[]: work array for previously processed and current objects * processed[]: flag array marking already-processed objects
* workspace[]: work array in which we are building list of loop members
* depth: number of valid entries in workspace[] at call * depth: number of valid entries in workspace[] at call
* newDepth: if successful, set to new number of workspace[] entries
* *
* On success, *newDepth is set and workspace[] entries depth..*newDepth-1 * On success, the length of the loop is returned, and workspace[] is filled
* are filled with pointers to the members of the loop. * with pointers to the members of the loop. On failure, we return 0.
* *
* Note: it is possible that the given starting object is a member of more * Note: it is possible that the given starting object is a member of more
* than one cycle; if so, we will find an arbitrary one of the cycles. * than one cycle; if so, we will find an arbitrary one of the cycles.
*/ */
static bool static int
findLoop(DumpableObject *obj, findLoop(DumpableObject *obj,
DumpId startPoint, DumpId startPoint,
bool *processed,
DumpableObject **workspace, DumpableObject **workspace,
int depth, int depth)
int *newDepth)
{ {
int i; int i;
/* /*
* Reject if obj is already present in workspace. This test serves three * Reject if obj is already processed. This test prevents us from finding
* purposes: it prevents us from finding loops that overlap * loops that overlap previously-processed loops.
* previously-processed loops, it prevents us from going into infinite */
* recursion if we are given a startPoint object that links to a cycle if (processed[obj->dumpId])
* it's not a member of, and it guarantees that we can't overflow the return 0;
* allocated size of workspace[].
/*
* Reject if obj is already present in workspace. This test prevents us
* from going into infinite recursion if we are given a startPoint object
* that links to a cycle it's not a member of, and it guarantees that we
* can't overflow the allocated size of workspace[].
*/ */
for (i = 0; i < depth; i++) for (i = 0; i < depth; i++)
{ {
if (workspace[i] == obj) if (workspace[i] == obj)
return false; return 0;
} }
/* /*
...@@ -616,10 +616,7 @@ findLoop(DumpableObject *obj, ...@@ -616,10 +616,7 @@ findLoop(DumpableObject *obj,
for (i = 0; i < obj->nDeps; i++) for (i = 0; i < obj->nDeps; i++)
{ {
if (obj->dependencies[i] == startPoint) if (obj->dependencies[i] == startPoint)
{ return depth;
*newDepth = depth;
return true;
}
} }
/* /*
...@@ -628,18 +625,20 @@ findLoop(DumpableObject *obj, ...@@ -628,18 +625,20 @@ findLoop(DumpableObject *obj,
for (i = 0; i < obj->nDeps; i++) for (i = 0; i < obj->nDeps; i++)
{ {
DumpableObject *nextobj = findObjectByDumpId(obj->dependencies[i]); DumpableObject *nextobj = findObjectByDumpId(obj->dependencies[i]);
int newDepth;
if (!nextobj) if (!nextobj)
continue; /* ignore dependencies on undumped objects */ continue; /* ignore dependencies on undumped objects */
if (findLoop(nextobj, newDepth = findLoop(nextobj,
startPoint, startPoint,
workspace, processed,
depth, workspace,
newDepth)) depth);
return true; if (newDepth > 0)
return newDepth;
} }
return false; return 0;
} }
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment