Further fix pg_trgm's extraction of trigrams from regular expressions.

Commit 9e43e8714 turns out to have been insufficient: not only is it
necessary to track tentative parent links while considering a set of
arc removals, but it's necessary to track tentative flag additions
as well.  This is because we always merge arc target states into
arc source states; therefore, when considering a merge of the final
state with some other, it is the other state that will acquire a new
TSTATE_FIN bit.  If there's another arc for the same color trigram
that would cause merging of that state with the initial state, we
failed to recognize the problem.  The test cases for the prior commit
evidently only exercised situations where a tentative merge with the
initial state occurs before one with the final state.  If it goes the
other way around, we'll happily merge the initial and final states,
either producing a broken final graph that would never match anything,
or triggering the Assert added by the prior commit.

It's tempting to consider switching the merge direction when the merge
involves the final state, but I lack the time to analyze that idea in
detail.  Instead just keep track of the flag changes that would result
from proposed merges, in the same way that the prior commit tracked
proposed parent links.

Along the way, add some more debugging support, because I'm not entirely
confident that this is the last bug here.  And tweak matters so that
the transformed.dot file uses small integers rather than pointer values
to identify states; that makes it more readable if you're just eyeballing
it rather than fooling with Graphviz.  And rename a couple of identically
named struct fields to reduce confusion.

Per report from Corey Csuhta.  Add a test case based on his example.
(Note: this case does not trigger the bug under 9.3, apparently because
its different measurement of costs causes it to stop merging states before
it hits the failure.  I spent some time trying to find a variant that would
fail in 9.3, without success; but I'm sure such cases exist.)

Like the previous patch, back-patch to 9.3 where this code was added.

Report: https://postgr.es/m/E2B01A4B-4530-406B-8D17-2F67CF9A16BA@csuhta.com
This commit is contained in:
Tom Lane 2017-04-14 14:52:03 -04:00
parent a70b18b894
commit d51279433c
3 changed files with 136 additions and 42 deletions

View File

@ -3489,6 +3489,7 @@ create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
insert into test2 values ('/123/-45/');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
@ -3590,7 +3591,8 @@ select * from test2 where t ~ '(abc)*$';
abcdef
quark
z foo bar
(3 rows)
/123/-45/
(4 rows)
select * from test2 where t ~* 'DEF';
t
@ -3682,6 +3684,12 @@ select * from test2 where t ~ 'qua(?!foo)';
quark
(1 row)
select * from test2 where t ~ '/\d+/-\d';
t
-----------
/123/-45/
(1 row)
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
@ -3776,7 +3784,8 @@ select * from test2 where t ~ '(abc)*$';
abcdef
quark
z foo bar
(3 rows)
/123/-45/
(4 rows)
select * from test2 where t ~* 'DEF';
t
@ -3868,6 +3877,12 @@ select * from test2 where t ~ 'qua(?!foo)';
quark
(1 row)
select * from test2 where t ~ '/\d+/-\d';
t
-----------
/123/-45/
(1 row)
-- Check similarity threshold (bug #14202)
CREATE TEMP TABLE restaurants (city text);
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);

View File

@ -47,6 +47,7 @@ create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
insert into test2 values ('/123/-45/');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
@ -82,6 +83,7 @@ select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
select * from test2 where t ~ 'qua(?!foo)';
select * from test2 where t ~ '/\d+/-\d';
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
@ -119,6 +121,7 @@ select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
select * from test2 where t ~ 'qua(?!foo)';
select * from test2 where t ~ '/\d+/-\d';
-- Check similarity threshold (bug #14202)

View File

@ -200,9 +200,10 @@
/*
* Uncomment to print intermediate stages, for exploring and debugging the
* algorithm implementation. This produces three graph files in /tmp,
* in Graphviz .dot format.
* Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
* for exploring and debugging the algorithm implementation.
* This produces three graph files in /tmp, in Graphviz .dot format.
* Some progress information is also printed to postmaster stderr.
*/
/* #define TRGM_REGEXP_DEBUG */
@ -226,7 +227,7 @@
* Penalty multipliers for trigram counts depending on whitespace contents.
* Numbers based on analysis of real-life texts.
*/
const float4 penalties[8] = {
static const float4 penalties[8] = {
1.0f, /* "aaa" */
3.5f, /* "aa " */
0.0f, /* "a a" (impossible) */
@ -319,9 +320,12 @@ typedef struct
* enterKeys - enter keys reachable from this state without reading any
* predictable trigram (List of TrgmStateKey)
* flags - flag bits
* snumber - number of this state (initially assigned as -1, -2, etc,
* for debugging purposes only; then at the packaging stage,
* surviving states are renumbered with positive numbers)
* parent - parent state, if this state has been merged into another
* tentFlags - flags this state would acquire via planned merges
* tentParent - planned parent state, if considering a merge
* number - number of this state (used at the packaging stage)
*/
#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
#define TSTATE_FIN 0x02 /* flag indicating this state is final */
@ -332,9 +336,10 @@ typedef struct TrgmState
List *arcs;
List *enterKeys;
int flags;
int snumber;
struct TrgmState *parent;
int tentFlags;
struct TrgmState *tentParent;
int number;
} TrgmState;
/*
@ -361,7 +366,7 @@ typedef struct
* Information about color trigram (used in stage 3)
*
* ctrgm - trigram itself
* number - number of this trigram (used in the packaging stage)
* cnumber - number of this trigram (used in the packaging stage)
* count - number of simple trigrams created from this color trigram
* expanded - indicates this color trigram is expanded into simple trigrams
* arcs - list of all arcs labeled with this color trigram.
@ -369,7 +374,7 @@ typedef struct
typedef struct
{
ColorTrgm ctrgm;
int number;
int cnumber;
int count;
float4 penalty;
bool expanded;
@ -403,6 +408,7 @@ typedef struct
/* Expanded graph (stage 2) */
HTAB *states;
TrgmState *initState;
int nstates;
/* Workspace for stage 2 */
List *queue;
@ -918,6 +924,7 @@ transformGraph(TrgmNFA *trgmNFA)
1024,
&hashCtl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
trgmNFA->nstates = 0;
/* Create initial state: ambiguous prefix, NFA's initial state */
MemSet(&initkey, 0, sizeof(initkey));
@ -1387,9 +1394,11 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
state->arcs = NIL;
state->enterKeys = NIL;
state->flags = 0;
/* states are initially given negative numbers */
state->snumber = -(++trgmNFA->nstates);
state->parent = NULL;
state->tentFlags = 0;
state->tentParent = NULL;
state->number = -1;
trgmNFA->queue = lappend(trgmNFA->queue, state);
}
@ -1454,10 +1463,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
ColorTrgmInfo *colorTrgms;
int64 totalTrgmCount;
float4 totalTrgmPenalty;
int number;
int cnumber;
/* Collect color trigrams from all arcs */
colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount);
colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
trgmNFA->colorTrgms = colorTrgms;
i = 0;
@ -1470,12 +1479,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
ColorTrgmInfo *trgmInfo = &colorTrgms[i];
arcInfo->source = state;
arcInfo->target = arc->target;
colorTrgms[i].arcs = list_make1(arcInfo);
colorTrgms[i].expanded = true;
colorTrgms[i].ctrgm = arc->ctrgm;
trgmInfo->ctrgm = arc->ctrgm;
trgmInfo->cnumber = -1;
/* count and penalty will be set below */
trgmInfo->expanded = true;
trgmInfo->arcs = list_make1(arcInfo);
i++;
}
}
@ -1573,6 +1585,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
break;
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
trgmInfo->ctrgm.colors[0],
trgmInfo->ctrgm.colors[1],
trgmInfo->ctrgm.colors[2],
trgmInfo->penalty,
list_length(trgmInfo->arcs));
#endif
/*
* Does any arc of this color trigram connect initial and final
* states? If so we can't remove it.
@ -1585,26 +1606,44 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
int source_flags,
target_flags;
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
-target->snumber, target->flags,
-source->snumber, source->flags);
#endif
/* examine parent states, if any merging has already happened */
while (source->parent)
source = source->parent;
while (target->parent)
target = target->parent;
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
-target->snumber, target->flags,
-source->snumber, source->flags);
#endif
/* we must also consider merges we are planning right now */
source_flags = source->flags;
source_flags = source->flags | source->tentFlags;
while (source->tentParent)
{
source = source->tentParent;
source_flags |= source->flags;
source_flags |= source->flags | source->tentFlags;
}
target_flags = target->flags;
target_flags = target->flags | target->tentFlags;
while (target->tentParent)
{
target = target->tentParent;
target_flags |= target->flags;
target_flags |= target->flags | target->tentFlags;
}
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
-target->snumber, target_flags,
-source->snumber, source_flags);
#endif
/* would fully-merged state have both INIT and FIN set? */
if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
(TSTATE_INIT | TSTATE_FIN))
@ -1615,29 +1654,62 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
/* ok so far, so remember planned merge */
if (source != target)
{
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... tentatively merging s%d into s%d\n",
-target->snumber, -source->snumber);
#endif
target->tentParent = source;
source->tentFlags |= target_flags;
}
}
/* We must clear all the tentParent fields before continuing */
/*
* We must reset all the tentFlags/tentParent fields before
* continuing. tentFlags could only have become set in states that
* are the source or parent or tentative parent of one of the current
* arcs; likewise tentParent could only have become set in states that
* are the target or parent or tentative parent of one of the current
* arcs. There might be some overlap between those sets, but if we
* clear tentFlags in target states as well as source states, we
* should be okay even if we visit a state as target before visiting
* it as a source.
*/
foreach(cell, trgmInfo->arcs)
{
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
TrgmState *target = arcInfo->target;
TrgmState *source = arcInfo->source,
*target = arcInfo->target;
TrgmState *ttarget;
/* no need to touch previously-merged states */
while (source->parent)
source = source->parent;
while (target->parent)
target = target->parent;
while (source)
{
source->tentFlags = 0;
source = source->tentParent;
}
while ((ttarget = target->tentParent) != NULL)
{
target->tentParent = NULL;
target->tentFlags = 0; /* in case it was also a source */
target = ttarget;
}
}
/* Now, move on if we can't drop this trigram */
if (!canRemove)
{
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... not ok to merge\n");
#endif
continue;
}
/* OK, merge states linked by each arc labeled by the trigram */
foreach(cell, trgmInfo->arcs)
@ -1652,6 +1724,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
target = target->parent;
if (source != target)
{
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, "merging s%d into s%d\n",
-target->snumber, -source->snumber);
#endif
mergeStates(source, target);
/* Assert we didn't merge initial and final states */
Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
@ -1675,15 +1751,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
* Sort color trigrams by colors (will be useful for bsearch in packGraph)
* and enumerate the color trigrams that are expanded.
*/
number = 0;
cnumber = 0;
qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
colorTrgmInfoCmp);
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
if (colorTrgms[i].expanded)
{
colorTrgms[i].number = number;
number++;
colorTrgms[i].cnumber = cnumber;
cnumber++;
}
}
@ -1854,7 +1930,7 @@ colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
static TrgmPackedGraph *
packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
{
int number = 2,
int snumber = 2,
arcIndex,
arcsCount;
HASH_SEQ_STATUS scan_status;
@ -1874,16 +1950,16 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
while (state->parent)
state = state->parent;
if (state->number < 0)
if (state->snumber < 0)
{
if (state->flags & TSTATE_INIT)
state->number = 0;
state->snumber = 0;
else if (state->flags & TSTATE_FIN)
state->number = 1;
state->snumber = 1;
else
{
state->number = number;
number++;
state->snumber = snumber;
snumber++;
}
}
}
@ -1909,7 +1985,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
while (target->parent)
target = target->parent;
if (source->number != target->number)
if (source->snumber != target->snumber)
{
ColorTrgmInfo *ctrgm;
@ -1921,9 +1997,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
Assert(ctrgm != NULL);
Assert(ctrgm->expanded);
arcs[arcIndex].sourceState = source->number;
arcs[arcIndex].targetState = target->number;
arcs[arcIndex].colorTrgm = ctrgm->number;
arcs[arcIndex].sourceState = source->snumber;
arcs[arcIndex].targetState = target->snumber;
arcs[arcIndex].colorTrgm = ctrgm->cnumber;
arcIndex++;
}
}
@ -1969,13 +2045,13 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
}
/* Pack states and arcs information */
result->statesCount = number;
result->statesCount = snumber;
result->states = (TrgmPackedState *)
MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState));
MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
packedArcs = (TrgmPackedArc *)
MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
j = 0;
for (i = 0; i < number; i++)
for (i = 0; i < snumber; i++)
{
int cnt = 0;
@ -2141,7 +2217,7 @@ printTrgmNFA(TrgmNFA *trgmNFA)
{
ListCell *cell;
appendStringInfo(&buf, "s%p", (void *) state);
appendStringInfo(&buf, "s%d", -state->snumber);
if (state->flags & TSTATE_FIN)
appendStringInfoString(&buf, " [shape = doublecircle]");
if (state->flags & TSTATE_INIT)
@ -2153,8 +2229,8 @@ printTrgmNFA(TrgmNFA *trgmNFA)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
appendStringInfo(&buf, " s%p -> s%p [label = \"",
(void *) state, (void *) arc->target);
appendStringInfo(&buf, " s%d -> s%d [label = \"",
-state->snumber, -arc->target->snumber);
printTrgmColor(&buf, arc->ctrgm.colors[0]);
appendStringInfoChar(&buf, ' ');
printTrgmColor(&buf, arc->ctrgm.colors[1]);
@ -2167,7 +2243,7 @@ printTrgmNFA(TrgmNFA *trgmNFA)
if (initstate)
{
appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
appendStringInfo(&buf, " initial -> s%p;\n", (void *) initstate);
appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
}
appendStringInfoString(&buf, "}\n");