diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index e6b5793fdd..4e37a7ee10 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -3489,6 +3489,7 @@ create table test2(t text COLLATE "C"); insert into test2 values ('abcdef'); insert into test2 values ('quark'); insert into test2 values (' z foo bar'); +insert into test2 values ('/123/-45/'); create index test2_idx_gin on test2 using gin (t gin_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -3590,7 +3591,8 @@ select * from test2 where t ~ '(abc)*$'; abcdef quark z foo bar -(3 rows) + /123/-45/ +(4 rows) select * from test2 where t ~* 'DEF'; t @@ -3682,6 +3684,12 @@ select * from test2 where t ~ 'qua(?!foo)'; quark (1 row) +select * from test2 where t ~ '/\d+/-\d'; + t +----------- + /123/-45/ +(1 row) + drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; @@ -3776,7 +3784,8 @@ select * from test2 where t ~ '(abc)*$'; abcdef quark z foo bar -(3 rows) + /123/-45/ +(4 rows) select * from test2 where t ~* 'DEF'; t @@ -3868,6 +3877,12 @@ select * from test2 where t ~ 'qua(?!foo)'; quark (1 row) +select * from test2 where t ~ '/\d+/-\d'; + t +----------- + /123/-45/ +(1 row) + -- Check similarity threshold (bug #14202) CREATE TEMP TABLE restaurants (city text); INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000); diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index eccd921d82..389114a63e 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -47,6 +47,7 @@ create table test2(t text COLLATE "C"); insert into test2 values ('abcdef'); insert into test2 values ('quark'); insert into test2 values (' z foo bar'); +insert into test2 values ('/123/-45/'); create index test2_idx_gin on test2 using gin (t gin_trgm_ops); set enable_seqscan=off; explain (costs off) @@ -82,6 +83,7 @@ select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo'; select * from test2 where t ~ 'qua(?!foo)'; +select * from test2 where t ~ '/\d+/-\d'; drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); @@ -119,6 +121,7 @@ select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo bar'; select * from test2 where t ~ ' z foo'; select * from test2 where t ~ 'qua(?!foo)'; +select * from test2 where t ~ '/\d+/-\d'; -- Check similarity threshold (bug #14202) diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index a2e4a61630..1ef91b3519 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -200,9 +200,10 @@ /* - * Uncomment to print intermediate stages, for exploring and debugging the - * algorithm implementation. This produces three graph files in /tmp, - * in Graphviz .dot format. + * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info, + * for exploring and debugging the algorithm implementation. + * This produces three graph files in /tmp, in Graphviz .dot format. + * Some progress information is also printed to postmaster stderr. */ /* #define TRGM_REGEXP_DEBUG */ @@ -226,7 +227,7 @@ * Penalty multipliers for trigram counts depending on whitespace contents. * Numbers based on analysis of real-life texts. */ -const float4 penalties[8] = { +static const float4 penalties[8] = { 1.0f, /* "aaa" */ 3.5f, /* "aa " */ 0.0f, /* "a a" (impossible) */ @@ -319,9 +320,12 @@ typedef struct * enterKeys - enter keys reachable from this state without reading any * predictable trigram (List of TrgmStateKey) * flags - flag bits + * snumber - number of this state (initially assigned as -1, -2, etc, + * for debugging purposes only; then at the packaging stage, + * surviving states are renumbered with positive numbers) * parent - parent state, if this state has been merged into another + * tentFlags - flags this state would acquire via planned merges * tentParent - planned parent state, if considering a merge - * number - number of this state (used at the packaging stage) */ #define TSTATE_INIT 0x01 /* flag indicating this state is initial */ #define TSTATE_FIN 0x02 /* flag indicating this state is final */ @@ -332,9 +336,10 @@ typedef struct TrgmState List *arcs; List *enterKeys; int flags; + int snumber; struct TrgmState *parent; + int tentFlags; struct TrgmState *tentParent; - int number; } TrgmState; /* @@ -361,7 +366,7 @@ typedef struct * Information about color trigram (used in stage 3) * * ctrgm - trigram itself - * number - number of this trigram (used in the packaging stage) + * cnumber - number of this trigram (used in the packaging stage) * count - number of simple trigrams created from this color trigram * expanded - indicates this color trigram is expanded into simple trigrams * arcs - list of all arcs labeled with this color trigram. @@ -369,7 +374,7 @@ typedef struct typedef struct { ColorTrgm ctrgm; - int number; + int cnumber; int count; float4 penalty; bool expanded; @@ -403,6 +408,7 @@ typedef struct /* Expanded graph (stage 2) */ HTAB *states; TrgmState *initState; + int nstates; /* Workspace for stage 2 */ List *queue; @@ -918,6 +924,7 @@ transformGraph(TrgmNFA *trgmNFA) 1024, &hashCtl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + trgmNFA->nstates = 0; /* Create initial state: ambiguous prefix, NFA's initial state */ MemSet(&initkey, 0, sizeof(initkey)); @@ -1387,9 +1394,11 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key) state->arcs = NIL; state->enterKeys = NIL; state->flags = 0; + /* states are initially given negative numbers */ + state->snumber = -(++trgmNFA->nstates); state->parent = NULL; + state->tentFlags = 0; state->tentParent = NULL; - state->number = -1; trgmNFA->queue = lappend(trgmNFA->queue, state); } @@ -1454,10 +1463,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA) ColorTrgmInfo *colorTrgms; int64 totalTrgmCount; float4 totalTrgmPenalty; - int number; + int cnumber; /* Collect color trigrams from all arcs */ - colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount); + colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount); trgmNFA->colorTrgms = colorTrgms; i = 0; @@ -1470,12 +1479,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA) { TrgmArc *arc = (TrgmArc *) lfirst(cell); TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo)); + ColorTrgmInfo *trgmInfo = &colorTrgms[i]; arcInfo->source = state; arcInfo->target = arc->target; - colorTrgms[i].arcs = list_make1(arcInfo); - colorTrgms[i].expanded = true; - colorTrgms[i].ctrgm = arc->ctrgm; + trgmInfo->ctrgm = arc->ctrgm; + trgmInfo->cnumber = -1; + /* count and penalty will be set below */ + trgmInfo->expanded = true; + trgmInfo->arcs = list_make1(arcInfo); i++; } } @@ -1573,6 +1585,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA) if (totalTrgmPenalty <= WISH_TRGM_PENALTY) break; +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n", + trgmInfo->ctrgm.colors[0], + trgmInfo->ctrgm.colors[1], + trgmInfo->ctrgm.colors[2], + trgmInfo->penalty, + list_length(trgmInfo->arcs)); +#endif + /* * Does any arc of this color trigram connect initial and final * states? If so we can't remove it. @@ -1585,26 +1606,44 @@ selectColorTrigrams(TrgmNFA *trgmNFA) int source_flags, target_flags; +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n", + -target->snumber, target->flags, + -source->snumber, source->flags); +#endif + /* examine parent states, if any merging has already happened */ while (source->parent) source = source->parent; while (target->parent) target = target->parent; +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n", + -target->snumber, target->flags, + -source->snumber, source->flags); +#endif + /* we must also consider merges we are planning right now */ - source_flags = source->flags; + source_flags = source->flags | source->tentFlags; while (source->tentParent) { source = source->tentParent; - source_flags |= source->flags; + source_flags |= source->flags | source->tentFlags; } - target_flags = target->flags; + target_flags = target->flags | target->tentFlags; while (target->tentParent) { target = target->tentParent; - target_flags |= target->flags; + target_flags |= target->flags | target->tentFlags; } +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n", + -target->snumber, target_flags, + -source->snumber, source_flags); +#endif + /* would fully-merged state have both INIT and FIN set? */ if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) == (TSTATE_INIT | TSTATE_FIN)) @@ -1615,29 +1654,62 @@ selectColorTrigrams(TrgmNFA *trgmNFA) /* ok so far, so remember planned merge */ if (source != target) + { +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... tentatively merging s%d into s%d\n", + -target->snumber, -source->snumber); +#endif target->tentParent = source; + source->tentFlags |= target_flags; + } } - /* We must clear all the tentParent fields before continuing */ + /* + * We must reset all the tentFlags/tentParent fields before + * continuing. tentFlags could only have become set in states that + * are the source or parent or tentative parent of one of the current + * arcs; likewise tentParent could only have become set in states that + * are the target or parent or tentative parent of one of the current + * arcs. There might be some overlap between those sets, but if we + * clear tentFlags in target states as well as source states, we + * should be okay even if we visit a state as target before visiting + * it as a source. + */ foreach(cell, trgmInfo->arcs) { TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); - TrgmState *target = arcInfo->target; + TrgmState *source = arcInfo->source, + *target = arcInfo->target; TrgmState *ttarget; + /* no need to touch previously-merged states */ + while (source->parent) + source = source->parent; while (target->parent) target = target->parent; + while (source) + { + source->tentFlags = 0; + source = source->tentParent; + } + while ((ttarget = target->tentParent) != NULL) { target->tentParent = NULL; + target->tentFlags = 0; /* in case it was also a source */ target = ttarget; } } /* Now, move on if we can't drop this trigram */ if (!canRemove) + { +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, " ... not ok to merge\n"); +#endif continue; + } /* OK, merge states linked by each arc labeled by the trigram */ foreach(cell, trgmInfo->arcs) @@ -1652,6 +1724,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA) target = target->parent; if (source != target) { +#ifdef TRGM_REGEXP_DEBUG + fprintf(stderr, "merging s%d into s%d\n", + -target->snumber, -source->snumber); +#endif mergeStates(source, target); /* Assert we didn't merge initial and final states */ Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) != @@ -1675,15 +1751,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA) * Sort color trigrams by colors (will be useful for bsearch in packGraph) * and enumerate the color trigrams that are expanded. */ - number = 0; + cnumber = 0; qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { if (colorTrgms[i].expanded) { - colorTrgms[i].number = number; - number++; + colorTrgms[i].cnumber = cnumber; + cnumber++; } } @@ -1854,7 +1930,7 @@ colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) static TrgmPackedGraph * packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) { - int number = 2, + int snumber = 2, arcIndex, arcsCount; HASH_SEQ_STATUS scan_status; @@ -1874,16 +1950,16 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) while (state->parent) state = state->parent; - if (state->number < 0) + if (state->snumber < 0) { if (state->flags & TSTATE_INIT) - state->number = 0; + state->snumber = 0; else if (state->flags & TSTATE_FIN) - state->number = 1; + state->snumber = 1; else { - state->number = number; - number++; + state->snumber = snumber; + snumber++; } } } @@ -1909,7 +1985,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) while (target->parent) target = target->parent; - if (source->number != target->number) + if (source->snumber != target->snumber) { ColorTrgmInfo *ctrgm; @@ -1921,9 +1997,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) Assert(ctrgm != NULL); Assert(ctrgm->expanded); - arcs[arcIndex].sourceState = source->number; - arcs[arcIndex].targetState = target->number; - arcs[arcIndex].colorTrgm = ctrgm->number; + arcs[arcIndex].sourceState = source->snumber; + arcs[arcIndex].targetState = target->snumber; + arcs[arcIndex].colorTrgm = ctrgm->cnumber; arcIndex++; } } @@ -1969,13 +2045,13 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext) } /* Pack states and arcs information */ - result->statesCount = number; + result->statesCount = snumber; result->states = (TrgmPackedState *) - MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState)); + MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState)); packedArcs = (TrgmPackedArc *) MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc)); j = 0; - for (i = 0; i < number; i++) + for (i = 0; i < snumber; i++) { int cnt = 0; @@ -2141,7 +2217,7 @@ printTrgmNFA(TrgmNFA *trgmNFA) { ListCell *cell; - appendStringInfo(&buf, "s%p", (void *) state); + appendStringInfo(&buf, "s%d", -state->snumber); if (state->flags & TSTATE_FIN) appendStringInfoString(&buf, " [shape = doublecircle]"); if (state->flags & TSTATE_INIT) @@ -2153,8 +2229,8 @@ printTrgmNFA(TrgmNFA *trgmNFA) { TrgmArc *arc = (TrgmArc *) lfirst(cell); - appendStringInfo(&buf, " s%p -> s%p [label = \"", - (void *) state, (void *) arc->target); + appendStringInfo(&buf, " s%d -> s%d [label = \"", + -state->snumber, -arc->target->snumber); printTrgmColor(&buf, arc->ctrgm.colors[0]); appendStringInfoChar(&buf, ' '); printTrgmColor(&buf, arc->ctrgm.colors[1]); @@ -2167,7 +2243,7 @@ printTrgmNFA(TrgmNFA *trgmNFA) if (initstate) { appendStringInfoString(&buf, " node [shape = point ]; initial;\n"); - appendStringInfo(&buf, " initial -> s%p;\n", (void *) initstate); + appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber); } appendStringInfoString(&buf, "}\n");