diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out index 0ba44fa6a0..13b1fde1b8 100644 --- a/contrib/pg_trgm/expected/pg_trgm.out +++ b/contrib/pg_trgm/expected/pg_trgm.out @@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%'; quark (1 row) +select * from test2 where t like '%z foo bar%'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t like ' z foo%'; + t +------------- + z foo bar +(1 row) + +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~ '[abc]{3}'::text) +(2 rows) + +explain (costs off) + select * from test2 where t ~* 'DEF'; + QUERY PLAN +------------------------------------------ + Index Scan using test2_idx_gist on test2 + Index Cond: (t ~* 'DEF'::text) +(2 rows) + +select * from test2 where t ~ '[abc]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'a[bc]+d'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ '(abc)*$'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* 'DEF'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~ 'dEf'; + t +--- +(0 rows) + +select * from test2 where t ~* '^q'; + t +------- + quark +(1 row) + +select * from test2 where t ~* '[abc]{3}[def]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* 'ab[a-z]{3}'; + t +-------- + abcdef +(1 row) + +select * from test2 where t ~* '(^| )qua'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q.*rk$'; + t +------- + quark +(1 row) + +select * from test2 where t ~ 'q'; + t +------- + quark +(1 row) + +select * from test2 where t ~ '[a-z]{3}'; + t +------------- + abcdef + quark + z foo bar +(3 rows) + +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; + t +--- +(0 rows) + +select * from test2 where t ~ 'z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo bar'; + t +------------- + z foo bar +(1 row) + +select * from test2 where t ~ ' z foo'; + t +------------- + z foo bar +(1 row) + diff --git a/contrib/pg_trgm/pg_trgm--1.0--1.1.sql b/contrib/pg_trgm/pg_trgm--1.0--1.1.sql index 449f840084..65bbb1cfde 100644 --- a/contrib/pg_trgm/pg_trgm--1.0--1.1.sql +++ b/contrib/pg_trgm/pg_trgm--1.0--1.1.sql @@ -3,6 +3,10 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); + ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD OPERATOR 5 pg_catalog.~ (text, text), OPERATOR 6 pg_catalog.~* (text, text); diff --git a/contrib/pg_trgm/pg_trgm--1.1.sql b/contrib/pg_trgm/pg_trgm--1.1.sql index 5d28339738..1fff7af2c4 100644 --- a/contrib/pg_trgm/pg_trgm--1.1.sql +++ b/contrib/pg_trgm/pg_trgm--1.1.sql @@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD OPERATOR 4 pg_catalog.~~* (text, text), FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid); +-- Add operators that are new in 9.3. + +ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD + OPERATOR 5 pg_catalog.~ (text, text), + OPERATOR 6 pg_catalog.~* (text, text); + -- support functions for gin CREATE FUNCTION gin_extract_value_trgm(text, internal) RETURNS internal diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql index 37a4c24705..7b02d98818 100644 --- a/contrib/pg_trgm/sql/pg_trgm.sql +++ b/contrib/pg_trgm/sql/pg_trgm.sql @@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%'; select * from test2 where t like E'%\\bcd%'; select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike 'qua%'; +select * from test2 where t like '%z foo bar%'; +select * from test2 where t like ' z foo%'; +explain (costs off) + select * from test2 where t ~ '[abc]{3}'; +explain (costs off) + select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ '[abc]{3}'; +select * from test2 where t ~ 'a[bc]+d'; +select * from test2 where t ~ '(abc)*$'; +select * from test2 where t ~* 'DEF'; +select * from test2 where t ~ 'dEf'; +select * from test2 where t ~* '^q'; +select * from test2 where t ~* '[abc]{3}[def]{3}'; +select * from test2 where t ~* 'ab[a-z]{3}'; +select * from test2 where t ~* '(^| )qua'; +select * from test2 where t ~ 'q.*rk$'; +select * from test2 where t ~ 'q'; +select * from test2 where t ~ '[a-z]{3}'; +select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}'; +select * from test2 where t ~ 'z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo bar'; +select * from test2 where t ~ ' z foo'; diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 15e7bebb00..ed649b8dcc 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen); extern TRGM *generate_wildcard_trgm(const char *str, int slen); extern float4 cnt_sml(TRGM *trg1, TRGM *trg2); extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2); -extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph, - Oid collation); +extern bool *trgm_presence_map(TRGM *query, TRGM *key); +extern TRGM *createTrgmNFA(text *text_re, Oid collation, + TrgmPackedGraph **graph, MemoryContext rcontext); extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check); #endif /* __TRGM_H__ */ diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c index e828571563..1fbbd9ca35 100644 --- a/contrib/pg_trgm/trgm_gin.c +++ b/contrib/pg_trgm/trgm_gin.c @@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) #endif /* FALL THRU */ case RegExpStrategyNumber: - trg = createTrgmNFA(val, &graph, PG_GET_COLLATION()); + trg = createTrgmNFA(val, PG_GET_COLLATION(), + &graph, CurrentMemoryContext); if (trg && ARRNELEM(trg) > 0) { /* diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 605d7ea356..178f073755 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -8,6 +8,25 @@ #include "access/skey.h" +typedef struct +{ + /* most recent inputs to gtrgm_consistent */ + StrategyNumber strategy; + text *query; + /* extracted trigrams for query */ + TRGM *trigrams; + /* if a regex operator, the extracted graph */ + TrgmPackedGraph *graph; + + /* + * The "query" and "trigrams" are stored in the same palloc block as this + * cache struct, at MAXALIGN'ed offsets. The graph however isn't. + */ +} gtrgm_consistent_cache; + +#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) + + PG_FUNCTION_INFO_V1(gtrgm_in); Datum gtrgm_in(PG_FUNCTION_ARGS); @@ -38,8 +57,6 @@ Datum gtrgm_penalty(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(gtrgm_picksplit); Datum gtrgm_picksplit(PG_FUNCTION_ARGS); -#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key)) - /* Number of one-bits in an unsigned byte */ static const uint8 number_of_ones[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, @@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS) TRGM *qtrg; bool res; Size querysize = VARSIZE(query); - char *cache = (char *) fcinfo->flinfo->fn_extra, - *cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber)); + gtrgm_consistent_cache *cache; /* - * Store both the strategy number and extracted trigrams in cache, because - * trigram extraction is relatively CPU-expensive. We must include - * strategy number because trigram extraction depends on strategy. + * We keep the extracted trigrams in cache, because trigram extraction is + * relatively CPU-expensive. When trying to reuse a cached value, check + * strategy number not just query itself, because trigram extraction + * depends on strategy. * - * The cached structure contains the strategy number, then the input query - * (starting at a MAXALIGN boundary), then the TRGM value (also starting - * at a MAXALIGN boundary). + * The cached structure is a single palloc chunk containing the + * gtrgm_consistent_cache header, then the input query (starting at a + * MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN + * boundary). However we don't try to include the regex graph (if any) in + * that struct. (XXX currently, this approach can leak regex graphs + * across index rescans. Not clear if that's worth fixing.) */ + cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra; if (cache == NULL || - strategy != *((StrategyNumber *) cache) || - VARSIZE(cachedQuery) != querysize || - memcmp(cachedQuery, query, querysize) != 0) + cache->strategy != strategy || + VARSIZE(cache->query) != querysize || + memcmp((char *) cache->query, (char *) query, querysize) != 0) { - char *newcache; + gtrgm_consistent_cache *newcache; + TrgmPackedGraph *graph = NULL; + Size qtrgsize; switch (strategy) { @@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS) qtrg = generate_wildcard_trgm(VARDATA(query), querysize - VARHDRSZ); break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + qtrg = createTrgmNFA(query, PG_GET_COLLATION(), + &graph, fcinfo->flinfo->fn_mcxt); + /* just in case an empty array is returned ... */ + if (qtrg && ARRNELEM(qtrg) <= 0) + { + pfree(qtrg); + qtrg = NULL; + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); qtrg = NULL; /* keep compiler quiet */ break; } - newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, - MAXALIGN(sizeof(StrategyNumber)) + - MAXALIGN(querysize) + - VARSIZE(qtrg)); - cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber)); + qtrgsize = qtrg ? VARSIZE(qtrg) : 0; - *((StrategyNumber *) newcache) = strategy; - memcpy(cachedQuery, query, querysize); - memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg)); + newcache = (gtrgm_consistent_cache *) + MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, + MAXALIGN(sizeof(gtrgm_consistent_cache)) + + MAXALIGN(querysize) + + qtrgsize); + + newcache->strategy = strategy; + newcache->query = (text *) + ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache))); + memcpy((char *) newcache->query, (char *) query, querysize); + if (qtrg) + { + newcache->trigrams = (TRGM *) + ((char *) newcache->query + MAXALIGN(querysize)); + memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize); + /* release qtrg in case it was made in fn_mcxt */ + pfree(qtrg); + } + else + newcache->trigrams = NULL; + newcache->graph = graph; if (cache) pfree(cache); - fcinfo->flinfo->fn_extra = newcache; + fcinfo->flinfo->fn_extra = (void *) newcache; + cache = newcache; } - qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize)); + qtrg = cache->trigrams; switch (strategy) { @@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS) } } break; + case RegExpICaseStrategyNumber: +#ifndef IGNORECASE + elog(ERROR, "cannot handle ~* with case-sensitive trigrams"); +#endif + /* FALL THRU */ + case RegExpStrategyNumber: + /* Regexp search is inexact */ + *recheck = true; + + /* Check regex match as much as we can with available info */ + if (qtrg) + { + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + bool *check; + + check = trgm_presence_map(qtrg, key); + res = trigramsMatchGraph(cache->graph, check); + pfree(check); + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int32 k, + tmp = 0, + len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + BITVECP sign = GETSIGN(key); + + /* descend only if at least one trigram is present */ + res = false; + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + if (GETBIT(sign, HASHVAL(tmp))) + { + res = true; + break; + } + } + } + } + else + { + /* trigram-free query must be rechecked everywhere */ + res = true; + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 49e94f57a8..76e470c778 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2) return true; } +/* + * Return a palloc'd boolean array showing, for each trigram in "query", + * whether it is present in the trigram array "key". + * This relies on the "key" array being sorted, but "query" need not be. + */ +bool * +trgm_presence_map(TRGM *query, TRGM *key) +{ + bool *result; + trgm *ptrq = GETARR(query), + *ptrk = GETARR(key); + int lenq = ARRNELEM(query), + lenk = ARRNELEM(key), + i; + + result = (bool *) palloc0(lenq * sizeof(bool)); + + /* for each query trigram, do a binary search in the key array */ + for (i = 0; i < lenq; i++) + { + int lo = 0; + int hi = lenk; + + while (lo < hi) + { + int mid = (lo + hi) / 2; + int res = CMPTRGM(ptrq, ptrk + mid); + + if (res < 0) + hi = mid; + else if (res > 0) + lo = mid + 1; + else + { + result[i] = true; + break; + } + } + ptrq++; + } + + return result; +} + Datum similarity(PG_FUNCTION_ARGS) { diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 72e9b208fe..772fc44b3c 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams); * * Returns an array of trigrams required by the regular expression, or NULL if * the regular expression was too complex to analyze. In addition, a packed - * graph representation of the regex is returned into *graph. + * graph representation of the regex is returned into *graph. The results + * must be allocated in rcontext (which might or might not be the current + * context). */ TRGM * -createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) +createTrgmNFA(text *text_re, Oid collation, + TrgmPackedGraph **graph, MemoryContext rcontext) { TRGM *trg; regex_t regex; @@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) /* * This processing generates a great deal of cruft, which we'd like to - * clean up before returning (since this function is normally called in a + * clean up before returning (since this function may be called in a * query-lifespan memory context). Make a temp context we can work in so - * that cleanup is easy. Note that the returned data structures must be - * allocated in caller's context, however. + * that cleanup is easy. */ tmpcontext = AllocSetContextCreate(CurrentMemoryContext, "createTrgmNFA temporary context", @@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) */ PG_TRY(); { - trg = createTrgmNFAInternal(®ex, graph, oldcontext); + trg = createTrgmNFAInternal(®ex, graph, rcontext); } PG_CATCH(); { diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index 4572750f4d..9039f03e8b 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar'; - Beginning in PostgreSQL 9.3, pg_trgm - GIN indexes also support index searches for regular-expression matches + Beginning in PostgreSQL 9.3, these index types also support + index searches for regular-expression matches (~ and ~* operators), for example SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';