Make contrib/pg_trgm also support regex searches with GiST indexes.

This wasn't addressed in the original patch, but it doesn't take very
much additional code to cover the case, so let's get it done.

Since pg_trgm 1.1 hasn't been released yet, I just changed the definition
of what's in it, rather than inventing a 1.2.
This commit is contained in:
Tom Lane 2013-04-10 13:30:14 -04:00
parent e543631f3c
commit 6f5b8beb64
10 changed files with 352 additions and 35 deletions

View File

@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%';
quark
(1 row)
select * from test2 where t like '%z foo bar%';
t
-------------
z foo bar
(1 row)
select * from test2 where t like ' z foo%';
t
-------------
z foo bar
(1 row)
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
QUERY PLAN
------------------------------------------
Index Scan using test2_idx_gist on test2
Index Cond: (t ~ '[abc]{3}'::text)
(2 rows)
explain (costs off)
select * from test2 where t ~* 'DEF';
QUERY PLAN
------------------------------------------
Index Scan using test2_idx_gist on test2
Index Cond: (t ~* 'DEF'::text)
(2 rows)
select * from test2 where t ~ '[abc]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~ 'a[bc]+d';
t
--------
abcdef
(1 row)
select * from test2 where t ~ '(abc)*$';
t
-------------
abcdef
quark
z foo bar
(3 rows)
select * from test2 where t ~* 'DEF';
t
--------
abcdef
(1 row)
select * from test2 where t ~ 'dEf';
t
---
(0 rows)
select * from test2 where t ~* '^q';
t
-------
quark
(1 row)
select * from test2 where t ~* '[abc]{3}[def]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~* 'ab[a-z]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~* '(^| )qua';
t
-------
quark
(1 row)
select * from test2 where t ~ 'q.*rk$';
t
-------
quark
(1 row)
select * from test2 where t ~ 'q';
t
-------
quark
(1 row)
select * from test2 where t ~ '[a-z]{3}';
t
-------------
abcdef
quark
z foo bar
(3 rows)
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
t
---
(0 rows)
select * from test2 where t ~ 'z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo';
t
-------------
z foo bar
(1 row)

View File

@ -3,6 +3,10 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);

View File

@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 4 pg_catalog.~~* (text, text),
FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid);
-- Add operators that are new in 9.3.
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
-- support functions for gin
CREATE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal

View File

@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%';
select * from test2 where t like E'%\\bcd%';
select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%';
select * from test2 where t like '%z foo bar%';
select * from test2 where t like ' z foo%';
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
explain (costs off)
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ '[abc]{3}';
select * from test2 where t ~ 'a[bc]+d';
select * from test2 where t ~ '(abc)*$';
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ 'dEf';
select * from test2 where t ~* '^q';
select * from test2 where t ~* '[abc]{3}[def]{3}';
select * from test2 where t ~* 'ab[a-z]{3}';
select * from test2 where t ~* '(^| )qua';
select * from test2 where t ~ 'q.*rk$';
select * from test2 where t ~ 'q';
select * from test2 where t ~ '[a-z]{3}';
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
select * from test2 where t ~ 'z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';

View File

@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen);
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph,
Oid collation);
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
extern TRGM *createTrgmNFA(text *text_re, Oid collation,
TrgmPackedGraph **graph, MemoryContext rcontext);
extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
#endif /* __TRGM_H__ */

View File

@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
#endif
/* FALL THRU */
case RegExpStrategyNumber:
trg = createTrgmNFA(val, &graph, PG_GET_COLLATION());
trg = createTrgmNFA(val, PG_GET_COLLATION(),
&graph, CurrentMemoryContext);
if (trg && ARRNELEM(trg) > 0)
{
/*

View File

@ -8,6 +8,25 @@
#include "access/skey.h"
typedef struct
{
/* most recent inputs to gtrgm_consistent */
StrategyNumber strategy;
text *query;
/* extracted trigrams for query */
TRGM *trigrams;
/* if a regex operator, the extracted graph */
TrgmPackedGraph *graph;
/*
* The "query" and "trigrams" are stored in the same palloc block as this
* cache struct, at MAXALIGN'ed offsets. The graph however isn't.
*/
} gtrgm_consistent_cache;
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
PG_FUNCTION_INFO_V1(gtrgm_in);
Datum gtrgm_in(PG_FUNCTION_ARGS);
@ -38,8 +57,6 @@ Datum gtrgm_penalty(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_picksplit);
Datum gtrgm_picksplit(PG_FUNCTION_ARGS);
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
/* Number of one-bits in an unsigned byte */
static const uint8 number_of_ones[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *qtrg;
bool res;
Size querysize = VARSIZE(query);
char *cache = (char *) fcinfo->flinfo->fn_extra,
*cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber));
gtrgm_consistent_cache *cache;
/*
* Store both the strategy number and extracted trigrams in cache, because
* trigram extraction is relatively CPU-expensive. We must include
* strategy number because trigram extraction depends on strategy.
* We keep the extracted trigrams in cache, because trigram extraction is
* relatively CPU-expensive. When trying to reuse a cached value, check
* strategy number not just query itself, because trigram extraction
* depends on strategy.
*
* The cached structure contains the strategy number, then the input query
* (starting at a MAXALIGN boundary), then the TRGM value (also starting
* at a MAXALIGN boundary).
* The cached structure is a single palloc chunk containing the
* gtrgm_consistent_cache header, then the input query (starting at a
* MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN
* boundary). However we don't try to include the regex graph (if any) in
* that struct. (XXX currently, this approach can leak regex graphs
* across index rescans. Not clear if that's worth fixing.)
*/
cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
if (cache == NULL ||
strategy != *((StrategyNumber *) cache) ||
VARSIZE(cachedQuery) != querysize ||
memcmp(cachedQuery, query, querysize) != 0)
cache->strategy != strategy ||
VARSIZE(cache->query) != querysize ||
memcmp((char *) cache->query, (char *) query, querysize) != 0)
{
char *newcache;
gtrgm_consistent_cache *newcache;
TrgmPackedGraph *graph = NULL;
Size qtrgsize;
switch (strategy)
{
@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
qtrg = generate_wildcard_trgm(VARDATA(query),
querysize - VARHDRSZ);
break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case RegExpStrategyNumber:
qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
&graph, fcinfo->flinfo->fn_mcxt);
/* just in case an empty array is returned ... */
if (qtrg && ARRNELEM(qtrg) <= 0)
{
pfree(qtrg);
qtrg = NULL;
}
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
qtrg = NULL; /* keep compiler quiet */
break;
}
newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
MAXALIGN(sizeof(StrategyNumber)) +
MAXALIGN(querysize) +
VARSIZE(qtrg));
cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber));
qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
*((StrategyNumber *) newcache) = strategy;
memcpy(cachedQuery, query, querysize);
memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg));
newcache = (gtrgm_consistent_cache *)
MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
MAXALIGN(sizeof(gtrgm_consistent_cache)) +
MAXALIGN(querysize) +
qtrgsize);
newcache->strategy = strategy;
newcache->query = (text *)
((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
memcpy((char *) newcache->query, (char *) query, querysize);
if (qtrg)
{
newcache->trigrams = (TRGM *)
((char *) newcache->query + MAXALIGN(querysize));
memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
/* release qtrg in case it was made in fn_mcxt */
pfree(qtrg);
}
else
newcache->trigrams = NULL;
newcache->graph = graph;
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = newcache;
fcinfo->flinfo->fn_extra = (void *) newcache;
cache = newcache;
}
qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize));
qtrg = cache->trigrams;
switch (strategy)
{
@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
}
}
break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case RegExpStrategyNumber:
/* Regexp search is inexact */
*recheck = true;
/* Check regex match as much as we can with available info */
if (qtrg)
{
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
bool *check;
check = trgm_presence_map(qtrg, key);
res = trigramsMatchGraph(cache->graph, check);
pfree(check);
}
else if (ISALLTRUE(key))
{ /* non-leaf contains signature */
res = true;
}
else
{ /* non-leaf contains signature */
int32 k,
tmp = 0,
len = ARRNELEM(qtrg);
trgm *ptr = GETARR(qtrg);
BITVECP sign = GETSIGN(key);
/* descend only if at least one trigram is present */
res = false;
for (k = 0; k < len; k++)
{
CPTRGM(((char *) &tmp), ptr + k);
if (GETBIT(sign, HASHVAL(tmp)))
{
res = true;
break;
}
}
}
}
else
{
/* trigram-free query must be rechecked everywhere */
res = true;
}
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */

View File

@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2)
return true;
}
/*
* Return a palloc'd boolean array showing, for each trigram in "query",
* whether it is present in the trigram array "key".
* This relies on the "key" array being sorted, but "query" need not be.
*/
bool *
trgm_presence_map(TRGM *query, TRGM *key)
{
bool *result;
trgm *ptrq = GETARR(query),
*ptrk = GETARR(key);
int lenq = ARRNELEM(query),
lenk = ARRNELEM(key),
i;
result = (bool *) palloc0(lenq * sizeof(bool));
/* for each query trigram, do a binary search in the key array */
for (i = 0; i < lenq; i++)
{
int lo = 0;
int hi = lenk;
while (lo < hi)
{
int mid = (lo + hi) / 2;
int res = CMPTRGM(ptrq, ptrk + mid);
if (res < 0)
hi = mid;
else if (res > 0)
lo = mid + 1;
else
{
result[i] = true;
break;
}
}
ptrq++;
}
return result;
}
Datum
similarity(PG_FUNCTION_ARGS)
{

View File

@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
*
* Returns an array of trigrams required by the regular expression, or NULL if
* the regular expression was too complex to analyze. In addition, a packed
* graph representation of the regex is returned into *graph.
* graph representation of the regex is returned into *graph. The results
* must be allocated in rcontext (which might or might not be the current
* context).
*/
TRGM *
createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
createTrgmNFA(text *text_re, Oid collation,
TrgmPackedGraph **graph, MemoryContext rcontext)
{
TRGM *trg;
regex_t regex;
@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
/*
* This processing generates a great deal of cruft, which we'd like to
* clean up before returning (since this function is normally called in a
* clean up before returning (since this function may be called in a
* query-lifespan memory context). Make a temp context we can work in so
* that cleanup is easy. Note that the returned data structures must be
* allocated in caller's context, however.
* that cleanup is easy.
*/
tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
"createTrgmNFA temporary context",
@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
*/
PG_TRY();
{
trg = createTrgmNFAInternal(&regex, graph, oldcontext);
trg = createTrgmNFAInternal(&regex, graph, rcontext);
}
PG_CATCH();
{

View File

@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
</para>
<para>
Beginning in <productname>PostgreSQL</> 9.3, <filename>pg_trgm</filename>
GIN indexes also support index searches for regular-expression matches
Beginning in <productname>PostgreSQL</> 9.3, these index types also support
index searches for regular-expression matches
(<literal>~</> and <literal>~*</> operators), for example
<programlisting>
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';