trgm - Trigram matching for PostgreSQL

--------------------------------------

	The pg_trgm contrib module provides functions and index classes
	for determining the similarity of text based on trigram
	matching.
This commit is contained in:
Teodor Sigaev 2004-05-31 17:18:12 +00:00
parent 553bc41633
commit cbfa4092bb
9 changed files with 4529 additions and 0 deletions

17
contrib/pg_trgm/Makefile Normal file
View File

@ -0,0 +1,17 @@
subdir = contrib/pg_trgm
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I. $(CPPFLAGS)
MODULE_big = pg_trgm
OBJS = trgm_op.o trgm_gist.o
DATA_built = pg_trgm.sql
DOCS = README.pg_trgm
REGRESS = pg_trgm
include $(top_srcdir)/contrib/contrib-global.mk
# DO NOT DELETE

View File

@ -0,0 +1,138 @@
trgm - Trigram matching for PostgreSQL
--------------------------------------
Introduction
This module is sponsored by Delta-Soft Ltd., Moscow, Russia.
The pg_trgm contrib module provides functions and index classes
for determining the similarity of text based on trigram
matching.
Definitions
Trigram (or Trigraph)
A trigram is a set of three consecutive characters taken
from a string. A string is considered to have two spaces
prefixed and one space suffixed when determining the set
of trigrams that comprise the string.
eg. The set of trigrams in the word "cat" is " c", " ca",
"at " and "cat".
Public Functions
real similarity(text, text)
Returns a number that indicates how closely matches the two
arguments are. A zero result indicates that the two words
are completely dissimilar, and a result of one indicates that
the two words are identical.
real show_limit()
Returns the current similarity threshold used by the '%'
operator. This in effect sets the minimum similarity between
two words in order that they be considered similar enough to
be misspellings of each other, for example.
real set_limit(real)
Sets the current similarity threshold that is used by the '%'
operator, and is returned by the show_limit() function.
text[] show_trgm(text)
Returns an array of all the trigrams of the supplied text
parameter.
Public Operators
text % text (returns boolean)
The '%' operator returns TRUE if its two arguments have a similarity
that is greater than the similarity threshold set by set_limit(). It
will return FALSE if the similarity is less than the current
threshold.
Public Index Operator Classes
gist_trgm_ops
The pg_trgm module comes with an index operator class that allows a
developer to create an index over a text column for the purpose
of very fast similarity searches.
To use this index, the '%' operator must be used and an appropriate
similarity threshold for the application must be set.
eg.
CREATE TABLE test_trgm (t text);
CREATE INDEX trgm_idx ON test_trgm USING gist (t gist_trgm_ops);
At this point, you will have an index on the t text column that you
can use for similarity searching.
eg.
SELECT
t,
similarity(t, 'word') AS sml
FROM
test_trgm
WHERE
t % 'word'
ORDER BY
sml DESC, t;
This will return all values in the text column that are sufficiently
similar to 'word', sorted from best match to worst. The index will
be used to make this a fast operation over very large data sets.
Tsearch2 Integration
Trigram matching is a very useful tool when used in conjunction
with a text index created by the Tsearch2 contrib module. (See
contrib/tsearch2)
The first step is to generate an auxiliary table containing all
the unique words in the Tsearch2 index:
CREATE TABLE words AS
SELECT word FROM stat('SELECT vector FROM documents');
Where 'documents' is the table that contains the Tsearch2 index
column 'vector', of type 'tsvector'.
Next, create a trigram index on the word column:
CREATE INDEX words_idx ON words USING gist(word gist_trgm_ops);
Now, a SELECT query similar to the example above can be used to
suggest spellings for misspelled words in user search terms. A
useful extra clause is to ensure that the similar words are also
of similar length to the misspelled word.
Note: Since the 'words' table has been generated as a separate,
static table, it will need to be periodically regenerated so that
it remains up to date with the word list in the Tsearch2 index.
Authors
Oleg Bartunov <oleg@sai.msu.su>, Moscow, Moscow University, Russia
Teodor Sigaev <teodor@sigaev.ru>, Moscow, Delta-Soft Ltd.,Russia
Contributors
Christopher Kings-Lynne wrote this README file
References
Tsearch2 Development Site
http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/
GiST Development Site
http://www.sai.msu.su/~megera/postgres/gist/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,107 @@
SET search_path = public;
BEGIN;
create function set_limit(float4)
returns float4
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict,iscachable);
create function show_limit()
returns float4
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict,iscachable);
create function show_trgm(text)
returns _text
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict,iscachable);
create function similarity(text,text)
returns float4
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict,iscachable);
create function similarity_op(text,text)
returns bool
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict,iscachable);
CREATE OPERATOR % (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = similarity_op,
COMMUTATOR = '%',
RESTRICT = contsel,
JOIN = contjoinsel
);
--gist key
CREATE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE FUNCTION gtrgm_out(gtrgm)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE TYPE gtrgm (
INTERNALLENGTH = -1,
INPUT = gtrgm_in,
OUTPUT = gtrgm_out
);
-- support functions
CREATE FUNCTION gtrgm_consistent(gtrgm,internal,int4)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE 'C';
CREATE FUNCTION gtrgm_compress(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C';
CREATE FUNCTION gtrgm_decompress(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C';
CREATE FUNCTION gtrgm_penalty(internal,internal,internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE FUNCTION gtrgm_picksplit(internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C';
CREATE FUNCTION gtrgm_union(bytea, internal)
RETURNS _int4
AS 'MODULE_PATHNAME'
LANGUAGE 'C';
CREATE FUNCTION gtrgm_same(gtrgm, gtrgm, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C';
-- create the operator class
CREATE OPERATOR CLASS gist_trgm_ops
FOR TYPE text USING gist
AS
OPERATOR 1 % (text, text),
FUNCTION 1 gtrgm_consistent (gtrgm, internal, int4),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
FUNCTION 4 gtrgm_decompress (internal),
FUNCTION 5 gtrgm_penalty (internal, internal, internal),
FUNCTION 6 gtrgm_picksplit (internal, internal),
FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal),
STORAGE gtrgm;
COMMIT;

View File

@ -0,0 +1,30 @@
\set ECHO none
\i pg_trgm.sql
\set ECHO all
select show_trgm('');
select show_trgm('(*&^$@%@');
select show_trgm('a b c');
select show_trgm(' a b c ');
select show_trgm('aA bB cC');
select show_trgm(' aA bB cC ');
select show_trgm('a b C0*%^');
select similarity('wow','WOWa ');
select similarity('wow',' WOW ');
CREATE TABLE test_trgm(t text);
\copy test_trgm from 'data/trgm.data
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;

88
contrib/pg_trgm/trgm.h Normal file
View File

@ -0,0 +1,88 @@
#ifndef __TRGM_H__
#define __TRGM_H__
#include "postgres.h"
#include "access/gist.h"
#include "access/itup.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
/* options */
#define LPADDING 2
#define RPADDING 1
#define KEEPONLYALNUM
#define IGNORECASE
#define DIVUNION
typedef char trgm[3];
#define CMPCHAR(a,b) ( ((a)==(b)) ? 0 : ( ((a)<(b)) ? -1 : 1 ) )
#define CMPPCHAR(a,b,i) CMPCHAR( *(((char*)(a))+i), *(((char*)(b))+i) )
#define CMPTRGM(a,b) ( CMPPCHAR(a,b,0) ? CMPPCHAR(a,b,0) : ( CMPPCHAR(a,b,1) ? CMPPCHAR(a,b,1) : CMPPCHAR(a,b,2) ) )
#define CPTRGM(a,b) do { \
*(((char*)(a))+0) = *(((char*)(b))+0); \
*(((char*)(a))+1) = *(((char*)(b))+1); \
*(((char*)(a))+2) = *(((char*)(b))+2); \
} while(0);
typedef struct {
int4 len;
uint8 flag;
char data[1];
} TRGM;
#define TRGMHRDSIZE (sizeof(int4)+sizeof(uint8))
/* gist */
#define BITBYTE 8
#define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
typedef char BITVEC[SIGLEN];
typedef char *BITVECP;
#define LOOPBYTE(a) \
for(i=0;i<SIGLEN;i++) {\
a;\
}
#define LOOPBIT(a) \
for(i=0;i<SIGLENBIT;i++) {\
a;\
}
#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) )
#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
#define HASHVAL(val) (((unsigned int)(val)) % SIGLENBIT)
#define HASH(sign, val) SETBIT((sign), HASHVAL(val))
#define ARRKEY 0x01
#define SIGNKEY 0x02
#define ALLISTRUE 0x04
#define ISARRKEY(x) ( ((TRGM*)x)->flag & ARRKEY )
#define ISSIGNKEY(x) ( ((TRGM*)x)->flag & SIGNKEY )
#define ISALLTRUE(x) ( ((TRGM*)x)->flag & ALLISTRUE )
#define CALCGTSIZE(flag, len) ( TRGMHRDSIZE + ( ( (flag) & ARRKEY ) ? ((len)*sizeof(trgm)) : (((flag) & ALLISTRUE) ? 0 : SIGLEN) ) )
#define GETSIGN(x) ( (BITVECP)( (char*)x+TRGMHRDSIZE ) )
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHRDSIZE ) )
#define ARRNELEM(x) ( ( ((TRGM*)x)->len - TRGMHRDSIZE )/sizeof(trgm) )
extern float4 trgm_limit;
TRGM* generate_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
#endif

566
contrib/pg_trgm/trgm_gist.c Normal file
View File

@ -0,0 +1,566 @@
#include "trgm.h"
#include "access/gist.h"
#include "access/itup.h"
#include "access/rtree.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
#include "access/tuptoaster.h"
PG_FUNCTION_INFO_V1(gtrgm_in);
Datum gtrgm_in(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_out);
Datum gtrgm_out(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_compress);
Datum gtrgm_compress(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_decompress);
Datum gtrgm_decompress(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_consistent);
Datum gtrgm_consistent(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_union);
Datum gtrgm_union(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_same);
Datum gtrgm_same(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_penalty);
Datum gtrgm_penalty(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_picksplit);
Datum gtrgm_picksplit(PG_FUNCTION_ARGS);
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
#define SUMBIT(val) ( \
GETBITBYTE(val,0) + \
GETBITBYTE(val,1) + \
GETBITBYTE(val,2) + \
GETBITBYTE(val,3) + \
GETBITBYTE(val,4) + \
GETBITBYTE(val,5) + \
GETBITBYTE(val,6) + \
GETBITBYTE(val,7) \
)
Datum
gtrgm_in(PG_FUNCTION_ARGS)
{
elog(ERROR, "Not implemented");
PG_RETURN_DATUM(0);
}
Datum
gtrgm_out(PG_FUNCTION_ARGS)
{
elog(ERROR, "Not implemented");
PG_RETURN_DATUM(0);
}
static void
makesign(BITVECP sign, TRGM * a)
{
int4 k,
len = ARRNELEM(a);
trgm *ptr = GETARR(a);
int4 tmp=0;
MemSet((void *) sign, 0, sizeof(BITVEC));
SETBIT(sign, SIGLENBIT); /*set last unused bit*/
for (k = 0; k < len; k++) {
CPTRGM( ((char*)&tmp), ptr+k );
HASH(sign, tmp);
}
}
Datum
gtrgm_compress(PG_FUNCTION_ARGS)
{
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
GISTENTRY *retval = entry;
if (entry->leafkey)
{ /* trgm */
TRGM *res;
text *toastedval = (text *) DatumGetPointer(entry->key);
text *val = (text *) DatumGetPointer(PG_DETOAST_DATUM(entry->key));
res = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
if (val != toastedval)
pfree(val);
retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
gistentryinit(*retval, PointerGetDatum(res),
entry->rel, entry->page,
entry->offset, res->len, FALSE);
}
else if (ISSIGNKEY(DatumGetPointer(entry->key)) &&
!ISALLTRUE(DatumGetPointer(entry->key)))
{
int4 i,
len;
TRGM *res;
BITVECP sign = GETSIGN(DatumGetPointer(entry->key));
LOOPBYTE(
if ((sign[i] & 0xff) != 0xff)
PG_RETURN_POINTER(retval);
);
len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0);
res = (TRGM *) palloc(len);
res->len = len;
res->flag = SIGNKEY | ALLISTRUE;
retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
gistentryinit(*retval, PointerGetDatum(res),
entry->rel, entry->page,
entry->offset, res->len, FALSE);
}
PG_RETURN_POINTER(retval);
}
Datum
gtrgm_decompress(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(PG_GETARG_DATUM(0));
}
Datum
gtrgm_consistent(PG_FUNCTION_ARGS)
{
text *query = (text *) PG_GETARG_TEXT_P(1);
TRGM *key = (TRGM *) DatumGetPointer( ((GISTENTRY *) PG_GETARG_POINTER(0))->key );
TRGM *qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
int res=false;
if ( GIST_LEAF( (GISTENTRY *) PG_GETARG_POINTER(0) ) ) { /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key,qtrg);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res = ( *(int*)&tmpsml==*(int*)&trgm_limit || tmpsml > trgm_limit ) ? true : false;
} else if ( ISALLTRUE(key) ) { /* non-leaf contains signature */
res = true;
} else { /* non-leaf contains signature */
int4 count=0;
int4 k, len = ARRNELEM(qtrg);
trgm *ptr = GETARR(qtrg);
BITVECP sign = GETSIGN(key);
int4 tmp=0;
for (k = 0; k < len; k++) {
CPTRGM( ((char*)&tmp), ptr+k );
count += GETBIT(sign, HASHVAL(tmp));
}
#ifdef DIVUNION
res = ( len==count ) ? true : ( ( ( ( ((float4)count) / ((float4)(len-count)) ) ) >= trgm_limit ) ? true : false );
#else
res = (len==0) ? false : ( ( ( ( ((float4)count) / ((float4)len) ) ) >= trgm_limit ) ? true : false );
#endif
}
PG_FREE_IF_COPY(query,1);
pfree(qtrg);
PG_RETURN_BOOL(res);
}
static int4
unionkey(BITVECP sbase, TRGM * add)
{
int4 i;
if (ISSIGNKEY(add))
{
BITVECP sadd = GETSIGN(add);
if (ISALLTRUE(add))
return 1;
LOOPBYTE(
sbase[i] |= sadd[i];
);
}
else
{
trgm *ptr = GETARR(add);
int4 tmp=0;
for (i = 0; i < ARRNELEM(add); i++) {
CPTRGM( ((char*)&tmp), ptr+i );
HASH(sbase, tmp);
}
}
return 0;
}
Datum
gtrgm_union(PG_FUNCTION_ARGS)
{
GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
int4 len = entryvec->n;
int *size = (int *) PG_GETARG_POINTER(1);
BITVEC base;
int4 i;
int4 flag = 0;
TRGM *result;
MemSet((void *) base, 0, sizeof(BITVEC));
for (i = 0; i < len; i++)
{
if (unionkey(base, GETENTRY(entryvec, i)))
{
flag = ALLISTRUE;
break;
}
}
flag |= SIGNKEY;
len = CALCGTSIZE(flag, 0);
result = (TRGM *) palloc(len);
*size = result->len = len;
result->flag = flag;
if (!ISALLTRUE(result))
memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC));
PG_RETURN_POINTER(result);
}
Datum
gtrgm_same(PG_FUNCTION_ARGS)
{
TRGM *a = (TRGM *) PG_GETARG_POINTER(0);
TRGM *b = (TRGM *) PG_GETARG_POINTER(1);
bool *result = (bool *) PG_GETARG_POINTER(2);
if (ISSIGNKEY(a))
{ /* then b also ISSIGNKEY */
if (ISALLTRUE(a) && ISALLTRUE(b))
*result = true;
else if (ISALLTRUE(a))
*result = false;
else if (ISALLTRUE(b))
*result = false;
else
{
int4 i;
BITVECP sa = GETSIGN(a),
sb = GETSIGN(b);
*result = true;
LOOPBYTE(
if (sa[i] != sb[i])
{
*result = false;
break;
}
);
}
}
else
{ /* a and b ISARRKEY */
int4 lena = ARRNELEM(a),
lenb = ARRNELEM(b);
if (lena != lenb)
*result = false;
else
{
trgm *ptra = GETARR(a),
*ptrb = GETARR(b);
int4 i;
*result = true;
for (i = 0; i < lena; i++)
if (CMPTRGM(ptra+i, ptrb+i))
{
*result = false;
break;
}
}
}
PG_RETURN_POINTER(result);
}
static int4
sizebitvec(BITVECP sign)
{
int4 size = 0,
i;
LOOPBYTE(
size += SUMBIT(*(char *) sign);
sign = (BITVECP) (((char *) sign) + 1);
);
return size;
}
static int
hemdistsign(BITVECP a, BITVECP b) {
int i,dist=0;
LOOPBIT(
if ( GETBIT(a,i) != GETBIT(b,i) )
dist++;
);
return dist;
}
static int
hemdist(TRGM *a, TRGM *b) {
if ( ISALLTRUE(a) ) {
if (ISALLTRUE(b))
return 0;
else
return SIGLENBIT-sizebitvec(GETSIGN(b));
} else if (ISALLTRUE(b))
return SIGLENBIT-sizebitvec(GETSIGN(a));
return hemdistsign( GETSIGN(a), GETSIGN(b) );
}
Datum
gtrgm_penalty(PG_FUNCTION_ARGS)
{
GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */
GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
float *penalty = (float *) PG_GETARG_POINTER(2);
TRGM *origval = (TRGM *) DatumGetPointer(origentry->key);
TRGM *newval = (TRGM *) DatumGetPointer(newentry->key);
BITVECP orig = GETSIGN(origval);
*penalty = 0.0;
if (ISARRKEY(newval)) {
BITVEC sign;
makesign(sign, newval);
if ( ISALLTRUE(origval) )
*penalty=((float)(SIGLENBIT-sizebitvec(sign)))/(float)(SIGLENBIT+1);
else
*penalty=hemdistsign(sign,orig);
} else {
*penalty=hemdist(origval,newval);
}
PG_RETURN_POINTER(penalty);
}
typedef struct
{
bool allistrue;
BITVEC sign;
} CACHESIGN;
static void
fillcache(CACHESIGN * item, TRGM * key)
{
item->allistrue = false;
if (ISARRKEY(key))
makesign(item->sign, key);
else if (ISALLTRUE(key))
item->allistrue = true;
else
memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC));
}
#define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) )
typedef struct
{
OffsetNumber pos;
int4 cost;
} SPLITCOST;
static int
comparecost(const void *a, const void *b)
{
if (((SPLITCOST *) a)->cost == ((SPLITCOST *) b)->cost)
return 0;
else
return (((SPLITCOST *) a)->cost > ((SPLITCOST *) b)->cost) ? 1 : -1;
}
static int
hemdistcache(CACHESIGN *a, CACHESIGN *b) {
if ( a->allistrue ) {
if (b->allistrue)
return 0;
else
return SIGLENBIT-sizebitvec(b->sign);
} else if (b->allistrue)
return SIGLENBIT-sizebitvec(a->sign);
return hemdistsign( a->sign, b->sign );
}
Datum
gtrgm_picksplit(PG_FUNCTION_ARGS)
{
GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
OffsetNumber maxoff = entryvec->n - 2;
GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
OffsetNumber k,
j;
TRGM *datum_l,
*datum_r;
BITVECP union_l,
union_r;
int4 size_alpha,
size_beta;
int4 size_waste,
waste = -1;
int4 nbytes;
OffsetNumber seed_1 = 0,
seed_2 = 0;
OffsetNumber *left,
*right;
BITVECP ptr;
int i;
CACHESIGN *cache;
SPLITCOST *costvector;
nbytes = (maxoff + 2) * sizeof(OffsetNumber);
v->spl_left = (OffsetNumber *) palloc(nbytes);
v->spl_right = (OffsetNumber *) palloc(nbytes);
cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2));
fillcache(&cache[FirstOffsetNumber], GETENTRY(entryvec, FirstOffsetNumber));
for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) {
for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) {
if (k == FirstOffsetNumber)
fillcache(&cache[j], GETENTRY(entryvec, j));
size_waste=hemdistcache(&(cache[j]),&(cache[k]));
if (size_waste > waste) {
waste = size_waste;
seed_1 = k;
seed_2 = j;
}
}
}
left = v->spl_left;
v->spl_nleft = 0;
right = v->spl_right;
v->spl_nright = 0;
if (seed_1 == 0 || seed_2 == 0) {
seed_1 = 1;
seed_2 = 2;
}
/* form initial .. */
if (cache[seed_1].allistrue) {
datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
datum_l->len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0);
datum_l->flag = SIGNKEY | ALLISTRUE;
} else {
datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0));
datum_l->len = CALCGTSIZE(SIGNKEY, 0);
datum_l->flag = SIGNKEY;
memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC));
}
if (cache[seed_2].allistrue) {
datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
datum_r->len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0);
datum_r->flag = SIGNKEY | ALLISTRUE;
} else {
datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0));
datum_r->len = CALCGTSIZE(SIGNKEY, 0);
datum_r->flag = SIGNKEY;
memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC));
}
union_l=GETSIGN(datum_l);
union_r=GETSIGN(datum_r);
maxoff = OffsetNumberNext(maxoff);
fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff));
/* sort before ... */
costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) {
costvector[j - 1].pos = j;
size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j]));
size_beta = hemdistcache(&(cache[seed_2]), &(cache[j]));
costvector[j - 1].cost = abs(size_alpha - size_beta);
}
qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
for (k = 0; k < maxoff; k++) {
j = costvector[k].pos;
if (j == seed_1) {
*left++ = j;
v->spl_nleft++;
continue;
} else if (j == seed_2) {
*right++ = j;
v->spl_nright++;
continue;
}
if (ISALLTRUE(datum_l) || cache[j].allistrue) {
if ( ISALLTRUE(datum_l) && cache[j].allistrue )
size_alpha=0;
else
size_alpha = SIGLENBIT-sizebitvec(
( cache[j].allistrue ) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign)
);
} else {
size_alpha=hemdistsign(cache[j].sign,GETSIGN(datum_l));
}
if (ISALLTRUE(datum_r) || cache[j].allistrue) {
if ( ISALLTRUE(datum_r) && cache[j].allistrue )
size_beta=0;
else
size_beta = SIGLENBIT-sizebitvec(
( cache[j].allistrue ) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign)
);
} else {
size_beta=hemdistsign(cache[j].sign,GETSIGN(datum_r));
}
if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1)) {
if (ISALLTRUE(datum_l) || cache[j].allistrue) {
if (! ISALLTRUE(datum_l) )
MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC));
} else {
ptr=cache[j].sign;
LOOPBYTE(
union_l[i] |= ptr[i];
);
}
*left++ = j;
v->spl_nleft++;
} else {
if (ISALLTRUE(datum_r) || cache[j].allistrue) {
if (! ISALLTRUE(datum_r) )
MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC));
} else {
ptr=cache[j].sign;
LOOPBYTE(
union_r[i] |= ptr[i];
);
}
*right++ = j;
v->spl_nright++;
}
}
*right = *left = FirstOffsetNumber;
pfree(costvector);
pfree(cache);
v->spl_ldatum = PointerGetDatum(datum_l);
v->spl_rdatum = PointerGetDatum(datum_r);
PG_RETURN_POINTER(v);
}

269
contrib/pg_trgm/trgm_op.c Normal file
View File

@ -0,0 +1,269 @@
#include "trgm.h"
#include <ctype.h>
#include "utils/array.h"
#include "catalog/pg_type.h"
float4 trgm_limit = 0.3;
PG_FUNCTION_INFO_V1(set_limit);
Datum set_limit(PG_FUNCTION_ARGS);
Datum
set_limit(PG_FUNCTION_ARGS) {
float4 nlimit = PG_GETARG_FLOAT4(0);
if ( nlimit < 0 || nlimit > 1.0 )
elog(ERROR,"Wrong limit, should be between 0 and 1");
trgm_limit = nlimit;
PG_RETURN_FLOAT4(trgm_limit);
}
PG_FUNCTION_INFO_V1(show_limit);
Datum show_limit(PG_FUNCTION_ARGS);
Datum
show_limit(PG_FUNCTION_ARGS) {
PG_RETURN_FLOAT4(trgm_limit);
}
#define WORDWAIT 0
#define INWORD 1
static int
comp_trgm(const void *a, const void *b) {
return CMPTRGM(a,b);
}
static int
unique_array (trgm *a, int len) {
trgm *curend, *tmp;
curend = tmp = a;
while (tmp - a < len)
if ( CMPTRGM(tmp, curend) ) {
curend++;
CPTRGM(curend,tmp);
tmp++;
} else
tmp++;
return curend + 1 - a;
}
TRGM*
generate_trgm(char *str, int slen) {
TRGM* trg;
char *buf,*sptr,*bufptr;
trgm *tptr;
int state=WORDWAIT;
int wl,len;
trg = (TRGM*) palloc(TRGMHRDSIZE+sizeof(trgm) * (slen/2 + 1) * 3);
trg->flag = ARRKEY;
trg->len = TRGMHRDSIZE;
if ( slen+LPADDING+RPADDING<3 || slen == 0 )
return trg;
tptr = GETARR(trg);
buf = palloc(sizeof(char) * (slen+4));
sptr = str;
if ( LPADDING > 0 ) {
*buf = ' ';
if ( LPADDING > 1 )
*(buf+1) = ' ';
}
bufptr = buf+LPADDING;
while( sptr-str < slen ) {
if ( state == WORDWAIT ) {
if (
#ifdef KEEPONLYALNUM
isalnum((unsigned char)*sptr)
#else
!isspace( (unsigned char)*sptr )
#endif
) {
*bufptr = *sptr; /* start put word in buffer */
bufptr++;
state = INWORD;
if ( sptr-str == slen-1 /* last char */ )
goto gettrg;
}
} else {
if (
#ifdef KEEPONLYALNUM
!isalnum((unsigned char)*sptr)
#else
isspace( (unsigned char)*sptr )
#endif
) {
gettrg:
/* word in buffer, so count trigrams */
*bufptr = ' ';
*(bufptr+1) = ' ';
wl = bufptr - (buf+LPADDING) - 2 + LPADDING + RPADDING;
if ( wl<=0 ) {
bufptr = buf+LPADDING;
state = WORDWAIT;
sptr++;
continue;
}
#ifdef IGNORECASE
do { /* lower word */
int wwl = bufptr-buf;
bufptr = buf+LPADDING;
while( bufptr-buf < wwl ) {
*bufptr = tolower( (unsigned char) *bufptr );
bufptr++;
}
} while(0);
#endif
bufptr = buf;
/* set trigrams */
while( bufptr-buf < wl ) {
CPTRGM(tptr, bufptr);
bufptr++;
tptr++;
}
bufptr = buf+LPADDING;
state = WORDWAIT;
} else {
*bufptr = *sptr; /* put in buffer */
bufptr++;
if ( sptr-str == slen-1 )
goto gettrg;
}
}
sptr++;
}
pfree(buf);
if ( (len=tptr-GETARR(trg)) == 0 )
return trg;
if ( len>0 ) {
qsort( (void*)GETARR(trg), len, sizeof(trgm), comp_trgm );
len = unique_array( GETARR(trg), len );
}
trg->len = CALCGTSIZE(ARRKEY, len);
return trg;
}
PG_FUNCTION_INFO_V1(show_trgm);
Datum show_trgm(PG_FUNCTION_ARGS);
Datum
show_trgm(PG_FUNCTION_ARGS) {
text *in = PG_GETARG_TEXT_P(0);
TRGM *trg;
Datum *d;
ArrayType *a;
trgm *ptr;
trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
d = (Datum*)palloc( sizeof(Datum)*(1+ARRNELEM(trg)) );
ptr = GETARR(trg);
while( ptr-GETARR(trg) < ARRNELEM(trg) ) {
text *item=(text*)palloc(VARHDRSZ + 3);
VARATT_SIZEP(item) = VARHDRSZ+3;
CPTRGM(VARDATA(item), ptr);
d[ ptr-GETARR(trg) ] = PointerGetDatum(item);
ptr++;
}
a = construct_array(
d,
ARRNELEM(trg),
TEXTOID,
-1,
false,
'i'
);
ptr = GETARR(trg);
while( ptr-GETARR(trg) < ARRNELEM(trg) ) {
pfree(DatumGetPointer(d[ ptr-GETARR(trg) ]));
ptr++;
}
pfree(d);
pfree(trg);
PG_FREE_IF_COPY(in,0);
PG_RETURN_POINTER(a);
}
float4
cnt_sml(TRGM *trg1, TRGM *trg2) {
trgm *ptr1, *ptr2;
int count=0;
int len1, len2;
ptr1 = GETARR(trg1);
ptr2 = GETARR(trg2);
len1 = ARRNELEM(trg1);
len2 = ARRNELEM(trg2);
while( ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2 ) {
int res = CMPTRGM(ptr1,ptr2);
if ( res < 0 ) {
ptr1++;
} else if ( res > 0 ) {
ptr2++;
} else {
ptr1++;
ptr2++;
count++;
}
}
#ifdef DIVUNION
return ( ( ((float4)count) / ((float4)(len1+len2-count)) ) );
#else
return ( ((float)count) / ((float)( (len1>len2) ? len1 : len2 )) );
#endif
}
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
Datum
similarity(PG_FUNCTION_ARGS) {
text *in1 = PG_GETARG_TEXT_P(0);
text *in2 = PG_GETARG_TEXT_P(1);
TRGM *trg1, *trg2;
float4 res;
trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
res = cnt_sml(trg1,trg2);
pfree(trg1);
pfree(trg2);
PG_FREE_IF_COPY(in1,0);
PG_FREE_IF_COPY(in2,1);
PG_RETURN_FLOAT4(res);
}
PG_FUNCTION_INFO_V1(similarity_op);
Datum similarity_op(PG_FUNCTION_ARGS);
Datum
similarity_op(PG_FUNCTION_ARGS) {
float4 res=DatumGetFloat4( DirectFunctionCall2(
similarity,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1)
) );
PG_RETURN_BOOL( res >= trgm_limit );
}