postgresql/contrib/pg_trgm/trgm_op.c
Tom Lane 234a02b2a8 Replace direct assignments to VARATT_SIZEP(x) with SET_VARSIZE(x, len).
Get rid of VARATT_SIZE and VARATT_DATA, which were simply redundant with
VARSIZE and VARDATA, and as a consequence almost no code was using the
longer names.  Rename the length fields of struct varlena and various
derived structures to catch anyplace that was accessing them directly;
and clean up various places so caught.  In itself this patch doesn't
change any behavior at all, but it is necessary infrastructure if we hope
to play any games with the representation of varlena headers.
Greg Stark and Tom Lane
2007-02-27 23:48:10 +00:00

311 lines
5.2 KiB
C

#include "trgm.h"
#include <ctype.h>
#include "utils/array.h"
#include "catalog/pg_type.h"
PG_MODULE_MAGIC;
float4 trgm_limit = 0.3f;
PG_FUNCTION_INFO_V1(set_limit);
Datum set_limit(PG_FUNCTION_ARGS);
Datum
set_limit(PG_FUNCTION_ARGS)
{
float4 nlimit = PG_GETARG_FLOAT4(0);
if (nlimit < 0 || nlimit > 1.0)
elog(ERROR, "wrong limit, should be between 0 and 1");
trgm_limit = nlimit;
PG_RETURN_FLOAT4(trgm_limit);
}
PG_FUNCTION_INFO_V1(show_limit);
Datum show_limit(PG_FUNCTION_ARGS);
Datum
show_limit(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT4(trgm_limit);
}
#define WORDWAIT 0
#define INWORD 1
static int
comp_trgm(const void *a, const void *b)
{
return CMPTRGM(a, b);
}
static int
unique_array(trgm * a, int len)
{
trgm *curend,
*tmp;
curend = tmp = a;
while (tmp - a < len)
if (CMPTRGM(tmp, curend))
{
curend++;
CPTRGM(curend, tmp);
tmp++;
}
else
tmp++;
return curend + 1 - a;
}
TRGM *
generate_trgm(char *str, int slen)
{
TRGM *trg;
char *buf,
*sptr,
*bufptr;
trgm *tptr;
int state = WORDWAIT;
int wl,
len;
trg = (TRGM *) palloc(TRGMHRDSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
trg->flag = ARRKEY;
trg->len = TRGMHRDSIZE;
if (slen + LPADDING + RPADDING < 3 || slen == 0)
return trg;
tptr = GETARR(trg);
buf = palloc(sizeof(char) * (slen + 4));
sptr = str;
if (LPADDING > 0)
{
*buf = ' ';
if (LPADDING > 1)
*(buf + 1) = ' ';
}
bufptr = buf + LPADDING;
while (sptr - str < slen)
{
if (state == WORDWAIT)
{
if (
#ifdef KEEPONLYALNUM
isalnum((unsigned char) *sptr)
#else
!isspace((unsigned char) *sptr)
#endif
)
{
*bufptr = *sptr; /* start put word in buffer */
bufptr++;
state = INWORD;
if (sptr - str == slen - 1 /* last char */ )
goto gettrg;
}
}
else
{
if (
#ifdef KEEPONLYALNUM
!isalnum((unsigned char) *sptr)
#else
isspace((unsigned char) *sptr)
#endif
)
{
gettrg:
/* word in buffer, so count trigrams */
*bufptr = ' ';
*(bufptr + 1) = ' ';
wl = bufptr - (buf + LPADDING) - 2 + LPADDING + RPADDING;
if (wl <= 0)
{
bufptr = buf + LPADDING;
state = WORDWAIT;
sptr++;
continue;
}
#ifdef IGNORECASE
do
{ /* lower word */
int wwl = bufptr - buf;
bufptr = buf + LPADDING;
while (bufptr - buf < wwl)
{
*bufptr = tolower((unsigned char) *bufptr);
bufptr++;
}
} while (0);
#endif
bufptr = buf;
/* set trigrams */
while (bufptr - buf < wl)
{
CPTRGM(tptr, bufptr);
bufptr++;
tptr++;
}
bufptr = buf + LPADDING;
state = WORDWAIT;
}
else
{
*bufptr = *sptr; /* put in buffer */
bufptr++;
if (sptr - str == slen - 1)
goto gettrg;
}
}
sptr++;
}
pfree(buf);
if ((len = tptr - GETARR(trg)) == 0)
return trg;
if (len > 0)
{
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
len = unique_array(GETARR(trg), len);
}
trg->len = CALCGTSIZE(ARRKEY, len);
return trg;
}
PG_FUNCTION_INFO_V1(show_trgm);
Datum show_trgm(PG_FUNCTION_ARGS);
Datum
show_trgm(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(0);
TRGM *trg;
Datum *d;
ArrayType *a;
trgm *ptr;
trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
ptr = GETARR(trg);
while (ptr - GETARR(trg) < ARRNELEM(trg))
{
text *item = (text *) palloc(VARHDRSZ + 3);
SET_VARSIZE(item, VARHDRSZ + 3);
CPTRGM(VARDATA(item), ptr);
d[ptr - GETARR(trg)] = PointerGetDatum(item);
ptr++;
}
a = construct_array(
d,
ARRNELEM(trg),
TEXTOID,
-1,
false,
'i'
);
ptr = GETARR(trg);
while (ptr - GETARR(trg) < ARRNELEM(trg))
{
pfree(DatumGetPointer(d[ptr - GETARR(trg)]));
ptr++;
}
pfree(d);
pfree(trg);
PG_FREE_IF_COPY(in, 0);
PG_RETURN_POINTER(a);
}
float4
cnt_sml(TRGM * trg1, TRGM * trg2)
{
trgm *ptr1,
*ptr2;
int count = 0;
int len1,
len2;
ptr1 = GETARR(trg1);
ptr2 = GETARR(trg2);
len1 = ARRNELEM(trg1);
len2 = ARRNELEM(trg2);
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
{
int res = CMPTRGM(ptr1, ptr2);
if (res < 0)
ptr1++;
else if (res > 0)
ptr2++;
else
{
ptr1++;
ptr2++;
count++;
}
}
#ifdef DIVUNION
return ((((float4) count) / ((float4) (len1 + len2 - count))));
#else
return (((float) count) / ((float) ((len1 > len2) ? len1 : len2)));
#endif
}
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
Datum
similarity(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_P(0);
text *in2 = PG_GETARG_TEXT_P(1);
TRGM *trg1,
*trg2;
float4 res;
trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
res = cnt_sml(trg1, trg2);
pfree(trg1);
pfree(trg2);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(res);
}
PG_FUNCTION_INFO_V1(similarity_op);
Datum similarity_op(PG_FUNCTION_ARGS);
Datum
similarity_op(PG_FUNCTION_ARGS)
{
float4 res = DatumGetFloat4(DirectFunctionCall2(
similarity,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1)
));
PG_RETURN_BOOL(res >= trgm_limit);
}