mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-10-11 03:26:59 +02:00
8ff2bccee3
suffix, to distinguish them from doubles. Make some function declarations and definitions use the "const" qualifier for arguments consistently. Ignore warning 4102 ("unreferenced label"), because such warnings are always emitted by bison-generated code. Patch from Magnus Hagander.
311 lines
5.2 KiB
C
311 lines
5.2 KiB
C
#include "trgm.h"
|
|
#include <ctype.h>
|
|
#include "utils/array.h"
|
|
#include "catalog/pg_type.h"
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
float4 trgm_limit = 0.3f;
|
|
|
|
PG_FUNCTION_INFO_V1(set_limit);
|
|
Datum set_limit(PG_FUNCTION_ARGS);
|
|
Datum
|
|
set_limit(PG_FUNCTION_ARGS)
|
|
{
|
|
float4 nlimit = PG_GETARG_FLOAT4(0);
|
|
|
|
if (nlimit < 0 || nlimit > 1.0)
|
|
elog(ERROR, "wrong limit, should be between 0 and 1");
|
|
trgm_limit = nlimit;
|
|
PG_RETURN_FLOAT4(trgm_limit);
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(show_limit);
|
|
Datum show_limit(PG_FUNCTION_ARGS);
|
|
Datum
|
|
show_limit(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_FLOAT4(trgm_limit);
|
|
}
|
|
|
|
#define WORDWAIT 0
|
|
#define INWORD 1
|
|
|
|
static int
|
|
comp_trgm(const void *a, const void *b)
|
|
{
|
|
return CMPTRGM(a, b);
|
|
}
|
|
|
|
static int
|
|
unique_array(trgm * a, int len)
|
|
{
|
|
trgm *curend,
|
|
*tmp;
|
|
|
|
curend = tmp = a;
|
|
while (tmp - a < len)
|
|
if (CMPTRGM(tmp, curend))
|
|
{
|
|
curend++;
|
|
CPTRGM(curend, tmp);
|
|
tmp++;
|
|
}
|
|
else
|
|
tmp++;
|
|
|
|
return curend + 1 - a;
|
|
}
|
|
|
|
|
|
TRGM *
|
|
generate_trgm(char *str, int slen)
|
|
{
|
|
TRGM *trg;
|
|
char *buf,
|
|
*sptr,
|
|
*bufptr;
|
|
trgm *tptr;
|
|
int state = WORDWAIT;
|
|
int wl,
|
|
len;
|
|
|
|
trg = (TRGM *) palloc(TRGMHRDSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
|
|
trg->flag = ARRKEY;
|
|
trg->len = TRGMHRDSIZE;
|
|
|
|
if (slen + LPADDING + RPADDING < 3 || slen == 0)
|
|
return trg;
|
|
|
|
tptr = GETARR(trg);
|
|
|
|
buf = palloc(sizeof(char) * (slen + 4));
|
|
sptr = str;
|
|
|
|
if (LPADDING > 0)
|
|
{
|
|
*buf = ' ';
|
|
if (LPADDING > 1)
|
|
*(buf + 1) = ' ';
|
|
}
|
|
|
|
bufptr = buf + LPADDING;
|
|
while (sptr - str < slen)
|
|
{
|
|
if (state == WORDWAIT)
|
|
{
|
|
if (
|
|
#ifdef KEEPONLYALNUM
|
|
isalnum((unsigned char) *sptr)
|
|
#else
|
|
!isspace((unsigned char) *sptr)
|
|
#endif
|
|
)
|
|
{
|
|
*bufptr = *sptr; /* start put word in buffer */
|
|
bufptr++;
|
|
state = INWORD;
|
|
if (sptr - str == slen - 1 /* last char */ )
|
|
goto gettrg;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (
|
|
#ifdef KEEPONLYALNUM
|
|
!isalnum((unsigned char) *sptr)
|
|
#else
|
|
isspace((unsigned char) *sptr)
|
|
#endif
|
|
)
|
|
{
|
|
gettrg:
|
|
/* word in buffer, so count trigrams */
|
|
*bufptr = ' ';
|
|
*(bufptr + 1) = ' ';
|
|
wl = bufptr - (buf + LPADDING) - 2 + LPADDING + RPADDING;
|
|
if (wl <= 0)
|
|
{
|
|
bufptr = buf + LPADDING;
|
|
state = WORDWAIT;
|
|
sptr++;
|
|
continue;
|
|
}
|
|
|
|
#ifdef IGNORECASE
|
|
do
|
|
{ /* lower word */
|
|
int wwl = bufptr - buf;
|
|
|
|
bufptr = buf + LPADDING;
|
|
while (bufptr - buf < wwl)
|
|
{
|
|
*bufptr = tolower((unsigned char) *bufptr);
|
|
bufptr++;
|
|
}
|
|
} while (0);
|
|
#endif
|
|
bufptr = buf;
|
|
/* set trigrams */
|
|
while (bufptr - buf < wl)
|
|
{
|
|
CPTRGM(tptr, bufptr);
|
|
bufptr++;
|
|
tptr++;
|
|
}
|
|
bufptr = buf + LPADDING;
|
|
state = WORDWAIT;
|
|
}
|
|
else
|
|
{
|
|
*bufptr = *sptr; /* put in buffer */
|
|
bufptr++;
|
|
if (sptr - str == slen - 1)
|
|
goto gettrg;
|
|
}
|
|
}
|
|
sptr++;
|
|
}
|
|
|
|
pfree(buf);
|
|
|
|
if ((len = tptr - GETARR(trg)) == 0)
|
|
return trg;
|
|
|
|
if (len > 0)
|
|
{
|
|
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
len = unique_array(GETARR(trg), len);
|
|
}
|
|
|
|
trg->len = CALCGTSIZE(ARRKEY, len);
|
|
|
|
return trg;
|
|
}
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(show_trgm);
|
|
Datum show_trgm(PG_FUNCTION_ARGS);
|
|
Datum
|
|
show_trgm(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in = PG_GETARG_TEXT_P(0);
|
|
TRGM *trg;
|
|
Datum *d;
|
|
ArrayType *a;
|
|
trgm *ptr;
|
|
|
|
trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
|
|
d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
|
|
|
|
ptr = GETARR(trg);
|
|
while (ptr - GETARR(trg) < ARRNELEM(trg))
|
|
{
|
|
text *item = (text *) palloc(VARHDRSZ + 3);
|
|
|
|
VARATT_SIZEP(item) = VARHDRSZ + 3;
|
|
CPTRGM(VARDATA(item), ptr);
|
|
d[ptr - GETARR(trg)] = PointerGetDatum(item);
|
|
ptr++;
|
|
}
|
|
|
|
a = construct_array(
|
|
d,
|
|
ARRNELEM(trg),
|
|
TEXTOID,
|
|
-1,
|
|
false,
|
|
'i'
|
|
);
|
|
|
|
ptr = GETARR(trg);
|
|
while (ptr - GETARR(trg) < ARRNELEM(trg))
|
|
{
|
|
pfree(DatumGetPointer(d[ptr - GETARR(trg)]));
|
|
ptr++;
|
|
}
|
|
|
|
pfree(d);
|
|
pfree(trg);
|
|
PG_FREE_IF_COPY(in, 0);
|
|
|
|
PG_RETURN_POINTER(a);
|
|
}
|
|
|
|
float4
|
|
cnt_sml(TRGM * trg1, TRGM * trg2)
|
|
{
|
|
trgm *ptr1,
|
|
*ptr2;
|
|
int count = 0;
|
|
int len1,
|
|
len2;
|
|
|
|
ptr1 = GETARR(trg1);
|
|
ptr2 = GETARR(trg2);
|
|
|
|
len1 = ARRNELEM(trg1);
|
|
len2 = ARRNELEM(trg2);
|
|
|
|
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
|
|
{
|
|
int res = CMPTRGM(ptr1, ptr2);
|
|
|
|
if (res < 0)
|
|
ptr1++;
|
|
else if (res > 0)
|
|
ptr2++;
|
|
else
|
|
{
|
|
ptr1++;
|
|
ptr2++;
|
|
count++;
|
|
}
|
|
}
|
|
|
|
#ifdef DIVUNION
|
|
return ((((float4) count) / ((float4) (len1 + len2 - count))));
|
|
#else
|
|
return (((float) count) / ((float) ((len1 > len2) ? len1 : len2)));
|
|
#endif
|
|
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(similarity);
|
|
Datum similarity(PG_FUNCTION_ARGS);
|
|
Datum
|
|
similarity(PG_FUNCTION_ARGS)
|
|
{
|
|
text *in1 = PG_GETARG_TEXT_P(0);
|
|
text *in2 = PG_GETARG_TEXT_P(1);
|
|
TRGM *trg1,
|
|
*trg2;
|
|
float4 res;
|
|
|
|
trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
|
|
trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
|
|
|
|
res = cnt_sml(trg1, trg2);
|
|
|
|
pfree(trg1);
|
|
pfree(trg2);
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
|
|
PG_RETURN_FLOAT4(res);
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(similarity_op);
|
|
Datum similarity_op(PG_FUNCTION_ARGS);
|
|
Datum
|
|
similarity_op(PG_FUNCTION_ARGS)
|
|
{
|
|
float4 res = DatumGetFloat4(DirectFunctionCall2(
|
|
similarity,
|
|
PG_GETARG_DATUM(0),
|
|
PG_GETARG_DATUM(1)
|
|
));
|
|
|
|
PG_RETURN_BOOL(res >= trgm_limit);
|
|
}
|