#include "trgm.h" #include #include "utils/array.h" #include "catalog/pg_type.h" float4 trgm_limit = 0.3; PG_FUNCTION_INFO_V1(set_limit); Datum set_limit(PG_FUNCTION_ARGS); Datum set_limit(PG_FUNCTION_ARGS) { float4 nlimit = PG_GETARG_FLOAT4(0); if (nlimit < 0 || nlimit > 1.0) elog(ERROR, "Wrong limit, should be between 0 and 1"); trgm_limit = nlimit; PG_RETURN_FLOAT4(trgm_limit); } PG_FUNCTION_INFO_V1(show_limit); Datum show_limit(PG_FUNCTION_ARGS); Datum show_limit(PG_FUNCTION_ARGS) { PG_RETURN_FLOAT4(trgm_limit); } #define WORDWAIT 0 #define INWORD 1 static int comp_trgm(const void *a, const void *b) { return CMPTRGM(a, b); } static int unique_array(trgm * a, int len) { trgm *curend, *tmp; curend = tmp = a; while (tmp - a < len) if (CMPTRGM(tmp, curend)) { curend++; CPTRGM(curend, tmp); tmp++; } else tmp++; return curend + 1 - a; } TRGM * generate_trgm(char *str, int slen) { TRGM *trg; char *buf, *sptr, *bufptr; trgm *tptr; int state = WORDWAIT; int wl, len; trg = (TRGM *) palloc(TRGMHRDSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); trg->flag = ARRKEY; trg->len = TRGMHRDSIZE; if (slen + LPADDING + RPADDING < 3 || slen == 0) return trg; tptr = GETARR(trg); buf = palloc(sizeof(char) * (slen + 4)); sptr = str; if (LPADDING > 0) { *buf = ' '; if (LPADDING > 1) *(buf + 1) = ' '; } bufptr = buf + LPADDING; while (sptr - str < slen) { if (state == WORDWAIT) { if ( #ifdef KEEPONLYALNUM isalnum((unsigned char) *sptr) #else !isspace((unsigned char) *sptr) #endif ) { *bufptr = *sptr; /* start put word in buffer */ bufptr++; state = INWORD; if (sptr - str == slen - 1 /* last char */ ) goto gettrg; } } else { if ( #ifdef KEEPONLYALNUM !isalnum((unsigned char) *sptr) #else isspace((unsigned char) *sptr) #endif ) { gettrg: /* word in buffer, so count trigrams */ *bufptr = ' '; *(bufptr + 1) = ' '; wl = bufptr - (buf + LPADDING) - 2 + LPADDING + RPADDING; if (wl <= 0) { bufptr = buf + LPADDING; state = WORDWAIT; sptr++; continue; } #ifdef IGNORECASE do { /* lower word */ int wwl = bufptr - buf; bufptr = buf + LPADDING; while (bufptr - buf < wwl) { *bufptr = tolower((unsigned char) *bufptr); bufptr++; } } while (0); #endif bufptr = buf; /* set trigrams */ while (bufptr - buf < wl) { CPTRGM(tptr, bufptr); bufptr++; tptr++; } bufptr = buf + LPADDING; state = WORDWAIT; } else { *bufptr = *sptr; /* put in buffer */ bufptr++; if (sptr - str == slen - 1) goto gettrg; } } sptr++; } pfree(buf); if ((len = tptr - GETARR(trg)) == 0) return trg; if (len > 0) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); len = unique_array(GETARR(trg), len); } trg->len = CALCGTSIZE(ARRKEY, len); return trg; } PG_FUNCTION_INFO_V1(show_trgm); Datum show_trgm(PG_FUNCTION_ARGS); Datum show_trgm(PG_FUNCTION_ARGS) { text *in = PG_GETARG_TEXT_P(0); TRGM *trg; Datum *d; ArrayType *a; trgm *ptr; trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ); d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg))); ptr = GETARR(trg); while (ptr - GETARR(trg) < ARRNELEM(trg)) { text *item = (text *) palloc(VARHDRSZ + 3); VARATT_SIZEP(item) = VARHDRSZ + 3; CPTRGM(VARDATA(item), ptr); d[ptr - GETARR(trg)] = PointerGetDatum(item); ptr++; } a = construct_array( d, ARRNELEM(trg), TEXTOID, -1, false, 'i' ); ptr = GETARR(trg); while (ptr - GETARR(trg) < ARRNELEM(trg)) { pfree(DatumGetPointer(d[ptr - GETARR(trg)])); ptr++; } pfree(d); pfree(trg); PG_FREE_IF_COPY(in, 0); PG_RETURN_POINTER(a); } float4 cnt_sml(TRGM * trg1, TRGM * trg2) { trgm *ptr1, *ptr2; int count = 0; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) ptr1++; else if (res > 0) ptr2++; else { ptr1++; ptr2++; count++; } } #ifdef DIVUNION return ((((float4) count) / ((float4) (len1 + len2 - count)))); #else return (((float) count) / ((float) ((len1 > len2) ? len1 : len2))); #endif } PG_FUNCTION_INFO_V1(similarity); Datum similarity(PG_FUNCTION_ARGS); Datum similarity(PG_FUNCTION_ARGS) { text *in1 = PG_GETARG_TEXT_P(0); text *in2 = PG_GETARG_TEXT_P(1); TRGM *trg1, *trg2; float4 res; trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ); trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ); res = cnt_sml(trg1, trg2); pfree(trg1); pfree(trg2); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_FLOAT4(res); } PG_FUNCTION_INFO_V1(similarity_op); Datum similarity_op(PG_FUNCTION_ARGS); Datum similarity_op(PG_FUNCTION_ARGS) { float4 res = DatumGetFloat4(DirectFunctionCall2( similarity, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1) )); PG_RETURN_BOOL(res >= trgm_limit); }