2008-05-17 03:28:26 +02:00
|
|
|
/*
|
2010-09-20 22:08:53 +02:00
|
|
|
* contrib/pg_trgm/trgm_op.c
|
2008-05-17 03:28:26 +02:00
|
|
|
*/
|
2010-12-04 06:16:21 +01:00
|
|
|
#include "postgres.h"
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
#include <ctype.h>
|
2010-12-04 06:16:21 +01:00
|
|
|
|
|
|
|
#include "trgm.h"
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
#include "catalog/pg_type.h"
|
2008-11-12 14:43:54 +01:00
|
|
|
#include "tsearch/ts_locale.h"
|
2014-01-13 19:07:10 +01:00
|
|
|
#include "utils/memutils.h"
|
2010-12-04 06:16:21 +01:00
|
|
|
|
2006-05-31 00:12:16 +02:00
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
|
2007-01-26 18:45:42 +01:00
|
|
|
float4 trgm_limit = 0.3f;
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(set_limit);
|
2011-02-01 03:33:55 +01:00
|
|
|
PG_FUNCTION_INFO_V1(show_limit);
|
|
|
|
PG_FUNCTION_INFO_V1(show_trgm);
|
|
|
|
PG_FUNCTION_INFO_V1(similarity);
|
|
|
|
PG_FUNCTION_INFO_V1(similarity_dist);
|
|
|
|
PG_FUNCTION_INFO_V1(similarity_op);
|
|
|
|
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
Datum
|
2004-08-29 07:07:03 +02:00
|
|
|
set_limit(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
float4 nlimit = PG_GETARG_FLOAT4(0);
|
|
|
|
|
|
|
|
if (nlimit < 0 || nlimit > 1.0)
|
2006-03-01 07:30:32 +01:00
|
|
|
elog(ERROR, "wrong limit, should be between 0 and 1");
|
2004-05-31 19:18:12 +02:00
|
|
|
trgm_limit = nlimit;
|
|
|
|
PG_RETURN_FLOAT4(trgm_limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
2004-08-29 07:07:03 +02:00
|
|
|
show_limit(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2004-05-31 19:18:12 +02:00
|
|
|
PG_RETURN_FLOAT4(trgm_limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2004-08-29 07:07:03 +02:00
|
|
|
comp_trgm(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
return CMPTRGM(a, b);
|
2004-05-31 19:18:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2009-06-11 16:49:15 +02:00
|
|
|
unique_array(trgm *a, int len)
|
2004-08-29 07:07:03 +02:00
|
|
|
{
|
|
|
|
trgm *curend,
|
|
|
|
*tmp;
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
curend = tmp = a;
|
|
|
|
while (tmp - a < len)
|
2004-08-29 07:07:03 +02:00
|
|
|
if (CMPTRGM(tmp, curend))
|
|
|
|
{
|
2004-05-31 19:18:12 +02:00
|
|
|
curend++;
|
2004-08-29 07:07:03 +02:00
|
|
|
CPTRGM(curend, tmp);
|
2004-05-31 19:18:12 +02:00
|
|
|
tmp++;
|
2004-08-29 07:07:03 +02:00
|
|
|
}
|
|
|
|
else
|
2004-05-31 19:18:12 +02:00
|
|
|
tmp++;
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
return curend + 1 - a;
|
|
|
|
}
|
|
|
|
|
2008-11-12 14:43:54 +01:00
|
|
|
/*
|
|
|
|
* Finds first word in string, returns pointer to the word,
|
|
|
|
* endword points to the character after word
|
|
|
|
*/
|
2009-06-11 16:49:15 +02:00
|
|
|
static char *
|
|
|
|
find_word(char *str, int lenstr, char **endword, int *charlen)
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
char *beginword = str;
|
2008-11-12 14:43:54 +01:00
|
|
|
|
2013-04-09 07:05:55 +02:00
|
|
|
while (beginword - str < lenstr && !ISWORDCHR(beginword))
|
2008-11-12 14:43:54 +01:00
|
|
|
beginword += pg_mblen(beginword);
|
|
|
|
|
|
|
|
if (beginword - str >= lenstr)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
*endword = beginword;
|
|
|
|
*charlen = 0;
|
2013-04-09 07:05:55 +02:00
|
|
|
while (*endword - str < lenstr && ISWORDCHR(*endword))
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
|
|
|
*endword += pg_mblen(*endword);
|
|
|
|
(*charlen)++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return beginword;
|
|
|
|
}
|
|
|
|
|
Get rid of USE_WIDE_UPPER_LOWER dependency in trigram construction.
contrib/pg_trgm's make_trigrams() was coded to ignore multibyte character
boundaries and just make trigrams from bytes if USE_WIDE_UPPER_LOWER wasn't
defined. This is a bit odd, since there's no obvious reason why trigram
compaction rules should depend on the presence of towlower() and friends.
What's more, there was an Assert() that would fail if that code path was
fed any multibyte characters.
We need to do something about this since the pending regex-indexing patch
has an assumption that you get just one "trgm" from any three characters.
The best solution seems to be to remove the USE_WIDE_UPPER_LOWER
dependency, which shouldn't really have been there in the first place.
The second loop in make_trigrams() is now just a fast path and not a
potentially incompatible algorithm.
If there is anybody still using Postgres on machines without wcstombs() or
towlower(), and they have non-ASCII data indexed by pg_trgm, they'll need
to REINDEX those indexes after pg_upgrade to 9.3, else searches may fail
incorrectly. It seems likely that there are no such installations, though.
In passing, rename cnt_trigram to compact_trigram, which seems to better
describe its functionality, and improve make_trigrams' test for whether it
has to use the slow path or not (per a suggestion from Alexander Korotkov).
2013-04-07 20:45:33 +02:00
|
|
|
/*
|
|
|
|
* Reduce a trigram (three possibly multi-byte characters) to a trgm,
|
|
|
|
* which is always exactly three bytes. If we have three single-byte
|
|
|
|
* characters, we just use them as-is; otherwise we form a hash value.
|
|
|
|
*/
|
2013-04-09 07:05:55 +02:00
|
|
|
void
|
Get rid of USE_WIDE_UPPER_LOWER dependency in trigram construction.
contrib/pg_trgm's make_trigrams() was coded to ignore multibyte character
boundaries and just make trigrams from bytes if USE_WIDE_UPPER_LOWER wasn't
defined. This is a bit odd, since there's no obvious reason why trigram
compaction rules should depend on the presence of towlower() and friends.
What's more, there was an Assert() that would fail if that code path was
fed any multibyte characters.
We need to do something about this since the pending regex-indexing patch
has an assumption that you get just one "trgm" from any three characters.
The best solution seems to be to remove the USE_WIDE_UPPER_LOWER
dependency, which shouldn't really have been there in the first place.
The second loop in make_trigrams() is now just a fast path and not a
potentially incompatible algorithm.
If there is anybody still using Postgres on machines without wcstombs() or
towlower(), and they have non-ASCII data indexed by pg_trgm, they'll need
to REINDEX those indexes after pg_upgrade to 9.3, else searches may fail
incorrectly. It seems likely that there are no such installations, though.
In passing, rename cnt_trigram to compact_trigram, which seems to better
describe its functionality, and improve make_trigrams' test for whether it
has to use the slow path or not (per a suggestion from Alexander Korotkov).
2013-04-07 20:45:33 +02:00
|
|
|
compact_trigram(trgm *tptr, char *str, int bytelen)
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
if (bytelen == 3)
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
CPTRGM(tptr, str);
|
2008-11-12 14:43:54 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
pg_crc32 crc;
|
|
|
|
|
Switch to CRC-32C in WAL and other places.
The old algorithm was found to not be the usual CRC-32 algorithm, used by
Ethernet et al. We were using a non-reflected lookup table with code meant
for a reflected lookup table. That's a strange combination that AFAICS does
not correspond to any bit-wise CRC calculation, which makes it difficult to
reason about its properties. Although it has worked well in practice, seems
safer to use a well-known algorithm.
Since we're changing the algorithm anyway, we might as well choose a
different polynomial. The Castagnoli polynomial has better error-correcting
properties than the traditional CRC-32 polynomial, even if we had
implemented it correctly. Another reason for picking that is that some new
CPUs have hardware support for calculating CRC-32C, but not CRC-32, let
alone our strange variant of it. This patch doesn't add any support for such
hardware, but a future patch could now do that.
The old algorithm is kept around for tsquery and pg_trgm, which use the
values in indexes that need to remain compatible so that pg_upgrade works.
While we're at it, share the old lookup table for CRC-32 calculation
between hstore, ltree and core. They all use the same table, so might as
well.
2014-11-04 10:35:15 +01:00
|
|
|
INIT_LEGACY_CRC32(crc);
|
|
|
|
COMP_LEGACY_CRC32(crc, str, bytelen);
|
|
|
|
FIN_LEGACY_CRC32(crc);
|
2008-11-12 14:43:54 +01:00
|
|
|
|
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* use only 3 upper bytes from crc, hope, it's good enough hashing
|
2008-11-12 14:43:54 +01:00
|
|
|
*/
|
|
|
|
CPTRGM(tptr, &crc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-02-01 03:33:55 +01:00
|
|
|
* Adds trigrams from words (already padded).
|
2008-11-12 14:43:54 +01:00
|
|
|
*/
|
2009-06-11 16:49:15 +02:00
|
|
|
static trgm *
|
|
|
|
make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
char *ptr = str;
|
2008-11-12 14:43:54 +01:00
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
if (charlen < 3)
|
2008-11-12 14:43:54 +01:00
|
|
|
return tptr;
|
|
|
|
|
Get rid of USE_WIDE_UPPER_LOWER dependency in trigram construction.
contrib/pg_trgm's make_trigrams() was coded to ignore multibyte character
boundaries and just make trigrams from bytes if USE_WIDE_UPPER_LOWER wasn't
defined. This is a bit odd, since there's no obvious reason why trigram
compaction rules should depend on the presence of towlower() and friends.
What's more, there was an Assert() that would fail if that code path was
fed any multibyte characters.
We need to do something about this since the pending regex-indexing patch
has an assumption that you get just one "trgm" from any three characters.
The best solution seems to be to remove the USE_WIDE_UPPER_LOWER
dependency, which shouldn't really have been there in the first place.
The second loop in make_trigrams() is now just a fast path and not a
potentially incompatible algorithm.
If there is anybody still using Postgres on machines without wcstombs() or
towlower(), and they have non-ASCII data indexed by pg_trgm, they'll need
to REINDEX those indexes after pg_upgrade to 9.3, else searches may fail
incorrectly. It seems likely that there are no such installations, though.
In passing, rename cnt_trigram to compact_trigram, which seems to better
describe its functionality, and improve make_trigrams' test for whether it
has to use the slow path or not (per a suggestion from Alexander Korotkov).
2013-04-07 20:45:33 +02:00
|
|
|
if (bytelen > charlen)
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
Get rid of USE_WIDE_UPPER_LOWER dependency in trigram construction.
contrib/pg_trgm's make_trigrams() was coded to ignore multibyte character
boundaries and just make trigrams from bytes if USE_WIDE_UPPER_LOWER wasn't
defined. This is a bit odd, since there's no obvious reason why trigram
compaction rules should depend on the presence of towlower() and friends.
What's more, there was an Assert() that would fail if that code path was
fed any multibyte characters.
We need to do something about this since the pending regex-indexing patch
has an assumption that you get just one "trgm" from any three characters.
The best solution seems to be to remove the USE_WIDE_UPPER_LOWER
dependency, which shouldn't really have been there in the first place.
The second loop in make_trigrams() is now just a fast path and not a
potentially incompatible algorithm.
If there is anybody still using Postgres on machines without wcstombs() or
towlower(), and they have non-ASCII data indexed by pg_trgm, they'll need
to REINDEX those indexes after pg_upgrade to 9.3, else searches may fail
incorrectly. It seems likely that there are no such installations, though.
In passing, rename cnt_trigram to compact_trigram, which seems to better
describe its functionality, and improve make_trigrams' test for whether it
has to use the slow path or not (per a suggestion from Alexander Korotkov).
2013-04-07 20:45:33 +02:00
|
|
|
/* Find multibyte character boundaries and apply compact_trigram */
|
2009-06-11 16:49:15 +02:00
|
|
|
int lenfirst = pg_mblen(str),
|
|
|
|
lenmiddle = pg_mblen(str + lenfirst),
|
|
|
|
lenlast = pg_mblen(str + lenfirst + lenmiddle);
|
2008-11-12 14:43:54 +01:00
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
Get rid of USE_WIDE_UPPER_LOWER dependency in trigram construction.
contrib/pg_trgm's make_trigrams() was coded to ignore multibyte character
boundaries and just make trigrams from bytes if USE_WIDE_UPPER_LOWER wasn't
defined. This is a bit odd, since there's no obvious reason why trigram
compaction rules should depend on the presence of towlower() and friends.
What's more, there was an Assert() that would fail if that code path was
fed any multibyte characters.
We need to do something about this since the pending regex-indexing patch
has an assumption that you get just one "trgm" from any three characters.
The best solution seems to be to remove the USE_WIDE_UPPER_LOWER
dependency, which shouldn't really have been there in the first place.
The second loop in make_trigrams() is now just a fast path and not a
potentially incompatible algorithm.
If there is anybody still using Postgres on machines without wcstombs() or
towlower(), and they have non-ASCII data indexed by pg_trgm, they'll need
to REINDEX those indexes after pg_upgrade to 9.3, else searches may fail
incorrectly. It seems likely that there are no such installations, though.
In passing, rename cnt_trigram to compact_trigram, which seems to better
describe its functionality, and improve make_trigrams' test for whether it
has to use the slow path or not (per a suggestion from Alexander Korotkov).
2013-04-07 20:45:33 +02:00
|
|
|
compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
|
2008-11-12 14:43:54 +01:00
|
|
|
|
|
|
|
ptr += lenfirst;
|
|
|
|
tptr++;
|
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
lenfirst = lenmiddle;
|
|
|
|
lenmiddle = lenlast;
|
|
|
|
lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
|
2008-11-12 14:43:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Get rid of USE_WIDE_UPPER_LOWER dependency in trigram construction.
contrib/pg_trgm's make_trigrams() was coded to ignore multibyte character
boundaries and just make trigrams from bytes if USE_WIDE_UPPER_LOWER wasn't
defined. This is a bit odd, since there's no obvious reason why trigram
compaction rules should depend on the presence of towlower() and friends.
What's more, there was an Assert() that would fail if that code path was
fed any multibyte characters.
We need to do something about this since the pending regex-indexing patch
has an assumption that you get just one "trgm" from any three characters.
The best solution seems to be to remove the USE_WIDE_UPPER_LOWER
dependency, which shouldn't really have been there in the first place.
The second loop in make_trigrams() is now just a fast path and not a
potentially incompatible algorithm.
If there is anybody still using Postgres on machines without wcstombs() or
towlower(), and they have non-ASCII data indexed by pg_trgm, they'll need
to REINDEX those indexes after pg_upgrade to 9.3, else searches may fail
incorrectly. It seems likely that there are no such installations, though.
In passing, rename cnt_trigram to compact_trigram, which seems to better
describe its functionality, and improve make_trigrams' test for whether it
has to use the slow path or not (per a suggestion from Alexander Korotkov).
2013-04-07 20:45:33 +02:00
|
|
|
/* Fast path when there are no multibyte characters */
|
2009-06-11 16:49:15 +02:00
|
|
|
Assert(bytelen == charlen);
|
2008-11-12 14:43:54 +01:00
|
|
|
|
|
|
|
while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
|
|
|
|
{
|
|
|
|
CPTRGM(tptr, ptr);
|
|
|
|
ptr++;
|
|
|
|
tptr++;
|
|
|
|
}
|
|
|
|
}
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2008-11-12 14:43:54 +01:00
|
|
|
return tptr;
|
|
|
|
}
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2004-08-29 07:07:03 +02:00
|
|
|
TRGM *
|
|
|
|
generate_trgm(char *str, int slen)
|
|
|
|
{
|
|
|
|
TRGM *trg;
|
2008-11-12 14:43:54 +01:00
|
|
|
char *buf;
|
2004-08-29 07:07:03 +02:00
|
|
|
trgm *tptr;
|
2008-11-12 14:43:54 +01:00
|
|
|
int len,
|
|
|
|
charlen,
|
|
|
|
bytelen;
|
2009-06-11 16:49:15 +02:00
|
|
|
char *bword,
|
|
|
|
*eword;
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2014-01-13 19:07:10 +01:00
|
|
|
/*
|
|
|
|
* Guard against possible overflow in the palloc requests below. (We
|
|
|
|
* don't worry about the additive constants, since palloc can detect
|
|
|
|
* requests that are a little above MaxAllocSize --- we just need to
|
|
|
|
* prevent integer overflow in the multiplications.)
|
|
|
|
*/
|
|
|
|
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
|
|
|
|
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
errmsg("out of memory")));
|
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
|
2004-05-31 19:18:12 +02:00
|
|
|
trg->flag = ARRKEY;
|
2007-02-28 23:44:38 +01:00
|
|
|
SET_VARSIZE(trg, TRGMHDRSIZE);
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2004-08-29 07:07:03 +02:00
|
|
|
if (slen + LPADDING + RPADDING < 3 || slen == 0)
|
2004-05-31 19:18:12 +02:00
|
|
|
return trg;
|
|
|
|
|
|
|
|
tptr = GETARR(trg);
|
|
|
|
|
2014-01-13 19:07:10 +01:00
|
|
|
/* Allocate a buffer for case-folded, blank-padded words */
|
|
|
|
buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2004-08-29 07:07:03 +02:00
|
|
|
if (LPADDING > 0)
|
|
|
|
{
|
2004-05-31 19:18:12 +02:00
|
|
|
*buf = ' ';
|
2004-08-29 07:07:03 +02:00
|
|
|
if (LPADDING > 1)
|
|
|
|
*(buf + 1) = ' ';
|
2004-05-31 19:18:12 +02:00
|
|
|
}
|
|
|
|
|
2008-11-12 14:43:54 +01:00
|
|
|
eword = str;
|
2009-06-11 16:49:15 +02:00
|
|
|
while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
|
2004-08-29 07:07:03 +02:00
|
|
|
{
|
2008-11-12 14:43:54 +01:00
|
|
|
#ifdef IGNORECASE
|
|
|
|
bword = lowerstr_with_len(bword, eword - bword);
|
|
|
|
bytelen = strlen(bword);
|
2004-05-31 19:18:12 +02:00
|
|
|
#else
|
2008-11-12 14:43:54 +01:00
|
|
|
bytelen = eword - bword;
|
2004-05-31 19:18:12 +02:00
|
|
|
#endif
|
2008-11-12 14:43:54 +01:00
|
|
|
|
|
|
|
memcpy(buf + LPADDING, bword, bytelen);
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
#ifdef IGNORECASE
|
2008-11-12 14:43:54 +01:00
|
|
|
pfree(bword);
|
2004-05-31 19:18:12 +02:00
|
|
|
#endif
|
2014-01-13 19:07:10 +01:00
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
buf[LPADDING + bytelen] = ' ';
|
|
|
|
buf[LPADDING + bytelen + 1] = ' ';
|
2008-11-12 14:43:54 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* count trigrams
|
|
|
|
*/
|
2009-06-11 16:49:15 +02:00
|
|
|
tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
|
|
|
|
charlen + LPADDING + RPADDING);
|
2004-05-31 19:18:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pfree(buf);
|
|
|
|
|
2004-08-29 07:07:03 +02:00
|
|
|
if ((len = tptr - GETARR(trg)) == 0)
|
2004-05-31 19:18:12 +02:00
|
|
|
return trg;
|
|
|
|
|
2014-01-13 19:07:10 +01:00
|
|
|
/*
|
|
|
|
* Make trigrams unique.
|
|
|
|
*/
|
|
|
|
if (len > 1)
|
2004-08-29 07:07:03 +02:00
|
|
|
{
|
|
|
|
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
|
|
len = unique_array(GETARR(trg), len);
|
|
|
|
}
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2007-02-28 23:44:38 +01:00
|
|
|
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
return trg;
|
|
|
|
}
|
|
|
|
|
2011-02-01 03:33:55 +01:00
|
|
|
/*
|
|
|
|
* Extract the next non-wildcard part of a search string, ie, a word bounded
|
|
|
|
* by '_' or '%' meta-characters, non-word characters or string end.
|
|
|
|
*
|
|
|
|
* str: source string, of length lenstr bytes (need not be null-terminated)
|
|
|
|
* buf: where to return the substring (must be long enough)
|
|
|
|
* *bytelen: receives byte length of the found substring
|
|
|
|
* *charlen: receives character length of the found substring
|
|
|
|
*
|
|
|
|
* Returns pointer to end+1 of the found substring in the source string.
|
|
|
|
* Returns NULL if no word found (in which case buf, bytelen, charlen not set)
|
|
|
|
*
|
|
|
|
* If the found word is bounded by non-word characters or string boundaries
|
|
|
|
* then this function will include corresponding padding spaces into buf.
|
|
|
|
*/
|
|
|
|
static const char *
|
|
|
|
get_wildcard_part(const char *str, int lenstr,
|
|
|
|
char *buf, int *bytelen, int *charlen)
|
|
|
|
{
|
|
|
|
const char *beginword = str;
|
|
|
|
const char *endword;
|
|
|
|
char *s = buf;
|
2012-08-20 19:24:52 +02:00
|
|
|
bool in_leading_wildcard_meta = false;
|
|
|
|
bool in_trailing_wildcard_meta = false;
|
2011-04-10 17:42:00 +02:00
|
|
|
bool in_escape = false;
|
|
|
|
int clen;
|
2011-02-01 03:33:55 +01:00
|
|
|
|
|
|
|
/*
|
2012-08-20 19:24:52 +02:00
|
|
|
* Find the first word character, remembering whether preceding character
|
|
|
|
* was wildcard meta-character. Note that the in_escape state persists
|
|
|
|
* from this loop to the next one, since we may exit at a word character
|
|
|
|
* that is in_escape.
|
2011-02-01 03:33:55 +01:00
|
|
|
*/
|
|
|
|
while (beginword - str < lenstr)
|
|
|
|
{
|
|
|
|
if (in_escape)
|
|
|
|
{
|
2013-04-09 07:05:55 +02:00
|
|
|
if (ISWORDCHR(beginword))
|
2011-02-01 03:33:55 +01:00
|
|
|
break;
|
2012-08-20 19:24:52 +02:00
|
|
|
in_escape = false;
|
|
|
|
in_leading_wildcard_meta = false;
|
2011-02-01 03:33:55 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (ISESCAPECHAR(beginword))
|
|
|
|
in_escape = true;
|
|
|
|
else if (ISWILDCARDCHAR(beginword))
|
2012-08-20 19:24:52 +02:00
|
|
|
in_leading_wildcard_meta = true;
|
2013-04-09 07:05:55 +02:00
|
|
|
else if (ISWORDCHR(beginword))
|
2011-02-01 03:33:55 +01:00
|
|
|
break;
|
|
|
|
else
|
2012-08-20 19:24:52 +02:00
|
|
|
in_leading_wildcard_meta = false;
|
2011-02-01 03:33:55 +01:00
|
|
|
}
|
|
|
|
beginword += pg_mblen(beginword);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle string end.
|
|
|
|
*/
|
|
|
|
if (beginword - str >= lenstr)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
2012-08-20 19:24:52 +02:00
|
|
|
* Add left padding spaces if preceding character wasn't wildcard
|
2011-02-01 03:33:55 +01:00
|
|
|
* meta-character.
|
|
|
|
*/
|
|
|
|
*charlen = 0;
|
2012-08-20 19:24:52 +02:00
|
|
|
if (!in_leading_wildcard_meta)
|
2011-02-01 03:33:55 +01:00
|
|
|
{
|
|
|
|
if (LPADDING > 0)
|
|
|
|
{
|
|
|
|
*s++ = ' ';
|
|
|
|
(*charlen)++;
|
|
|
|
if (LPADDING > 1)
|
|
|
|
{
|
|
|
|
*s++ = ' ';
|
|
|
|
(*charlen)++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy data into buf until wildcard meta-character, non-word character or
|
|
|
|
* string boundary. Strip escapes during copy.
|
|
|
|
*/
|
|
|
|
endword = beginword;
|
|
|
|
while (endword - str < lenstr)
|
|
|
|
{
|
|
|
|
clen = pg_mblen(endword);
|
|
|
|
if (in_escape)
|
|
|
|
{
|
2013-04-09 07:05:55 +02:00
|
|
|
if (ISWORDCHR(endword))
|
2011-02-01 03:33:55 +01:00
|
|
|
{
|
|
|
|
memcpy(s, endword, clen);
|
|
|
|
(*charlen)++;
|
|
|
|
s += clen;
|
|
|
|
}
|
|
|
|
else
|
2012-08-20 19:24:52 +02:00
|
|
|
{
|
|
|
|
/*
|
2013-05-29 22:58:43 +02:00
|
|
|
* Back up endword to the escape character when stopping at an
|
|
|
|
* escaped char, so that subsequent get_wildcard_part will
|
2012-08-20 19:24:52 +02:00
|
|
|
* restart from the escape character. We assume here that
|
|
|
|
* escape chars are single-byte.
|
|
|
|
*/
|
|
|
|
endword--;
|
2011-02-01 03:33:55 +01:00
|
|
|
break;
|
2012-08-20 19:24:52 +02:00
|
|
|
}
|
|
|
|
in_escape = false;
|
2011-02-01 03:33:55 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (ISESCAPECHAR(endword))
|
|
|
|
in_escape = true;
|
|
|
|
else if (ISWILDCARDCHAR(endword))
|
|
|
|
{
|
2012-08-20 19:24:52 +02:00
|
|
|
in_trailing_wildcard_meta = true;
|
2011-02-01 03:33:55 +01:00
|
|
|
break;
|
|
|
|
}
|
2013-04-09 07:05:55 +02:00
|
|
|
else if (ISWORDCHR(endword))
|
2011-02-01 03:33:55 +01:00
|
|
|
{
|
|
|
|
memcpy(s, endword, clen);
|
|
|
|
(*charlen)++;
|
|
|
|
s += clen;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
endword += clen;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-08-20 19:24:52 +02:00
|
|
|
* Add right padding spaces if next character isn't wildcard
|
2011-02-01 03:33:55 +01:00
|
|
|
* meta-character.
|
|
|
|
*/
|
2012-08-20 19:24:52 +02:00
|
|
|
if (!in_trailing_wildcard_meta)
|
2011-02-01 03:33:55 +01:00
|
|
|
{
|
|
|
|
if (RPADDING > 0)
|
|
|
|
{
|
|
|
|
*s++ = ' ';
|
|
|
|
(*charlen)++;
|
|
|
|
if (RPADDING > 1)
|
|
|
|
{
|
|
|
|
*s++ = ' ';
|
|
|
|
(*charlen)++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*bytelen = s - buf;
|
|
|
|
return endword;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generates trigrams for wildcard search string.
|
|
|
|
*
|
|
|
|
* Returns array of trigrams that must occur in any string that matches the
|
|
|
|
* wildcard string. For example, given pattern "a%bcd%" the trigrams
|
|
|
|
* " a", "bcd" would be extracted.
|
|
|
|
*/
|
|
|
|
TRGM *
|
|
|
|
generate_wildcard_trgm(const char *str, int slen)
|
|
|
|
{
|
|
|
|
TRGM *trg;
|
|
|
|
char *buf,
|
2011-04-10 17:42:00 +02:00
|
|
|
*buf2;
|
2011-02-01 03:33:55 +01:00
|
|
|
trgm *tptr;
|
|
|
|
int len,
|
|
|
|
charlen,
|
|
|
|
bytelen;
|
|
|
|
const char *eword;
|
|
|
|
|
2014-01-13 19:07:10 +01:00
|
|
|
/*
|
|
|
|
* Guard against possible overflow in the palloc requests below. (We
|
|
|
|
* don't worry about the additive constants, since palloc can detect
|
|
|
|
* requests that are a little above MaxAllocSize --- we just need to
|
|
|
|
* prevent integer overflow in the multiplications.)
|
|
|
|
*/
|
|
|
|
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
|
|
|
|
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
|
|
errmsg("out of memory")));
|
|
|
|
|
2011-04-10 17:42:00 +02:00
|
|
|
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
|
2011-02-01 03:33:55 +01:00
|
|
|
trg->flag = ARRKEY;
|
|
|
|
SET_VARSIZE(trg, TRGMHDRSIZE);
|
|
|
|
|
|
|
|
if (slen + LPADDING + RPADDING < 3 || slen == 0)
|
|
|
|
return trg;
|
|
|
|
|
|
|
|
tptr = GETARR(trg);
|
|
|
|
|
2014-01-13 19:07:10 +01:00
|
|
|
/* Allocate a buffer for blank-padded, but not yet case-folded, words */
|
2011-02-01 03:33:55 +01:00
|
|
|
buf = palloc(sizeof(char) * (slen + 4));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract trigrams from each substring extracted by get_wildcard_part.
|
|
|
|
*/
|
|
|
|
eword = str;
|
|
|
|
while ((eword = get_wildcard_part(eword, slen - (eword - str),
|
|
|
|
buf, &bytelen, &charlen)) != NULL)
|
|
|
|
{
|
|
|
|
#ifdef IGNORECASE
|
|
|
|
buf2 = lowerstr_with_len(buf, bytelen);
|
|
|
|
bytelen = strlen(buf2);
|
|
|
|
#else
|
|
|
|
buf2 = buf;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* count trigrams
|
|
|
|
*/
|
|
|
|
tptr = make_trigrams(tptr, buf2, bytelen, charlen);
|
2014-01-13 19:07:10 +01:00
|
|
|
|
2011-02-01 03:33:55 +01:00
|
|
|
#ifdef IGNORECASE
|
|
|
|
pfree(buf2);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
pfree(buf);
|
|
|
|
|
|
|
|
if ((len = tptr - GETARR(trg)) == 0)
|
|
|
|
return trg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make trigrams unique.
|
|
|
|
*/
|
2014-01-13 19:07:10 +01:00
|
|
|
if (len > 1)
|
2011-02-01 03:33:55 +01:00
|
|
|
{
|
|
|
|
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
|
|
|
|
len = unique_array(GETARR(trg), len);
|
|
|
|
}
|
|
|
|
|
|
|
|
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
|
|
|
|
|
|
|
|
return trg;
|
|
|
|
}
|
|
|
|
|
2008-11-12 14:43:54 +01:00
|
|
|
uint32
|
|
|
|
trgm2int(trgm *ptr)
|
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
uint32 val = 0;
|
2008-11-12 14:43:54 +01:00
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
val |= *(((unsigned char *) ptr));
|
2008-11-12 14:43:54 +01:00
|
|
|
val <<= 8;
|
2009-06-11 16:49:15 +02:00
|
|
|
val |= *(((unsigned char *) ptr) + 1);
|
2008-11-12 14:43:54 +01:00
|
|
|
val <<= 8;
|
2009-06-11 16:49:15 +02:00
|
|
|
val |= *(((unsigned char *) ptr) + 2);
|
2008-11-12 14:43:54 +01:00
|
|
|
|
|
|
|
return val;
|
|
|
|
}
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
Datum
|
2004-08-29 07:07:03 +02:00
|
|
|
show_trgm(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *in = PG_GETARG_TEXT_P(0);
|
|
|
|
TRGM *trg;
|
|
|
|
Datum *d;
|
|
|
|
ArrayType *a;
|
|
|
|
trgm *ptr;
|
2007-07-13 01:10:57 +02:00
|
|
|
int i;
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
|
2004-08-29 07:07:03 +02:00
|
|
|
d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2007-07-13 01:10:57 +02:00
|
|
|
for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
|
2004-08-29 07:07:03 +02:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2009-06-11 16:49:15 +02:00
|
|
|
if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
|
2008-11-12 14:43:54 +01:00
|
|
|
{
|
|
|
|
snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
|
|
|
|
SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SET_VARSIZE(item, VARHDRSZ + 3);
|
|
|
|
CPTRGM(VARDATA(item), ptr);
|
|
|
|
}
|
2007-07-13 01:10:57 +02:00
|
|
|
d[i] = PointerGetDatum(item);
|
2004-05-31 19:18:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
a = construct_array(
|
2004-08-29 07:07:03 +02:00
|
|
|
d,
|
|
|
|
ARRNELEM(trg),
|
|
|
|
TEXTOID,
|
|
|
|
-1,
|
|
|
|
false,
|
|
|
|
'i'
|
|
|
|
);
|
2004-05-31 19:18:12 +02:00
|
|
|
|
2007-07-13 01:10:57 +02:00
|
|
|
for (i = 0; i < ARRNELEM(trg); i++)
|
|
|
|
pfree(DatumGetPointer(d[i]));
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
pfree(d);
|
|
|
|
pfree(trg);
|
2004-08-29 07:07:03 +02:00
|
|
|
PG_FREE_IF_COPY(in, 0);
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
PG_RETURN_POINTER(a);
|
|
|
|
}
|
|
|
|
|
|
|
|
float4
|
2009-06-11 16:49:15 +02:00
|
|
|
cnt_sml(TRGM *trg1, TRGM *trg2)
|
2004-08-29 07:07:03 +02:00
|
|
|
{
|
|
|
|
trgm *ptr1,
|
|
|
|
*ptr2;
|
|
|
|
int count = 0;
|
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
ptr1 = GETARR(trg1);
|
|
|
|
ptr2 = GETARR(trg2);
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
len1 = ARRNELEM(trg1);
|
|
|
|
len2 = ARRNELEM(trg2);
|
|
|
|
|
2013-02-13 20:07:06 +01:00
|
|
|
/* explicit test is needed to avoid 0/0 division when both lengths are 0 */
|
|
|
|
if (len1 <= 0 || len2 <= 0)
|
|
|
|
return (float4) 0.0;
|
|
|
|
|
2004-08-29 07:07:03 +02:00
|
|
|
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
|
|
|
|
{
|
|
|
|
int res = CMPTRGM(ptr1, ptr2);
|
|
|
|
|
|
|
|
if (res < 0)
|
2004-05-31 19:18:12 +02:00
|
|
|
ptr1++;
|
2004-08-29 07:07:03 +02:00
|
|
|
else if (res > 0)
|
2004-05-31 19:18:12 +02:00
|
|
|
ptr2++;
|
2004-08-29 07:07:03 +02:00
|
|
|
else
|
|
|
|
{
|
2004-05-31 19:18:12 +02:00
|
|
|
ptr1++;
|
|
|
|
ptr2++;
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef DIVUNION
|
2013-02-13 20:07:06 +01:00
|
|
|
return ((float4) count) / ((float4) (len1 + len2 - count));
|
2004-05-31 19:18:12 +02:00
|
|
|
#else
|
2013-02-13 20:07:06 +01:00
|
|
|
return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
|
2004-05-31 19:18:12 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2011-02-01 03:33:55 +01:00
|
|
|
/*
|
|
|
|
* Returns whether trg2 contains all trigrams in trg1.
|
|
|
|
* This relies on the trigram arrays being sorted.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
trgm_contained_by(TRGM *trg1, TRGM *trg2)
|
|
|
|
{
|
|
|
|
trgm *ptr1,
|
|
|
|
*ptr2;
|
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
|
|
|
|
ptr1 = GETARR(trg1);
|
|
|
|
ptr2 = GETARR(trg2);
|
|
|
|
|
|
|
|
len1 = ARRNELEM(trg1);
|
|
|
|
len2 = ARRNELEM(trg2);
|
|
|
|
|
|
|
|
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
|
|
|
|
{
|
|
|
|
int res = CMPTRGM(ptr1, ptr2);
|
|
|
|
|
|
|
|
if (res < 0)
|
|
|
|
return false;
|
|
|
|
else if (res > 0)
|
|
|
|
ptr2++;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ptr1++;
|
|
|
|
ptr2++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ptr1 - GETARR(trg1) < len1)
|
|
|
|
return false;
|
|
|
|
else
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-04-10 19:30:14 +02:00
|
|
|
/*
|
|
|
|
* Return a palloc'd boolean array showing, for each trigram in "query",
|
|
|
|
* whether it is present in the trigram array "key".
|
|
|
|
* This relies on the "key" array being sorted, but "query" need not be.
|
|
|
|
*/
|
|
|
|
bool *
|
|
|
|
trgm_presence_map(TRGM *query, TRGM *key)
|
|
|
|
{
|
|
|
|
bool *result;
|
|
|
|
trgm *ptrq = GETARR(query),
|
|
|
|
*ptrk = GETARR(key);
|
|
|
|
int lenq = ARRNELEM(query),
|
|
|
|
lenk = ARRNELEM(key),
|
|
|
|
i;
|
|
|
|
|
|
|
|
result = (bool *) palloc0(lenq * sizeof(bool));
|
|
|
|
|
|
|
|
/* for each query trigram, do a binary search in the key array */
|
|
|
|
for (i = 0; i < lenq; i++)
|
|
|
|
{
|
|
|
|
int lo = 0;
|
|
|
|
int hi = lenk;
|
|
|
|
|
|
|
|
while (lo < hi)
|
|
|
|
{
|
|
|
|
int mid = (lo + hi) / 2;
|
|
|
|
int res = CMPTRGM(ptrq, ptrk + mid);
|
|
|
|
|
|
|
|
if (res < 0)
|
|
|
|
hi = mid;
|
|
|
|
else if (res > 0)
|
|
|
|
lo = mid + 1;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
result[i] = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ptrq++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
Datum
|
2004-08-29 07:07:03 +02:00
|
|
|
similarity(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *in1 = PG_GETARG_TEXT_P(0);
|
|
|
|
text *in2 = PG_GETARG_TEXT_P(1);
|
|
|
|
TRGM *trg1,
|
|
|
|
*trg2;
|
|
|
|
float4 res;
|
2004-05-31 19:18:12 +02:00
|
|
|
|
|
|
|
trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
|
|
|
|
trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
|
|
|
|
|
2004-08-29 07:07:03 +02:00
|
|
|
res = cnt_sml(trg1, trg2);
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
pfree(trg1);
|
|
|
|
pfree(trg2);
|
2004-08-29 07:07:03 +02:00
|
|
|
PG_FREE_IF_COPY(in1, 0);
|
|
|
|
PG_FREE_IF_COPY(in2, 1);
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
PG_RETURN_FLOAT4(res);
|
|
|
|
}
|
|
|
|
|
2010-12-04 06:16:21 +01:00
|
|
|
Datum
|
|
|
|
similarity_dist(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
|
|
|
|
PG_GETARG_DATUM(0),
|
|
|
|
PG_GETARG_DATUM(1)));
|
2011-04-10 17:42:00 +02:00
|
|
|
|
2010-12-04 06:16:21 +01:00
|
|
|
PG_RETURN_FLOAT4(1.0 - res);
|
|
|
|
}
|
|
|
|
|
2004-05-31 19:18:12 +02:00
|
|
|
Datum
|
2004-08-29 07:07:03 +02:00
|
|
|
similarity_op(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2010-12-04 06:16:21 +01:00
|
|
|
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
|
2005-10-15 04:49:52 +02:00
|
|
|
PG_GETARG_DATUM(0),
|
2010-12-04 06:16:21 +01:00
|
|
|
PG_GETARG_DATUM(1)));
|
2004-08-29 07:07:03 +02:00
|
|
|
|
|
|
|
PG_RETURN_BOOL(res >= trgm_limit);
|
2004-05-31 19:18:12 +02:00
|
|
|
}
|