postgresql/contrib/unaccent/unaccent.c

381 lines
7.8 KiB
C
Raw Normal View History

2009-08-18 12:34:39 +02:00
/*-------------------------------------------------------------------------
*
* unaccent.c
2010-02-26 03:01:40 +01:00
* Text search unaccent dictionary
2009-08-18 12:34:39 +02:00
*
* Copyright (c) 2009-2014, PostgreSQL Global Development Group
2009-08-18 12:34:39 +02:00
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* contrib/unaccent/unaccent.c
2009-08-18 12:34:39 +02:00
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
PG_MODULE_MAGIC;
/*
* Unaccent dictionary uses a trie to find a character to replace. Each node of
* the trie is an array of 256 TrieChar structs (n-th element of array
2009-08-18 12:34:39 +02:00
* corresponds to byte)
*/
typedef struct TrieChar
2010-02-26 03:01:40 +01:00
{
struct TrieChar *nextChar;
2010-02-26 03:01:40 +01:00
char *replaceTo;
int replacelen;
} TrieChar;
2009-08-18 12:34:39 +02:00
/*
* placeChar - put str into trie's structure, byte by byte.
2009-08-18 12:34:39 +02:00
*/
static TrieChar *
placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
2009-08-18 12:34:39 +02:00
{
TrieChar *curnode;
2009-08-18 12:34:39 +02:00
2010-02-26 03:01:40 +01:00
if (!node)
2009-08-18 12:34:39 +02:00
{
node = palloc(sizeof(TrieChar) * 256);
memset(node, 0, sizeof(TrieChar) * 256);
2009-08-18 12:34:39 +02:00
}
curnode = node + *str;
2010-02-26 03:01:40 +01:00
if (lenstr == 1)
2009-08-18 12:34:39 +02:00
{
2010-02-26 03:01:40 +01:00
if (curnode->replaceTo)
2009-08-18 12:34:39 +02:00
elog(WARNING, "duplicate TO argument, use first one");
else
{
curnode->replacelen = replacelen;
2010-02-26 03:01:40 +01:00
curnode->replaceTo = palloc(replacelen);
2009-08-18 12:34:39 +02:00
memcpy(curnode->replaceTo, replaceTo, replacelen);
}
}
else
{
2010-02-26 03:01:40 +01:00
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
2009-08-18 12:34:39 +02:00
}
return node;
}
/*
* initTrie - create trie from file.
*
* Function converts UTF8-encoded file into current encoding.
2009-08-18 12:34:39 +02:00
*/
static TrieChar *
initTrie(char *filename)
2009-08-18 12:34:39 +02:00
{
TrieChar *volatile rootTrie = NULL;
2009-08-18 12:34:39 +02:00
MemoryContext ccxt = CurrentMemoryContext;
2010-02-26 03:01:40 +01:00
tsearch_readline_state trst;
volatile bool skip;
2009-08-18 12:34:39 +02:00
filename = get_tsearch_config_filename(filename, "rules");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open unaccent file \"%s\": %m",
filename)));
2010-02-26 03:01:40 +01:00
do
2009-08-18 12:34:39 +02:00
{
/*
* pg_do_encoding_conversion() (called by tsearch_readline()) will
* emit exception if it finds untranslatable characters in current
* locale. We just skip such lines, continuing with the next.
*/
2009-08-18 12:34:39 +02:00
skip = true;
PG_TRY();
{
char *line;
2009-08-18 12:34:39 +02:00
while ((line = tsearch_readline(&trst)) != NULL)
{
/*----------
* The format of each line must be "src" or "src trg", where
* src and trg are sequences of one or more non-whitespace
* characters, separated by whitespace. Whitespace at start
* or end of line is ignored. If trg is omitted, an empty
* string is used as the replacement.
*
* We use a simple state machine, with states
* 0 initial (before src)
* 1 in src
* 2 in whitespace after src
* 3 in trg
* 4 in whitespace after trg
* -1 syntax error detected (line will be ignored)
*----------
*/
int state;
char *ptr;
char *src = NULL;
char *trg = NULL;
int ptrlen;
int srclen = 0;
int trglen = 0;
state = 0;
for (ptr = line; *ptr; ptr += ptrlen)
{
ptrlen = pg_mblen(ptr);
/* ignore whitespace, but end src or trg */
if (t_isspace(ptr))
{
if (state == 1)
state = 2;
else if (state == 3)
state = 4;
continue;
}
switch (state)
{
case 0:
/* start of src */
src = ptr;
srclen = ptrlen;
state = 1;
break;
case 1:
/* continue src */
srclen += ptrlen;
break;
case 2:
/* start of trg */
trg = ptr;
trglen = ptrlen;
state = 3;
break;
case 3:
/* continue trg */
trglen += ptrlen;
break;
default:
/* bogus line format */
state = -1;
break;
}
}
if (state == 1 || state == 2)
{
/* trg was omitted, so use "" */
trg = "";
trglen = 0;
}
if (state > 0)
rootTrie = placeChar(rootTrie,
(unsigned char *) src, srclen,
trg, trglen);
2009-08-18 12:34:39 +02:00
pfree(line);
}
skip = false;
2009-08-18 12:34:39 +02:00
}
PG_CATCH();
{
ErrorData *errdata;
MemoryContext ecxt;
ecxt = MemoryContextSwitchTo(ccxt);
errdata = CopyErrorData();
if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
{
FlushErrorState();
}
else
{
MemoryContextSwitchTo(ecxt);
PG_RE_THROW();
}
}
PG_END_TRY();
}
2010-02-26 03:01:40 +01:00
while (skip);
2009-08-18 12:34:39 +02:00
tsearch_readline_end(&trst);
return rootTrie;
2009-08-18 12:34:39 +02:00
}
/*
* findReplaceTo - find multibyte character in trie
2009-08-18 12:34:39 +02:00
*/
static TrieChar *
findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
2009-08-18 12:34:39 +02:00
{
2010-02-26 03:01:40 +01:00
while (node)
2009-08-18 12:34:39 +02:00
{
node = node + *src;
2010-02-26 03:01:40 +01:00
if (srclen == 1)
2009-08-18 12:34:39 +02:00
return node;
src++;
srclen--;
node = node->nextChar;
}
return NULL;
}
PG_FUNCTION_INFO_V1(unaccent_init);
Datum
unaccent_init(PG_FUNCTION_ARGS)
{
2010-02-26 03:01:40 +01:00
List *dictoptions = (List *) PG_GETARG_POINTER(0);
TrieChar *rootTrie = NULL;
2010-02-26 03:01:40 +01:00
bool fileloaded = false;
2009-08-18 12:34:39 +02:00
ListCell *l;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp("Rules", defel->defname) == 0)
{
if (fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Rules parameters")));
rootTrie = initTrie(defGetString(defel));
2010-02-26 03:01:40 +01:00
fileloaded = true;
2009-08-18 12:34:39 +02:00
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized Unaccent parameter: \"%s\"",
defel->defname)));
}
}
if (!fileloaded)
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Rules parameter")));
}
PG_RETURN_POINTER(rootTrie);
2009-08-18 12:34:39 +02:00
}
PG_FUNCTION_INFO_V1(unaccent_lexize);
Datum
unaccent_lexize(PG_FUNCTION_ARGS)
{
TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
2010-02-26 03:01:40 +01:00
char *srcchar = (char *) PG_GETARG_POINTER(1);
2009-08-18 12:34:39 +02:00
int32 len = PG_GETARG_INT32(2);
2010-02-26 03:01:40 +01:00
char *srcstart,
*trgchar = NULL;
2009-08-18 12:34:39 +02:00
int charlen;
TSLexeme *res = NULL;
TrieChar *node;
2009-08-18 12:34:39 +02:00
srcstart = srcchar;
2010-02-26 03:01:40 +01:00
while (srcchar - srcstart < len)
2009-08-18 12:34:39 +02:00
{
charlen = pg_mblen(srcchar);
node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
2010-02-26 03:01:40 +01:00
if (node && node->replaceTo)
2009-08-18 12:34:39 +02:00
{
2010-02-26 03:01:40 +01:00
if (!res)
2009-08-18 12:34:39 +02:00
{
2012-04-22 18:23:47 +02:00
/* allocate res only if it's needed */
2009-08-18 12:34:39 +02:00
res = palloc0(sizeof(TSLexeme) * 2);
2010-02-26 03:01:40 +01:00
res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
2009-08-18 12:34:39 +02:00
res->flags = TSL_FILTER;
2010-02-26 03:01:40 +01:00
if (srcchar != srcstart)
2009-08-18 12:34:39 +02:00
{
memcpy(trgchar, srcstart, srcchar - srcstart);
trgchar += (srcchar - srcstart);
}
}
2010-02-26 03:01:40 +01:00
memcpy(trgchar, node->replaceTo, node->replacelen);
trgchar += node->replacelen;
2009-08-18 12:34:39 +02:00
}
2010-02-26 03:01:40 +01:00
else if (res)
2009-08-18 12:34:39 +02:00
{
2010-02-26 03:01:40 +01:00
memcpy(trgchar, srcchar, charlen);
2009-08-18 12:34:39 +02:00
trgchar += charlen;
}
srcchar += charlen;
}
2010-02-26 03:01:40 +01:00
if (res)
2009-08-18 12:34:39 +02:00
*trgchar = '\0';
PG_RETURN_POINTER(res);
}
/*
* Function-like wrapper for dictionary
*/
PG_FUNCTION_INFO_V1(unaccent_dict);
Datum
unaccent_dict(PG_FUNCTION_ARGS)
{
2010-02-26 03:01:40 +01:00
text *str;
int strArg;
Oid dictOid;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
2009-08-18 12:34:39 +02:00
if (PG_NARGS() == 1)
{
dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
2009-08-18 12:34:39 +02:00
strArg = 0;
}
else
{
dictOid = PG_GETARG_OID(0);
strArg = 1;
}
str = PG_GETARG_TEXT_P(strArg);
dict = lookup_ts_dictionary_cache(dictOid);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
2010-02-26 03:01:40 +01:00
PointerGetDatum(dict->dictData),
PointerGetDatum(VARDATA(str)),
Int32GetDatum(VARSIZE(str) - VARHDRSZ),
2009-08-18 12:34:39 +02:00
PointerGetDatum(NULL)));
PG_FREE_IF_COPY(str, strArg);
2010-02-26 03:01:40 +01:00
if (res == NULL)
2009-08-18 12:34:39 +02:00
{
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
2010-02-26 03:01:40 +01:00
else if (res->lexeme == NULL)
2009-08-18 12:34:39 +02:00
{
pfree(res);
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else
{
2010-02-26 03:01:40 +01:00
text *txt = cstring_to_text(res->lexeme);
2009-08-18 12:34:39 +02:00
pfree(res->lexeme);
pfree(res);
PG_RETURN_TEXT_P(txt);
}
}