postgresql/contrib/unaccent/unaccent.c
2010-01-02 16:58:17 +00:00

319 lines
6.7 KiB
C

/*-------------------------------------------------------------------------
*
* unaccent.c
* Text search unaccent dictionary
*
* Copyright (c) 2009-2010, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.4 2010/01/02 16:57:33 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
PG_MODULE_MAGIC;
/*
* Unaccent dictionary uses uncompressed suffix tree to find a
* character to replace. Each node of tree is an array of
* SuffixChar struct with length = 256 (n-th element of array
* corresponds to byte)
*/
typedef struct SuffixChar {
struct SuffixChar *nextChar;
char *replaceTo;
int replacelen;
} SuffixChar;
/*
* placeChar - put str into tree's structure, byte by byte.
*/
static SuffixChar*
placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
{
SuffixChar *curnode;
if ( !node )
{
node = palloc(sizeof(SuffixChar) * 256);
memset(node, 0, sizeof(SuffixChar) * 256);
}
curnode = node + *str;
if ( lenstr == 1 )
{
if ( curnode->replaceTo )
elog(WARNING, "duplicate TO argument, use first one");
else
{
curnode->replacelen = replacelen;
curnode->replaceTo = palloc( replacelen );
memcpy(curnode->replaceTo, replaceTo, replacelen);
}
}
else
{
curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
}
return node;
}
/*
* initSuffixTree - create suffix tree from file. Function converts
* UTF8-encoded file into current encoding.
*/
static SuffixChar*
initSuffixTree(char *filename)
{
SuffixChar * volatile rootSuffixTree = NULL;
MemoryContext ccxt = CurrentMemoryContext;
tsearch_readline_state trst;
volatile bool skip;
filename = get_tsearch_config_filename(filename, "rules");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open unaccent file \"%s\": %m",
filename)));
do
{
char src[4096];
char trg[4096];
int srclen;
int trglen;
char *line = NULL;
skip = true;
PG_TRY();
{
/*
* pg_do_encoding_conversion() (called by tsearch_readline())
* will emit exception if it finds untranslatable characters in current locale.
* We just skip such characters.
*/
while ((line = tsearch_readline(&trst)) != NULL)
{
if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
continue;
srclen = strlen(src);
trglen = strlen(trg);
rootSuffixTree = placeChar(rootSuffixTree,
(unsigned char*)src, srclen,
trg, trglen);
skip = false;
pfree(line);
}
}
PG_CATCH();
{
ErrorData *errdata;
MemoryContext ecxt;
ecxt = MemoryContextSwitchTo(ccxt);
errdata = CopyErrorData();
if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
{
FlushErrorState();
}
else
{
MemoryContextSwitchTo(ecxt);
PG_RE_THROW();
}
}
PG_END_TRY();
}
while(skip);
tsearch_readline_end(&trst);
return rootSuffixTree;
}
/*
* findReplaceTo - find multibyte character in tree
*/
static SuffixChar *
findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
{
while( node )
{
node = node + *src;
if ( srclen == 1 )
return node;
src++;
srclen--;
node = node->nextChar;
}
return NULL;
}
PG_FUNCTION_INFO_V1(unaccent_init);
Datum unaccent_init(PG_FUNCTION_ARGS);
Datum
unaccent_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
SuffixChar *rootSuffixTree = NULL;
bool fileloaded = false;
ListCell *l;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp("Rules", defel->defname) == 0)
{
if (fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Rules parameters")));
rootSuffixTree = initSuffixTree(defGetString(defel));
fileloaded = true;
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized Unaccent parameter: \"%s\"",
defel->defname)));
}
}
if (!fileloaded)
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Rules parameter")));
}
PG_RETURN_POINTER(rootSuffixTree);
}
PG_FUNCTION_INFO_V1(unaccent_lexize);
Datum unaccent_lexize(PG_FUNCTION_ARGS);
Datum
unaccent_lexize(PG_FUNCTION_ARGS)
{
SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
char *srcchar = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *srcstart, *trgchar = NULL;
int charlen;
TSLexeme *res = NULL;
SuffixChar *node;
srcstart = srcchar;
while( srcchar - srcstart < len )
{
charlen = pg_mblen(srcchar);
node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
if ( node && node->replaceTo )
{
if ( !res )
{
/* allocate res only it it's needed */
res = palloc0(sizeof(TSLexeme) * 2);
res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
res->flags = TSL_FILTER;
if ( srcchar != srcstart )
{
memcpy(trgchar, srcstart, srcchar - srcstart);
trgchar += (srcchar - srcstart);
}
}
memcpy( trgchar, node->replaceTo, node->replacelen );
trgchar += node->replacelen;
}
else if ( res )
{
memcpy( trgchar, srcchar, charlen );
trgchar += charlen;
}
srcchar += charlen;
}
if ( res )
*trgchar = '\0';
PG_RETURN_POINTER(res);
}
/*
* Function-like wrapper for dictionary
*/
PG_FUNCTION_INFO_V1(unaccent_dict);
Datum unaccent_dict(PG_FUNCTION_ARGS);
Datum
unaccent_dict(PG_FUNCTION_ARGS)
{
text *str;
int strArg;
Oid dictOid;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (PG_NARGS() == 1)
{
dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
strArg = 0;
}
else
{
dictOid = PG_GETARG_OID(0);
strArg = 1;
}
str = PG_GETARG_TEXT_P(strArg);
dict = lookup_ts_dictionary_cache(dictOid);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(VARDATA(str)),
Int32GetDatum(VARSIZE(str) - VARHDRSZ),
PointerGetDatum(NULL)));
PG_FREE_IF_COPY(str, strArg);
if ( res == NULL )
{
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else if ( res->lexeme == NULL )
{
pfree(res);
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else
{
text *txt = cstring_to_text(res->lexeme);
pfree(res->lexeme);
pfree(res);
PG_RETURN_TEXT_P(txt);
}
}