postgresql/contrib/unaccent/unaccent.c

/*-------------------------------------------------------------------------
 *
 * unaccent.c
 *	  Text search unaccent dictionary
 *
 * Copyright (c) 2009-2010, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.6 2010/08/05 15:25:35 rhaas Exp $
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "fmgr.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"

PG_MODULE_MAGIC;

/*
 * Unaccent dictionary uses uncompressed suffix tree to find a
 * character to replace. Each node of tree is an array of
 * SuffixChar struct with length = 256 (n-th element of array
 * corresponds to byte)
 */
typedef struct SuffixChar
{
	struct SuffixChar *nextChar;
	char	   *replaceTo;
	int			replacelen;
} SuffixChar;

/*
 * placeChar - put str into tree's structure, byte by byte.
 */
static SuffixChar *
placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
{
	SuffixChar *curnode;

	if (!node)
	{
		node = palloc(sizeof(SuffixChar) * 256);
		memset(node, 0, sizeof(SuffixChar) * 256);
	}

	curnode = node + *str;

	if (lenstr == 1)
	{
		if (curnode->replaceTo)
			elog(WARNING, "duplicate TO argument, use first one");
		else
		{
			curnode->replacelen = replacelen;
			curnode->replaceTo = palloc(replacelen);
			memcpy(curnode->replaceTo, replaceTo, replacelen);
		}
	}
	else
	{
		curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
	}

	return node;
}

/*
 * initSuffixTree  - create suffix tree from file. Function converts
 * UTF8-encoded file into current encoding.
 */
static SuffixChar *
initSuffixTree(char *filename)
{
	SuffixChar *volatile rootSuffixTree = NULL;
	MemoryContext ccxt = CurrentMemoryContext;
	tsearch_readline_state trst;
	volatile bool skip;

	filename = get_tsearch_config_filename(filename, "rules");
	if (!tsearch_readline_begin(&trst, filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open unaccent file \"%s\": %m",
						filename)));

	do
	{
		char		src[4096];
		char		trg[4096];
		int			srclen;
		int			trglen;
		char	   *line = NULL;

		skip = true;

		PG_TRY();
		{
			/*
			 * pg_do_encoding_conversion() (called by tsearch_readline()) will
			 * emit exception if it finds untranslatable characters in current
			 * locale. We just skip such characters.
			 */
			while ((line = tsearch_readline(&trst)) != NULL)
			{
				if (sscanf(line, "%s\t%s\n", src, trg) != 2)
					continue;

				srclen = strlen(src);
				trglen = strlen(trg);

				rootSuffixTree = placeChar(rootSuffixTree,
										   (unsigned char *) src, srclen,
										   trg, trglen);
				skip = false;
				pfree(line);
			}
		}
		PG_CATCH();
		{
			ErrorData  *errdata;
			MemoryContext ecxt;

			ecxt = MemoryContextSwitchTo(ccxt);
			errdata = CopyErrorData();
			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
			{
				FlushErrorState();
			}
			else
			{
				MemoryContextSwitchTo(ecxt);
				PG_RE_THROW();
			}
		}
		PG_END_TRY();
	}
	while (skip);

	tsearch_readline_end(&trst);

	return rootSuffixTree;
}

/*
 * findReplaceTo - find multibyte character in tree
 */
static SuffixChar *
findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
{
	while (node)
	{
		node = node + *src;
		if (srclen == 1)
			return node;

		src++;
		srclen--;
		node = node->nextChar;
	}

	return NULL;
}

PG_FUNCTION_INFO_V1(unaccent_init);
Datum		unaccent_init(PG_FUNCTION_ARGS);
Datum
unaccent_init(PG_FUNCTION_ARGS)
{
	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
	SuffixChar *rootSuffixTree = NULL;
	bool		fileloaded = false;
	ListCell   *l;

	foreach(l, dictoptions)
	{
		DefElem    *defel = (DefElem *) lfirst(l);

		if (pg_strcasecmp("Rules", defel->defname) == 0)
		{
			if (fileloaded)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("multiple Rules parameters")));
			rootSuffixTree = initSuffixTree(defGetString(defel));
			fileloaded = true;
		}
		else
		{
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("unrecognized Unaccent parameter: \"%s\"",
							defel->defname)));
		}
	}

	if (!fileloaded)
	{
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("missing Rules parameter")));
	}

	PG_RETURN_POINTER(rootSuffixTree);
}

PG_FUNCTION_INFO_V1(unaccent_lexize);
Datum		unaccent_lexize(PG_FUNCTION_ARGS);
Datum
unaccent_lexize(PG_FUNCTION_ARGS)
{
	SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
	char	   *srcchar = (char *) PG_GETARG_POINTER(1);
	int32		len = PG_GETARG_INT32(2);
	char	   *srcstart,
			   *trgchar = NULL;
	int			charlen;
	TSLexeme   *res = NULL;
	SuffixChar *node;

	srcstart = srcchar;
	while (srcchar - srcstart < len)
	{
		charlen = pg_mblen(srcchar);

		node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
		if (node && node->replaceTo)
		{
			if (!res)
			{
				/* allocate res only it it's needed */
				res = palloc0(sizeof(TSLexeme) * 2);
				res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
				res->flags = TSL_FILTER;
				if (srcchar != srcstart)
				{
					memcpy(trgchar, srcstart, srcchar - srcstart);
					trgchar += (srcchar - srcstart);
				}
			}
			memcpy(trgchar, node->replaceTo, node->replacelen);
			trgchar += node->replacelen;
		}
		else if (res)
		{
			memcpy(trgchar, srcchar, charlen);
			trgchar += charlen;
		}

		srcchar += charlen;
	}

	if (res)
		*trgchar = '\0';

	PG_RETURN_POINTER(res);
}

/*
 * Function-like wrapper for dictionary
 */
PG_FUNCTION_INFO_V1(unaccent_dict);
Datum		unaccent_dict(PG_FUNCTION_ARGS);
Datum
unaccent_dict(PG_FUNCTION_ARGS)
{
	text	   *str;
	int			strArg;
	Oid			dictOid;
	TSDictionaryCacheEntry *dict;
	TSLexeme   *res;

	if (PG_NARGS() == 1)
	{
		dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
		strArg = 0;
	}
	else
	{
		dictOid = PG_GETARG_OID(0);
		strArg = 1;
	}
	str = PG_GETARG_TEXT_P(strArg);

	dict = lookup_ts_dictionary_cache(dictOid);

	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
											 PointerGetDatum(dict->dictData),
											   PointerGetDatum(VARDATA(str)),
									  Int32GetDatum(VARSIZE(str) - VARHDRSZ),
													 PointerGetDatum(NULL)));

	PG_FREE_IF_COPY(str, strArg);

	if (res == NULL)
	{
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
	}
	else if (res->lexeme == NULL)
	{
		pfree(res);
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
	}
	else
	{
		text	   *txt = cstring_to_text(res->lexeme);

		pfree(res->lexeme);
		pfree(res);

		PG_RETURN_TEXT_P(txt);
	}
}