postgresql/contrib/unaccent/unaccent.c

/*-------------------------------------------------------------------------
 *
 * unaccent.c
 *	  Text search unaccent dictionary
 *
 * Copyright (c) 2009-2014, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  contrib/unaccent/unaccent.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"

PG_MODULE_MAGIC;

/*
 * Unaccent dictionary uses a trie to find a character to replace. Each node of
 * the trie is an array of 256 TrieChar structs (n-th element of array
 * corresponds to byte)
 */
typedef struct TrieChar
{
	struct TrieChar *nextChar;
	char	   *replaceTo;
	int			replacelen;
} TrieChar;

/*
 * placeChar - put str into trie's structure, byte by byte.
 */
static TrieChar *
placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
{
	TrieChar   *curnode;

	if (!node)
	{
		node = palloc(sizeof(TrieChar) * 256);
		memset(node, 0, sizeof(TrieChar) * 256);
	}

	curnode = node + *str;

	if (lenstr == 1)
	{
		if (curnode->replaceTo)
			elog(WARNING, "duplicate TO argument, use first one");
		else
		{
			curnode->replacelen = replacelen;
			curnode->replaceTo = palloc(replacelen);
			memcpy(curnode->replaceTo, replaceTo, replacelen);
		}
	}
	else
	{
		curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
	}

	return node;
}

/*
 * initTrie  - create trie from file.
 *
 * Function converts UTF8-encoded file into current encoding.
 */
static TrieChar *
initTrie(char *filename)
{
	TrieChar   *volatile rootTrie = NULL;
	MemoryContext ccxt = CurrentMemoryContext;
	tsearch_readline_state trst;
	volatile bool skip;

	filename = get_tsearch_config_filename(filename, "rules");
	if (!tsearch_readline_begin(&trst, filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open unaccent file \"%s\": %m",
						filename)));

	do
	{
		/*
		 * pg_do_encoding_conversion() (called by tsearch_readline()) will
		 * emit exception if it finds untranslatable characters in current
		 * locale. We just skip such lines, continuing with the next.
		 */
		skip = true;

		PG_TRY();
		{
			char	   *line;

			while ((line = tsearch_readline(&trst)) != NULL)
			{
				/*----------
				 * The format of each line must be "src" or "src trg", where
				 * src and trg are sequences of one or more non-whitespace
				 * characters, separated by whitespace.  Whitespace at start
				 * or end of line is ignored.  If trg is omitted, an empty
				 * string is used as the replacement.
				 *
				 * We use a simple state machine, with states
				 *	0	initial (before src)
				 *	1	in src
				 *	2	in whitespace after src
				 *	3	in trg
				 *	4	in whitespace after trg
				 *	-1	syntax error detected (line will be ignored)
				 *----------
				 */
				int			state;
				char	   *ptr;
				char	   *src = NULL;
				char	   *trg = NULL;
				int			ptrlen;
				int			srclen = 0;
				int			trglen = 0;

				state = 0;
				for (ptr = line; *ptr; ptr += ptrlen)
				{
					ptrlen = pg_mblen(ptr);
					/* ignore whitespace, but end src or trg */
					if (t_isspace(ptr))
					{
						if (state == 1)
							state = 2;
						else if (state == 3)
							state = 4;
						continue;
					}
					switch (state)
					{
						case 0:
							/* start of src */
							src = ptr;
							srclen = ptrlen;
							state = 1;
							break;
						case 1:
							/* continue src */
							srclen += ptrlen;
							break;
						case 2:
							/* start of trg */
							trg = ptr;
							trglen = ptrlen;
							state = 3;
							break;
						case 3:
							/* continue trg */
							trglen += ptrlen;
							break;
						default:
							/* bogus line format */
							state = -1;
							break;
					}
				}

				if (state == 1 || state == 2)
				{
					/* trg was omitted, so use "" */
					trg = "";
					trglen = 0;
				}

				if (state > 0)
					rootTrie = placeChar(rootTrie,
										 (unsigned char *) src, srclen,
										 trg, trglen);

				pfree(line);
			}
			skip = false;
		}
		PG_CATCH();
		{
			ErrorData  *errdata;
			MemoryContext ecxt;

			ecxt = MemoryContextSwitchTo(ccxt);
			errdata = CopyErrorData();
			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
			{
				FlushErrorState();
			}
			else
			{
				MemoryContextSwitchTo(ecxt);
				PG_RE_THROW();
			}
		}
		PG_END_TRY();
	}
	while (skip);

	tsearch_readline_end(&trst);

	return rootTrie;
}

/*
 * findReplaceTo - find multibyte character in trie
 */
static TrieChar *
findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
{
	while (node)
	{
		node = node + *src;
		if (srclen == 1)
			return node;

		src++;
		srclen--;
		node = node->nextChar;
	}

	return NULL;
}

PG_FUNCTION_INFO_V1(unaccent_init);
Datum
unaccent_init(PG_FUNCTION_ARGS)
{
	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
	TrieChar   *rootTrie = NULL;
	bool		fileloaded = false;
	ListCell   *l;

	foreach(l, dictoptions)
	{
		DefElem    *defel = (DefElem *) lfirst(l);

		if (pg_strcasecmp("Rules", defel->defname) == 0)
		{
			if (fileloaded)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("multiple Rules parameters")));
			rootTrie = initTrie(defGetString(defel));
			fileloaded = true;
		}
		else
		{
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("unrecognized Unaccent parameter: \"%s\"",
							defel->defname)));
		}
	}

	if (!fileloaded)
	{
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("missing Rules parameter")));
	}

	PG_RETURN_POINTER(rootTrie);
}

PG_FUNCTION_INFO_V1(unaccent_lexize);
Datum
unaccent_lexize(PG_FUNCTION_ARGS)
{
	TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
	char	   *srcchar = (char *) PG_GETARG_POINTER(1);
	int32		len = PG_GETARG_INT32(2);
	char	   *srcstart,
			   *trgchar = NULL;
	int			charlen;
	TSLexeme   *res = NULL;
	TrieChar   *node;

	srcstart = srcchar;
	while (srcchar - srcstart < len)
	{
		charlen = pg_mblen(srcchar);

		node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
		if (node && node->replaceTo)
		{
			if (!res)
			{
				/* allocate res only if it's needed */
				res = palloc0(sizeof(TSLexeme) * 2);
				res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
				res->flags = TSL_FILTER;
				if (srcchar != srcstart)
				{
					memcpy(trgchar, srcstart, srcchar - srcstart);
					trgchar += (srcchar - srcstart);
				}
			}
			memcpy(trgchar, node->replaceTo, node->replacelen);
			trgchar += node->replacelen;
		}
		else if (res)
		{
			memcpy(trgchar, srcchar, charlen);
			trgchar += charlen;
		}

		srcchar += charlen;
	}

	if (res)
		*trgchar = '\0';

	PG_RETURN_POINTER(res);
}

/*
 * Function-like wrapper for dictionary
 */
PG_FUNCTION_INFO_V1(unaccent_dict);
Datum
unaccent_dict(PG_FUNCTION_ARGS)
{
	text	   *str;
	int			strArg;
	Oid			dictOid;
	TSDictionaryCacheEntry *dict;
	TSLexeme   *res;

	if (PG_NARGS() == 1)
	{
		dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
		strArg = 0;
	}
	else
	{
		dictOid = PG_GETARG_OID(0);
		strArg = 1;
	}
	str = PG_GETARG_TEXT_P(strArg);

	dict = lookup_ts_dictionary_cache(dictOid);

	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
											 PointerGetDatum(dict->dictData),
											   PointerGetDatum(VARDATA(str)),
									  Int32GetDatum(VARSIZE(str) - VARHDRSZ),
													 PointerGetDatum(NULL)));

	PG_FREE_IF_COPY(str, strArg);

	if (res == NULL)
	{
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
	}
	else if (res->lexeme == NULL)
	{
		pfree(res);
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
	}
	else
	{
		text	   *txt = cstring_to_text(res->lexeme);

		pfree(res->lexeme);
		pfree(res);

		PG_RETURN_TEXT_P(txt);
	}
}
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* unaccent.c`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`* Text search unaccent dictionary`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*`
Update copyright for 2014 Update all files in head, and files COPYRIGHT and legal.sgml in all back branches. 2014-01-07 22:05:30 +01:00			`* Copyright (c) 2009-2014, PostgreSQL Global Development Group`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*`
			`* IDENTIFICATION`
Remove cvs keywords from all files. 2010-09-20 22:08:53 +02:00			`* contrib/unaccent/unaccent.c`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*`
			`*-------------------------------------------------------------------------`
			`*/`

			`#include "postgres.h"`

			`#include "catalog/namespace.h"`
			`#include "commands/defrem.h"`
			`#include "tsearch/ts_cache.h"`
			`#include "tsearch/ts_locale.h"`
			`#include "tsearch/ts_public.h"`
			`#include "utils/builtins.h"`

			`PG_MODULE_MAGIC;`

			`/*`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`* Unaccent dictionary uses a trie to find a character to replace. Each node of`
			`* the trie is an array of 256 TrieChar structs (n-th element of array`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`* corresponds to byte)`
			`*/`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`typedef struct TrieChar`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`{`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`struct TrieChar *nextChar;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`char *replaceTo;`
			`int replacelen;`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`} TrieChar;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`/*`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`* placeChar - put str into trie's structure, byte by byte.`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*/`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`static TrieChar *`
			`placeChar(TrieChar node, unsigned char str, int lenstr, char *replaceTo, int replacelen)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
pgindent run for release 9.3 This is the first run of the Perl-based pgindent script. Also update pgindent instructions. 2013-05-29 22:58:43 +02:00			`TrieChar *curnode;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (!node)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`node = palloc(sizeof(TrieChar) * 256);`
			`memset(node, 0, sizeof(TrieChar) * 256);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`

			`curnode = node + *str;`

pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (lenstr == 1)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (curnode->replaceTo)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`elog(WARNING, "duplicate TO argument, use first one");`
			`else`
			`{`
			`curnode->replacelen = replacelen;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`curnode->replaceTo = palloc(replacelen);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`memcpy(curnode->replaceTo, replaceTo, replacelen);`
			`}`
			`}`
			`else`
			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`

			`return node;`
			`}`

			`/*`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`* initTrie - create trie from file.`
			`*`
			`* Function converts UTF8-encoded file into current encoding.`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*/`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`static TrieChar *`
			`initTrie(char *filename)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
pgindent run for release 9.3 This is the first run of the Perl-based pgindent script. Also update pgindent instructions. 2013-05-29 22:58:43 +02:00			`TrieChar *volatile rootTrie = NULL;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`MemoryContext ccxt = CurrentMemoryContext;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`tsearch_readline_state trst;`
			`volatile bool skip;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`filename = get_tsearch_config_filename(filename, "rules");`
			`if (!tsearch_readline_begin(&trst, filename))`
			`ereport(ERROR,`
			`(errcode(ERRCODE_CONFIG_FILE_ERROR),`
			`errmsg("could not open unaccent file \"%s\": %m",`
			`filename)));`

pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`do`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
Fix assorted bugs in contrib/unaccent's configuration file parsing. Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches. 2011-11-07 17:48:53 +01:00			`/*`
			`* pg_do_encoding_conversion() (called by tsearch_readline()) will`
			`* emit exception if it finds untranslatable characters in current`
			`* locale. We just skip such lines, continuing with the next.`
			`*/`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`skip = true;`

			`PG_TRY();`
			`{`
Fix assorted bugs in contrib/unaccent's configuration file parsing. Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches. 2011-11-07 17:48:53 +01:00			`char *line;`

Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`while ((line = tsearch_readline(&trst)) != NULL)`
			`{`
Allow empty replacement strings in contrib/unaccent. This is useful in languages where diacritic signs are represented as separate characters; it's also one step towards letting unaccent be used for arbitrary substring substitutions. In passing, improve the user documentation for unaccent, which was sadly vague about some important details. Mohammad Alhashash, reviewed by Abhijit Menon-Sen 2014-07-01 02:51:26 +02:00			`/*----------`
			`* The format of each line must be "src" or "src trg", where`
			`* src and trg are sequences of one or more non-whitespace`
			`* characters, separated by whitespace. Whitespace at start`
			`* or end of line is ignored. If trg is omitted, an empty`
			`* string is used as the replacement.`
			`*`
			`* We use a simple state machine, with states`
			`* 0 initial (before src)`
			`* 1 in src`
			`* 2 in whitespace after src`
			`* 3 in trg`
			`* 4 in whitespace after trg`
			`* -1 syntax error detected (line will be ignored)`
			`*----------`
Fix assorted bugs in contrib/unaccent's configuration file parsing. Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches. 2011-11-07 17:48:53 +01:00			`*/`
			`int state;`
			`char *ptr;`
			`char *src = NULL;`
			`char *trg = NULL;`
			`int ptrlen;`
			`int srclen = 0;`
			`int trglen = 0;`

			`state = 0;`
			`for (ptr = line; *ptr; ptr += ptrlen)`
			`{`
			`ptrlen = pg_mblen(ptr);`
			`/* ignore whitespace, but end src or trg */`
			`if (t_isspace(ptr))`
			`{`
			`if (state == 1)`
			`state = 2;`
			`else if (state == 3)`
			`state = 4;`
			`continue;`
			`}`
			`switch (state)`
			`{`
			`case 0:`
			`/* start of src */`
			`src = ptr;`
			`srclen = ptrlen;`
			`state = 1;`
			`break;`
			`case 1:`
			`/* continue src */`
			`srclen += ptrlen;`
			`break;`
			`case 2:`
			`/* start of trg */`
			`trg = ptr;`
			`trglen = ptrlen;`
			`state = 3;`
			`break;`
			`case 3:`
			`/* continue trg */`
			`trglen += ptrlen;`
			`break;`
			`default:`
			`/* bogus line format */`
			`state = -1;`
			`break;`
			`}`
			`}`

Allow empty replacement strings in contrib/unaccent. This is useful in languages where diacritic signs are represented as separate characters; it's also one step towards letting unaccent be used for arbitrary substring substitutions. In passing, improve the user documentation for unaccent, which was sadly vague about some important details. Mohammad Alhashash, reviewed by Abhijit Menon-Sen 2014-07-01 02:51:26 +02:00			`if (state == 1 \|\| state == 2)`
			`{`
			`/* trg was omitted, so use "" */`
			`trg = "";`
			`trglen = 0;`
			`}`

			`if (state > 0)`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`rootTrie = placeChar(rootTrie,`
pgindent run for release 9.3 This is the first run of the Perl-based pgindent script. Also update pgindent instructions. 2013-05-29 22:58:43 +02:00			`(unsigned char *) src, srclen,`
			`trg, trglen);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`pfree(line);`
			`}`
Fix assorted bugs in contrib/unaccent's configuration file parsing. Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches. 2011-11-07 17:48:53 +01:00			`skip = false;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`
			`PG_CATCH();`
			`{`
			`ErrorData *errdata;`
			`MemoryContext ecxt;`

			`ecxt = MemoryContextSwitchTo(ccxt);`
			`errdata = CopyErrorData();`
			`if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)`
			`{`
			`FlushErrorState();`
			`}`
			`else`
			`{`
			`MemoryContextSwitchTo(ecxt);`
			`PG_RE_THROW();`
			`}`
			`}`
			`PG_END_TRY();`
			`}`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`while (skip);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`tsearch_readline_end(&trst);`

The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`return rootTrie;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`

			`/*`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`* findReplaceTo - find multibyte character in trie`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*/`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`static TrieChar *`
			`findReplaceTo(TrieChar node, unsigned char src, int srclen)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`while (node)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
			`node = node + *src;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (srclen == 1)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`return node;`

			`src++;`
			`srclen--;`
			`node = node->nextChar;`
			`}`

			`return NULL;`
			`}`

			`PG_FUNCTION_INFO_V1(unaccent_init);`
			`Datum`
			`unaccent_init(PG_FUNCTION_ARGS)`
			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`List dictoptions = (List ) PG_GETARG_POINTER(0);`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`TrieChar *rootTrie = NULL;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`bool fileloaded = false;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`ListCell *l;`

			`foreach(l, dictoptions)`
			`{`
			`DefElem defel = (DefElem ) lfirst(l);`

			`if (pg_strcasecmp("Rules", defel->defname) == 0)`
			`{`
			`if (fileloaded)`
			`ereport(ERROR,`
			`(errcode(ERRCODE_INVALID_PARAMETER_VALUE),`
			`errmsg("multiple Rules parameters")));`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`rootTrie = initTrie(defGetString(defel));`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`fileloaded = true;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`
			`else`
			`{`
			`ereport(ERROR,`
			`(errcode(ERRCODE_INVALID_PARAMETER_VALUE),`
			`errmsg("unrecognized Unaccent parameter: \"%s\"",`
			`defel->defname)));`
			`}`
			`}`

			`if (!fileloaded)`
			`{`
			`ereport(ERROR,`
			`(errcode(ERRCODE_INVALID_PARAMETER_VALUE),`
			`errmsg("missing Rules parameter")));`
			`}`

The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`PG_RETURN_POINTER(rootTrie);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`

			`PG_FUNCTION_INFO_V1(unaccent_lexize);`
			`Datum`
			`unaccent_lexize(PG_FUNCTION_ARGS)`
			`{`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`TrieChar rootTrie = (TrieChar ) PG_GETARG_POINTER(0);`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`char srcchar = (char ) PG_GETARG_POINTER(1);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`int32 len = PG_GETARG_INT32(2);`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`char *srcstart,`
			`*trgchar = NULL;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`int charlen;`
			`TSLexeme *res = NULL;`
The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`TrieChar *node;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`srcstart = srcchar;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`while (srcchar - srcstart < len)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
			`charlen = pg_mblen(srcchar);`

The data structure used in unaccent is a trie, not suffix tree. Fix the term used in variable and struct names, and comments. Alexander Korotkov 2013-05-08 19:57:42 +02:00			`node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (node && node->replaceTo)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (!res)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
Fix some typos Josh Kupershmidt 2012-04-22 18:23:47 +02:00			`/* allocate res only if it's needed */`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`res = palloc0(sizeof(TSLexeme) * 2);`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`res->flags = TSL_FILTER;`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (srcchar != srcstart)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
			`memcpy(trgchar, srcstart, srcchar - srcstart);`
			`trgchar += (srcchar - srcstart);`
			`}`
			`}`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`memcpy(trgchar, node->replaceTo, node->replacelen);`
			`trgchar += node->replacelen;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`}`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`else if (res)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`memcpy(trgchar, srcchar, charlen);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`trgchar += charlen;`
			`}`

			`srcchar += charlen;`
			`}`

pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (res)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`*trgchar = '\0';`

			`PG_RETURN_POINTER(res);`
			`}`

			`/*`
			`* Function-like wrapper for dictionary`
			`*/`
			`PG_FUNCTION_INFO_V1(unaccent_dict);`
			`Datum`
			`unaccent_dict(PG_FUNCTION_ARGS)`
			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`text *str;`
			`int strArg;`
			`Oid dictOid;`
			`TSDictionaryCacheEntry *dict;`
			`TSLexeme *res;`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`if (PG_NARGS() == 1)`
			`{`
Standardize get_whatever_oid functions for other object types. - Rename TSParserGetPrsid to get_ts_parser_oid. - Rename TSDictionaryGetDictid to get_ts_dict_oid. - Rename TSTemplateGetTmplid to get_ts_template_oid. - Rename TSConfigGetCfgid to get_ts_config_oid. - Rename FindConversionByName to get_conversion_oid. - Rename GetConstraintName to get_constraint_oid. - Add new functions get_opclass_oid, get_opfamily_oid, get_rewrite_oid, get_rewrite_oid_without_relid, get_trigger_oid, and get_cast_oid. The name of each function matches the corresponding catalog. Thanks to KaiGai Kohei for the review. 2010-08-05 17:25:36 +02:00			`dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`strArg = 0;`
			`}`
			`else`
			`{`
			`dictOid = PG_GETARG_OID(0);`
			`strArg = 1;`
			`}`
			`str = PG_GETARG_TEXT_P(strArg);`

			`dict = lookup_ts_dictionary_cache(dictOid);`

			`res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`PointerGetDatum(dict->dictData),`
			`PointerGetDatum(VARDATA(str)),`
			`Int32GetDatum(VARSIZE(str) - VARHDRSZ),`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`PointerGetDatum(NULL)));`

			`PG_FREE_IF_COPY(str, strArg);`

pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`if (res == NULL)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
			`PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));`
			`}`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`else if (res->lexeme == NULL)`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00			`{`
			`pfree(res);`
			`PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));`
			`}`
			`else`
			`{`
pgindent run for 9.0 2010-02-26 03:01:40 +01:00			`text *txt = cstring_to_text(res->lexeme);`
Unaccent dictionary. 2009-08-18 12:34:39 +02:00
			`pfree(res->lexeme);`
			`pfree(res);`

			`PG_RETURN_TEXT_P(txt);`
			`}`
			`}`