/*------------------------------------------------------------------------- * * unaccent.c * Text search unaccent dictionary * * Copyright (c) 2009, PostgreSQL Global Development Group * * IDENTIFICATION * $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.3 2009/08/18 15:57:26 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "fmgr.h" #include "catalog/namespace.h" #include "commands/defrem.h" #include "mb/pg_wchar.h" #include "tsearch/ts_cache.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "utils/builtins.h" PG_MODULE_MAGIC; /* * Unaccent dictionary uses uncompressed suffix tree to find a * character to replace. Each node of tree is an array of * SuffixChar struct with length = 256 (n-th element of array * corresponds to byte) */ typedef struct SuffixChar { struct SuffixChar *nextChar; char *replaceTo; int replacelen; } SuffixChar; /* * placeChar - put str into tree's structure, byte by byte. */ static SuffixChar* placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) { SuffixChar *curnode; if ( !node ) { node = palloc(sizeof(SuffixChar) * 256); memset(node, 0, sizeof(SuffixChar) * 256); } curnode = node + *str; if ( lenstr == 1 ) { if ( curnode->replaceTo ) elog(WARNING, "duplicate TO argument, use first one"); else { curnode->replacelen = replacelen; curnode->replaceTo = palloc( replacelen ); memcpy(curnode->replaceTo, replaceTo, replacelen); } } else { curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen); } return node; } /* * initSuffixTree - create suffix tree from file. Function converts * UTF8-encoded file into current encoding. */ static SuffixChar* initSuffixTree(char *filename) { SuffixChar * volatile rootSuffixTree = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { char src[4096]; char trg[4096]; int srclen; int trglen; char *line = NULL; skip = true; PG_TRY(); { /* * pg_do_encoding_conversion() (called by tsearch_readline()) * will emit exception if it finds untranslatable characters in current locale. * We just skip such characters. */ while ((line = tsearch_readline(&trst)) != NULL) { if ( sscanf(line, "%s\t%s\n", src, trg)!=2 ) continue; srclen = strlen(src); trglen = strlen(trg); rootSuffixTree = placeChar(rootSuffixTree, (unsigned char*)src, srclen, trg, trglen); skip = false; pfree(line); } } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while(skip); tsearch_readline_end(&trst); return rootSuffixTree; } /* * findReplaceTo - find multibyte character in tree */ static SuffixChar * findReplaceTo( SuffixChar *node, unsigned char *src, int srclen ) { while( node ) { node = node + *src; if ( srclen == 1 ) return node; src++; srclen--; node = node->nextChar; } return NULL; } PG_FUNCTION_INFO_V1(unaccent_init); Datum unaccent_init(PG_FUNCTION_ARGS); Datum unaccent_init(PG_FUNCTION_ARGS) { List *dictoptions = (List *) PG_GETARG_POINTER(0); SuffixChar *rootSuffixTree = NULL; bool fileloaded = false; ListCell *l; foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); if (pg_strcasecmp("Rules", defel->defname) == 0) { if (fileloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple Rules parameters"))); rootSuffixTree = initSuffixTree(defGetString(defel)); fileloaded = true; } else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized Unaccent parameter: \"%s\"", defel->defname))); } } if (!fileloaded) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("missing Rules parameter"))); } PG_RETURN_POINTER(rootSuffixTree); } PG_FUNCTION_INFO_V1(unaccent_lexize); Datum unaccent_lexize(PG_FUNCTION_ARGS); Datum unaccent_lexize(PG_FUNCTION_ARGS) { SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0); char *srcchar = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *srcstart, *trgchar = NULL; int charlen; TSLexeme *res = NULL; SuffixChar *node; srcstart = srcchar; while( srcchar - srcstart < len ) { charlen = pg_mblen(srcchar); node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen ); if ( node && node->replaceTo ) { if ( !res ) { /* allocate res only it it's needed */ res = palloc0(sizeof(TSLexeme) * 2); res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ ); res->flags = TSL_FILTER; if ( srcchar != srcstart ) { memcpy(trgchar, srcstart, srcchar - srcstart); trgchar += (srcchar - srcstart); } } memcpy( trgchar, node->replaceTo, node->replacelen ); trgchar += node->replacelen; } else if ( res ) { memcpy( trgchar, srcchar, charlen ); trgchar += charlen; } srcchar += charlen; } if ( res ) *trgchar = '\0'; PG_RETURN_POINTER(res); } /* * Function-like wrapper for dictionary */ PG_FUNCTION_INFO_V1(unaccent_dict); Datum unaccent_dict(PG_FUNCTION_ARGS); Datum unaccent_dict(PG_FUNCTION_ARGS) { text *str; int strArg; Oid dictOid; TSDictionaryCacheEntry *dict; TSLexeme *res; if (PG_NARGS() == 1) { dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false); strArg = 0; } else { dictOid = PG_GETARG_OID(0); strArg = 1; } str = PG_GETARG_TEXT_P(strArg); dict = lookup_ts_dictionary_cache(dictOid); res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), PointerGetDatum(dict->dictData), PointerGetDatum(VARDATA(str)), Int32GetDatum(VARSIZE(str) - VARHDRSZ), PointerGetDatum(NULL))); PG_FREE_IF_COPY(str, strArg); if ( res == NULL ) { PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); } else if ( res->lexeme == NULL ) { pfree(res); PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); } else { text *txt = cstring_to_text(res->lexeme); pfree(res->lexeme); pfree(res); PG_RETURN_TEXT_P(txt); } }