Cleanup for some problems in tsearch patch:

- ispell initialization crashed on empty dictionary file
- ispell initialization crashed on affix file with prefixes but no suffixes
- stop words file was run through pg_verify_mbstr, with database
  encoding, but it's supposed to be UTF-8; similar bug for synonym files
- bunch of comments added, typos fixed, and other cleanup

Introduced consistent encoding checking/conversion of data read from tsearch
configuration files, by doing this in a single t_readline() subroutine
(replacing direct usages of fgets).  Cleaned up API for readstopwords too.

Heikki Linnakangas
This commit is contained in:
Tom Lane 2007-08-25 00:03:59 +00:00
parent b918bf86c6
commit 7351b5fa17
14 changed files with 344 additions and 331 deletions

View File

@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -192,7 +192,6 @@ dsnowball_init(PG_FUNCTION_ARGS)
ListCell *l; ListCell *l;
d = (DictSnowball *) palloc0(sizeof(DictSnowball)); d = (DictSnowball *) palloc0(sizeof(DictSnowball));
d->stoplist.wordop = recode_and_lowerstr;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
@ -204,8 +203,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters"))); errmsg("multiple StopWords parameters")));
readstoplist(defGetString(defel), &d->stoplist); readstoplist(defGetString(defel), &d->stoplist, lowerstr);
sortstoplist(&d->stoplist);
stoploaded = true; stoploaded = true;
} }
else if (pg_strcasecmp("Language", defel->defname) == 0) else if (pg_strcasecmp("Language", defel->defname) == 0)

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -39,7 +39,6 @@ dispell_init(PG_FUNCTION_ARGS)
ListCell *l; ListCell *l;
d = (DictISpell *) palloc0(sizeof(DictISpell)); d = (DictISpell *) palloc0(sizeof(DictISpell));
d->stoplist.wordop = recode_and_lowerstr;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
@ -73,8 +72,7 @@ dispell_init(PG_FUNCTION_ARGS)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters"))); errmsg("multiple StopWords parameters")));
readstoplist(defGetString(defel), &(d->stoplist)); readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
sortstoplist(&(d->stoplist));
stoploaded = true; stoploaded = true;
} }
else else

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -23,19 +23,17 @@
typedef struct typedef struct
{ {
StopList stoplist; StopList stoplist;
} DictExample; } DictSimple;
Datum Datum
dsimple_init(PG_FUNCTION_ARGS) dsimple_init(PG_FUNCTION_ARGS)
{ {
List *dictoptions = (List *) PG_GETARG_POINTER(0); List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictExample *d = (DictExample *) palloc0(sizeof(DictExample)); DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
bool stoploaded = false; bool stoploaded = false;
ListCell *l; ListCell *l;
d->stoplist.wordop = recode_and_lowerstr;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
DefElem *defel = (DefElem *) lfirst(l); DefElem *defel = (DefElem *) lfirst(l);
@ -46,8 +44,7 @@ dsimple_init(PG_FUNCTION_ARGS)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters"))); errmsg("multiple StopWords parameters")));
readstoplist(defGetString(defel), &d->stoplist); readstoplist(defGetString(defel), &d->stoplist, lowerstr);
sortstoplist(&d->stoplist);
stoploaded = true; stoploaded = true;
} }
else else
@ -65,16 +62,16 @@ dsimple_init(PG_FUNCTION_ARGS)
Datum Datum
dsimple_lexize(PG_FUNCTION_ARGS) dsimple_lexize(PG_FUNCTION_ARGS)
{ {
DictExample *d = (DictExample *) PG_GETARG_POINTER(0); DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1); char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2); int32 len = PG_GETARG_INT32(2);
char *txt = lowerstr_with_len(in, len); char *txt;
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
txt = lowerstr_with_len(in, len);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
pfree(txt); pfree(txt);
}
else else
res[0].lexeme = txt; res[0].lexeme = txt;

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.2 2007/08/22 04:13:15 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -20,9 +20,6 @@
#include "tsearch/ts_utils.h" #include "tsearch/ts_utils.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#define SYNBUFLEN 4096
typedef struct typedef struct
{ {
char *in; char *in;
@ -31,23 +28,34 @@ typedef struct
typedef struct typedef struct
{ {
int len; int len; /* length of syn array */
Syn *syn; Syn *syn;
} DictSyn; } DictSyn;
/*
* Finds the next whitespace-delimited word within the 'in' string.
* Returns a pointer to the first character of the word, and a pointer
* to the next byte after the last character in the word (in *end).
*/
static char * static char *
findwrd(char *in, char **end) findwrd(char *in, char **end)
{ {
char *start; char *start;
*end = NULL; /* Skip leading spaces */
while (*in && t_isspace(in)) while (*in && t_isspace(in))
in += pg_mblen(in); in += pg_mblen(in);
/* Return NULL on empty lines */
if (*in == '\0') if (*in == '\0')
{
*end = NULL;
return NULL; return NULL;
}
start = in; start = in;
/* Find end of word */
while (*in && !t_isspace(in)) while (*in && !t_isspace(in))
in += pg_mblen(in); in += pg_mblen(in);
@ -70,12 +78,11 @@ dsynonym_init(PG_FUNCTION_ARGS)
ListCell *l; ListCell *l;
char *filename = NULL; char *filename = NULL;
FILE *fin; FILE *fin;
char buf[SYNBUFLEN];
char *starti, char *starti,
*starto, *starto,
*end = NULL; *end = NULL;
int cur = 0; int cur = 0;
int slen; char *line = NULL;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
@ -105,10 +112,33 @@ dsynonym_init(PG_FUNCTION_ARGS)
d = (DictSyn *) palloc0(sizeof(DictSyn)); d = (DictSyn *) palloc0(sizeof(DictSyn));
while (fgets(buf, SYNBUFLEN, fin)) while ((line = t_readline(fin)) != NULL)
{ {
slen = strlen(buf); starti = findwrd(line, &end);
pg_verifymbstr(buf, slen, false); if (!starti)
{
/* Empty line */
goto skipline;
}
*end = '\0';
if (end >= line + strlen(line))
{
/* A line with only one word. Ignore silently. */
goto skipline;
}
starto = findwrd(end + 1, &end);
if (!starto)
{
/* A line with only one word. Ignore silently. */
goto skipline;
}
*end = '\0';
/* starti now points to the first word, and starto to the second
* word on the line, with a \0 terminator at the end of both words.
*/
if (cur == d->len) if (cur == d->len)
{ {
if (d->len == 0) if (d->len == 0)
@ -123,35 +153,18 @@ dsynonym_init(PG_FUNCTION_ARGS)
} }
} }
starti = findwrd(buf, &end); d->syn[cur].in = lowerstr(starti);
if (!starti) d->syn[cur].out = lowerstr(starto);
continue;
*end = '\0';
if (end >= buf + slen)
continue;
starto = findwrd(end + 1, &end);
if (!starto)
continue;
*end = '\0';
d->syn[cur].in = recode_and_lowerstr(starti);
d->syn[cur].out = recode_and_lowerstr(starto);
if (!(d->syn[cur].in && d->syn[cur].out))
{
FreeFile(fin);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
cur++; cur++;
skipline:
pfree(line);
} }
FreeFile(fin); FreeFile(fin);
d->len = cur; d->len = cur;
if (cur > 1)
qsort(d->syn, d->len, sizeof(Syn), compareSyn); qsort(d->syn, d->len, sizeof(Syn), compareSyn);
PG_RETURN_POINTER(d); PG_RETURN_POINTER(d);
@ -179,8 +192,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
if (!found) if (!found)
PG_RETURN_POINTER(NULL); PG_RETURN_POINTER(NULL);
res = palloc(sizeof(TSLexeme) * 2); res = palloc0(sizeof(TSLexeme) * 2);
memset(res, 0, sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out); res[0].lexeme = pstrdup(found->out);
PG_RETURN_POINTER(res); PG_RETURN_POINTER(res);

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -170,10 +170,10 @@ static void
thesaurusRead(char *filename, DictThesaurus * d) thesaurusRead(char *filename, DictThesaurus * d)
{ {
FILE *fh; FILE *fh;
char str[BUFSIZ];
int lineno = 0; int lineno = 0;
uint16 idsubst = 0; uint16 idsubst = 0;
bool useasis = false; bool useasis = false;
char *line;
filename = get_tsearch_config_filename(filename, "ths"); filename = get_tsearch_config_filename(filename, "ths");
fh = AllocateFile(filename, "r"); fh = AllocateFile(filename, "r");
@ -183,27 +183,28 @@ thesaurusRead(char *filename, DictThesaurus * d)
errmsg("could not open thesaurus file \"%s\": %m", errmsg("could not open thesaurus file \"%s\": %m",
filename))); filename)));
while (fgets(str, sizeof(str), fh)) while ((line = t_readline(fh)) != NULL)
{ {
char *ptr, char *ptr;
*recoded;
int state = TR_WAITLEX; int state = TR_WAITLEX;
char *beginwrd = NULL; char *beginwrd = NULL;
uint16 posinsubst = 0; uint16 posinsubst = 0;
uint16 nwrd = 0; uint16 nwrd = 0;
ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
GetDatabaseEncoding(), PG_UTF8);
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
lineno++; lineno++;
/* is it comment ? */ ptr = line;
while (t_isspace(ptr))
/* is it a comment? */
while (*ptr && t_isspace(ptr))
ptr += pg_mblen(ptr); ptr += pg_mblen(ptr);
if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
if (t_iseq(ptr, '#') || *ptr == '\0' ||
t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
{
pfree(line);
continue; continue;
}
while (*ptr) while (*ptr)
{ {
@ -301,8 +302,7 @@ thesaurusRead(char *filename, DictThesaurus * d)
lineno, filename))); lineno, filename)));
} }
if (recoded != str) pfree(line);
pfree(recoded);
} }
d->nsubst = idsubst; d->nsubst = idsubst;

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -21,8 +21,11 @@
/* /*
* during initialization dictionary requires a lot * Initialization requires a lot of memory that's not needed
* of memory, so it will use temporary context * after the initialization is done. In init function,
* CurrentMemoryContext is a long lived memory context associated
* with the dictionary cache entry, so we use a temporary context
* for the short-lived stuff.
*/ */
static MemoryContext tmpCtx = NULL; static MemoryContext tmpCtx = NULL;
@ -32,6 +35,9 @@ static MemoryContext tmpCtx = NULL;
static void static void
checkTmpCtx(void) checkTmpCtx(void)
{ {
/* XXX: This assumes that CurrentMemoryContext doesn't have
* any children other than the one we create here.
*/
if (CurrentMemoryContext->firstchild == NULL) if (CurrentMemoryContext->firstchild == NULL)
{ {
tmpCtx = AllocSetContextCreate(CurrentMemoryContext, tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
@ -74,17 +80,7 @@ cmpspell(const void *s1, const void *s2)
static int static int
cmpspellaffix(const void *s1, const void *s2) cmpspellaffix(const void *s1, const void *s2)
{ {
return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag)); return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
}
static char *
strnduplicate(char *s, int len)
{
char *d = (char *) palloc(len + 1);
memcpy(d, s, len);
d[len] = '\0';
return d;
} }
static char * static char *
@ -185,7 +181,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
} }
Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
strcpy(Conf->Spell[Conf->nspell]->word, word); strcpy(Conf->Spell[Conf->nspell]->word, word);
strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16); strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
Conf->nspell++; Conf->nspell++;
} }
@ -197,9 +193,8 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
void void
NIImportDictionary(IspellDict * Conf, const char *filename) NIImportDictionary(IspellDict * Conf, const char *filename)
{ {
char str[BUFSIZ],
*pstr;
FILE *dict; FILE *dict;
char *line;
checkTmpCtx(); checkTmpCtx();
@ -209,19 +204,14 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
errmsg("could not open dictionary file \"%s\": %m", errmsg("could not open dictionary file \"%s\": %m",
filename))); filename)));
while (fgets(str, sizeof(str), dict)) while ((line = t_readline(dict)) != NULL)
{ {
char *s, char *s, *pstr;
*recoded;
const char *flag; const char *flag;
recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), /* Extract flag from the line */
PG_UTF8, GetDatabaseEncoding());
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
flag = NULL; flag = NULL;
if ((s = findchar(recoded, '/'))) if ((s = findchar(line, '/')))
{ {
*s++ = '\0'; *s++ = '\0';
flag = s; flag = s;
@ -240,8 +230,8 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
else else
flag = ""; flag = "";
/* Remove trailing spaces */
s = recoded; s = line;
while (*s) while (*s)
{ {
if (t_isspace(s)) if (t_isspace(s))
@ -251,13 +241,12 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
} }
s += pg_mblen(s); s += pg_mblen(s);
} }
pstr = lowerstr_ctx(recoded); pstr = lowerstr_ctx(line);
NIAddSpell(Conf, pstr, flag); NIAddSpell(Conf, pstr, flag);
pfree(pstr); pfree(pstr);
if (recoded != str) pfree(line);
pfree(recoded);
} }
FreeFile(dict); FreeFile(dict);
} }
@ -402,7 +391,7 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
static bool static bool
parse_affentry(char *str, char *mask, char *find, char *repl, parse_affentry(char *str, char *mask, char *find, char *repl,
const char *filename, int line) const char *filename, int lineno)
{ {
int state = PAE_WAIT_MASK; int state = PAE_WAIT_MASK;
char *pmask = mask, char *pmask = mask,
@ -453,7 +442,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"", errmsg("syntax error at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
} }
else if (state == PAE_INFIND) else if (state == PAE_INFIND)
{ {
@ -471,7 +460,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"", errmsg("syntax error at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
} }
else if (state == PAE_WAIT_REPL) else if (state == PAE_WAIT_REPL)
{ {
@ -489,7 +478,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"", errmsg("syntax error at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
} }
else if (state == PAE_INREPL) else if (state == PAE_INREPL)
{ {
@ -507,7 +496,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"", errmsg("syntax error at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
} }
else else
elog(ERROR, "unknown state in parse_affentry: %d", state); elog(ERROR, "unknown state in parse_affentry: %d", state);
@ -522,7 +511,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl,
static void static void
addFlagValue(IspellDict * Conf, char *s, uint32 val, addFlagValue(IspellDict * Conf, char *s, uint32 val,
const char *filename, int line) const char *filename, int lineno)
{ {
while (*s && t_isspace(s)) while (*s && t_isspace(s))
s++; s++;
@ -531,13 +520,13 @@ addFlagValue(IspellDict * Conf, char *s, uint32 val,
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"", errmsg("syntax error at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
if (pg_mblen(s) != 1) if (pg_mblen(s) != 1)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
Conf->flagval[(unsigned int) *s] = (unsigned char) val; Conf->flagval[(unsigned int) *s] = (unsigned char) val;
Conf->usecompound = true; Conf->usecompound = true;
@ -546,7 +535,6 @@ addFlagValue(IspellDict * Conf, char *s, uint32 val,
static void static void
NIImportOOAffixes(IspellDict * Conf, const char *filename) NIImportOOAffixes(IspellDict * Conf, const char *filename)
{ {
char str[BUFSIZ];
char type[BUFSIZ], char type[BUFSIZ],
*ptype = NULL; *ptype = NULL;
char sflag[BUFSIZ]; char sflag[BUFSIZ];
@ -560,9 +548,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
int flag = 0; int flag = 0;
char flagflags = 0; char flagflags = 0;
FILE *affix; FILE *affix;
int line = 0; int lineno = 0;
int scanread = 0; int scanread = 0;
char scanbuf[BUFSIZ]; char scanbuf[BUFSIZ];
char *recoded;
checkTmpCtx(); checkTmpCtx();
@ -576,45 +565,41 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
errmsg("could not open affix file \"%s\": %m", errmsg("could not open affix file \"%s\": %m",
filename))); filename)));
while (fgets(str, sizeof(str), affix)) while ((recoded = t_readline(affix)) != NULL)
{ {
char *recoded; lineno++;
recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
PG_UTF8, GetDatabaseEncoding());
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
line++;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
{
pfree(recoded);
continue; continue;
}
if (STRNCMP(recoded, "COMPOUNDFLAG") == 0) if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"), addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
FF_COMPOUNDFLAG, filename, line); FF_COMPOUNDFLAG, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0) else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"), addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
FF_COMPOUNDBEGIN, filename, line); FF_COMPOUNDBEGIN, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDLAST") == 0) else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"), addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
FF_COMPOUNDLAST, filename, line); FF_COMPOUNDLAST, filename, lineno);
/* COMPOUNDLAST and COMPOUNDEND are synonyms */ /* COMPOUNDLAST and COMPOUNDEND are synonyms */
else if (STRNCMP(recoded, "COMPOUNDEND") == 0) else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDEND"), addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
FF_COMPOUNDLAST, filename, line); FF_COMPOUNDLAST, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0) else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"), addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
FF_COMPOUNDMIDDLE, filename, line); FF_COMPOUNDMIDDLE, filename, lineno);
else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0) else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"), addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
FF_COMPOUNDONLY, filename, line); FF_COMPOUNDONLY, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0) else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"), addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
FF_COMPOUNDPERMITFLAG, filename, line); FF_COMPOUNDPERMITFLAG, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0) else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"), addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
FF_COMPOUNDFORBIDFLAG, filename, line); FF_COMPOUNDFORBIDFLAG, filename, lineno);
else if (STRNCMP(recoded, "FLAG") == 0) else if (STRNCMP(recoded, "FLAG") == 0)
{ {
char *s = recoded + strlen("FLAG"); char *s = recoded + strlen("FLAG");
@ -626,14 +611,13 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"", errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
} }
if (recoded != str)
pfree(recoded); pfree(recoded);
} }
FreeFile(affix); FreeFile(affix);
line = 0; lineno = 0;
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5); sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
@ -643,18 +627,11 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
errmsg("could not open affix file \"%s\": %m", errmsg("could not open affix file \"%s\": %m",
filename))); filename)));
while (fgets(str, sizeof(str), affix)) while ((recoded = t_readline(affix)) != NULL)
{ {
char *recoded; lineno++;
recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
PG_UTF8, GetDatabaseEncoding());
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
line++;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
continue; goto nextline;
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask); scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
@ -662,12 +639,12 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
pfree(ptype); pfree(ptype);
ptype = lowerstr_ctx(type); ptype = lowerstr_ctx(type);
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
continue; goto nextline;
if (scanread == 4) if (scanread == 4)
{ {
if (strlen(sflag) != 1) if (strlen(sflag) != 1)
continue; goto nextline;
flag = *sflag; flag = *sflag;
isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
pfind = lowerstr_ctx(find); pfind = lowerstr_ctx(find);
@ -683,7 +660,7 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
int aflg = 0; int aflg = 0;
if (strlen(sflag) != 1 || flag != *sflag || flag == 0) if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
continue; goto nextline;
prepl = lowerstr_ctx(repl); prepl = lowerstr_ctx(repl);
/* affix flag */ /* affix flag */
if ((ptr = strchr(prepl, '/')) != NULL) if ((ptr = strchr(prepl, '/')) != NULL)
@ -710,7 +687,7 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
pfree(pmask); pfree(pmask);
} }
if (recoded != str) nextline:
pfree(recoded); pfree(recoded);
} }
@ -733,13 +710,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
char find[BUFSIZ]; char find[BUFSIZ];
char repl[BUFSIZ]; char repl[BUFSIZ];
char *s; char *s;
int suffixes = 0; bool suffixes = false;
int prefixes = 0; bool prefixes = false;
int flag = 0; int flag = 0;
char flagflags = 0; char flagflags = 0;
FILE *affix; FILE *affix;
int line = 0; int lineno = 0;
int oldformat = 0; bool oldformat = false;
char *recoded = NULL;
checkTmpCtx(); checkTmpCtx();
@ -752,16 +730,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
memset(Conf->flagval, 0, sizeof(Conf->flagval)); memset(Conf->flagval, 0, sizeof(Conf->flagval));
Conf->usecompound = false; Conf->usecompound = false;
while (fgets(str, sizeof(str), affix)) while ((recoded = t_readline(affix)) != NULL)
{ {
if (pstr) pstr = lowerstr(recoded);
pfree(pstr); pfree(recoded);
pstr = recode_and_lowerstr(str); lineno++;
line++; /* Skip comments and empty lines */
if (*pstr == '#' || *pstr == '\n') if (*pstr == '#' || *pstr == '\n')
continue; goto nextline;
if (STRNCMP(pstr, "compoundwords") == 0) if (STRNCMP(pstr, "compoundwords") == 0)
{ {
@ -777,23 +755,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG; Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
Conf->usecompound = true; Conf->usecompound = true;
} }
oldformat++; oldformat = true;
continue; goto nextline;
} }
} }
if (STRNCMP(pstr, "suffixes") == 0) if (STRNCMP(pstr, "suffixes") == 0)
{ {
suffixes = 1; suffixes = true;
prefixes = 0; prefixes = false;
oldformat++; oldformat = true;
continue; goto nextline;
} }
if (STRNCMP(pstr, "prefixes") == 0) if (STRNCMP(pstr, "prefixes") == 0)
{ {
suffixes = 0; suffixes = false;
prefixes = 1; prefixes = true;
oldformat++; oldformat = true;
continue; goto nextline;
} }
if (STRNCMP(pstr, "flag") == 0) if (STRNCMP(pstr, "flag") == 0)
{ {
@ -802,14 +780,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
while (*s && t_isspace(s)) while (*s && t_isspace(s))
s++; s++;
oldformat++; oldformat = true;
/* allow only single-encoded flags */ /* allow only single-encoded flags */
if (pg_mblen(s) != 1) if (pg_mblen(s) != 1)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
if (*s == '*') if (*s == '*')
{ {
@ -830,10 +808,10 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
flag = (unsigned char) *s; flag = (unsigned char) *s;
continue; goto nextline;
} }
if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 || if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0) STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
@ -842,23 +820,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR), (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("wrong affix file format for flag at line %d of affix file \"%s\"", errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
line, filename))); lineno, filename)));
FreeFile(affix); FreeFile(affix);
NIImportOOAffixes(Conf, filename); NIImportOOAffixes(Conf, filename);
return; return;
} }
if ((!suffixes) && (!prefixes)) if ((!suffixes) && (!prefixes))
continue; goto nextline;
if (!parse_affentry(pstr, mask, find, repl, filename, line)) if (!parse_affentry(pstr, mask, find, repl, filename, lineno))
continue; goto nextline;
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
nextline:
pfree(pstr);
} }
FreeFile(affix); FreeFile(affix);
if (pstr)
pfree(pstr);
} }
static int static int
@ -975,38 +953,55 @@ mkSPNode(IspellDict * Conf, int low, int high, int level)
return rs; return rs;
} }
/*
* Builds the Conf->Dictionary tree and AffixData from the imported dictionary
* and affixes.
*/
void void
NISortDictionary(IspellDict * Conf) NISortDictionary(IspellDict * Conf)
{ {
size_t i; int i;
int naffix = 3; int naffix = 0;
int curaffix;
checkTmpCtx(); checkTmpCtx();
/* compress affixes */ /* compress affixes */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
for (i = 1; i < Conf->nspell; i++)
if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
naffix++;
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); /* Count the number of different flags used in the dictionary */
naffix = 1;
Conf->AffixData[0] = pstrdup(""); qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
Conf->Spell[0]->p.d.affix = 1; naffix = 0;
Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word); for (i = 0; i < Conf->nspell; i++)
for (i = 1; i < Conf->nspell; i++)
{
if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
{ {
if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
naffix++; naffix++;
Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
} }
Conf->Spell[i]->p.d.affix = naffix;
/*
* Fill in Conf->AffixData with the affixes that were used
* in the dictionary. Replace textual flag-field of Conf->Spell
* entries with indexes into Conf->AffixData array.
*/
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
curaffix = -1;
for (i = 0; i < Conf->nspell; i++)
{
if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
{
curaffix++;
Assert(curaffix < naffix);
Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
}
Conf->Spell[i]->p.d.affix = curaffix;
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
} }
Conf->lenAffixData = Conf->nAffixData = naffix; Conf->lenAffixData = Conf->nAffixData = naffix;
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
@ -1085,7 +1080,7 @@ mkANode(IspellDict * Conf, int low, int high, int level, int type)
} }
static void static void
mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
{ {
int i, int i,
cnt = 0; cnt = 0;
@ -1145,7 +1140,7 @@ NISortAffixes(IspellDict * Conf)
AFFIX *Affix; AFFIX *Affix;
size_t i; size_t i;
CMPDAffix *ptr; CMPDAffix *ptr;
int firstsuffix = -1; int firstsuffix = Conf->naffixes;
checkTmpCtx(); checkTmpCtx();
@ -1160,7 +1155,7 @@ NISortAffixes(IspellDict * Conf)
for (i = 0; i < Conf->naffixes; i++) for (i = 0; i < Conf->naffixes; i++)
{ {
Affix = &(((AFFIX *) Conf->Affix)[i]); Affix = &(((AFFIX *) Conf->Affix)[i]);
if (Affix->type == FF_SUFFIX && firstsuffix < 0) if (Affix->type == FF_SUFFIX && i < firstsuffix)
firstsuffix = i; firstsuffix = i;
if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
@ -1185,12 +1180,12 @@ NISortAffixes(IspellDict * Conf)
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
mkVoidAffix(Conf, 1, firstsuffix); mkVoidAffix(Conf, true, firstsuffix);
mkVoidAffix(Conf, 0, firstsuffix); mkVoidAffix(Conf, false, firstsuffix);
} }
static AffixNodeData * static AffixNodeData *
FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type) FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
{ {
AffixNodeData *StopLow, AffixNodeData *StopLow,
*StopHigh, *StopHigh,
@ -1374,7 +1369,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
plevel = 0; plevel = 0;
while (pnode) while (pnode)
{ {
prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
if (!prefix) if (!prefix)
break; break;
for (j = 0; j < prefix->naff; j++) for (j = 0; j < prefix->naff; j++)
@ -1398,7 +1393,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
int baselen = 0; int baselen = 0;
/* find possible suffix */ /* find possible suffix */
suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
if (!suffix) if (!suffix)
break; break;
/* foreach suffix check affix */ /* foreach suffix check affix */
@ -1416,7 +1411,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
swrdlen = strlen(newword); swrdlen = strlen(newword);
while (pnode) while (pnode)
{ {
prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
if (!prefix) if (!prefix)
break; break;
for (j = 0; j < prefix->naff; j++) for (j = 0; j < prefix->naff; j++)
@ -1626,7 +1621,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
if (wordlen == level + 1) if (wordlen == level + 1)
{ {
/* well, it was last word */ /* well, it was last word */
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
var->nstem++; var->nstem++;
pfree(notprobed); pfree(notprobed);
return var; return var;
@ -1641,7 +1636,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */ /* we can find next word */
level++; level++;
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos); var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
var->nstem++; var->nstem++;
node = Conf->Dictionary; node = Conf->Dictionary;
startpos = level; startpos = level;
@ -1656,7 +1651,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
level++; level++;
} }
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
var->nstem++; var->nstem++;
pfree(notprobed); pfree(notprobed);
return var; return var;

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -125,28 +125,47 @@ _t_isprint(const char *ptr)
} }
#endif /* TS_USE_WIDE */ #endif /* TS_USE_WIDE */
/* /*
* Convert C-string from UTF8 to server encoding and * Read the next line from a tsearch data file (expected to be in UTF-8), and
* lower it * convert it to database encoding if needed. The returned string is palloc'd.
* NULL return means EOF.
*/ */
char * char *
recode_and_lowerstr(char *str) t_readline(FILE *fp)
{ {
int len;
char *recoded; char *recoded;
char *ret; char buf[4096]; /* lines must not be longer than this */
recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), if (fgets(buf, sizeof(buf), fp) == NULL)
PG_UTF8, GetDatabaseEncoding()); return NULL;
if (recoded == NULL) len = strlen(buf);
/* Make sure the input is valid UTF-8 */
(void) pg_verify_mbstr(PG_UTF8, buf, len, false);
/* And convert */
recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf,
len,
PG_UTF8,
GetDatabaseEncoding());
if (recoded == NULL) /* should not happen */
elog(ERROR, "encoding conversion failed"); elog(ERROR, "encoding conversion failed");
ret = lowerstr(recoded); if (recoded == buf)
{
/*
* conversion didn't pstrdup, so we must.
* We can use the length of the original string, because
* no conversion was done.
*/
recoded = pnstrdup(recoded, len);
}
if (recoded != str) return recoded;
pfree(recoded);
return ret;
} }
char * char *
@ -155,6 +174,9 @@ lowerstr(char *str)
return lowerstr_with_len(str, strlen(str)); return lowerstr_with_len(str, strlen(str));
} }
/*
* Returned string is palloc'd
*/
char * char *
lowerstr_with_len(char *str, int len) lowerstr_with_len(char *str, int len)
{ {

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -308,7 +308,7 @@ LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
{ {
/* /*
* Dictionary normalizes lexemes, so we remove from stack all * Dictionary normalizes lexemes, so we remove from stack all
* used lexemes , return to basic mode and redo end of stack * used lexemes, return to basic mode and redo end of stack
* (if it exists) * (if it exists)
*/ */
if (res) if (res)
@ -427,14 +427,14 @@ parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
* Headline framework * Headline framework
*/ */
static void static void
hladdword(HeadlineText * prs, char *buf, int4 buflen, int type) hladdword(HeadlineParsedText * prs, char *buf, int4 buflen, int type)
{ {
while (prs->curwords >= prs->lenwords) while (prs->curwords >= prs->lenwords)
{ {
prs->lenwords *= 2; prs->lenwords *= 2;
prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord)); prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
} }
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord)); memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].type = (uint8) type; prs->words[prs->curwords].type = (uint8) type;
prs->words[prs->curwords].len = buflen; prs->words[prs->curwords].len = buflen;
prs->words[prs->curwords].word = palloc(buflen); prs->words[prs->curwords].word = palloc(buflen);
@ -443,16 +443,16 @@ hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
} }
static void static void
hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen) hlfinditem(HeadlineParsedText * prs, TSQuery query, char *buf, int buflen)
{ {
int i; int i;
QueryItem *item = GETQUERY(query); QueryItem *item = GETQUERY(query);
HeadlineWord *word; HeadlineWordEntry *word;
while (prs->curwords + query->size >= prs->lenwords) while (prs->curwords + query->size >= prs->lenwords)
{ {
prs->lenwords *= 2; prs->lenwords *= 2;
prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord)); prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
} }
word = &(prs->words[prs->curwords - 1]); word = &(prs->words[prs->curwords - 1]);
@ -462,7 +462,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
{ {
if (word->item) if (word->item)
{ {
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord)); memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].item = item; prs->words[prs->curwords].item = item;
prs->words[prs->curwords].repeated = 1; prs->words[prs->curwords].repeated = 1;
prs->curwords++; prs->curwords++;
@ -475,7 +475,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
} }
static void static void
addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms) addHLParsedLex(HeadlineParsedText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
{ {
ParsedLex *tmplexs; ParsedLex *tmplexs;
TSLexeme *ptr; TSLexeme *ptr;
@ -511,7 +511,7 @@ addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * n
} }
void void
hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen) hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen)
{ {
int type, int type,
lenlemm; lenlemm;
@ -571,12 +571,12 @@ hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen
} }
text * text *
generatHeadline(HeadlineText * prs) generateHeadline(HeadlineParsedText * prs)
{ {
text *out; text *out;
int len = 128; int len = 128;
char *ptr; char *ptr;
HeadlineWord *wrd = prs->words; HeadlineWordEntry *wrd = prs->words;
out = (text *) palloc(len); out = (text *) palloc(len);
ptr = ((char *) out) + VARHDRSZ; ptr = ((char *) out) + VARHDRSZ;

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -63,21 +63,29 @@ get_tsearch_config_filename(const char *basename,
return result; return result;
} }
#define STOPBUFLEN 4096 static int
comparestr(const void *a, const void *b)
{
return strcmp(*(char **) a, *(char **) b);
}
/*
* Reads a stopword file. Each word is run through 'wordop'
* function, if given. wordop may either modify the input in-place,
* or palloc a new version.
*/
void void
readstoplist(char *in, StopList * s) readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
{ {
char **stop = NULL; char **stop = NULL;
s->len = 0; s->len = 0;
if (in && *in) if (fname && *fname)
{ {
char *filename = get_tsearch_config_filename(in, "stop"); char *filename = get_tsearch_config_filename(fname, "stop");
FILE *hin; FILE *hin;
char buf[STOPBUFLEN]; char *line;
int reallen = 0; int reallen = 0;
int line = 0;
if ((hin = AllocateFile(filename, "r")) == NULL) if ((hin = AllocateFile(filename, "r")) == NULL)
ereport(ERROR, ereport(ERROR,
@ -85,65 +93,56 @@ readstoplist(char *in, StopList * s)
errmsg("could not open stopword file \"%s\": %m", errmsg("could not open stopword file \"%s\": %m",
filename))); filename)));
while (fgets(buf, STOPBUFLEN, hin)) while ((line = t_readline(hin)) != NULL)
{ {
char *pbuf = buf; char *pbuf = line;
line++; /* Trim trailing space */
while (*pbuf && !isspace(*pbuf)) while (*pbuf && !t_isspace(pbuf))
pbuf++; pbuf++;
*pbuf = '\0'; *pbuf = '\0';
if (*buf == '\0') /* Skip empty lines */
continue; if (*line == '\0')
if (!pg_verifymbstr(buf, strlen(buf), true))
{ {
FreeFile(hin); pfree(line);
ereport(ERROR, continue;
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte encoding at line %d in file \"%s\"",
line, filename)));
} }
if (s->len >= reallen) if (s->len >= reallen)
{ {
if (reallen == 0) if (reallen == 0)
{ {
reallen = 16; reallen = 64;
stop = (char **) palloc(sizeof(char *) * reallen); stop = (char **) palloc(sizeof(char *) * reallen);
} }
else else
{ {
reallen *= 2; reallen *= 2;
stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen); stop = (char **) repalloc((void *) stop,
sizeof(char *) * reallen);
} }
} }
if (wordop)
if (s->wordop) {
stop[s->len] = s->wordop(buf); stop[s->len] = wordop(line);
if (stop[s->len] != line)
pfree(line);
}
else else
stop[s->len] = pstrdup(buf); stop[s->len] = line;
(s->len)++; (s->len)++;
} }
FreeFile(hin); FreeFile(hin);
pfree(filename); pfree(filename);
} }
s->stop = stop; s->stop = stop;
}
static int /* Sort to allow binary searching */
comparestr(const void *a, const void *b)
{
return strcmp(*(char **) a, *(char **) b);
}
void
sortstoplist(StopList * s)
{
if (s->stop && s->len > 0) if (s->stop && s->len > 0)
qsort(s->stop, s->len, sizeof(char *), comparestr); qsort(s->stop, s->len, sizeof(char *), comparestr);
} }

View File

@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.2 2007/08/22 01:39:45 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -300,7 +300,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
text *in = PG_GETARG_TEXT_P(1); text *in = PG_GETARG_TEXT_P(1);
TSQuery query = PG_GETARG_TSQUERY(2); TSQuery query = PG_GETARG_TSQUERY(2);
text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
HeadlineText prs; HeadlineParsedText prs;
List *prsoptions; List *prsoptions;
text *out; text *out;
TSConfigCacheEntry *cfg; TSConfigCacheEntry *cfg;
@ -309,9 +309,9 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
cfg = lookup_ts_config_cache(PG_GETARG_OID(0)); cfg = lookup_ts_config_cache(PG_GETARG_OID(0));
prsobj = lookup_ts_parser_cache(cfg->prsId); prsobj = lookup_ts_parser_cache(cfg->prsId);
memset(&prs, 0, sizeof(HeadlineText)); memset(&prs, 0, sizeof(HeadlineParsedText));
prs.lenwords = 32; prs.lenwords = 32;
prs.words = (HeadlineWord *) palloc(sizeof(HeadlineWord) * prs.lenwords); prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);
hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ); hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);
@ -325,7 +325,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS)
PointerGetDatum(prsoptions), PointerGetDatum(prsoptions),
PointerGetDatum(query)); PointerGetDatum(query));
out = generatHeadline(&prs); out = generateHeadline(&prs);
PG_FREE_IF_COPY(in, 1); PG_FREE_IF_COPY(in, 1);
PG_FREE_IF_COPY(query, 2); PG_FREE_IF_COPY(query, 2);

View File

@ -6,7 +6,7 @@
* *
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -18,12 +18,17 @@
#include "tsearch/dicts/regis.h" #include "tsearch/dicts/regis.h"
#include "tsearch/ts_public.h" #include "tsearch/ts_public.h"
/*
* Max length of a flag name. Names longer than this will be truncated
* to the maximum.
*/
#define MAXFLAGLEN 16
struct SPNode; struct SPNode;
typedef struct typedef struct
{ {
uint32 uint32 val:8,
val:8,
isword:1, isword:1,
compoundflag:4, compoundflag:4,
affix:19; affix:19;
@ -54,22 +59,25 @@ typedef struct spell_struct
{ {
union union
{ {
char flag[16]; /*
* flag is filled in by NIImportDictionary. After NISortDictionary,
* d is valid and flag is invalid.
*/
char flag[MAXFLAGLEN];
struct struct
{ {
int affix; int affix;
int len; int len;
} d; } d;
} p; } p;
char word[1]; char word[1]; /* variable length, null-terminated */
} SPELL; } SPELL;
#define SPELLHDRSZ (offsetof(SPELL, word)) #define SPELLHDRSZ (offsetof(SPELL, word))
typedef struct aff_struct typedef struct aff_struct
{ {
uint32 uint32 flag:8,
flag:8,
type:1, type:1,
flagflags:7, flagflags:7,
issimple:1, issimple:1,
@ -85,11 +93,16 @@ typedef struct aff_struct
} AFFIX; } AFFIX;
/* /*
* affixes use deictinary flags too * affixes use dictionary flags too
*/ */
#define FF_COMPOUNDPERMITFLAG 0x10 #define FF_COMPOUNDPERMITFLAG 0x10
#define FF_COMPOUNDFORBIDFLAG 0x20 #define FF_COMPOUNDFORBIDFLAG 0x20
#define FF_CROSSPRODUCT 0x40 #define FF_CROSSPRODUCT 0x40
/*
* Don't change the order of these. Initialization sorts by these,
* and expects prefixes to come first after sorting.
*/
#define FF_SUFFIX 1 #define FF_SUFFIX 1
#define FF_PREFIX 0 #define FF_PREFIX 0
@ -97,8 +110,7 @@ struct AffixNode;
typedef struct typedef struct
{ {
uint32 uint32 val:8,
val:8,
naff:24; naff:24;
AFFIX **aff; AFFIX **aff;
struct AffixNode *node; struct AffixNode *node;
@ -126,9 +138,13 @@ typedef struct
int naffixes; int naffixes;
AFFIX *Affix; AFFIX *Affix;
int nspell; /*
int mspell; * Temporary array of all words in the dict file. Only used during
* initialization
*/
SPELL **Spell; SPELL **Spell;
int nspell; /* number of valid entries in Spell array */
int mspell; /* allocated length of Spell array */
AffixNode *Suffix; AffixNode *Suffix;
AffixNode *Prefix; AffixNode *Prefix;

View File

@ -5,7 +5,7 @@
* *
* Copyright (c) 1998-2007, PostgreSQL Global Development Group * Copyright (c) 1998-2007, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -38,11 +38,11 @@
#ifdef TS_USE_WIDE #ifdef TS_USE_WIDE
size_t char2wchar(wchar_t *to, const char *from, size_t len); extern size_t char2wchar(wchar_t *to, const char *from, size_t len);
#ifdef WIN32 #ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len); extern size_t wchar2char(char *to, const wchar_t *from, size_t len);
#else /* WIN32 */ #else /* WIN32 */
/* correct wcstombs */ /* correct wcstombs */
@ -81,8 +81,8 @@ extern int _t_isprint(const char *ptr);
#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s) #define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s)
#endif #endif
char *lowerstr(char *str); extern char *lowerstr(char *str);
char *lowerstr_with_len(char *str, int len); extern char *lowerstr_with_len(char *str, int len);
char *recode_and_lowerstr(char *str); extern char *t_readline(FILE *fp);
#endif /* __TSLOCALE_H__ */ #endif /* __TSLOCALE_H__ */

View File

@ -6,7 +6,7 @@
* *
* Copyright (c) 1998-2007, PostgreSQL Global Development Group * Copyright (c) 1998-2007, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.2 2007/08/22 01:39:46 tgl Exp $ * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.3 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -71,12 +71,11 @@ typedef struct
{ {
int len; int len;
char **stop; char **stop;
char *(*wordop) (char *);
} StopList; } StopList;
extern void sortstoplist(StopList * s); extern void readstoplist(const char *fname, StopList *s,
extern void readstoplist(char *in, StopList * s); char *(*wordop) (char *));
extern bool searchstoplist(StopList * s, char *key); extern bool searchstoplist(StopList *s, char *key);
/* /*
* Interface with dictionaries * Interface with dictionaries
@ -102,9 +101,8 @@ typedef struct
#define TSL_ADDPOS 0x01 #define TSL_ADDPOS 0x01
/* /*
* Struct for supporting complex dictionaries like * Struct for supporting complex dictionaries like thesaurus.
* thesaurus, pointer to is an 4-th argument for * 4th argument for dictlexize method is a pointer to this
* dictlexize method
*/ */
typedef struct typedef struct
{ {

View File

@ -5,7 +5,7 @@
* *
* Copyright (c) 1998-2007, PostgreSQL Global Development Group * Copyright (c) 1998-2007, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -13,6 +13,7 @@
#define _PG_TS_UTILS_H_ #define _PG_TS_UTILS_H_
#include "tsearch/ts_type.h" #include "tsearch/ts_type.h"
#include "tsearch/ts_public.h"
/* /*
* Common parse definitions for tsvector and tsquery * Common parse definitions for tsvector and tsquery
@ -38,7 +39,8 @@ typedef struct
extern bool gettoken_tsvector(TSVectorParseState *state); extern bool gettoken_tsvector(TSVectorParseState *state);
struct ParseQueryNode; struct ParseQueryNode; /* private in backend/utils/adt/tsquery.c */
typedef struct typedef struct
{ {
char *buffer; /* entire string we are scanning */ char *buffer; /* entire string we are scanning */
@ -46,7 +48,7 @@ typedef struct
int4 state; int4 state;
int4 count; int4 count;
/* reverse polish notation in list (for temprorary usage) */ /* reverse polish notation in list (for temporary usage) */
struct ParseQueryNode *str; struct ParseQueryNode *str;
/* number in str */ /* number in str */
@ -102,36 +104,12 @@ extern void parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen);
* headline framework, flow in common to generate: * headline framework, flow in common to generate:
* 1 parse text with hlparsetext * 1 parse text with hlparsetext
* 2 parser-specific function to find part * 2 parser-specific function to find part
* 3 generatHeadline to generate result text * 3 generateHeadline to generate result text
*/ */
typedef struct extern void hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query,
{
uint32 selected:1,
in:1,
replace:1,
repeated:1,
unused:4,
type:8,
len:16;
char *word;
QueryItem *item;
} HeadlineWord;
typedef struct
{
HeadlineWord *words;
int4 lenwords;
int4 curwords;
char *startsel;
char *stopsel;
int2 startsellen;
int2 stopsellen;
} HeadlineText;
extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
char *buf, int4 buflen); char *buf, int4 buflen);
extern text *generatHeadline(HeadlineText * prs); extern text *generateHeadline(HeadlineParsedText * prs);
/* /*
* token/node types for parsing * token/node types for parsing