postgresql/src/backend/tsearch/dict_synonym.c

242 lines
4.8 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* dict_synonym.c
* Synonym dictionary: replace word by its synonym
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/tsearch/dict_synonym.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "commands/defrem.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
typedef struct
{
char *in;
char *out;
int outlen;
uint16 flags;
} Syn;
typedef struct
{
2007-11-15 22:14:46 +01:00
int len; /* length of syn array */
Syn *syn;
bool case_sensitive;
} DictSyn;
/*
* Finds the next whitespace-delimited word within the 'in' string.
* Returns a pointer to the first character of the word, and a pointer
* to the next byte after the last character in the word (in *end).
* Character '*' at the end of word will not be treated as word
* character if flags is not null.
*/
static char *
findwrd(char *in, char **end, uint16 *flags)
{
char *start;
char *lastchar;
/* Skip leading spaces */
while (*in && t_isspace(in))
in += pg_mblen(in);
/* Return NULL on empty lines */
if (*in == '\0')
{
*end = NULL;
return NULL;
}
lastchar = start = in;
/* Find end of word */
while (*in && !t_isspace(in))
{
lastchar = in;
in += pg_mblen(in);
}
2010-02-26 03:01:40 +01:00
if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
{
*flags = TSL_PREFIX;
*end = lastchar;
}
else
{
if (flags)
2010-02-26 03:01:40 +01:00
*flags = 0;
*end = in;
}
return start;
}
static int
compareSyn(const void *a, const void *b)
{
return strcmp(((const Syn *) a)->in, ((const Syn *) b)->in);
}
Datum
dsynonym_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSyn *d;
ListCell *l;
char *filename = NULL;
bool case_sensitive = false;
tsearch_readline_state trst;
char *starti,
*starto,
*end = NULL;
int cur = 0;
char *line = NULL;
uint16 flags = 0;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers. We have a lot of code in which option names, which from the user's viewpoint are logically keywords, are passed through the grammar as plain identifiers, and then matched to string literals during command execution. This approach avoids making words into lexer keywords unnecessarily. Some places matched these strings using plain strcmp, some using pg_strcasecmp. But the latter should be unnecessary since identifiers would have been downcased on their way through the parser. Aside from any efficiency concerns (probably not a big factor), the lack of consistency in this area creates a hazard of subtle bugs due to different places coming to different conclusions about whether two option names are the same or different. Hence, standardize on using strcmp() to match any option names that are expected to have been fed through the parser. This does create a user-visible behavioral change, which is that while formerly all of these would work: alter table foo set (fillfactor = 50); alter table foo set (FillFactor = 50); alter table foo set ("fillfactor" = 50); alter table foo set ("FillFactor" = 50); now the last case will fail because that double-quoted identifier is different from the others. However, none of our documentation says that you can use a quoted identifier in such contexts at all, and we should discourage doing so since it would break if we ever decide to parse such constructs as true lexer keywords rather than poor man's substitutes. So this shouldn't create a significant compatibility issue for users. Daniel Gustafsson, reviewed by Michael Paquier, small changes by me Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 00:25:02 +01:00
if (strcmp(defel->defname, "synonyms") == 0)
filename = defGetString(defel);
Avoid unnecessary use of pg_strcasecmp for already-downcased identifiers. We have a lot of code in which option names, which from the user's viewpoint are logically keywords, are passed through the grammar as plain identifiers, and then matched to string literals during command execution. This approach avoids making words into lexer keywords unnecessarily. Some places matched these strings using plain strcmp, some using pg_strcasecmp. But the latter should be unnecessary since identifiers would have been downcased on their way through the parser. Aside from any efficiency concerns (probably not a big factor), the lack of consistency in this area creates a hazard of subtle bugs due to different places coming to different conclusions about whether two option names are the same or different. Hence, standardize on using strcmp() to match any option names that are expected to have been fed through the parser. This does create a user-visible behavioral change, which is that while formerly all of these would work: alter table foo set (fillfactor = 50); alter table foo set (FillFactor = 50); alter table foo set ("fillfactor" = 50); alter table foo set ("FillFactor" = 50); now the last case will fail because that double-quoted identifier is different from the others. However, none of our documentation says that you can use a quoted identifier in such contexts at all, and we should discourage doing so since it would break if we ever decide to parse such constructs as true lexer keywords rather than poor man's substitutes. So this shouldn't create a significant compatibility issue for users. Daniel Gustafsson, reviewed by Michael Paquier, small changes by me Discussion: https://postgr.es/m/29405B24-564E-476B-98C0-677A29805B84@yesql.se
2018-01-27 00:25:02 +01:00
else if (strcmp(defel->defname, "casesensitive") == 0)
case_sensitive = defGetBoolean(defel);
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized synonym parameter: \"%s\"",
defel->defname)));
}
if (!filename)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Synonyms parameter")));
filename = get_tsearch_config_filename(filename, "syn");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open synonym file \"%s\": %m",
filename)));
d = (DictSyn *) palloc0(sizeof(DictSyn));
while ((line = tsearch_readline(&trst)) != NULL)
{
starti = findwrd(line, &end, NULL);
if (!starti)
{
/* Empty line */
goto skipline;
}
if (*end == '\0')
{
/* A line with only one word. Ignore silently. */
goto skipline;
}
*end = '\0';
starto = findwrd(end + 1, &end, &flags);
if (!starto)
{
/* A line with only one word (+whitespace). Ignore silently. */
goto skipline;
}
*end = '\0';
/*
2007-11-15 22:14:46 +01:00
* starti now points to the first word, and starto to the second word
* on the line, with a \0 terminator at the end of both words.
*/
if (cur >= d->len)
{
if (d->len == 0)
{
d->len = 64;
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
else
{
d->len *= 2;
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
}
}
if (case_sensitive)
{
d->syn[cur].in = pstrdup(starti);
d->syn[cur].out = pstrdup(starto);
}
else
{
d->syn[cur].in = lowerstr(starti);
d->syn[cur].out = lowerstr(starto);
}
d->syn[cur].outlen = strlen(starto);
2010-02-26 03:01:40 +01:00
d->syn[cur].flags = flags;
cur++;
2007-11-15 22:14:46 +01:00
skipline:
pfree(line);
}
tsearch_readline_end(&trst);
d->len = cur;
qsort(d->syn, d->len, sizeof(Syn), compareSyn);
d->case_sensitive = case_sensitive;
PG_RETURN_POINTER(d);
}
Datum
dsynonym_lexize(PG_FUNCTION_ARGS)
{
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
2007-11-15 22:14:46 +01:00
int32 len = PG_GETARG_INT32(2);
Syn key,
*found;
TSLexeme *res;
/* note: d->len test protects against Solaris bsearch-of-no-items bug */
if (len <= 0 || d->len <= 0)
PG_RETURN_POINTER(NULL);
if (d->case_sensitive)
key.in = pnstrdup(in, len);
else
key.in = lowerstr_with_len(in, len);
key.out = NULL;
found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
pfree(key.in);
if (!found)
PG_RETURN_POINTER(NULL);
res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = pnstrdup(found->out, found->outlen);
res[0].flags = found->flags;
PG_RETURN_POINTER(res);
}