postgresql/contrib/tsearch2/ts_cfg.c
Teodor Sigaev 324300bc7c improve support of agglutinative languages (query with compound words).
regression=# select to_tsquery( '\'fotballklubber\'');
                   to_tsquery
------------------------------------------------
 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb'
(1 row)

So, changed interface to dictionaries, lexize method of dictionary shoud return
pointer to aray of TSLexeme structs instead of char**. Last element should
have TSLexeme->lexeme == NULL.

typedef struct {
        /* number of variant of split word , for example
                Word 'fotballklubber' (norwegian) has two varian to split:
                ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
                should return:
                nvariant        lexeme
                1               fotball
                1               klubb
                2               fot
                2               ball
                2               klubb

        */
        uint16  nvariant;

        /* currently unused */
        uint16  flags;

        /* C-string */
        char    *lexeme;
} TSLexeme;
2005-01-25 15:24:38 +00:00

638 lines
14 KiB
C

/*
* interface functions to tscfg
* Teodor Sigaev <teodor@sigaev.ru>
*/
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <locale.h>
#include "postgres.h"
#include "fmgr.h"
#include "utils/array.h"
#include "catalog/pg_type.h"
#include "executor/spi.h"
#include "ts_cfg.h"
#include "dict.h"
#include "wparser.h"
#include "snmap.h"
#include "common.h"
#include "tsvector.h"
#define IGNORE_LONGLEXEME 1
/*********top interface**********/
static Oid current_cfg_id = 0;
void
init_cfg(Oid id, TSCfgInfo * cfg)
{
Oid arg[2];
bool isnull;
Datum pars[2];
int stat,
i,
j;
text *ptr;
text *prsname = NULL;
char *nsp = get_namespace(TSNSP_FunctionOid);
char buf[1024];
MemoryContext oldcontext;
void *plan;
arg[0] = OIDOID;
arg[1] = OIDOID;
pars[0] = ObjectIdGetDatum(id);
pars[1] = ObjectIdGetDatum(id);
memset(cfg, 0, sizeof(TSCfgInfo));
SPI_connect();
sprintf(buf, "select prs_name from %s.pg_ts_cfg where oid = $1", nsp);
plan = SPI_prepare(buf, 1, arg);
if (!plan)
ts_error(ERROR, "SPI_prepare() failed");
stat = SPI_execp(plan, pars, " ", 1);
if (stat < 0)
ts_error(ERROR, "SPI_execp return %d", stat);
if (SPI_processed > 0)
{
prsname = (text *) DatumGetPointer(
SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)
);
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
prsname = ptextdup(prsname);
MemoryContextSwitchTo(oldcontext);
cfg->id = id;
}
else
ts_error(ERROR, "No tsearch cfg with id %d", id);
SPI_freeplan(plan);
arg[0] = TEXTOID;
sprintf(buf, "select lt.tokid, map.dict_name from %s.pg_ts_cfgmap as map, %s.pg_ts_cfg as cfg, %s.token_type( $1 ) as lt where lt.alias = map.tok_alias and map.ts_name = cfg.ts_name and cfg.oid= $2 order by lt.tokid desc;", nsp, nsp, nsp);
plan = SPI_prepare(buf, 2, arg);
if (!plan)
ts_error(ERROR, "SPI_prepare() failed");
pars[0] = PointerGetDatum(prsname);
stat = SPI_execp(plan, pars, " ", 0);
if (stat < 0)
ts_error(ERROR, "SPI_execp return %d", stat);
if (SPI_processed <= 0)
ts_error(ERROR, "No parser with id %d", id);
for (i = 0; i < SPI_processed; i++)
{
int lexid = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull));
ArrayType *toasted_a = (ArrayType *) PointerGetDatum(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 2, &isnull));
ArrayType *a;
if (!cfg->map)
{
cfg->len = lexid + 1;
cfg->map = (ListDictionary *) malloc(sizeof(ListDictionary) * cfg->len);
if (!cfg->map)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
memset(cfg->map, 0, sizeof(ListDictionary) * cfg->len);
}
if (isnull)
continue;
a = (ArrayType *) PointerGetDatum(PG_DETOAST_DATUM(DatumGetPointer(toasted_a)));
if (ARR_NDIM(a) != 1)
ts_error(ERROR, "Wrong dimension");
if (ARRNELEMS(a) < 1)
continue;
cfg->map[lexid].len = ARRNELEMS(a);
cfg->map[lexid].dict_id = (Datum *) malloc(sizeof(Datum) * cfg->map[lexid].len);
if (!cfg->map[lexid].dict_id)
ts_error(ERROR, "No memory");
memset(cfg->map[lexid].dict_id, 0, sizeof(Datum) * cfg->map[lexid].len);
ptr = (text *) ARR_DATA_PTR(a);
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
for (j = 0; j < cfg->map[lexid].len; j++)
{
cfg->map[lexid].dict_id[j] = PointerGetDatum(ptextdup(ptr));
ptr = NEXTVAL(ptr);
}
MemoryContextSwitchTo(oldcontext);
if (a != toasted_a)
pfree(a);
}
SPI_freeplan(plan);
SPI_finish();
cfg->prs_id = name2id_prs(prsname);
pfree(prsname);
pfree(nsp);
for (i = 0; i < cfg->len; i++)
{
for (j = 0; j < cfg->map[i].len; j++)
{
ptr = (text *) DatumGetPointer(cfg->map[i].dict_id[j]);
cfg->map[i].dict_id[j] = ObjectIdGetDatum(name2id_dict(ptr));
pfree(ptr);
}
}
}
typedef struct
{
TSCfgInfo *last_cfg;
int len;
int reallen;
TSCfgInfo *list;
SNMap name2id_map;
} CFGList;
static CFGList CList = {NULL, 0, 0, NULL, {0, 0, NULL}};
void
reset_cfg(void)
{
freeSNMap(&(CList.name2id_map));
if (CList.list)
{
int i,
j;
for (i = 0; i < CList.len; i++)
if (CList.list[i].map)
{
for (j = 0; j < CList.list[i].len; j++)
if (CList.list[i].map[j].dict_id)
free(CList.list[i].map[j].dict_id);
free(CList.list[i].map);
}
free(CList.list);
}
memset(&CList, 0, sizeof(CFGList));
}
static int
comparecfg(const void *a, const void *b)
{
return ((TSCfgInfo *) a)->id - ((TSCfgInfo *) b)->id;
}
TSCfgInfo *
findcfg(Oid id)
{
/* last used cfg */
if (CList.last_cfg && CList.last_cfg->id == id)
return CList.last_cfg;
/* already used cfg */
if (CList.len != 0)
{
TSCfgInfo key;
key.id = id;
CList.last_cfg = bsearch(&key, CList.list, CList.len, sizeof(TSCfgInfo), comparecfg);
if (CList.last_cfg != NULL)
return CList.last_cfg;
}
/* last chance */
if (CList.len == CList.reallen)
{
TSCfgInfo *tmp;
int reallen = (CList.reallen) ? 2 * CList.reallen : 16;
tmp = (TSCfgInfo *) realloc(CList.list, sizeof(TSCfgInfo) * reallen);
if (!tmp)
ts_error(ERROR, "No memory");
CList.reallen = reallen;
CList.list = tmp;
}
CList.last_cfg = &(CList.list[CList.len]);
init_cfg(id, CList.last_cfg);
CList.len++;
qsort(CList.list, CList.len, sizeof(TSCfgInfo), comparecfg);
return findcfg(id); /* qsort changed order!! */ ;
}
Oid
name2id_cfg(text *name)
{
Oid arg[1];
bool isnull;
Datum pars[1];
int stat;
Oid id = findSNMap_t(&(CList.name2id_map), name);
void *plan;
char *nsp;
char buf[1024];
arg[0] = TEXTOID;
pars[0] = PointerGetDatum(name);
if (id)
return id;
nsp = get_namespace(TSNSP_FunctionOid);
SPI_connect();
sprintf(buf, "select oid from %s.pg_ts_cfg where ts_name = $1", nsp);
plan = SPI_prepare(buf, 1, arg);
if (!plan)
/* internal error */
elog(ERROR, "SPI_prepare() failed");
stat = SPI_execp(plan, pars, " ", 1);
if (stat < 0)
/* internal error */
elog(ERROR, "SPI_execp return %d", stat);
if (SPI_processed > 0)
{
id = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
if (isnull)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("null id for tsearch config")));
}
else
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("no tsearch config")));
SPI_freeplan(plan);
SPI_finish();
addSNMap_t(&(CList.name2id_map), name, id);
return id;
}
void
parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
{
int type,
lenlemm,
i;
char *lemm = NULL;
WParserInfo *prsobj = findprs(cfg->prs_id);
prsobj->prs = (void *) DatumGetPointer(
FunctionCall2(
&(prsobj->start_info),
PointerGetDatum(buf),
Int32GetDatum(buflen)
)
);
while ((type = DatumGetInt32(FunctionCall3(
&(prsobj->getlexeme_info),
PointerGetDatum(prsobj->prs),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)))) != 0)
{
if (lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long")));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long")));
#endif
}
if (type >= cfg->len) /* skip this type of lexem */
continue;
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
TSLexeme *norms,
*ptr;
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(lemm),
PointerGetDatum(lenlemm)
)
);
if (!norms) /* dictionary doesn't know this lexem */
continue;
prs->pos++; /* set pos */
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
}
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
prs->curwords++;
}
pfree(norms);
break; /* lexem already normalized or is stop
* word */
}
}
FunctionCall1(
&(prsobj->end_info),
PointerGetDatum(prsobj->prs)
);
}
static void
hladdword(HLPRSTEXT * prs, char *buf, int4 buflen, int type)
{
while (prs->curwords >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HLWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(HLWORD));
}
memset(&(prs->words[prs->curwords]), 0, sizeof(HLWORD));
prs->words[prs->curwords].type = (uint8) type;
prs->words[prs->curwords].len = buflen;
prs->words[prs->curwords].word = palloc(buflen);
memcpy(prs->words[prs->curwords].word, buf, buflen);
prs->curwords++;
}
static void
hlfinditem(HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int buflen)
{
int i;
ITEM *item = GETQUERY(query);
HLWORD *word;
while (prs->curwords + query->size >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HLWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(HLWORD));
}
word = &(prs->words[prs->curwords - 1]);
for (i = 0; i < query->size; i++)
{
if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0)
{
if (word->item)
{
memcpy(&(prs->words[prs->curwords]), word, sizeof(HLWORD));
prs->words[prs->curwords].item = item;
prs->words[prs->curwords].repeated = 1;
prs->curwords++;
}
else
word->item = item;
}
item++;
}
}
void
hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
{
int type,
lenlemm,
i;
char *lemm = NULL;
WParserInfo *prsobj = findprs(cfg->prs_id);
prsobj->prs = (void *) DatumGetPointer(
FunctionCall2(
&(prsobj->start_info),
PointerGetDatum(buf),
Int32GetDatum(buflen)
)
);
while ((type = DatumGetInt32(FunctionCall3(
&(prsobj->getlexeme_info),
PointerGetDatum(prsobj->prs),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)))) != 0)
{
if (lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long")));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long")));
#endif
}
hladdword(prs, lemm, lenlemm, type);
if (type >= cfg->len)
continue;
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
TSLexeme *norms,
*ptr;
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(lemm),
PointerGetDatum(lenlemm)
)
);
if (!norms) /* dictionary doesn't know this lexem */
continue;
while (ptr->lexeme)
{
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
break; /* lexem already normalized or is stop
* word */
}
}
FunctionCall1(
&(prsobj->end_info),
PointerGetDatum(prsobj->prs)
);
}
text *
genhl(HLPRSTEXT * prs)
{
text *out;
int len = 128;
char *ptr;
HLWORD *wrd = prs->words;
out = (text *) palloc(len);
ptr = ((char *) out) + VARHDRSZ;
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
len *= 2;
out = (text *) repalloc(out, len);
ptr = ((char *) out) + dist;
}
if (wrd->in && !wrd->repeated)
{
if (wrd->replace)
{
*ptr = ' ';
ptr++;
}
else
{
if (wrd->selected)
{
memcpy(ptr, prs->startsel, prs->startsellen);
ptr += prs->startsellen;
}
memcpy(ptr, wrd->word, wrd->len);
ptr += wrd->len;
if (wrd->selected)
{
memcpy(ptr, prs->stopsel, prs->stopsellen);
ptr += prs->stopsellen;
}
}
}
else if (!wrd->repeated)
pfree(wrd->word);
wrd++;
}
VARATT_SIZEP(out) = ptr - ((char *) out);
return out;
}
int
get_currcfg(void)
{
Oid arg[1] = {TEXTOID};
const char *curlocale;
Datum pars[1];
bool isnull;
int stat;
char buf[1024];
char *nsp;
void *plan;
if (current_cfg_id > 0)
return current_cfg_id;
nsp = get_namespace(TSNSP_FunctionOid);
SPI_connect();
sprintf(buf, "select oid from %s.pg_ts_cfg where locale = $1 ", nsp);
pfree(nsp);
plan = SPI_prepare(buf, 1, arg);
if (!plan)
/* internal error */
elog(ERROR, "SPI_prepare() failed");
curlocale = setlocale(LC_CTYPE, NULL);
pars[0] = PointerGetDatum(char2text((char *) curlocale));
stat = SPI_execp(plan, pars, " ", 1);
if (stat < 0)
/* internal error */
elog(ERROR, "SPI_execp return %d", stat);
if (SPI_processed > 0)
current_cfg_id = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
else
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not find tsearch config by locale")));
pfree(DatumGetPointer(pars[0]));
SPI_freeplan(plan);
SPI_finish();
return current_cfg_id;
}
PG_FUNCTION_INFO_V1(set_curcfg);
Datum set_curcfg(PG_FUNCTION_ARGS);
Datum
set_curcfg(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
findcfg(PG_GETARG_OID(0));
current_cfg_id = PG_GETARG_OID(0);
PG_RETURN_VOID();
}
PG_FUNCTION_INFO_V1(set_curcfg_byname);
Datum set_curcfg_byname(PG_FUNCTION_ARGS);
Datum
set_curcfg_byname(PG_FUNCTION_ARGS)
{
text *name = PG_GETARG_TEXT_P(0);
SET_FUNCOID();
DirectFunctionCall1(
set_curcfg,
ObjectIdGetDatum(name2id_cfg(name))
);
PG_FREE_IF_COPY(name, 0);
PG_RETURN_VOID();
}
PG_FUNCTION_INFO_V1(show_curcfg);
Datum show_curcfg(PG_FUNCTION_ARGS);
Datum
show_curcfg(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
PG_RETURN_OID(get_currcfg());
}
PG_FUNCTION_INFO_V1(reset_tsearch);
Datum reset_tsearch(PG_FUNCTION_ARGS);
Datum
reset_tsearch(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
ts_error(NOTICE, "TSearch cache cleaned");
PG_RETURN_VOID();
}