mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-10-01 10:41:18 +02:00
664 lines
14 KiB
C
664 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* ts_parse.c
|
|
* main parse functions for tsearch
|
|
*
|
|
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/tsearch/ts_parse.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "tsearch/ts_cache.h"
|
|
#include "tsearch/ts_public.h"
|
|
#include "tsearch/ts_utils.h"
|
|
|
|
#define IGNORE_LONGLEXEME 1
|
|
|
|
/*
|
|
* Lexize subsystem
|
|
*/
|
|
|
|
typedef struct ParsedLex
|
|
{
|
|
int type;
|
|
char *lemm;
|
|
int lenlemm;
|
|
struct ParsedLex *next;
|
|
} ParsedLex;
|
|
|
|
typedef struct ListParsedLex
|
|
{
|
|
ParsedLex *head;
|
|
ParsedLex *tail;
|
|
} ListParsedLex;
|
|
|
|
typedef struct
|
|
{
|
|
TSConfigCacheEntry *cfg;
|
|
Oid curDictId;
|
|
int posDict;
|
|
DictSubState dictState;
|
|
ParsedLex *curSub;
|
|
ListParsedLex towork; /* current list to work */
|
|
ListParsedLex waste; /* list of lexemes that already lexized */
|
|
|
|
/*
|
|
* fields to store last variant to lexize (basically, thesaurus or similar
|
|
* to, which wants several lexemes
|
|
*/
|
|
|
|
ParsedLex *lastRes;
|
|
TSLexeme *tmpRes;
|
|
} LexizeData;
|
|
|
|
static void
|
|
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
|
|
{
|
|
ld->cfg = cfg;
|
|
ld->curDictId = InvalidOid;
|
|
ld->posDict = 0;
|
|
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
ld->lastRes = NULL;
|
|
ld->tmpRes = NULL;
|
|
}
|
|
|
|
static void
|
|
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
|
|
{
|
|
if (list->tail)
|
|
{
|
|
list->tail->next = newpl;
|
|
list->tail = newpl;
|
|
}
|
|
else
|
|
list->head = list->tail = newpl;
|
|
newpl->next = NULL;
|
|
}
|
|
|
|
static ParsedLex *
|
|
LPLRemoveHead(ListParsedLex *list)
|
|
{
|
|
ParsedLex *res = list->head;
|
|
|
|
if (list->head)
|
|
list->head = list->head->next;
|
|
|
|
if (list->head == NULL)
|
|
list->tail = NULL;
|
|
|
|
return res;
|
|
}
|
|
|
|
static void
|
|
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
|
|
{
|
|
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
|
|
|
newpl->type = type;
|
|
newpl->lemm = lemm;
|
|
newpl->lenlemm = lenlemm;
|
|
LPLAddTail(&ld->towork, newpl);
|
|
ld->curSub = ld->towork.tail;
|
|
}
|
|
|
|
static void
|
|
RemoveHead(LexizeData *ld)
|
|
{
|
|
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
|
|
|
|
ld->posDict = 0;
|
|
}
|
|
|
|
static void
|
|
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
|
|
{
|
|
if (correspondLexem)
|
|
{
|
|
*correspondLexem = ld->waste.head;
|
|
}
|
|
else
|
|
{
|
|
ParsedLex *tmp,
|
|
*ptr = ld->waste.head;
|
|
|
|
while (ptr)
|
|
{
|
|
tmp = ptr->next;
|
|
pfree(ptr);
|
|
ptr = tmp;
|
|
}
|
|
}
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
}
|
|
|
|
static void
|
|
moveToWaste(LexizeData *ld, ParsedLex *stop)
|
|
{
|
|
bool go = true;
|
|
|
|
while (ld->towork.head && go)
|
|
{
|
|
if (ld->towork.head == stop)
|
|
{
|
|
ld->curSub = stop->next;
|
|
go = false;
|
|
}
|
|
RemoveHead(ld);
|
|
}
|
|
}
|
|
|
|
static void
|
|
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
|
|
{
|
|
if (ld->tmpRes)
|
|
{
|
|
TSLexeme *ptr;
|
|
|
|
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
|
|
pfree(ptr->lexeme);
|
|
pfree(ld->tmpRes);
|
|
}
|
|
ld->tmpRes = res;
|
|
ld->lastRes = lex;
|
|
}
|
|
|
|
static TSLexeme *
|
|
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
|
|
{
|
|
int i;
|
|
ListDictionary *map;
|
|
TSDictionaryCacheEntry *dict;
|
|
TSLexeme *res;
|
|
|
|
if (ld->curDictId == InvalidOid)
|
|
{
|
|
/*
|
|
* usial mode: dictionary wants only one word, but we should keep in
|
|
* mind that we should go through all stack
|
|
*/
|
|
|
|
while (ld->towork.head)
|
|
{
|
|
ParsedLex *curVal = ld->towork.head;
|
|
char *curValLemm = curVal->lemm;
|
|
int curValLenLemm = curVal->lenlemm;
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
|
|
{
|
|
/* skip this type of lexeme */
|
|
RemoveHead(ld);
|
|
continue;
|
|
}
|
|
|
|
for (i = ld->posDict; i < map->len; i++)
|
|
{
|
|
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
|
|
|
|
ld->dictState.isend = ld->dictState.getnext = false;
|
|
ld->dictState.private_state = NULL;
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
|
&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(curValLemm),
|
|
Int32GetDatum(curValLenLemm),
|
|
PointerGetDatum(&ld->dictState)
|
|
));
|
|
|
|
if (ld->dictState.getnext)
|
|
{
|
|
/*
|
|
* dictionary wants next word, so setup and store current
|
|
* position and go to multiword mode
|
|
*/
|
|
|
|
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
|
|
ld->posDict = i + 1;
|
|
ld->curSub = curVal->next;
|
|
if (res)
|
|
setNewTmpRes(ld, curVal, res);
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
|
|
if (!res) /* dictionary doesn't know this lexeme */
|
|
continue;
|
|
|
|
if (res->flags & TSL_FILTER)
|
|
{
|
|
curValLemm = res->lexeme;
|
|
curValLenLemm = strlen(res->lexeme);
|
|
continue;
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
setCorrLex(ld, correspondLexem);
|
|
return res;
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
}
|
|
}
|
|
else
|
|
{ /* curDictId is valid */
|
|
dict = lookup_ts_dictionary_cache(ld->curDictId);
|
|
|
|
/*
|
|
* Dictionary ld->curDictId asks us about following words
|
|
*/
|
|
|
|
while (ld->curSub)
|
|
{
|
|
ParsedLex *curVal = ld->curSub;
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
if (curVal->type != 0)
|
|
{
|
|
bool dictExists = false;
|
|
|
|
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
|
|
{
|
|
/* skip this type of lexeme */
|
|
ld->curSub = curVal->next;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* We should be sure that current type of lexeme is recognized
|
|
* by our dictinonary: we just check is it exist in list of
|
|
* dictionaries ?
|
|
*/
|
|
for (i = 0; i < map->len && !dictExists; i++)
|
|
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
|
|
dictExists = true;
|
|
|
|
if (!dictExists)
|
|
{
|
|
/*
|
|
* Dictionary can't work with current tpe of lexeme,
|
|
* return to basic mode and redo all stored lexemes
|
|
*/
|
|
ld->curDictId = InvalidOid;
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
}
|
|
|
|
ld->dictState.isend = (curVal->type == 0) ? true : false;
|
|
ld->dictState.getnext = false;
|
|
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
|
&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(curVal->lemm),
|
|
Int32GetDatum(curVal->lenlemm),
|
|
PointerGetDatum(&ld->dictState)
|
|
));
|
|
|
|
if (ld->dictState.getnext)
|
|
{
|
|
/* Dictionary wants one more */
|
|
ld->curSub = curVal->next;
|
|
if (res)
|
|
setNewTmpRes(ld, curVal, res);
|
|
continue;
|
|
}
|
|
|
|
if (res || ld->tmpRes)
|
|
{
|
|
/*
|
|
* Dictionary normalizes lexemes, so we remove from stack all
|
|
* used lexemes, return to basic mode and redo end of stack
|
|
* (if it exists)
|
|
*/
|
|
if (res)
|
|
{
|
|
moveToWaste(ld, ld->curSub);
|
|
}
|
|
else
|
|
{
|
|
res = ld->tmpRes;
|
|
moveToWaste(ld, ld->lastRes);
|
|
}
|
|
|
|
/* reset to initial state */
|
|
ld->curDictId = InvalidOid;
|
|
ld->posDict = 0;
|
|
ld->lastRes = NULL;
|
|
ld->tmpRes = NULL;
|
|
setCorrLex(ld, correspondLexem);
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* Dict don't want next lexem and didn't recognize anything, redo
|
|
* from ld->towork.head
|
|
*/
|
|
ld->curDictId = InvalidOid;
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
}
|
|
|
|
setCorrLex(ld, correspondLexem);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Parse string and lexize words.
|
|
*
|
|
* prs will be filled in.
|
|
*/
|
|
void
|
|
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
|
|
{
|
|
int type,
|
|
lenlemm;
|
|
char *lemm = NULL;
|
|
LexizeData ldata;
|
|
TSLexeme *norms;
|
|
TSConfigCacheEntry *cfg;
|
|
TSParserCacheEntry *prsobj;
|
|
void *prsdata;
|
|
|
|
cfg = lookup_ts_config_cache(cfgId);
|
|
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
|
|
|
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
|
|
PointerGetDatum(buf),
|
|
Int32GetDatum(buflen)));
|
|
|
|
LexizeInit(&ldata, cfg);
|
|
|
|
do
|
|
{
|
|
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
|
PointerGetDatum(prsdata),
|
|
PointerGetDatum(&lemm),
|
|
PointerGetDatum(&lenlemm)));
|
|
|
|
if (type > 0 && lenlemm >= MAXSTRLEN)
|
|
{
|
|
#ifdef IGNORE_LONGLEXEME
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
continue;
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
#endif
|
|
}
|
|
|
|
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
|
|
|
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
|
|
{
|
|
TSLexeme *ptr = norms;
|
|
|
|
prs->pos++; /* set pos */
|
|
|
|
while (ptr->lexeme)
|
|
{
|
|
if (prs->curwords == prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
|
|
}
|
|
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
prs->pos++;
|
|
prs->words[prs->curwords].len = strlen(ptr->lexeme);
|
|
prs->words[prs->curwords].word = ptr->lexeme;
|
|
prs->words[prs->curwords].nvariant = ptr->nvariant;
|
|
prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
|
|
prs->words[prs->curwords].alen = 0;
|
|
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
|
|
ptr++;
|
|
prs->curwords++;
|
|
}
|
|
pfree(norms);
|
|
}
|
|
} while (type > 0);
|
|
|
|
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
|
}
|
|
|
|
/*
|
|
* Headline framework
|
|
*/
|
|
static void
|
|
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
|
|
{
|
|
while (prs->curwords >= prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
|
|
}
|
|
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
|
|
prs->words[prs->curwords].type = (uint8) type;
|
|
prs->words[prs->curwords].len = buflen;
|
|
prs->words[prs->curwords].word = palloc(buflen);
|
|
memcpy(prs->words[prs->curwords].word, buf, buflen);
|
|
prs->curwords++;
|
|
}
|
|
|
|
static void
|
|
hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
|
|
{
|
|
int i;
|
|
QueryItem *item = GETQUERY(query);
|
|
HeadlineWordEntry *word;
|
|
|
|
while (prs->curwords + query->size >= prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
|
|
}
|
|
|
|
word = &(prs->words[prs->curwords - 1]);
|
|
for (i = 0; i < query->size; i++)
|
|
{
|
|
if (item->type == QI_VAL &&
|
|
tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
|
|
buf, buflen, item->qoperand.prefix) == 0)
|
|
{
|
|
if (word->item)
|
|
{
|
|
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
|
|
prs->words[prs->curwords].item = &item->qoperand;
|
|
prs->words[prs->curwords].repeated = 1;
|
|
prs->curwords++;
|
|
}
|
|
else
|
|
word->item = &item->qoperand;
|
|
}
|
|
item++;
|
|
}
|
|
}
|
|
|
|
static void
|
|
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
|
|
{
|
|
ParsedLex *tmplexs;
|
|
TSLexeme *ptr;
|
|
|
|
while (lexs)
|
|
{
|
|
|
|
if (lexs->type > 0)
|
|
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
|
|
|
|
ptr = norms;
|
|
while (ptr && ptr->lexeme)
|
|
{
|
|
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
|
|
ptr++;
|
|
}
|
|
|
|
tmplexs = lexs->next;
|
|
pfree(lexs);
|
|
lexs = tmplexs;
|
|
}
|
|
|
|
if (norms)
|
|
{
|
|
ptr = norms;
|
|
while (ptr->lexeme)
|
|
{
|
|
pfree(ptr->lexeme);
|
|
ptr++;
|
|
}
|
|
pfree(norms);
|
|
}
|
|
}
|
|
|
|
void
|
|
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
|
|
{
|
|
int type,
|
|
lenlemm;
|
|
char *lemm = NULL;
|
|
LexizeData ldata;
|
|
TSLexeme *norms;
|
|
ParsedLex *lexs;
|
|
TSConfigCacheEntry *cfg;
|
|
TSParserCacheEntry *prsobj;
|
|
void *prsdata;
|
|
|
|
cfg = lookup_ts_config_cache(cfgId);
|
|
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
|
|
|
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
|
|
PointerGetDatum(buf),
|
|
Int32GetDatum(buflen)));
|
|
|
|
LexizeInit(&ldata, cfg);
|
|
|
|
do
|
|
{
|
|
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
|
PointerGetDatum(prsdata),
|
|
PointerGetDatum(&lemm),
|
|
PointerGetDatum(&lenlemm)));
|
|
|
|
if (type > 0 && lenlemm >= MAXSTRLEN)
|
|
{
|
|
#ifdef IGNORE_LONGLEXEME
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
continue;
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
#endif
|
|
}
|
|
|
|
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
|
|
|
do
|
|
{
|
|
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
|
|
addHLParsedLex(prs, query, lexs, norms);
|
|
else
|
|
addHLParsedLex(prs, query, lexs, NULL);
|
|
} while (norms);
|
|
|
|
} while (type > 0);
|
|
|
|
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
|
}
|
|
|
|
text *
|
|
generateHeadline(HeadlineParsedText *prs)
|
|
{
|
|
text *out;
|
|
char *ptr;
|
|
int len = 128;
|
|
int numfragments = 0;
|
|
int2 infrag = 0;
|
|
|
|
HeadlineWordEntry *wrd = prs->words;
|
|
|
|
out = (text *) palloc(len);
|
|
ptr = ((char *) out) + VARHDRSZ;
|
|
|
|
while (wrd - prs->words < prs->curwords)
|
|
{
|
|
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
|
|
{
|
|
int dist = ptr - ((char *) out);
|
|
|
|
len *= 2;
|
|
out = (text *) repalloc(out, len);
|
|
ptr = ((char *) out) + dist;
|
|
}
|
|
|
|
if (wrd->in && !wrd->repeated)
|
|
{
|
|
if (!infrag)
|
|
{
|
|
|
|
/* start of a new fragment */
|
|
infrag = 1;
|
|
numfragments++;
|
|
/* add a fragment delimitor if this is after the first one */
|
|
if (numfragments > 1)
|
|
{
|
|
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
|
|
ptr += prs->fragdelimlen;
|
|
}
|
|
|
|
}
|
|
if (wrd->replace)
|
|
{
|
|
*ptr = ' ';
|
|
ptr++;
|
|
}
|
|
else if (!wrd->skip)
|
|
{
|
|
if (wrd->selected)
|
|
{
|
|
memcpy(ptr, prs->startsel, prs->startsellen);
|
|
ptr += prs->startsellen;
|
|
}
|
|
memcpy(ptr, wrd->word, wrd->len);
|
|
ptr += wrd->len;
|
|
if (wrd->selected)
|
|
{
|
|
memcpy(ptr, prs->stopsel, prs->stopsellen);
|
|
ptr += prs->stopsellen;
|
|
}
|
|
}
|
|
}
|
|
else if (!wrd->repeated)
|
|
{
|
|
if (infrag)
|
|
infrag = 0;
|
|
pfree(wrd->word);
|
|
}
|
|
|
|
wrd++;
|
|
}
|
|
|
|
SET_VARSIZE(out, ptr - ((char *) out));
|
|
return out;
|
|
}
|