postgresql/src/backend/tsearch/ts_parse.c
Tom Lane 382ceffdf7 Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.

By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis.  However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent.  That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.

This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.

This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.

Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 15:35:54 -04:00

672 lines
14 KiB
C

/*-------------------------------------------------------------------------
*
* ts_parse.c
* main parse functions for tsearch
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/tsearch/ts_parse.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#define IGNORE_LONGLEXEME 1
/*
* Lexize subsystem
*/
typedef struct ParsedLex
{
int type;
char *lemm;
int lenlemm;
struct ParsedLex *next;
} ParsedLex;
typedef struct ListParsedLex
{
ParsedLex *head;
ParsedLex *tail;
} ListParsedLex;
typedef struct
{
TSConfigCacheEntry *cfg;
Oid curDictId;
int posDict;
DictSubState dictState;
ParsedLex *curSub;
ListParsedLex towork; /* current list to work */
ListParsedLex waste; /* list of lexemes that already lexized */
/*
* fields to store last variant to lexize (basically, thesaurus or similar
* to, which wants several lexemes
*/
ParsedLex *lastRes;
TSLexeme *tmpRes;
} LexizeData;
static void
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
{
ld->cfg = cfg;
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
ld->waste.head = ld->waste.tail = NULL;
ld->lastRes = NULL;
ld->tmpRes = NULL;
}
static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
{
if (list->tail)
{
list->tail->next = newpl;
list->tail = newpl;
}
else
list->head = list->tail = newpl;
newpl->next = NULL;
}
static ParsedLex *
LPLRemoveHead(ListParsedLex *list)
{
ParsedLex *res = list->head;
if (list->head)
list->head = list->head->next;
if (list->head == NULL)
list->tail = NULL;
return res;
}
static void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
{
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
newpl->type = type;
newpl->lemm = lemm;
newpl->lenlemm = lenlemm;
LPLAddTail(&ld->towork, newpl);
ld->curSub = ld->towork.tail;
}
static void
RemoveHead(LexizeData *ld)
{
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
ld->posDict = 0;
}
static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
{
if (correspondLexem)
{
*correspondLexem = ld->waste.head;
}
else
{
ParsedLex *tmp,
*ptr = ld->waste.head;
while (ptr)
{
tmp = ptr->next;
pfree(ptr);
ptr = tmp;
}
}
ld->waste.head = ld->waste.tail = NULL;
}
static void
moveToWaste(LexizeData *ld, ParsedLex *stop)
{
bool go = true;
while (ld->towork.head && go)
{
if (ld->towork.head == stop)
{
ld->curSub = stop->next;
go = false;
}
RemoveHead(ld);
}
}
static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
{
if (ld->tmpRes)
{
TSLexeme *ptr;
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
pfree(ptr->lexeme);
pfree(ld->tmpRes);
}
ld->tmpRes = res;
ld->lastRes = lex;
}
static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{
int i;
ListDictionary *map;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (ld->curDictId == InvalidOid)
{
/*
* usual mode: dictionary wants only one word, but we should keep in
* mind that we should go through all stack
*/
while (ld->towork.head)
{
ParsedLex *curVal = ld->towork.head;
char *curValLemm = curVal->lemm;
int curValLenLemm = curVal->lenlemm;
map = ld->cfg->map + curVal->type;
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
RemoveHead(ld);
continue;
}
for (i = ld->posDict; i < map->len; i++)
{
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
ld->dictState.isend = ld->dictState.getnext = false;
ld->dictState.private_state = NULL;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curValLemm),
Int32GetDatum(curValLenLemm),
PointerGetDatum(&ld->dictState)
));
if (ld->dictState.getnext)
{
/*
* dictionary wants next word, so setup and store current
* position and go to multiword mode
*/
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
ld->posDict = i + 1;
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
return LexizeExec(ld, correspondLexem);
}
if (!res) /* dictionary doesn't know this lexeme */
continue;
if (res->flags & TSL_FILTER)
{
curValLemm = res->lexeme;
curValLenLemm = strlen(res->lexeme);
continue;
}
RemoveHead(ld);
setCorrLex(ld, correspondLexem);
return res;
}
RemoveHead(ld);
}
}
else
{ /* curDictId is valid */
dict = lookup_ts_dictionary_cache(ld->curDictId);
/*
* Dictionary ld->curDictId asks us about following words
*/
while (ld->curSub)
{
ParsedLex *curVal = ld->curSub;
map = ld->cfg->map + curVal->type;
if (curVal->type != 0)
{
bool dictExists = false;
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
ld->curSub = curVal->next;
continue;
}
/*
* We should be sure that current type of lexeme is recognized
* by our dictionary: we just check is it exist in list of
* dictionaries ?
*/
for (i = 0; i < map->len && !dictExists; i++)
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
dictExists = true;
if (!dictExists)
{
/*
* Dictionary can't work with current tpe of lexeme,
* return to basic mode and redo all stored lexemes
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
ld->dictState.isend = (curVal->type == 0) ? true : false;
ld->dictState.getnext = false;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)
));
if (ld->dictState.getnext)
{
/* Dictionary wants one more */
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
continue;
}
if (res || ld->tmpRes)
{
/*
* Dictionary normalizes lexemes, so we remove from stack all
* used lexemes, return to basic mode and redo end of stack
* (if it exists)
*/
if (res)
{
moveToWaste(ld, ld->curSub);
}
else
{
res = ld->tmpRes;
moveToWaste(ld, ld->lastRes);
}
/* reset to initial state */
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->lastRes = NULL;
ld->tmpRes = NULL;
setCorrLex(ld, correspondLexem);
return res;
}
/*
* Dict don't want next lexem and didn't recognize anything, redo
* from ld->towork.head
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
setCorrLex(ld, correspondLexem);
return NULL;
}
/*
* Parse string and lexize words.
*
* prs will be filled in.
*/
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{
int type,
lenlemm;
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
{
TSLexeme *ptr = norms;
prs->pos++; /* set pos */
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
}
if (ptr->flags & TSL_ADDPOS)
prs->pos++;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
prs->curwords++;
}
pfree(norms);
}
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
/*
* Headline framework
*/
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
while (prs->curwords >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
}
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].type = (uint8) type;
prs->words[prs->curwords].len = buflen;
prs->words[prs->curwords].word = palloc(buflen);
memcpy(prs->words[prs->curwords].word, buf, buflen);
prs->curwords++;
}
static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
int i;
QueryItem *item = GETQUERY(query);
HeadlineWordEntry *word;
while (prs->curwords + query->size >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
}
word = &(prs->words[prs->curwords - 1]);
word->pos = LIMITPOS(pos);
for (i = 0; i < query->size; i++)
{
if (item->type == QI_VAL &&
tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
buf, buflen, item->qoperand.prefix) == 0)
{
if (word->item)
{
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
prs->words[prs->curwords].item = &item->qoperand;
prs->words[prs->curwords].repeated = 1;
prs->curwords++;
}
else
word->item = &item->qoperand;
}
item++;
}
}
static void
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{
ParsedLex *tmplexs;
TSLexeme *ptr;
int32 savedpos;
while (lexs)
{
if (lexs->type > 0)
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
ptr = norms;
savedpos = prs->vectorpos;
while (ptr && ptr->lexeme)
{
if (ptr->flags & TSL_ADDPOS)
savedpos++;
hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
ptr++;
}
tmplexs = lexs->next;
pfree(lexs);
lexs = tmplexs;
}
if (norms)
{
ptr = norms;
while (ptr->lexeme)
{
if (ptr->flags & TSL_ADDPOS)
prs->vectorpos++;
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
}
}
void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
int type,
lenlemm;
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
ParsedLex *lexs;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
do
{
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
{
prs->vectorpos++;
addHLParsedLex(prs, query, lexs, norms);
}
else
addHLParsedLex(prs, query, lexs, NULL);
} while (norms);
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
text *
generateHeadline(HeadlineParsedText *prs)
{
text *out;
char *ptr;
int len = 128;
int numfragments = 0;
int16 infrag = 0;
HeadlineWordEntry *wrd = prs->words;
out = (text *) palloc(len);
ptr = ((char *) out) + VARHDRSZ;
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
len *= 2;
out = (text *) repalloc(out, len);
ptr = ((char *) out) + dist;
}
if (wrd->in && !wrd->repeated)
{
if (!infrag)
{
/* start of a new fragment */
infrag = 1;
numfragments++;
/* add a fragment delimiter if this is after the first one */
if (numfragments > 1)
{
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
ptr += prs->fragdelimlen;
}
}
if (wrd->replace)
{
*ptr = ' ';
ptr++;
}
else if (!wrd->skip)
{
if (wrd->selected)
{
memcpy(ptr, prs->startsel, prs->startsellen);
ptr += prs->startsellen;
}
memcpy(ptr, wrd->word, wrd->len);
ptr += wrd->len;
if (wrd->selected)
{
memcpy(ptr, prs->stopsel, prs->stopsellen);
ptr += prs->stopsellen;
}
}
}
else if (!wrd->repeated)
{
if (infrag)
infrag = 0;
pfree(wrd->word);
}
wrd++;
}
SET_VARSIZE(out, ptr - ((char *) out));
return out;
}