mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-09-19 03:00:09 +02:00
382ceffdf7
Don't move parenthesized lines to the left, even if that means they flow past the right margin. By default, BSD indent lines up statement continuation lines that are within parentheses so that they start just to the right of the preceding left parenthesis. However, traditionally, if that resulted in the continuation line extending to the right of the desired right margin, then indent would push it left just far enough to not overrun the margin, if it could do so without making the continuation line start to the left of the current statement indent. That makes for a weird mix of indentations unless one has been completely rigid about never violating the 80-column limit. This behavior has been pretty universally panned by Postgres developers. Hence, disable it with indent's new -lpl switch, so that parenthesized lines are always lined up with the preceding left paren. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
672 lines
14 KiB
C
672 lines
14 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* ts_parse.c
|
|
* main parse functions for tsearch
|
|
*
|
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/tsearch/ts_parse.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "tsearch/ts_cache.h"
|
|
#include "tsearch/ts_utils.h"
|
|
|
|
#define IGNORE_LONGLEXEME 1
|
|
|
|
/*
|
|
* Lexize subsystem
|
|
*/
|
|
|
|
typedef struct ParsedLex
|
|
{
|
|
int type;
|
|
char *lemm;
|
|
int lenlemm;
|
|
struct ParsedLex *next;
|
|
} ParsedLex;
|
|
|
|
typedef struct ListParsedLex
|
|
{
|
|
ParsedLex *head;
|
|
ParsedLex *tail;
|
|
} ListParsedLex;
|
|
|
|
typedef struct
|
|
{
|
|
TSConfigCacheEntry *cfg;
|
|
Oid curDictId;
|
|
int posDict;
|
|
DictSubState dictState;
|
|
ParsedLex *curSub;
|
|
ListParsedLex towork; /* current list to work */
|
|
ListParsedLex waste; /* list of lexemes that already lexized */
|
|
|
|
/*
|
|
* fields to store last variant to lexize (basically, thesaurus or similar
|
|
* to, which wants several lexemes
|
|
*/
|
|
|
|
ParsedLex *lastRes;
|
|
TSLexeme *tmpRes;
|
|
} LexizeData;
|
|
|
|
static void
|
|
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
|
|
{
|
|
ld->cfg = cfg;
|
|
ld->curDictId = InvalidOid;
|
|
ld->posDict = 0;
|
|
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
ld->lastRes = NULL;
|
|
ld->tmpRes = NULL;
|
|
}
|
|
|
|
static void
|
|
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
|
|
{
|
|
if (list->tail)
|
|
{
|
|
list->tail->next = newpl;
|
|
list->tail = newpl;
|
|
}
|
|
else
|
|
list->head = list->tail = newpl;
|
|
newpl->next = NULL;
|
|
}
|
|
|
|
static ParsedLex *
|
|
LPLRemoveHead(ListParsedLex *list)
|
|
{
|
|
ParsedLex *res = list->head;
|
|
|
|
if (list->head)
|
|
list->head = list->head->next;
|
|
|
|
if (list->head == NULL)
|
|
list->tail = NULL;
|
|
|
|
return res;
|
|
}
|
|
|
|
static void
|
|
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
|
|
{
|
|
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
|
|
|
newpl->type = type;
|
|
newpl->lemm = lemm;
|
|
newpl->lenlemm = lenlemm;
|
|
LPLAddTail(&ld->towork, newpl);
|
|
ld->curSub = ld->towork.tail;
|
|
}
|
|
|
|
static void
|
|
RemoveHead(LexizeData *ld)
|
|
{
|
|
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
|
|
|
|
ld->posDict = 0;
|
|
}
|
|
|
|
static void
|
|
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
|
|
{
|
|
if (correspondLexem)
|
|
{
|
|
*correspondLexem = ld->waste.head;
|
|
}
|
|
else
|
|
{
|
|
ParsedLex *tmp,
|
|
*ptr = ld->waste.head;
|
|
|
|
while (ptr)
|
|
{
|
|
tmp = ptr->next;
|
|
pfree(ptr);
|
|
ptr = tmp;
|
|
}
|
|
}
|
|
ld->waste.head = ld->waste.tail = NULL;
|
|
}
|
|
|
|
static void
|
|
moveToWaste(LexizeData *ld, ParsedLex *stop)
|
|
{
|
|
bool go = true;
|
|
|
|
while (ld->towork.head && go)
|
|
{
|
|
if (ld->towork.head == stop)
|
|
{
|
|
ld->curSub = stop->next;
|
|
go = false;
|
|
}
|
|
RemoveHead(ld);
|
|
}
|
|
}
|
|
|
|
static void
|
|
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
|
|
{
|
|
if (ld->tmpRes)
|
|
{
|
|
TSLexeme *ptr;
|
|
|
|
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
|
|
pfree(ptr->lexeme);
|
|
pfree(ld->tmpRes);
|
|
}
|
|
ld->tmpRes = res;
|
|
ld->lastRes = lex;
|
|
}
|
|
|
|
static TSLexeme *
|
|
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
|
|
{
|
|
int i;
|
|
ListDictionary *map;
|
|
TSDictionaryCacheEntry *dict;
|
|
TSLexeme *res;
|
|
|
|
if (ld->curDictId == InvalidOid)
|
|
{
|
|
/*
|
|
* usual mode: dictionary wants only one word, but we should keep in
|
|
* mind that we should go through all stack
|
|
*/
|
|
|
|
while (ld->towork.head)
|
|
{
|
|
ParsedLex *curVal = ld->towork.head;
|
|
char *curValLemm = curVal->lemm;
|
|
int curValLenLemm = curVal->lenlemm;
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
|
|
{
|
|
/* skip this type of lexeme */
|
|
RemoveHead(ld);
|
|
continue;
|
|
}
|
|
|
|
for (i = ld->posDict; i < map->len; i++)
|
|
{
|
|
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
|
|
|
|
ld->dictState.isend = ld->dictState.getnext = false;
|
|
ld->dictState.private_state = NULL;
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
|
&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(curValLemm),
|
|
Int32GetDatum(curValLenLemm),
|
|
PointerGetDatum(&ld->dictState)
|
|
));
|
|
|
|
if (ld->dictState.getnext)
|
|
{
|
|
/*
|
|
* dictionary wants next word, so setup and store current
|
|
* position and go to multiword mode
|
|
*/
|
|
|
|
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
|
|
ld->posDict = i + 1;
|
|
ld->curSub = curVal->next;
|
|
if (res)
|
|
setNewTmpRes(ld, curVal, res);
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
|
|
if (!res) /* dictionary doesn't know this lexeme */
|
|
continue;
|
|
|
|
if (res->flags & TSL_FILTER)
|
|
{
|
|
curValLemm = res->lexeme;
|
|
curValLenLemm = strlen(res->lexeme);
|
|
continue;
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
setCorrLex(ld, correspondLexem);
|
|
return res;
|
|
}
|
|
|
|
RemoveHead(ld);
|
|
}
|
|
}
|
|
else
|
|
{ /* curDictId is valid */
|
|
dict = lookup_ts_dictionary_cache(ld->curDictId);
|
|
|
|
/*
|
|
* Dictionary ld->curDictId asks us about following words
|
|
*/
|
|
|
|
while (ld->curSub)
|
|
{
|
|
ParsedLex *curVal = ld->curSub;
|
|
|
|
map = ld->cfg->map + curVal->type;
|
|
|
|
if (curVal->type != 0)
|
|
{
|
|
bool dictExists = false;
|
|
|
|
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
|
|
{
|
|
/* skip this type of lexeme */
|
|
ld->curSub = curVal->next;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* We should be sure that current type of lexeme is recognized
|
|
* by our dictionary: we just check is it exist in list of
|
|
* dictionaries ?
|
|
*/
|
|
for (i = 0; i < map->len && !dictExists; i++)
|
|
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
|
|
dictExists = true;
|
|
|
|
if (!dictExists)
|
|
{
|
|
/*
|
|
* Dictionary can't work with current tpe of lexeme,
|
|
* return to basic mode and redo all stored lexemes
|
|
*/
|
|
ld->curDictId = InvalidOid;
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
}
|
|
|
|
ld->dictState.isend = (curVal->type == 0) ? true : false;
|
|
ld->dictState.getnext = false;
|
|
|
|
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
|
&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(curVal->lemm),
|
|
Int32GetDatum(curVal->lenlemm),
|
|
PointerGetDatum(&ld->dictState)
|
|
));
|
|
|
|
if (ld->dictState.getnext)
|
|
{
|
|
/* Dictionary wants one more */
|
|
ld->curSub = curVal->next;
|
|
if (res)
|
|
setNewTmpRes(ld, curVal, res);
|
|
continue;
|
|
}
|
|
|
|
if (res || ld->tmpRes)
|
|
{
|
|
/*
|
|
* Dictionary normalizes lexemes, so we remove from stack all
|
|
* used lexemes, return to basic mode and redo end of stack
|
|
* (if it exists)
|
|
*/
|
|
if (res)
|
|
{
|
|
moveToWaste(ld, ld->curSub);
|
|
}
|
|
else
|
|
{
|
|
res = ld->tmpRes;
|
|
moveToWaste(ld, ld->lastRes);
|
|
}
|
|
|
|
/* reset to initial state */
|
|
ld->curDictId = InvalidOid;
|
|
ld->posDict = 0;
|
|
ld->lastRes = NULL;
|
|
ld->tmpRes = NULL;
|
|
setCorrLex(ld, correspondLexem);
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* Dict don't want next lexem and didn't recognize anything, redo
|
|
* from ld->towork.head
|
|
*/
|
|
ld->curDictId = InvalidOid;
|
|
return LexizeExec(ld, correspondLexem);
|
|
}
|
|
}
|
|
|
|
setCorrLex(ld, correspondLexem);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Parse string and lexize words.
|
|
*
|
|
* prs will be filled in.
|
|
*/
|
|
void
|
|
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
|
|
{
|
|
int type,
|
|
lenlemm;
|
|
char *lemm = NULL;
|
|
LexizeData ldata;
|
|
TSLexeme *norms;
|
|
TSConfigCacheEntry *cfg;
|
|
TSParserCacheEntry *prsobj;
|
|
void *prsdata;
|
|
|
|
cfg = lookup_ts_config_cache(cfgId);
|
|
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
|
|
|
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
|
|
PointerGetDatum(buf),
|
|
Int32GetDatum(buflen)));
|
|
|
|
LexizeInit(&ldata, cfg);
|
|
|
|
do
|
|
{
|
|
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
|
PointerGetDatum(prsdata),
|
|
PointerGetDatum(&lemm),
|
|
PointerGetDatum(&lenlemm)));
|
|
|
|
if (type > 0 && lenlemm >= MAXSTRLEN)
|
|
{
|
|
#ifdef IGNORE_LONGLEXEME
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
continue;
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
#endif
|
|
}
|
|
|
|
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
|
|
|
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
|
|
{
|
|
TSLexeme *ptr = norms;
|
|
|
|
prs->pos++; /* set pos */
|
|
|
|
while (ptr->lexeme)
|
|
{
|
|
if (prs->curwords == prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
|
|
}
|
|
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
prs->pos++;
|
|
prs->words[prs->curwords].len = strlen(ptr->lexeme);
|
|
prs->words[prs->curwords].word = ptr->lexeme;
|
|
prs->words[prs->curwords].nvariant = ptr->nvariant;
|
|
prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
|
|
prs->words[prs->curwords].alen = 0;
|
|
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
|
|
ptr++;
|
|
prs->curwords++;
|
|
}
|
|
pfree(norms);
|
|
}
|
|
} while (type > 0);
|
|
|
|
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
|
}
|
|
|
|
/*
|
|
* Headline framework
|
|
*/
|
|
static void
|
|
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
|
|
{
|
|
while (prs->curwords >= prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
|
|
}
|
|
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
|
|
prs->words[prs->curwords].type = (uint8) type;
|
|
prs->words[prs->curwords].len = buflen;
|
|
prs->words[prs->curwords].word = palloc(buflen);
|
|
memcpy(prs->words[prs->curwords].word, buf, buflen);
|
|
prs->curwords++;
|
|
}
|
|
|
|
static void
|
|
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
|
|
{
|
|
int i;
|
|
QueryItem *item = GETQUERY(query);
|
|
HeadlineWordEntry *word;
|
|
|
|
while (prs->curwords + query->size >= prs->lenwords)
|
|
{
|
|
prs->lenwords *= 2;
|
|
prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
|
|
}
|
|
|
|
word = &(prs->words[prs->curwords - 1]);
|
|
word->pos = LIMITPOS(pos);
|
|
for (i = 0; i < query->size; i++)
|
|
{
|
|
if (item->type == QI_VAL &&
|
|
tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
|
|
buf, buflen, item->qoperand.prefix) == 0)
|
|
{
|
|
if (word->item)
|
|
{
|
|
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
|
|
prs->words[prs->curwords].item = &item->qoperand;
|
|
prs->words[prs->curwords].repeated = 1;
|
|
prs->curwords++;
|
|
}
|
|
else
|
|
word->item = &item->qoperand;
|
|
}
|
|
item++;
|
|
}
|
|
}
|
|
|
|
static void
|
|
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
|
|
{
|
|
ParsedLex *tmplexs;
|
|
TSLexeme *ptr;
|
|
int32 savedpos;
|
|
|
|
while (lexs)
|
|
{
|
|
if (lexs->type > 0)
|
|
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
|
|
|
|
ptr = norms;
|
|
savedpos = prs->vectorpos;
|
|
while (ptr && ptr->lexeme)
|
|
{
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
savedpos++;
|
|
hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
|
|
ptr++;
|
|
}
|
|
|
|
tmplexs = lexs->next;
|
|
pfree(lexs);
|
|
lexs = tmplexs;
|
|
}
|
|
|
|
if (norms)
|
|
{
|
|
ptr = norms;
|
|
while (ptr->lexeme)
|
|
{
|
|
if (ptr->flags & TSL_ADDPOS)
|
|
prs->vectorpos++;
|
|
pfree(ptr->lexeme);
|
|
ptr++;
|
|
}
|
|
pfree(norms);
|
|
}
|
|
}
|
|
|
|
void
|
|
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
|
|
{
|
|
int type,
|
|
lenlemm;
|
|
char *lemm = NULL;
|
|
LexizeData ldata;
|
|
TSLexeme *norms;
|
|
ParsedLex *lexs;
|
|
TSConfigCacheEntry *cfg;
|
|
TSParserCacheEntry *prsobj;
|
|
void *prsdata;
|
|
|
|
cfg = lookup_ts_config_cache(cfgId);
|
|
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
|
|
|
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
|
|
PointerGetDatum(buf),
|
|
Int32GetDatum(buflen)));
|
|
|
|
LexizeInit(&ldata, cfg);
|
|
|
|
do
|
|
{
|
|
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
|
PointerGetDatum(prsdata),
|
|
PointerGetDatum(&lemm),
|
|
PointerGetDatum(&lenlemm)));
|
|
|
|
if (type > 0 && lenlemm >= MAXSTRLEN)
|
|
{
|
|
#ifdef IGNORE_LONGLEXEME
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
continue;
|
|
#else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("word is too long to be indexed"),
|
|
errdetail("Words longer than %d characters are ignored.",
|
|
MAXSTRLEN)));
|
|
#endif
|
|
}
|
|
|
|
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
|
|
|
do
|
|
{
|
|
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
|
|
{
|
|
prs->vectorpos++;
|
|
addHLParsedLex(prs, query, lexs, norms);
|
|
}
|
|
else
|
|
addHLParsedLex(prs, query, lexs, NULL);
|
|
} while (norms);
|
|
|
|
} while (type > 0);
|
|
|
|
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
|
}
|
|
|
|
text *
|
|
generateHeadline(HeadlineParsedText *prs)
|
|
{
|
|
text *out;
|
|
char *ptr;
|
|
int len = 128;
|
|
int numfragments = 0;
|
|
int16 infrag = 0;
|
|
|
|
HeadlineWordEntry *wrd = prs->words;
|
|
|
|
out = (text *) palloc(len);
|
|
ptr = ((char *) out) + VARHDRSZ;
|
|
|
|
while (wrd - prs->words < prs->curwords)
|
|
{
|
|
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
|
|
{
|
|
int dist = ptr - ((char *) out);
|
|
|
|
len *= 2;
|
|
out = (text *) repalloc(out, len);
|
|
ptr = ((char *) out) + dist;
|
|
}
|
|
|
|
if (wrd->in && !wrd->repeated)
|
|
{
|
|
if (!infrag)
|
|
{
|
|
|
|
/* start of a new fragment */
|
|
infrag = 1;
|
|
numfragments++;
|
|
/* add a fragment delimiter if this is after the first one */
|
|
if (numfragments > 1)
|
|
{
|
|
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
|
|
ptr += prs->fragdelimlen;
|
|
}
|
|
|
|
}
|
|
if (wrd->replace)
|
|
{
|
|
*ptr = ' ';
|
|
ptr++;
|
|
}
|
|
else if (!wrd->skip)
|
|
{
|
|
if (wrd->selected)
|
|
{
|
|
memcpy(ptr, prs->startsel, prs->startsellen);
|
|
ptr += prs->startsellen;
|
|
}
|
|
memcpy(ptr, wrd->word, wrd->len);
|
|
ptr += wrd->len;
|
|
if (wrd->selected)
|
|
{
|
|
memcpy(ptr, prs->stopsel, prs->stopsellen);
|
|
ptr += prs->stopsellen;
|
|
}
|
|
}
|
|
}
|
|
else if (!wrd->repeated)
|
|
{
|
|
if (infrag)
|
|
infrag = 0;
|
|
pfree(wrd->word);
|
|
}
|
|
|
|
wrd++;
|
|
}
|
|
|
|
SET_VARSIZE(out, ptr - ((char *) out));
|
|
return out;
|
|
}
|