postgresql/src/backend/utils/adt/tsquery.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1350 lines
29 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* tsquery.c
* I/O functions for tsquery
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/utils/adt/tsquery.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/pg_crc.h"
/* FTS operator priorities, see ts_type.h */
const int tsearch_op_priority[OP_COUNT] =
{
4, /* OP_NOT */
2, /* OP_AND */
1, /* OP_OR */
3 /* OP_PHRASE */
};
/*
* parser's states
*/
typedef enum
{
WAITOPERAND = 1,
WAITOPERATOR = 2,
WAITFIRSTOPERAND = 3
} ts_parserstate;
/*
* token types for parsing
*/
typedef enum
{
PT_END = 0,
PT_ERR = 1,
PT_VAL = 2,
PT_OPR = 3,
PT_OPEN = 4,
PT_CLOSE = 5
} ts_tokentype;
/*
* get token from query string
*
* *operator is filled in with OP_* when return values is PT_OPR,
* but *weight could contain a distance value in case of phrase operator.
* *strval, *lenval and *weight are filled in when return value is PT_VAL
*
*/
typedef ts_tokentype (*ts_tokenizer) (TSQueryParserState state, int8 *operator,
int *lenval, char **strval,
int16 *weight, bool *prefix);
struct TSQueryParserStateData
{
/* Tokenizer used for parsing tsquery */
ts_tokenizer gettoken;
/* State of tokenizer function */
char *buffer; /* entire string we are scanning */
char *buf; /* current scan point */
int count; /* nesting count, incremented by (,
* decremented by ) */
ts_parserstate state;
/* polish (prefix) notation in list, filled in by push* functions */
List *polstr;
/*
* Strings from operands are collected in op. curop is a pointer to the
* end of used space of op.
*/
char *op;
char *curop;
int lenop; /* allocated size of op */
int sumlen; /* used size of op */
/* state for value's parser */
TSVectorParseState valstate;
};
/*
* subroutine to parse the modifiers (weight and prefix flag currently)
* part, like ':AB*' of a query.
*/
static char *
get_modifiers(char *buf, int16 *weight, bool *prefix)
{
*weight = 0;
*prefix = false;
if (!t_iseq(buf, ':'))
return buf;
buf++;
while (*buf && pg_mblen(buf) == 1)
{
switch (*buf)
{
case 'a':
case 'A':
*weight |= 1 << 3;
break;
case 'b':
case 'B':
*weight |= 1 << 2;
break;
case 'c':
case 'C':
*weight |= 1 << 1;
break;
case 'd':
case 'D':
*weight |= 1;
break;
case '*':
*prefix = true;
break;
default:
return buf;
}
buf++;
}
return buf;
}
/*
* Parse phrase operator. The operator
* may take the following forms:
*
* a <N> b (distance is exactly N lexemes)
* a <-> b (default distance = 1)
*
* The buffer should begin with '<' char
*/
static bool
parse_phrase_operator(TSQueryParserState pstate, int16 *distance)
{
enum
{
PHRASE_OPEN = 0,
PHRASE_DIST,
PHRASE_CLOSE,
PHRASE_FINISH
} state = PHRASE_OPEN;
char *ptr = pstate->buf;
char *endptr;
long l = 1; /* default distance */
while (*ptr)
{
switch (state)
{
case PHRASE_OPEN:
if (t_iseq(ptr, '<'))
{
state = PHRASE_DIST;
ptr++;
}
else
return false;
break;
case PHRASE_DIST:
if (t_iseq(ptr, '-'))
{
state = PHRASE_CLOSE;
ptr++;
continue;
}
if (!t_isdigit(ptr))
return false;
errno = 0;
l = strtol(ptr, &endptr, 10);
if (ptr == endptr)
return false;
else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("distance in phrase operator must be an integer value between zero and %d inclusive",
MAXENTRYPOS)));
else
{
state = PHRASE_CLOSE;
ptr = endptr;
}
break;
case PHRASE_CLOSE:
if (t_iseq(ptr, '>'))
{
state = PHRASE_FINISH;
ptr++;
}
else
return false;
break;
case PHRASE_FINISH:
*distance = (int16) l;
pstate->buf = ptr;
return true;
}
}
return false;
}
/*
* Parse OR operator used in websearch_to_tsquery(), returns true if we
* believe that "OR" literal could be an operator OR
*/
static bool
parse_or_operator(TSQueryParserState pstate)
{
char *ptr = pstate->buf;
/* it should begin with "OR" literal */
if (pg_strncasecmp(ptr, "or", 2) != 0)
return false;
ptr += 2;
/*
* it shouldn't be a part of any word but somewhere later it should be
* some operand
*/
if (*ptr == '\0') /* no operand */
return false;
/* it shouldn't be a part of any word */
if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
return false;
for (;;)
{
ptr += pg_mblen(ptr);
if (*ptr == '\0') /* got end of string without operand */
return false;
/*
* Suppose, we found an operand, but could be a not correct operand.
* So we still treat OR literal as operation with possibly incorrect
* operand and will not search it as lexeme
*/
if (!t_isspace(ptr))
break;
}
pstate->buf += 2;
return true;
}
static ts_tokentype
gettoken_query_standard(TSQueryParserState state, int8 *operator,
int *lenval, char **strval,
int16 *weight, bool *prefix)
{
*weight = 0;
*prefix = false;
while (true)
{
switch (state->state)
{
case WAITFIRSTOPERAND:
case WAITOPERAND:
if (t_iseq(state->buf, '!'))
{
state->buf++;
state->state = WAITOPERAND;
*operator = OP_NOT;
return PT_OPR;
}
else if (t_iseq(state->buf, '('))
{
state->buf++;
state->state = WAITOPERAND;
state->count++;
return PT_OPEN;
}
else if (t_iseq(state->buf, ':'))
{
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsquery: \"%s\"",
state->buffer)));
}
else if (!t_isspace(state->buf))
{
/*
* We rely on the tsvector parser to parse the value for
* us
*/
reset_tsvector_parser(state->valstate, state->buf);
if (gettoken_tsvector(state->valstate, strval, lenval,
NULL, NULL, &state->buf))
{
state->buf = get_modifiers(state->buf, weight, prefix);
state->state = WAITOPERATOR;
return PT_VAL;
}
else if (state->state == WAITFIRSTOPERAND)
{
return PT_END;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("no operand in tsquery: \"%s\"",
state->buffer)));
}
break;
case WAITOPERATOR:
if (t_iseq(state->buf, '&'))
{
state->buf++;
state->state = WAITOPERAND;
*operator = OP_AND;
return PT_OPR;
}
else if (t_iseq(state->buf, '|'))
{
state->buf++;
state->state = WAITOPERAND;
*operator = OP_OR;
return PT_OPR;
}
else if (parse_phrase_operator(state, weight))
{
/* weight var is used as storage for distance */
state->state = WAITOPERAND;
*operator = OP_PHRASE;
return PT_OPR;
}
else if (t_iseq(state->buf, ')'))
{
state->buf++;
state->count--;
return (state->count < 0) ? PT_ERR : PT_CLOSE;
}
else if (*state->buf == '\0')
{
return (state->count) ? PT_ERR : PT_END;
}
else if (!t_isspace(state->buf))
{
return PT_ERR;
}
break;
}
state->buf += pg_mblen(state->buf);
}
}
static ts_tokentype
gettoken_query_websearch(TSQueryParserState state, int8 *operator,
int *lenval, char **strval,
int16 *weight, bool *prefix)
{
*weight = 0;
*prefix = false;
while (true)
{
switch (state->state)
{
case WAITFIRSTOPERAND:
case WAITOPERAND:
if (t_iseq(state->buf, '-'))
{
state->buf++;
state->state = WAITOPERAND;
*operator = OP_NOT;
return PT_OPR;
}
else if (t_iseq(state->buf, '"'))
{
/* Everything in quotes is processed as a single token */
/* skip opening quote */
state->buf++;
*strval = state->buf;
/* iterate to the closing quote or end of the string */
while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
state->buf++;
*lenval = state->buf - *strval;
/* skip closing quote if not end of the string */
if (*state->buf != '\0')
state->buf++;
state->state = WAITOPERATOR;
state->count++;
return PT_VAL;
}
else if (ISOPERATOR(state->buf))
{
/* or else gettoken_tsvector() will raise an error */
state->buf++;
state->state = WAITOPERAND;
continue;
}
else if (!t_isspace(state->buf))
{
/*
* We rely on the tsvector parser to parse the value for
* us
*/
reset_tsvector_parser(state->valstate, state->buf);
if (gettoken_tsvector(state->valstate, strval, lenval,
NULL, NULL, &state->buf))
{
state->state = WAITOPERATOR;
return PT_VAL;
}
else if (state->state == WAITFIRSTOPERAND)
{
return PT_END;
}
else
{
/* finally, we have to provide an operand */
pushStop(state);
return PT_END;
}
}
break;
case WAITOPERATOR:
if (t_iseq(state->buf, '"'))
{
/*
* put implicit AND after an operand and handle this quote
* in WAITOPERAND
*/
state->state = WAITOPERAND;
*operator = OP_AND;
return PT_OPR;
}
else if (parse_or_operator(state))
{
state->state = WAITOPERAND;
*operator = OP_OR;
return PT_OPR;
}
else if (*state->buf == '\0')
{
return PT_END;
}
else if (!t_isspace(state->buf))
{
/* put implicit AND after an operand */
*operator = OP_AND;
state->state = WAITOPERAND;
return PT_OPR;
}
break;
}
state->buf += pg_mblen(state->buf);
}
}
static ts_tokentype
gettoken_query_plain(TSQueryParserState state, int8 *operator,
int *lenval, char **strval,
int16 *weight, bool *prefix)
{
*weight = 0;
*prefix = false;
if (*state->buf == '\0')
return PT_END;
*strval = state->buf;
*lenval = strlen(state->buf);
state->buf += *lenval;
state->count++;
return PT_VAL;
}
/*
* Push an operator to state->polstr
*/
void
pushOperator(TSQueryParserState state, int8 oper, int16 distance)
{
QueryOperator *tmp;
Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
2007-11-15 22:14:46 +01:00
tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
tmp->type = QI_OPR;
tmp->oper = oper;
tmp->distance = (oper == OP_PHRASE) ? distance : 0;
/* left is filled in later with findoprnd */
state->polstr = lcons(tmp, state->polstr);
}
static void
pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight, bool prefix)
{
QueryOperand *tmp;
if (distance >= MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("value is too big in tsquery: \"%s\"",
state->buffer)));
if (lenval >= MAXSTRLEN)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("operand is too long in tsquery: \"%s\"",
state->buffer)));
tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
tmp->type = QI_VAL;
tmp->weight = weight;
tmp->prefix = prefix;
tmp->valcrc = (int32) valcrc;
tmp->length = lenval;
tmp->distance = distance;
state->polstr = lcons(tmp, state->polstr);
}
/*
* Push an operand to state->polstr.
*
* strval must point to a string equal to state->curop. lenval is the length
* of the string.
*/
void
pushValue(TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
{
pg_crc32 valcrc;
if (lenval >= MAXSTRLEN)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("word is too long in tsquery: \"%s\"",
state->buffer)));
INIT_LEGACY_CRC32(valcrc);
COMP_LEGACY_CRC32(valcrc, strval, lenval);
FIN_LEGACY_CRC32(valcrc);
pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight, prefix);
/* append the value string to state.op, enlarging buffer if needed first */
while (state->curop - state->op + lenval + 1 >= state->lenop)
{
int used = state->curop - state->op;
state->lenop *= 2;
state->op = (char *) repalloc((void *) state->op, state->lenop);
state->curop = state->op + used;
}
memcpy((void *) state->curop, (void *) strval, lenval);
state->curop += lenval;
*(state->curop) = '\0';
state->curop++;
state->sumlen += lenval + 1 /* \0 */ ;
}
/*
* Push a stopword placeholder to state->polstr
*/
void
pushStop(TSQueryParserState state)
{
QueryOperand *tmp;
tmp = (QueryOperand *) palloc0(sizeof(QueryOperand));
tmp->type = QI_VALSTOP;
state->polstr = lcons(tmp, state->polstr);
}
#define STACKDEPTH 32
typedef struct OperatorElement
{
int8 op;
int16 distance;
} OperatorElement;
static void
pushOpStack(OperatorElement *stack, int *lenstack, int8 op, int16 distance)
{
if (*lenstack == STACKDEPTH) /* internal error */
elog(ERROR, "tsquery stack too small");
stack[*lenstack].op = op;
stack[*lenstack].distance = distance;
(*lenstack)++;
}
static void
cleanOpStack(TSQueryParserState state,
OperatorElement *stack, int *lenstack, int8 op)
{
int opPriority = OP_PRIORITY(op);
while (*lenstack)
{
/* NOT is right associative unlike to others */
if ((op != OP_NOT && opPriority > OP_PRIORITY(stack[*lenstack - 1].op)) ||
(op == OP_NOT && opPriority >= OP_PRIORITY(stack[*lenstack - 1].op)))
break;
(*lenstack)--;
pushOperator(state, stack[*lenstack].op,
stack[*lenstack].distance);
}
}
/*
* Make polish (prefix) notation of query.
*
* See parse_tsquery for explanation of pushval.
*/
static void
makepol(TSQueryParserState state,
PushFunction pushval,
Datum opaque)
{
int8 operator = 0;
ts_tokentype type;
int lenval = 0;
char *strval = NULL;
OperatorElement opstack[STACKDEPTH];
int lenstack = 0;
int16 weight = 0;
bool prefix;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
while ((type = state->gettoken(state, &operator,
&lenval, &strval,
&weight, &prefix)) != PT_END)
{
switch (type)
{
case PT_VAL:
pushval(opaque, state, strval, lenval, weight, prefix);
break;
case PT_OPR:
cleanOpStack(state, opstack, &lenstack, operator);
pushOpStack(opstack, &lenstack, operator, weight);
break;
case PT_OPEN:
makepol(state, pushval, opaque);
break;
case PT_CLOSE:
cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
return;
case PT_ERR:
default:
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in tsquery: \"%s\"",
state->buffer)));
}
}
cleanOpStack(state, opstack, &lenstack, OP_OR /* lowest */ );
}
static void
findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
if (*pos >= nnodes)
elog(ERROR, "malformed tsquery: operand not found");
if (ptr[*pos].type == QI_VAL)
{
(*pos)++;
}
else if (ptr[*pos].type == QI_VALSTOP)
{
*needcleanup = true; /* we'll have to remove stop words */
(*pos)++;
}
else
{
Assert(ptr[*pos].type == QI_OPR);
if (ptr[*pos].qoperator.oper == OP_NOT)
{
ptr[*pos].qoperator.left = 1; /* fixed offset */
(*pos)++;
/* process the only argument */
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
}
else
{
QueryOperator *curitem = &ptr[*pos].qoperator;
int tmp = *pos; /* save current position */
Assert(curitem->oper == OP_AND ||
curitem->oper == OP_OR ||
curitem->oper == OP_PHRASE);
(*pos)++;
/* process RIGHT argument */
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
curitem->left = *pos - tmp; /* set LEFT arg's offset */
/* process LEFT argument */
findoprnd_recurse(ptr, pos, nnodes, needcleanup);
}
}
}
/*
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
* Fill in the left-fields previously left unfilled.
* The input QueryItems must be in polish (prefix) notation.
* Also, set *needcleanup to true if there are any QI_VALSTOP nodes.
*/
static void
findoprnd(QueryItem *ptr, int size, bool *needcleanup)
{
uint32 pos;
*needcleanup = false;
pos = 0;
findoprnd_recurse(ptr, &pos, size, needcleanup);
if (pos != size)
elog(ERROR, "malformed tsquery: extra nodes");
}
/*
2014-10-20 16:23:40 +02:00
* Each value (operand) in the query is passed to pushval. pushval can
* transform the simple value to an arbitrarily complex expression using
* pushValue and pushOperator. It must push a single value with pushValue,
2014-10-20 16:23:40 +02:00
* a complete expression with all operands, or a stopword placeholder
* with pushStop, otherwise the prefix notation representation will be broken,
* having an operator with no operand.
*
* opaque is passed on to pushval as is, pushval can use it to store its
* private state.
*/
TSQuery
parse_tsquery(char *buf,
PushFunction pushval,
Datum opaque,
int flags)
{
struct TSQueryParserStateData state;
int i;
TSQuery query;
int commonlen;
QueryItem *ptr;
ListCell *cell;
bool needcleanup;
int tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY;
/* plain should not be used with web */
Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB));
/* select suitable tokenizer */
if (flags & P_TSQ_PLAIN)
state.gettoken = gettoken_query_plain;
else if (flags & P_TSQ_WEB)
{
state.gettoken = gettoken_query_websearch;
tsv_flags |= P_TSV_IS_WEB;
}
else
state.gettoken = gettoken_query_standard;
/* init state */
state.buffer = buf;
state.buf = buf;
state.count = 0;
state.state = WAITFIRSTOPERAND;
state.polstr = NIL;
/* init value parser's state */
state.valstate = init_tsvector_parser(state.buffer, tsv_flags);
/* init list of operand */
state.sumlen = 0;
state.lenop = 64;
state.curop = state.op = (char *) palloc(state.lenop);
*(state.curop) = '\0';
/* parse query & make polish notation (postfix, but in reverse order) */
makepol(&state, pushval, opaque);
close_tsvector_parser(state.valstate);
if (list_length(state.polstr) == 0)
{
ereport(NOTICE,
2007-12-27 14:02:48 +01:00
(errmsg("text-search query doesn't contain lexemes: \"%s\"",
state.buffer)));
query = (TSQuery) palloc(HDRSIZETQ);
SET_VARSIZE(query, HDRSIZETQ);
query->size = 0;
return query;
}
if (TSQUERY_TOO_BIG(list_length(state.polstr), state.sumlen))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("tsquery is too large")));
commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen);
/* Pack the QueryItems in the final TSQuery struct to return to caller */
query = (TSQuery) palloc0(commonlen);
SET_VARSIZE(query, commonlen);
query->size = list_length(state.polstr);
ptr = GETQUERY(query);
/* Copy QueryItems to TSQuery */
i = 0;
foreach(cell, state.polstr)
{
QueryItem *item = (QueryItem *) lfirst(cell);
switch (item->type)
{
case QI_VAL:
memcpy(&ptr[i], item, sizeof(QueryOperand));
break;
case QI_VALSTOP:
ptr[i].type = QI_VALSTOP;
break;
case QI_OPR:
memcpy(&ptr[i], item, sizeof(QueryOperator));
break;
default:
elog(ERROR, "unrecognized QueryItem type: %d", item->type);
}
i++;
}
/* Copy all the operand strings to TSQuery */
memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen);
pfree(state.op);
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
/*
* Set left operand pointers for every operator. While we're at it,
* detect whether there are any QI_VALSTOP nodes.
*/
findoprnd(ptr, query->size, &needcleanup);
/*
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
* If there are QI_VALSTOP nodes, delete them and simplify the tree.
*/
if (needcleanup)
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
query = cleanup_tsquery_stopwords(query);
return query;
}
static void
pushval_asis(Datum opaque, TSQueryParserState state, char *strval, int lenval,
int16 weight, bool prefix)
{
pushValue(state, strval, lenval, weight, prefix);
}
/*
* in without morphology
*/
Datum
tsqueryin(PG_FUNCTION_ARGS)
{
char *in = PG_GETARG_CSTRING(0);
PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0));
}
/*
* out function
*/
typedef struct
{
QueryItem *curpol;
char *buf;
char *cur;
char *op;
int buflen;
} INFIX;
/* Makes sure inf->buf is large enough for adding 'addsize' bytes */
#define RESIZEBUF(inf, addsize) \
while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
{ \
int len = (inf)->cur - (inf)->buf; \
(inf)->buflen *= 2; \
(inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \
(inf)->cur = (inf)->buf + len; \
}
/*
* recursively traverse the tree and
* print it in infix (human-readable) form
*/
static void
infix(INFIX *in, int parentPriority, bool rightPhraseOp)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
if (in->curpol->type == QI_VAL)
{
QueryOperand *curpol = &in->curpol->qoperand;
char *op = in->op + curpol->distance;
int clen;
RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 6);
*(in->cur) = '\'';
in->cur++;
while (*op)
{
if (t_iseq(op, '\''))
{
*(in->cur) = '\'';
in->cur++;
}
else if (t_iseq(op, '\\'))
{
*(in->cur) = '\\';
in->cur++;
}
COPYCHAR(in->cur, op);
clen = pg_mblen(op);
op += clen;
in->cur += clen;
}
*(in->cur) = '\'';
in->cur++;
if (curpol->weight || curpol->prefix)
{
*(in->cur) = ':';
in->cur++;
if (curpol->prefix)
{
*(in->cur) = '*';
in->cur++;
}
if (curpol->weight & (1 << 3))
{
*(in->cur) = 'A';
in->cur++;
}
if (curpol->weight & (1 << 2))
{
*(in->cur) = 'B';
in->cur++;
}
if (curpol->weight & (1 << 1))
{
*(in->cur) = 'C';
in->cur++;
}
if (curpol->weight & 1)
{
*(in->cur) = 'D';
in->cur++;
}
}
*(in->cur) = '\0';
in->curpol++;
}
else if (in->curpol->qoperator.oper == OP_NOT)
{
int priority = QO_PRIORITY(in->curpol);
if (priority < parentPriority)
{
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
}
RESIZEBUF(in, 1);
*(in->cur) = '!';
in->cur++;
*(in->cur) = '\0';
in->curpol++;
infix(in, priority, false);
if (priority < parentPriority)
{
RESIZEBUF(in, 2);
sprintf(in->cur, " )");
in->cur = strchr(in->cur, '\0');
}
}
else
{
int8 op = in->curpol->qoperator.oper;
int priority = QO_PRIORITY(in->curpol);
int16 distance = in->curpol->qoperator.distance;
INFIX nrm;
bool needParenthesis = false;
in->curpol++;
if (priority < parentPriority ||
/* phrase operator depends on order */
(op == OP_PHRASE && rightPhraseOp))
{
needParenthesis = true;
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
}
nrm.curpol = in->curpol;
nrm.op = in->op;
nrm.buflen = 16;
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
/* get right operand */
infix(&nrm, priority, (op == OP_PHRASE));
/* get & print left operand */
in->curpol = nrm.curpol;
infix(in, priority, false);
/* print operator & right operand */
RESIZEBUF(in, 3 + (2 + 10 /* distance */ ) + (nrm.cur - nrm.buf));
switch (op)
{
case OP_OR:
sprintf(in->cur, " | %s", nrm.buf);
break;
case OP_AND:
sprintf(in->cur, " & %s", nrm.buf);
break;
case OP_PHRASE:
if (distance != 1)
sprintf(in->cur, " <%d> %s", distance, nrm.buf);
else
sprintf(in->cur, " <-> %s", nrm.buf);
break;
default:
/* OP_NOT is handled in above if-branch */
elog(ERROR, "unrecognized operator type: %d", op);
}
in->cur = strchr(in->cur, '\0');
pfree(nrm.buf);
if (needParenthesis)
{
RESIZEBUF(in, 2);
sprintf(in->cur, " )");
in->cur = strchr(in->cur, '\0');
}
}
}
Datum
tsqueryout(PG_FUNCTION_ARGS)
{
TSQuery query = PG_GETARG_TSQUERY(0);
INFIX nrm;
if (query->size == 0)
{
char *b = palloc(1);
*b = '\0';
PG_RETURN_POINTER(b);
}
nrm.curpol = GETQUERY(query);
nrm.buflen = 32;
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
*(nrm.cur) = '\0';
nrm.op = GETOPERAND(query);
infix(&nrm, -1 /* lowest priority */ , false);
PG_FREE_IF_COPY(query, 0);
PG_RETURN_CSTRING(nrm.buf);
}
/*
* Binary Input / Output functions. The binary format is as follows:
*
* uint32 number of operators/operands in the query
*
* Followed by the operators and operands, in prefix notation. For each
* operand:
*
* uint8 type, QI_VAL
* uint8 weight
* operand text in client encoding, null-terminated
* uint8 prefix
*
* For each operator:
* uint8 type, QI_OPR
* uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
* uint16 distance (only for OP_PHRASE)
*/
Datum
tsquerysend(PG_FUNCTION_ARGS)
{
TSQuery query = PG_GETARG_TSQUERY(0);
StringInfoData buf;
int i;
QueryItem *item = GETQUERY(query);
pq_begintypsend(&buf);
pq_sendint32(&buf, query->size);
for (i = 0; i < query->size; i++)
{
pq_sendint8(&buf, item->type);
switch (item->type)
{
case QI_VAL:
pq_sendint8(&buf, item->qoperand.weight);
pq_sendint8(&buf, item->qoperand.prefix);
pq_sendstring(&buf, GETOPERAND(query) + item->qoperand.distance);
break;
case QI_OPR:
pq_sendint8(&buf, item->qoperator.oper);
if (item->qoperator.oper == OP_PHRASE)
pq_sendint16(&buf, item->qoperator.distance);
break;
default:
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
}
item++;
}
PG_FREE_IF_COPY(query, 0);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
Datum
tsqueryrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSQuery query;
int i,
len;
QueryItem *item;
int datalen;
char *ptr;
uint32 size;
const char **operands;
bool needcleanup;
size = pq_getmsgint(buf, sizeof(uint32));
if (size > (MaxAllocSize / sizeof(QueryItem)))
elog(ERROR, "invalid size of tsquery");
/* Allocate space to temporarily hold operand strings */
operands = palloc(size * sizeof(char *));
/* Allocate space for all the QueryItems. */
len = HDRSIZETQ + sizeof(QueryItem) * size;
query = (TSQuery) palloc0(len);
query->size = size;
item = GETQUERY(query);
datalen = 0;
for (i = 0; i < size; i++)
{
item->type = (int8) pq_getmsgint(buf, sizeof(int8));
if (item->type == QI_VAL)
{
size_t val_len; /* length after recoding to server
* encoding */
uint8 weight;
uint8 prefix;
const char *val;
pg_crc32 valcrc;
weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
prefix = (uint8) pq_getmsgint(buf, sizeof(uint8));
val = pq_getmsgstring(buf);
val_len = strlen(val);
/* Sanity checks */
if (weight > 0xF)
elog(ERROR, "invalid tsquery: invalid weight bitmap");
if (val_len > MAXSTRLEN)
elog(ERROR, "invalid tsquery: operand too long");
2007-11-15 22:14:46 +01:00
if (datalen > MAXSTRPOS)
elog(ERROR, "invalid tsquery: total operand length exceeded");
/* Looks valid. */
INIT_LEGACY_CRC32(valcrc);
COMP_LEGACY_CRC32(valcrc, val, val_len);
FIN_LEGACY_CRC32(valcrc);
item->qoperand.weight = weight;
item->qoperand.prefix = (prefix) ? true : false;
item->qoperand.valcrc = (int32) valcrc;
item->qoperand.length = val_len;
item->qoperand.distance = datalen;
/*
* Operand strings are copied to the final struct after this loop;
* here we just collect them to an array
*/
operands[i] = val;
datalen += val_len + 1; /* + 1 for the '\0' terminator */
}
else if (item->type == QI_OPR)
{
int8 oper;
2007-11-15 22:14:46 +01:00
oper = (int8) pq_getmsgint(buf, sizeof(int8));
if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
elog(ERROR, "invalid tsquery: unrecognized operator type %d",
(int) oper);
if (i == size - 1)
elog(ERROR, "invalid pointer to right operand");
item->qoperator.oper = oper;
if (oper == OP_PHRASE)
item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
}
else
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
item++;
}
/* Enlarge buffer to make room for the operand values. */
query = (TSQuery) repalloc(query, len + datalen);
item = GETQUERY(query);
ptr = GETOPERAND(query);
/*
* Fill in the left-pointers. Checks that the tree is well-formed as a
* side-effect.
*/
findoprnd(item, size, &needcleanup);
Fix strange behavior (and possible crashes) in full text phrase search. In an attempt to simplify the tsquery matching engine, the original phrase search patch invented rewrite rules that would rearrange a tsquery so that no AND/OR/NOT operator appeared below a PHRASE operator. But this approach had numerous problems. The rearrangement step was missed by ts_rewrite (and perhaps other places), allowing tsqueries to be created that would cause Assert failures or perhaps crashes at execution, as reported by Andreas Seltenreich. The rewrite rules effectively defined semantics for operators underneath PHRASE that were buggy, or at least unintuitive. And because rewriting was done in tsqueryin() rather than at execution, the rearrangement was user-visible, which is not very desirable --- for example, it might cause unexpected matches or failures to match in ts_rewrite. As a somewhat independent problem, the behavior of nested PHRASE operators was only sane for left-deep trees; queries like "x <-> (y <-> z)" did not behave intuitively at all. To fix, get rid of the rewrite logic altogether, and instead teach the tsquery execution engine to manage AND/OR/NOT below a PHRASE operator by explicitly computing the match location(s) and match widths for these operators. This requires introducing some additional fields into the publicly visible ExecPhraseData struct; but since there's no way for third-party code to pass such a struct to TS_phrase_execute, it shouldn't create an ABI problem as long as we don't move the offsets of the existing fields. Another related problem was that index searches supposed that "!x <-> y" could be lossily approximated as "!x & y", which isn't correct because the latter will reject, say, "x q y" which the query itself accepts. This required some tweaking in TS_execute_ternary along with the main tsquery engine. Back-patch to 9.6 where phrase operators were introduced. While this could be argued to change behavior more than we'd like in a stable branch, we have to do something about the crash hazards and index-vs-seqscan inconsistency, and it doesn't seem desirable to let the unintuitive behaviors induced by the rewriting implementation stand as precedent. Discussion: https://postgr.es/m/28215.1481999808@sss.pgh.pa.us Discussion: https://postgr.es/m/26706.1482087250@sss.pgh.pa.us
2016-12-21 21:18:25 +01:00
/* Can't have found any QI_VALSTOP nodes */
Assert(!needcleanup);
/* Copy operands to output struct */
for (i = 0; i < size; i++)
{
if (item->type == QI_VAL)
{
memcpy(ptr, operands[i], item->qoperand.length + 1);
ptr += item->qoperand.length + 1;
}
item++;
}
pfree(operands);
Assert(ptr - GETOPERAND(query) == datalen);
SET_VARSIZE(query, len + datalen);
PG_RETURN_TSQUERY(query);
}
/*
* debug function, used only for view query
* which will be executed in non-leaf pages in index
*/
Datum
tsquerytree(PG_FUNCTION_ARGS)
{
TSQuery query = PG_GETARG_TSQUERY(0);
INFIX nrm;
text *res;
QueryItem *q;
int len;
if (query->size == 0)
{
res = (text *) palloc(VARHDRSZ);
SET_VARSIZE(res, VARHDRSZ);
PG_RETURN_POINTER(res);
}
q = clean_NOT(GETQUERY(query), &len);
if (!q)
{
res = cstring_to_text("T");
}
else
{
nrm.curpol = q;
nrm.buflen = 32;
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
*(nrm.cur) = '\0';
nrm.op = GETOPERAND(query);
infix(&nrm, -1, false);
res = cstring_to_text_with_len(nrm.buf, nrm.cur - nrm.buf);
pfree(q);
}
PG_FREE_IF_COPY(query, 0);
PG_RETURN_TEXT_P(res);
}