postgresql/src/backend/utils/adt/tsquery_cleanup.c

447 lines
11 KiB
C

/*-------------------------------------------------------------------------
*
* tsquery_cleanup.c
* Cleanup query from NOT values and/or stopword
* Utility functions to correct work.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/tsquery_cleanup.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "tsearch/ts_utils.h"
#include "varatt.h"
typedef struct NODE
{
struct NODE *left;
struct NODE *right;
QueryItem *valnode;
} NODE;
/*
* make query tree from plain view of query
*/
static NODE *
maketree(QueryItem *in)
{
NODE *node = (NODE *) palloc(sizeof(NODE));
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
node->valnode = in;
node->right = node->left = NULL;
if (in->type == QI_OPR)
{
node->right = maketree(in + 1);
if (in->qoperator.oper != OP_NOT)
node->left = maketree(in + in->qoperator.left);
}
return node;
}
/*
* Internal state for plaintree and plainnode
*/
typedef struct
{
QueryItem *ptr;
int len; /* allocated size of ptr */
int cur; /* number of elements in ptr */
} PLAINTREE;
static void
plainnode(PLAINTREE *state, NODE *node)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
if (state->cur == state->len)
{
state->len *= 2;
state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
}
memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
if (node->valnode->type == QI_VAL)
state->cur++;
else if (node->valnode->qoperator.oper == OP_NOT)
{
state->ptr[state->cur].qoperator.left = 1;
state->cur++;
plainnode(state, node->right);
}
else
{
int cur = state->cur;
state->cur++;
plainnode(state, node->right);
state->ptr[cur].qoperator.left = state->cur - cur;
plainnode(state, node->left);
}
pfree(node);
}
/*
* make plain view of tree from a NODE-tree representation
*/
static QueryItem *
plaintree(NODE *root, int *len)
{
PLAINTREE pl;
pl.cur = 0;
pl.len = 16;
if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
{
pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
plainnode(&pl, root);
}
else
pl.ptr = NULL;
*len = pl.cur;
return pl.ptr;
}
static void
freetree(NODE *node)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
if (!node)
return;
if (node->left)
freetree(node->left);
if (node->right)
freetree(node->right);
pfree(node);
}
/*
* clean tree for ! operator.
* It's useful for debug, but in
* other case, such view is used with search in index.
* Operator ! always return TRUE
*/
static NODE *
clean_NOT_intree(NODE *node)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
if (node->valnode->type == QI_VAL)
return node;
if (node->valnode->qoperator.oper == OP_NOT)
{
freetree(node);
return NULL;
}
/* operator & or | */
if (node->valnode->qoperator.oper == OP_OR)
{
if ((node->left = clean_NOT_intree(node->left)) == NULL ||
(node->right = clean_NOT_intree(node->right)) == NULL)
{
freetree(node);
return NULL;
}
}
else
{
NODE *res = node;
Assert(node->valnode->qoperator.oper == OP_AND ||
node->valnode->qoperator.oper == OP_PHRASE);
node->left = clean_NOT_intree(node->left);
node->right = clean_NOT_intree(node->right);
if (node->left == NULL && node->right == NULL)
{
pfree(node);
res = NULL;
}
else if (node->left == NULL)
{
res = node->right;
pfree(node);
}
else if (node->right == NULL)
{
res = node->left;
pfree(node);
}
return res;
}
return node;
}
QueryItem *
clean_NOT(QueryItem *ptr, int *len)
{
NODE *root = maketree(ptr);
return plaintree(clean_NOT_intree(root), len);
}
/*
* Remove QI_VALSTOP (stopword) nodes from query tree.
*
* Returns NULL if the query degenerates to nothing. Input must not be NULL.
*
* When we remove a phrase operator due to removing one or both of its
* arguments, we might need to adjust the distance of a parent phrase
* operator. For example, 'a' is a stopword, so:
* (b <-> a) <-> c should become b <2> c
* b <-> (a <-> c) should become b <2> c
* (b <-> (a <-> a)) <-> c should become b <3> c
* b <-> ((a <-> a) <-> c) should become b <3> c
* To handle that, we define two output parameters:
* ladd: amount to add to a phrase distance to the left of this node
* radd: amount to add to a phrase distance to the right of this node
* We need two outputs because we could need to bubble up adjustments to two
* different parent phrase operators. Consider
* w <-> (((a <-> x) <2> (y <3> a)) <-> z)
* After we've removed the two a's and are considering the <2> node (which is
* now just x <2> y), we have an ladd distance of 1 that needs to propagate
* up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
* propagate to the rightmost <->, so that we'll end up with
* w <2> ((x <2> y) <4> z)
* Near the bottom of the tree, we may have subtrees consisting only of
* stopwords. The distances of any phrase operators within such a subtree are
* summed and propagated to both ladd and radd, since we don't know which side
* of the lowest surviving phrase operator we are in. The rule is that any
* subtree that degenerates to NULL must return equal values of ladd and radd,
* and the parent node dealing with it should incorporate only one of those.
*
* Currently, we only implement this adjustment for adjacent phrase operators.
* Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
* isn't ideal, but there is no way to represent the really desired semantics
* without some redesign of the tsquery structure. Certainly it would not be
* any better to convert that to 'x <2> (y | z)'. Since this is such a weird
* corner case, let it go for now. But we can fix it in cases where the
* intervening non-phrase operator also gets removed, for example
* '((x <-> a) | a) <-> y' will become 'x <2> y'.
*/
static NODE *
clean_stopword_intree(NODE *node, int *ladd, int *radd)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
/* default output parameters indicate no change in parent distance */
*ladd = *radd = 0;
if (node->valnode->type == QI_VAL)
return node;
else if (node->valnode->type == QI_VALSTOP)
{
pfree(node);
return NULL;
}
Assert(node->valnode->type == QI_OPR);
if (node->valnode->qoperator.oper == OP_NOT)
{
/* NOT doesn't change pattern width, so just report child distances */
node->right = clean_stopword_intree(node->right, ladd, radd);
if (!node->right)
{
freetree(node);
return NULL;
}
}
else
{
NODE *res = node;
bool isphrase;
int ndistance,
lladd,
lradd,
rladd,
rradd;
/* First, recurse */
node->left = clean_stopword_intree(node->left, &lladd, &lradd);
node->right = clean_stopword_intree(node->right, &rladd, &rradd);
/* Check if current node is OP_PHRASE, get its distance */
isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
ndistance = isphrase ? node->valnode->qoperator.distance : 0;
if (node->left == NULL && node->right == NULL)
{
/*
* When we collapse out a phrase node entirely, propagate its own
* distance into both *ladd and *radd; it is the responsibility of
* the parent node to count it only once. Also, for a phrase
* node, distances coming from children are summed and propagated
* up to parent (we assume lladd == lradd and rladd == rradd, else
* rule was broken at a lower level). But if this isn't a phrase
* node, take the larger of the two child distances; that
* corresponds to what TS_execute will do in non-stopword cases.
*/
if (isphrase)
*ladd = *radd = lladd + ndistance + rladd;
else
*ladd = *radd = Max(lladd, rladd);
freetree(node);
return NULL;
}
else if (node->left == NULL)
{
/* Removing this operator and left subnode */
/* lladd and lradd are equal/redundant, don't count both */
if (isphrase)
{
/* operator's own distance must propagate to left */
*ladd = lladd + ndistance + rladd;
*radd = rradd;
}
else
{
/* at non-phrase op, just forget the left subnode entirely */
*ladd = rladd;
*radd = rradd;
}
res = node->right;
pfree(node);
}
else if (node->right == NULL)
{
/* Removing this operator and right subnode */
/* rladd and rradd are equal/redundant, don't count both */
if (isphrase)
{
/* operator's own distance must propagate to right */
*ladd = lladd;
*radd = lradd + ndistance + rradd;
}
else
{
/* at non-phrase op, just forget the right subnode entirely */
*ladd = lladd;
*radd = lradd;
}
res = node->left;
pfree(node);
}
else if (isphrase)
{
/* Absorb appropriate corrections at this level */
node->valnode->qoperator.distance += lradd + rladd;
/* Propagate up any unaccounted-for corrections */
*ladd = lladd;
*radd = rradd;
}
else
{
/* We're keeping a non-phrase operator, so ladd/radd remain 0 */
}
return res;
}
return node;
}
/*
* Number of elements in query tree
*/
static int32
calcstrlen(NODE *node)
{
int32 size = 0;
if (node->valnode->type == QI_VAL)
{
size = node->valnode->qoperand.length + 1;
}
else
{
Assert(node->valnode->type == QI_OPR);
size = calcstrlen(node->right);
if (node->valnode->qoperator.oper != OP_NOT)
size += calcstrlen(node->left);
}
return size;
}
/*
* Remove QI_VALSTOP (stopword) nodes from TSQuery.
*/
TSQuery
cleanup_tsquery_stopwords(TSQuery in, bool noisy)
{
int32 len,
lenstr,
commonlen,
i;
NODE *root;
int ladd,
radd;
TSQuery out;
QueryItem *items;
char *operands;
if (in->size == 0)
return in;
/* eliminate stop words */
root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
if (root == NULL)
{
if (noisy)
ereport(NOTICE,
(errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
out = palloc(HDRSIZETQ);
out->size = 0;
SET_VARSIZE(out, HDRSIZETQ);
return out;
}
/*
* Build TSQuery from plain view
*/
lenstr = calcstrlen(root);
items = plaintree(root, &len);
commonlen = COMPUTESIZE(len, lenstr);
out = palloc(commonlen);
SET_VARSIZE(out, commonlen);
out->size = len;
memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
items = GETQUERY(out);
operands = GETOPERAND(out);
for (i = 0; i < out->size; i++)
{
QueryOperand *op = (QueryOperand *) &items[i];
if (op->type != QI_VAL)
continue;
memcpy(operands, GETOPERAND(in) + op->distance, op->length);
operands[op->length] = '\0';
op->distance = operands - GETOPERAND(out);
operands += op->length + 1;
}
return out;
}