diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c index d9ec8ed431..330664da63 100644 --- a/src/backend/utils/adt/tsquery_cleanup.c +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -207,45 +207,59 @@ clean_NOT(QueryItem *ptr, int *len) } -#ifdef V_UNKNOWN /* exists in Windows headers */ -#undef V_UNKNOWN -#endif -#ifdef V_FALSE /* exists in Solaris headers */ -#undef V_FALSE -#endif - /* - * output values for result output parameter of clean_fakeval_intree - */ -#define V_UNKNOWN 0 /* the expression can't be evaluated - * statically */ -#define V_TRUE 1 /* the expression is always true (not - * implemented) */ -#define V_FALSE 2 /* the expression is always false (not - * implemented) */ -#define V_STOP 3 /* the expression is a stop word */ - -/* - * Remove QI_VALSTOP (stopword nodes) from query tree. + * Remove QI_VALSTOP (stopword) nodes from query tree. + * + * Returns NULL if the query degenerates to nothing. Input must not be NULL. + * + * When we remove a phrase operator due to removing one or both of its + * arguments, we might need to adjust the distance of a parent phrase + * operator. For example, 'a' is a stopword, so: + * (b <-> a) <-> c should become b <2> c + * b <-> (a <-> c) should become b <2> c + * (b <-> (a <-> a)) <-> c should become b <3> c + * b <-> ((a <-> a) <-> c) should become b <3> c + * To handle that, we define two output parameters: + * ladd: amount to add to a phrase distance to the left of this node + * radd: amount to add to a phrase distance to the right of this node + * We need two outputs because we could need to bubble up adjustments to two + * different parent phrase operators. Consider + * w <-> (((a <-> x) <2> (y <3> a)) <-> z) + * After we've removed the two a's and are considering the <2> node (which is + * now just x <2> y), we have an ladd distance of 1 that needs to propagate + * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to + * propagate to the rightmost <->, so that we'll end up with + * w <2> ((x <2> y) <4> z) + * Near the bottom of the tree, we may have subtrees consisting only of + * stopwords. The distances of any phrase operators within such a subtree are + * summed and propagated to both ladd and radd, since we don't know which side + * of the lowest surviving phrase operator we are in. The rule is that any + * subtree that degenerates to NULL must return equal values of ladd and radd, + * and the parent node dealing with it should incorporate only one of those. + * + * Currently, we only implement this adjustment for adjacent phrase operators. + * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which + * isn't ideal, but there is no way to represent the really desired semantics + * without some redesign of the tsquery structure. Certainly it would not be + * any better to convert that to 'x <2> (y | z)'. Since this is such a weird + * corner case, let it go for now. But we can fix it in cases where the + * intervening non-phrase operator also gets removed, for example + * '((x <-> a) | a) <-> y' will become 'x <2> y'. */ static NODE * -clean_fakeval_intree(NODE *node, char *result, int *adddistance) +clean_stopword_intree(NODE *node, int *ladd, int *radd) { - char lresult = V_UNKNOWN, - rresult = V_UNKNOWN; - /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); - if (adddistance) - *adddistance = 0; + /* default output parameters indicate no change in parent distance */ + *ladd = *radd = 0; if (node->valnode->type == QI_VAL) return node; else if (node->valnode->type == QI_VALSTOP) { pfree(node); - *result = V_STOP; return NULL; } @@ -253,10 +267,10 @@ clean_fakeval_intree(NODE *node, char *result, int *adddistance) if (node->valnode->qoperator.oper == OP_NOT) { - node->right = clean_fakeval_intree(node->right, &rresult, NULL); + /* NOT doesn't change pattern width, so just report child distances */ + node->right = clean_stopword_intree(node->right, ladd, radd); if (!node->right) { - *result = V_STOP; freetree(node); return NULL; } @@ -264,67 +278,89 @@ clean_fakeval_intree(NODE *node, char *result, int *adddistance) else { NODE *res = node; + bool isphrase; int ndistance, - ldistance = 0, - rdistance = 0; + lladd, + lradd, + rladd, + rradd; - ndistance = (node->valnode->qoperator.oper == OP_PHRASE) ? - node->valnode->qoperator.distance : - 0; + /* First, recurse */ + node->left = clean_stopword_intree(node->left, &lladd, &lradd); + node->right = clean_stopword_intree(node->right, &rladd, &rradd); - node->left = clean_fakeval_intree(node->left, - &lresult, - ndistance ? &ldistance : NULL); + /* Check if current node is OP_PHRASE, get its distance */ + isphrase = (node->valnode->qoperator.oper == OP_PHRASE); + ndistance = isphrase ? node->valnode->qoperator.distance : 0; - node->right = clean_fakeval_intree(node->right, - &rresult, - ndistance ? &rdistance : NULL); - - /* - * ndistance, ldistance and rdistance are greater than zero if their - * corresponding nodes are OP_PHRASE - */ - - if (lresult == V_STOP && rresult == V_STOP) + if (node->left == NULL && node->right == NULL) { - if (adddistance && ndistance) - *adddistance = ldistance + ndistance + rdistance; + /* + * When we collapse out a phrase node entirely, propagate its own + * distance into both *ladd and *radd; it is the responsibility of + * the parent node to count it only once. Also, for a phrase + * node, distances coming from children are summed and propagated + * up to parent (we assume lladd == lradd and rladd == rradd, else + * rule was broken at a lower level). But if this isn't a phrase + * node, take the larger of the two child distances; that + * corresponds to what TS_execute will do in non-stopword cases. + */ + if (isphrase) + *ladd = *radd = lladd + ndistance + rladd; + else + *ladd = *radd = Max(lladd, rladd); freetree(node); - *result = V_STOP; return NULL; } - else if (lresult == V_STOP) + else if (node->left == NULL) { + /* Removing this operator and left subnode */ + /* lladd and lradd are equal/redundant, don't count both */ + if (isphrase) + { + /* operator's own distance must propagate to left */ + *ladd = lladd + ndistance + rladd; + *radd = rradd; + } + else + { + /* at non-phrase op, just forget the left subnode entirely */ + *ladd = rladd; + *radd = rradd; + } res = node->right; - - /* - * propagate distance from current node to the right upper - * subtree. - */ - if (adddistance && ndistance) - *adddistance = rdistance; pfree(node); } - else if (rresult == V_STOP) + else if (node->right == NULL) { + /* Removing this operator and right subnode */ + /* rladd and rradd are equal/redundant, don't count both */ + if (isphrase) + { + /* operator's own distance must propagate to right */ + *ladd = lladd; + *radd = lradd + ndistance + rradd; + } + else + { + /* at non-phrase op, just forget the right subnode entirely */ + *ladd = lladd; + *radd = lradd; + } res = node->left; - - /* - * propagate distance from current node to the upper tree. - */ - if (adddistance && ndistance) - *adddistance = ndistance + ldistance; pfree(node); } - else if (ndistance) + else if (isphrase) { - node->valnode->qoperator.distance += ldistance; - if (adddistance) - *adddistance = 0; + /* Absorb appropriate corrections at this level */ + node->valnode->qoperator.distance += lradd + rladd; + /* Propagate up any unaccounted-for corrections */ + *ladd = lladd; + *radd = rradd; } - else if (adddistance) + else { - *adddistance = 0; + /* We're keeping a non-phrase operator, so ladd/radd remain 0 */ } return res; @@ -585,7 +621,8 @@ cleanup_fakeval_and_phrase(TSQuery in) commonlen, i; NODE *root; - char result = V_UNKNOWN; + int ladd, + radd; TSQuery out; QueryItem *items; char *operands; @@ -594,8 +631,8 @@ cleanup_fakeval_and_phrase(TSQuery in) return in; /* eliminate stop words */ - root = clean_fakeval_intree(maketree(GETQUERY(in)), &result, NULL); - if (result != V_UNKNOWN) + root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd); + if (root == NULL) { ereport(NOTICE, (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored"))); diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index be4bdf21f1..cf3beb35a9 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -594,7 +594,7 @@ SELECT to_tsquery('english', 'a <-> (1 <-> 2)'); SELECT to_tsquery('english', '1 <-> (a <-> 2)'); to_tsquery ------------- - '1' <-> '2' + '1' <2> '2' (1 row) SELECT to_tsquery('english', '1 <-> (2 <-> a)'); @@ -630,7 +630,7 @@ SELECT to_tsquery('english', 'a <3> (1 <-> 2)'); SELECT to_tsquery('english', '1 <3> (a <-> 2)'); to_tsquery ------------- - '1' <3> '2' + '1' <4> '2' (1 row) SELECT to_tsquery('english', '1 <3> (2 <-> a)'); @@ -666,7 +666,7 @@ SELECT to_tsquery('english', 'a <-> (1 <3> 2)'); SELECT to_tsquery('english', '1 <-> (a <3> 2)'); to_tsquery ------------- - '1' <-> '2' + '1' <4> '2' (1 row) SELECT to_tsquery('english', '1 <-> (2 <3> a)'); @@ -684,7 +684,7 @@ SELECT to_tsquery('english', '((a <-> 1) <-> 2) <-> s'); SELECT to_tsquery('english', '(2 <-> (a <-> 1)) <-> s'); to_tsquery ------------- - '2' <-> '1' + '2' <2> '1' (1 row) SELECT to_tsquery('english', '((1 <-> a) <-> 2) <-> s'); @@ -708,7 +708,7 @@ SELECT to_tsquery('english', 's <-> ((a <-> 1) <-> 2)'); SELECT to_tsquery('english', 's <-> (2 <-> (a <-> 1))'); to_tsquery ------------- - '2' <-> '1' + '2' <2> '1' (1 row) SELECT to_tsquery('english', 's <-> ((1 <-> a) <-> 2)'); @@ -750,13 +750,13 @@ SELECT to_tsquery('english', '(s <-> (1 <-> a)) <-> 2'); SELECT to_tsquery('english', '2 <-> ((a <-> 1) <-> s)'); to_tsquery ------------- - '2' <-> '1' + '2' <2> '1' (1 row) SELECT to_tsquery('english', '2 <-> (s <-> (a <-> 1))'); to_tsquery ------------- - '2' <-> '1' + '2' <3> '1' (1 row) SELECT to_tsquery('english', '2 <-> ((1 <-> a) <-> s)'); @@ -768,13 +768,13 @@ SELECT to_tsquery('english', '2 <-> ((1 <-> a) <-> s)'); SELECT to_tsquery('english', '2 <-> (s <-> (1 <-> a))'); to_tsquery ------------- - '2' <-> '1' + '2' <2> '1' (1 row) SELECT to_tsquery('english', 'foo <-> (a <-> (the <-> bar))'); to_tsquery ----------------- - 'foo' <-> 'bar' + 'foo' <3> 'bar' (1 row) SELECT to_tsquery('english', '((foo <-> a) <-> the) <-> bar');