Improve GiST range-contained-by searches by adding a flag for empty ranges.

In the original implementation, a range-contained-by search had to scan
the entire index because an empty range could be lurking anywhere.
Improve that by adding a flag to upper GiST entries that says whether the
represented subtree contains any empty ranges.

Also, make a simple mod to the penalty function to discourage empty ranges
from getting pushed into subtrees without any.  This needs more work, and
the picksplit function should be taught about it too, but that code can be
improved without causing an on-disk compatibility break; so we'll leave it
for another day.

Since we're breaking on-disk compatibility of range values anyway, I took
the opportunity to reorganize the range flags bits; the unused
RANGE_xB_NULL bits are now adjacent, which might open the door for using
them in some other way later.

In passing, remove the GiST range opclass entry for <>, which doesn't seem
like it can really be indexed usefully.

Alexander Korotkov, with some editorializing by Tom
This commit is contained in:
Tom Lane 2011-11-27 16:50:37 -05:00
parent 08da2d282f
commit c66e4f138b
6 changed files with 118 additions and 56 deletions

View File

@ -1622,6 +1622,24 @@ range_get_flags(RangeType *range)
return *((char *) range + VARSIZE(range) - 1);
}
/*
* range_set_contain_empty: set the RANGE_CONTAIN_EMPTY bit in the value.
*
* This is only needed in GiST operations, so we don't include a provision
* for setting it in range_serialize; rather, this function must be applied
* afterwards.
*/
void
range_set_contain_empty(RangeType *range)
{
char *flagsp;
/* flag byte is datum's last byte */
flagsp = (char *) range + VARSIZE(range) - 1;
*flagsp |= RANGE_CONTAIN_EMPTY;
}
/*
* This both serializes and canonicalizes (if applicable) the range.
* This should be used by most callers.

View File

@ -17,6 +17,7 @@
#include "access/gist.h"
#include "access/skey.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/rangetypes.h"
@ -32,7 +33,11 @@
#define RANGESTRAT_CONTAINED_BY 8
#define RANGESTRAT_CONTAINS_ELEM 16
#define RANGESTRAT_EQ 18
#define RANGESTRAT_NE 19
/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
#define rangeCopy(r) \
((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
false, -1)))
/*
* Auxiliary structure for picksplit method.
@ -145,6 +150,16 @@ range_gist_penalty(PG_FUNCTION_ARGS)
subtype_diff = &typcache->rng_subdiff_finfo;
/*
* If new is or contains empty, and orig doesn't, apply infinite penalty.
* We really don't want to pollute an empty-free subtree with empties.
*/
if (RangeIsOrContainsEmpty(new) && !RangeIsOrContainsEmpty(orig))
{
*penalty = get_float4_infinity();
PG_RETURN_POINTER(penalty);
}
/*
* We want to compare the size of "orig" to size of "orig union new".
* The penalty will be the sum of the reduction in the lower bound plus
@ -163,30 +178,9 @@ range_gist_penalty(PG_FUNCTION_ARGS)
}
else if (empty1)
{
if (lower2.infinite || upper2.infinite)
{
/* from empty to infinite */
*penalty = get_float4_infinity();
PG_RETURN_POINTER(penalty);
}
else if (OidIsValid(subtype_diff->fn_oid))
{
/* from empty to upper2-lower2 */
*penalty = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
typcache->rng_collation,
upper2.val,
lower2.val));
/* upper2 must be >= lower2 */
if (*penalty < 0)
*penalty = 0; /* subtype_diff is broken */
PG_RETURN_POINTER(penalty);
}
else
{
/* wild guess */
*penalty = 1.0;
PG_RETURN_POINTER(penalty);
}
/* infinite penalty for pushing non-empty into all-empty subtree */
*penalty = get_float4_infinity();
PG_RETURN_POINTER(penalty);
}
/* if orig isn't empty, s_union can't be either */
@ -334,15 +328,27 @@ range_gist_picksplit(PG_FUNCTION_ARGS)
Datum
range_gist_same(PG_FUNCTION_ARGS)
{
/* Datum r1 = PG_GETARG_DATUM(0); */
/* Datum r2 = PG_GETARG_DATUM(1); */
RangeType *r1 = PG_GETARG_RANGE(0);
RangeType *r2 = PG_GETARG_RANGE(1);
bool *result = (bool *) PG_GETARG_POINTER(2);
/*
* We can safely call range_eq using our fcinfo directly; it won't notice
* the third argument. This allows it to use fn_extra for caching.
* range_eq will ignore the RANGE_CONTAIN_EMPTY flag, so we have to
* check that for ourselves. More generally, if the entries have been
* properly normalized, then unequal flags bytes must mean unequal ranges
* ... so let's just test all the flag bits at once.
*/
*result = DatumGetBool(range_eq(fcinfo));
if (range_get_flags(r1) != range_get_flags(r2))
*result = false;
else
{
/*
* We can safely call range_eq using our fcinfo directly; it won't
* notice the third argument. This allows it to use fn_extra for
* caching.
*/
*result = DatumGetBool(range_eq(fcinfo));
}
PG_RETURN_POINTER(result);
}
@ -356,27 +362,53 @@ range_gist_same(PG_FUNCTION_ARGS)
/*
* Return the smallest range that contains r1 and r2
*
* XXX would it be better to redefine range_union as working this way?
* This differs from regular range_union in two critical ways:
* 1. It won't throw an error for non-adjacent r1 and r2, but just absorb
* the intervening values into the result range.
* 2. We track whether any empty range has been union'd into the result,
* so that contained_by searches can be indexed. Note that this means
* that *all* unions formed within the GiST index must go through here.
*/
static RangeType *
range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
{
RangeType *result;
RangeBound lower1,
lower2;
RangeBound upper1,
upper2;
bool empty1,
empty2;
char flags1,
flags2;
RangeBound *result_lower;
RangeBound *result_upper;
range_deserialize(typcache, r1, &lower1, &upper1, &empty1);
range_deserialize(typcache, r2, &lower2, &upper2, &empty2);
flags1 = range_get_flags(r1);
flags2 = range_get_flags(r2);
if (empty1)
{
/* We can return r2 as-is if it already is or contains empty */
if (flags2 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
return r2;
/* Else we'd better copy it (modify-in-place isn't safe) */
r2 = rangeCopy(r2);
range_set_contain_empty(r2);
return r2;
}
if (empty2)
{
/* We can return r1 as-is if it already is or contains empty */
if (flags1 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
return r1;
/* Else we'd better copy it (modify-in-place isn't safe) */
r1 = rangeCopy(r1);
range_set_contain_empty(r1);
return r1;
}
if (range_cmp_bounds(typcache, &lower1, &lower2) <= 0)
result_lower = &lower1;
@ -389,12 +421,19 @@ range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
result_upper = &upper2;
/* optimization to avoid constructing a new range */
if (result_lower == &lower1 && result_upper == &upper1)
if (result_lower == &lower1 && result_upper == &upper1 &&
((flags1 & RANGE_CONTAIN_EMPTY) || !(flags2 & RANGE_CONTAIN_EMPTY)))
return r1;
if (result_lower == &lower2 && result_upper == &upper2)
if (result_lower == &lower2 && result_upper == &upper2 &&
((flags2 & RANGE_CONTAIN_EMPTY) || !(flags1 & RANGE_CONTAIN_EMPTY)))
return r2;
return make_range(typcache, result_lower, result_upper, false);
result = make_range(typcache, result_lower, result_upper, false);
if ((flags1 & RANGE_CONTAIN_EMPTY) || (flags2 & RANGE_CONTAIN_EMPTY))
range_set_contain_empty(result);
return result;
}
/*
@ -484,21 +523,26 @@ range_gist_consistent_int(FmgrInfo *flinfo, StrategyNumber strategy,
break;
case RANGESTRAT_CONTAINED_BY:
/*
* Ideally we'd apply range_overlaps here, but at present it
* might fail to find empty ranges in the index, which should
* be reported as being contained by anything. This needs work.
* Empty ranges are contained by anything, so if key is or
* contains any empty ranges, we must descend into it. Otherwise,
* descend only if key overlaps the query.
*/
return true;
if (RangeIsOrContainsEmpty(key))
return true;
proc = range_overlaps;
break;
case RANGESTRAT_CONTAINS_ELEM:
proc = range_contains_elem;
break;
case RANGESTRAT_EQ:
/*
* If query is empty, descend only if the key is or contains any
* empty ranges. Otherwise, descend if key contains query.
*/
if (RangeIsEmpty(DatumGetRangeType(query)))
return RangeIsOrContainsEmpty(key);
proc = range_contains;
break;
case RANGESTRAT_NE:
return true;
break;
default:
elog(ERROR, "unrecognized range strategy: %d", strategy);
proc = NULL; /* keep compiler quiet */
@ -555,9 +599,6 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy,
case RANGESTRAT_EQ:
proc = range_eq;
break;
case RANGESTRAT_NE:
proc = range_ne;
break;
default:
elog(ERROR, "unrecognized range strategy: %d", strategy);
proc = NULL; /* keep compiler quiet */

View File

@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201111241
#define CATALOG_VERSION_NO 201111271
#endif

View File

@ -736,6 +736,5 @@ DATA(insert ( 3919 3831 3831 7 s 3890 783 0 ));
DATA(insert ( 3919 3831 3831 8 s 3892 783 0 ));
DATA(insert ( 3919 3831 2283 16 s 3889 783 0 ));
DATA(insert ( 3919 3831 3831 18 s 3882 783 0 ));
DATA(insert ( 3919 3831 3831 19 s 3883 783 0 ));
#endif /* PG_AMOP_H */

View File

@ -33,13 +33,15 @@ typedef struct
#define RangeTypeGetOid(r) ((r)->rangetypid)
/* A range's flags byte contains these bits: */
#define RANGE_EMPTY 0x01 /* range is empty */
#define RANGE_LB_INC 0x02 /* lower bound is inclusive (vs exclusive) */
#define RANGE_LB_NULL 0x04 /* lower bound is null (NOT CURRENTLY USED) */
#define RANGE_LB_INF 0x08 /* lower bound is +/- infinity */
#define RANGE_UB_INC 0x10 /* upper bound is inclusive (vs exclusive) */
#define RANGE_UB_NULL 0x20 /* upper bound is null (NOT CURRENTLY USED) */
#define RANGE_UB_INF 0x40 /* upper bound is +/- infinity */
#define RANGE_EMPTY 0x01 /* range is empty */
#define RANGE_LB_INC 0x02 /* lower bound is inclusive */
#define RANGE_UB_INC 0x04 /* upper bound is inclusive */
#define RANGE_LB_INF 0x08 /* lower bound is -infinity */
#define RANGE_UB_INF 0x10 /* upper bound is +infinity */
#define RANGE_LB_NULL 0x20 /* lower bound is null (NOT USED) */
#define RANGE_UB_NULL 0x40 /* upper bound is null (NOT USED) */
#define RANGE_CONTAIN_EMPTY 0x80 /* marks a GiST internal-page entry whose
* subtree contains some empty ranges */
#define RANGE_HAS_LBOUND(flags) (!((flags) & (RANGE_EMPTY | \
RANGE_LB_NULL | \
@ -49,7 +51,9 @@ typedef struct
RANGE_UB_NULL | \
RANGE_UB_INF)))
#define RangeIsEmpty(r) (range_get_flags(r) & RANGE_EMPTY)
#define RangeIsEmpty(r) ((range_get_flags(r) & RANGE_EMPTY) != 0)
#define RangeIsOrContainsEmpty(r) \
((range_get_flags(r) & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY)) != 0)
/* Internal representation of either bound of a range (not what's on disk) */
@ -152,6 +156,7 @@ extern void range_deserialize(TypeCacheEntry *typcache, RangeType *range,
RangeBound *lower, RangeBound *upper,
bool *empty);
extern char range_get_flags(RangeType *range);
extern void range_set_contain_empty(RangeType *range);
extern RangeType *make_range(TypeCacheEntry *typcache, RangeBound *lower,
RangeBound *upper, bool empty);
extern int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1,

View File

@ -1041,7 +1041,6 @@ ORDER BY 1, 2, 3;
783 | 15 | <->
783 | 16 | @>
783 | 18 | =
783 | 19 | <>
783 | 27 | @>
783 | 28 | <@
783 | 47 | @>
@ -1054,7 +1053,7 @@ ORDER BY 1, 2, 3;
2742 | 2 | @@@
2742 | 3 | <@
2742 | 4 | =
(44 rows)
(43 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing