From c6fbe6d6fb828f50b9d67627588eb5ab8bd25e47 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 21 Jul 2015 20:54:18 +0300 Subject: [PATCH] Add selectivity estimation functions for intarray operators. Uriy Zhuravlev and Alexander Korotkov, reviewed by Jeff Janes, some cleanup by me. --- contrib/intarray/Makefile | 4 +- contrib/intarray/_int_selfuncs.c | 341 ++++++++++++++++++ contrib/intarray/expected/_int.out | 1 + contrib/intarray/intarray--1.0--1.1.sql | 49 +++ .../{intarray--1.0.sql => intarray--1.1.sql} | 61 +++- contrib/intarray/intarray.control | 2 +- contrib/intarray/sql/_int.sql | 2 +- 7 files changed, 443 insertions(+), 17 deletions(-) create mode 100644 contrib/intarray/_int_selfuncs.c create mode 100644 contrib/intarray/intarray--1.0--1.1.sql rename contrib/intarray/{intarray--1.0.sql => intarray--1.1.sql} (88%) diff --git a/contrib/intarray/Makefile b/contrib/intarray/Makefile index 920c5b1ba0..5ea7f2aedf 100644 --- a/contrib/intarray/Makefile +++ b/contrib/intarray/Makefile @@ -2,10 +2,10 @@ MODULE_big = _int OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \ - _intbig_gist.o _int_gin.o $(WIN32RES) + _intbig_gist.o _int_gin.o _int_selfuncs.o $(WIN32RES) EXTENSION = intarray -DATA = intarray--1.0.sql intarray--unpackaged--1.0.sql +DATA = intarray--1.1.sql intarray--1.0--1.1.sql intarray--unpackaged--1.0.sql PGFILEDESC = "intarray - functions and operators for arrays of integers" REGRESS = _int diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c new file mode 100644 index 0000000000..2af1c9b65d --- /dev/null +++ b/contrib/intarray/_int_selfuncs.c @@ -0,0 +1,341 @@ +/*------------------------------------------------------------------------- + * + * _int_selfuncs.c + * Functions for selectivity estimation of intarray operators + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/intarray/_int_selfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "_int.h" + +#include "access/htup_details.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_type.h" +#include "utils/selfuncs.h" +#include "utils/syscache.h" +#include "utils/lsyscache.h" +#include "miscadmin.h" + +PG_FUNCTION_INFO_V1(_int_overlap_sel); +PG_FUNCTION_INFO_V1(_int_contains_sel); +PG_FUNCTION_INFO_V1(_int_contained_sel); +PG_FUNCTION_INFO_V1(_int_overlap_joinsel); +PG_FUNCTION_INFO_V1(_int_contains_joinsel); +PG_FUNCTION_INFO_V1(_int_contained_joinsel); +PG_FUNCTION_INFO_V1(_int_matchsel); + +Datum _int_overlap_sel(PG_FUNCTION_ARGS); +Datum _int_contains_sel(PG_FUNCTION_ARGS); +Datum _int_contained_sel(PG_FUNCTION_ARGS); +Datum _int_overlap_joinsel(PG_FUNCTION_ARGS); +Datum _int_contains_joinsel(PG_FUNCTION_ARGS); +Datum _int_contained_joinsel(PG_FUNCTION_ARGS); +Datum _int_matchsel(PG_FUNCTION_ARGS); + + +static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs, + int nmncelems, float4 minfreq); +static int compare_val_int4(const void *a, const void *b); + +/* + * Wrappers around the default array selectivity estimation functions. + * + * The default array selectivity operators for the @>, && and @< operators + * work fine for integer arrays. However, if we tried to just use arraycontsel + * and arracontjoinsel directly as the cost estimator functions for our + * operators, they would not work as intended, because they look at the + * operator's OID. Our operators behave exactly like the built-in anyarray + * versions, but we must tell the cost estimator functions which built-in + * operators they correspond to. These wrappers just replace the operator + * OID with the corresponding built-in operator's OID, and call the built-in + * function. + */ + +Datum +_int_overlap_sel(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel, + PG_GETARG_DATUM(0), + ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3))); +} + +Datum +_int_contains_sel(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel, + PG_GETARG_DATUM(0), + ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3))); +} + +Datum +_int_contained_sel(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel, + PG_GETARG_DATUM(0), + ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3))); +} + +Datum +_int_overlap_joinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel, + PG_GETARG_DATUM(0), + ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3), + PG_GETARG_DATUM(4))); +} + +Datum +_int_contains_joinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel, + PG_GETARG_DATUM(0), + ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3), + PG_GETARG_DATUM(4))); +} + +Datum +_int_contained_joinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel, + PG_GETARG_DATUM(0), + ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP), + PG_GETARG_DATUM(2), + PG_GETARG_DATUM(3), + PG_GETARG_DATUM(4))); +} + + +/* + * _int_matchsel -- restriction selectivity function for intarray @@ query_int + */ +Datum +_int_matchsel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec; + QUERYTYPE *query; + Datum *mcelems = NULL; + float4 *mcefreqs = NULL; + int nmcelems = 0; + float4 minfreq = 0.0; + float4 nullfrac = 0.0; + Form_pg_statistic stats; + Datum *values = NULL; + int nvalues = 0; + float4 *numbers = NULL; + int nnumbers = 0; + + /* + * If expression is not "variable @@ something" or "something @@ variable" + * then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + + /* + * Variable should be int[]. We don't support cases where variable is + * query_int. + */ + if (vardata.vartype != INT4ARRAYOID) + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + } + + /* + * The "@@" operator is strict, so we can cope with NULL right away. + */ + if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(0.0); + } + + /* The caller made sure the const is a query, so get it now */ + query = DatumGetQueryTypeP(((Const *) other)->constvalue); + + /* Empty query matches nothing */ + if (query->size == 0) + { + ReleaseVariableStats(vardata); + return (Selectivity) 0.0; + } + + /* + * Get the statistics for the intarray column. + * + * We're interested in the Most-Common-Elements list, and the NULL + * fraction. + */ + if (HeapTupleIsValid(vardata.statsTuple)) + { + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + nullfrac = stats->stanullfrac; + + /* + * For an int4 array, the default array type analyze function will + * collect a Most Common Elements list, which is an array of int4s. + */ + if (get_attstatsslot(vardata.statsTuple, + INT4OID, -1, + STATISTIC_KIND_MCELEM, InvalidOid, + NULL, + &values, &nvalues, + &numbers, &nnumbers)) + { + /* + * There should be three more Numbers than Values, because the + * last three (for intarray) cells are taken for minimal, maximal + * and nulls frequency. Punt if not. + */ + if (nnumbers == nvalues + 3) + { + /* Grab the lowest frequency. */ + minfreq = numbers[nnumbers - (nnumbers - nvalues)]; + + mcelems = values; + mcefreqs = numbers; + nmcelems = nvalues; + } + } + } + + /* Process the logical expression in the query, using the stats */ + selec = int_query_opr_selec(GETQUERY(query) + query->size - 1, + mcelems, mcefreqs, nmcelems, minfreq); + + /* MCE stats count only non-null rows, so adjust for null rows. */ + selec *= (1.0 - nullfrac); + + free_attstatsslot(INT4OID, values, nvalues, numbers, nnumbers); + ReleaseVariableStats(vardata); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); +} + +/* + * Estimate selectivity of single intquery operator + */ +static Selectivity +int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs, + int nmcelems, float4 minfreq) +{ + Selectivity selec; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (item->type == VAL) + { + Datum *searchres; + + if (mcelems == NULL) + return (Selectivity) DEFAULT_EQ_SEL; + + searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems, + sizeof(Datum), compare_val_int4); + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or at + * least as precise as ANALYZE could find out). + */ + selec = mcefreqs[searchres - mcelems]; + } + else + { + /* + * The element is not in MCELEM. Punt, but assume that the + * selectivity cannot be more than minfreq / 2. + */ + selec = Min(DEFAULT_EQ_SEL, minfreq / 2); + } + } + else if (item->type == OPR) + { + /* Current query node is an operator */ + Selectivity s1, + s2; + + s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems, + minfreq); + switch (item->val) + { + case (int32) '!': + selec = 1.0 - s1; + break; + + case (int32) '&': + s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs, + nmcelems, minfreq); + selec = s1 * s2; + break; + + case (int32) '|': + s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs, + nmcelems, minfreq); + selec = s1 + s2 - s1 * s2; + break; + + default: + elog(ERROR, "unrecognized operator: %d", item->val); + selec = 0; /* keep compiler quiet */ + break; + } + } + else + { + elog(ERROR, "unrecognized int query item type: %u", item->type); + selec = 0; /* keep compiler quiet */ + } + + /* Clamp intermediate results to stay sane despite roundoff error */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * Comparison function for binary search in mcelem array. + */ +static int +compare_val_int4(const void *a, const void *b) +{ + int32 key = *(int32 *) a; + const Datum *t = (const Datum *) b; + + return key - DatumGetInt32(*t); +} diff --git a/contrib/intarray/expected/_int.out b/contrib/intarray/expected/_int.out index 4080b9633f..962e5c6a4b 100644 --- a/contrib/intarray/expected/_int.out +++ b/contrib/intarray/expected/_int.out @@ -368,6 +368,7 @@ SELECT '1&(2&(4&(5|!6)))'::query_int; CREATE TABLE test__int( a int[] ); \copy test__int from 'data/test__int.data' +ANALYZE test__int; SELECT count(*) from test__int WHERE a && '{23,50}'; count ------- diff --git a/contrib/intarray/intarray--1.0--1.1.sql b/contrib/intarray/intarray--1.0--1.1.sql new file mode 100644 index 0000000000..fecebddfca --- /dev/null +++ b/contrib/intarray/intarray--1.0--1.1.sql @@ -0,0 +1,49 @@ +/* contrib/intarray/intarray--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION intarray UPDATE TO '1.1'" to load this file. \quit + +CREATE FUNCTION _int_matchsel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +ALTER OPERATOR @@ (_int4, query_int) SET (RESTRICT = _int_matchsel); +ALTER OPERATOR ~~ (query_int, _int4) SET (RESTRICT = _int_matchsel); + +CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +ALTER OPERATOR && (_int4, _int4) SET (RESTRICT = _int_overlap_sel, JOIN = _int_overlap_joinsel); +ALTER OPERATOR @> (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel); +ALTER OPERATOR <@ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel); + +ALTER OPERATOR @ (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel); +ALTER OPERATOR ~ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel); diff --git a/contrib/intarray/intarray--1.0.sql b/contrib/intarray/intarray--1.1.sql similarity index 88% rename from contrib/intarray/intarray--1.0.sql rename to contrib/intarray/intarray--1.1.sql index 0b89e0f55e..817625e54a 100644 --- a/contrib/intarray/intarray--1.0.sql +++ b/contrib/intarray/intarray--1.1.sql @@ -1,4 +1,4 @@ -/* contrib/intarray/intarray--1.0.sql */ +/* contrib/intarray/intarray--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION intarray" to load this file. \quit @@ -45,12 +45,17 @@ LANGUAGE C STRICT IMMUTABLE; COMMENT ON FUNCTION rboolop(query_int, _int4) IS 'boolean operation with array'; +CREATE FUNCTION _int_matchsel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + CREATE OPERATOR @@ ( LEFTARG = _int4, RIGHTARG = query_int, PROCEDURE = boolop, COMMUTATOR = '~~', - RESTRICT = contsel, + RESTRICT = _int_matchsel, JOIN = contjoinsel ); @@ -59,7 +64,7 @@ CREATE OPERATOR ~~ ( RIGHTARG = _int4, PROCEDURE = rboolop, COMMUTATOR = '@@', - RESTRICT = contsel, + RESTRICT = _int_matchsel, JOIN = contjoinsel ); @@ -117,6 +122,36 @@ RETURNS _int4 AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE; +CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + +CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal) +RETURNS float8 +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT STABLE; + -- -- OPERATORS -- @@ -126,8 +161,8 @@ CREATE OPERATOR && ( RIGHTARG = _int4, PROCEDURE = _int_overlap, COMMUTATOR = '&&', - RESTRICT = contsel, - JOIN = contjoinsel + RESTRICT = _int_overlap_sel, + JOIN = _int_overlap_joinsel ); --CREATE OPERATOR = ( @@ -157,8 +192,8 @@ CREATE OPERATOR @> ( RIGHTARG = _int4, PROCEDURE = _int_contains, COMMUTATOR = '<@', - RESTRICT = contsel, - JOIN = contjoinsel + RESTRICT = _int_contains_sel, + JOIN = _int_contains_joinsel ); CREATE OPERATOR <@ ( @@ -166,8 +201,8 @@ CREATE OPERATOR <@ ( RIGHTARG = _int4, PROCEDURE = _int_contained, COMMUTATOR = '@>', - RESTRICT = contsel, - JOIN = contjoinsel + RESTRICT = _int_contained_sel, + JOIN = _int_contained_joinsel ); -- obsolete: @@ -176,8 +211,8 @@ CREATE OPERATOR @ ( RIGHTARG = _int4, PROCEDURE = _int_contains, COMMUTATOR = '~', - RESTRICT = contsel, - JOIN = contjoinsel + RESTRICT = _int_contains_sel, + JOIN = _int_contains_joinsel ); CREATE OPERATOR ~ ( @@ -185,8 +220,8 @@ CREATE OPERATOR ~ ( RIGHTARG = _int4, PROCEDURE = _int_contained, COMMUTATOR = '@', - RESTRICT = contsel, - JOIN = contjoinsel + RESTRICT = _int_contained_sel, + JOIN = _int_contained_joinsel ); -------------- diff --git a/contrib/intarray/intarray.control b/contrib/intarray/intarray.control index 7b3d4f78f0..8c23e8d5e2 100644 --- a/contrib/intarray/intarray.control +++ b/contrib/intarray/intarray.control @@ -1,5 +1,5 @@ # intarray extension comment = 'functions, operators, and index support for 1-D arrays of integers' -default_version = '1.0' +default_version = '1.1' module_pathname = '$libdir/_int' relocatable = true diff --git a/contrib/intarray/sql/_int.sql b/contrib/intarray/sql/_int.sql index 216c5c58d6..f6fe2de55c 100644 --- a/contrib/intarray/sql/_int.sql +++ b/contrib/intarray/sql/_int.sql @@ -68,8 +68,8 @@ SELECT '1&(2&(4&(5|!6)))'::query_int; CREATE TABLE test__int( a int[] ); - \copy test__int from 'data/test__int.data' +ANALYZE test__int; SELECT count(*) from test__int WHERE a && '{23,50}'; SELECT count(*) from test__int WHERE a @@ '23|50';