Add selectivity estimation functions for intarray operators.

Uriy Zhuravlev and Alexander Korotkov, reviewed by Jeff Janes, some cleanup
by me.
This commit is contained in:
Heikki Linnakangas 2015-07-21 20:54:18 +03:00
parent 434873806a
commit c6fbe6d6fb
7 changed files with 443 additions and 17 deletions

View File

@ -2,10 +2,10 @@
MODULE_big = _int MODULE_big = _int
OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \ OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \
_intbig_gist.o _int_gin.o $(WIN32RES) _intbig_gist.o _int_gin.o _int_selfuncs.o $(WIN32RES)
EXTENSION = intarray EXTENSION = intarray
DATA = intarray--1.0.sql intarray--unpackaged--1.0.sql DATA = intarray--1.1.sql intarray--1.0--1.1.sql intarray--unpackaged--1.0.sql
PGFILEDESC = "intarray - functions and operators for arrays of integers" PGFILEDESC = "intarray - functions and operators for arrays of integers"
REGRESS = _int REGRESS = _int

View File

@ -0,0 +1,341 @@
/*-------------------------------------------------------------------------
*
* _int_selfuncs.c
* Functions for selectivity estimation of intarray operators
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* contrib/intarray/_int_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "_int.h"
#include "access/htup_details.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "utils/selfuncs.h"
#include "utils/syscache.h"
#include "utils/lsyscache.h"
#include "miscadmin.h"
PG_FUNCTION_INFO_V1(_int_overlap_sel);
PG_FUNCTION_INFO_V1(_int_contains_sel);
PG_FUNCTION_INFO_V1(_int_contained_sel);
PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
PG_FUNCTION_INFO_V1(_int_contains_joinsel);
PG_FUNCTION_INFO_V1(_int_contained_joinsel);
PG_FUNCTION_INFO_V1(_int_matchsel);
Datum _int_overlap_sel(PG_FUNCTION_ARGS);
Datum _int_contains_sel(PG_FUNCTION_ARGS);
Datum _int_contained_sel(PG_FUNCTION_ARGS);
Datum _int_overlap_joinsel(PG_FUNCTION_ARGS);
Datum _int_contains_joinsel(PG_FUNCTION_ARGS);
Datum _int_contained_joinsel(PG_FUNCTION_ARGS);
Datum _int_matchsel(PG_FUNCTION_ARGS);
static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs,
int nmncelems, float4 minfreq);
static int compare_val_int4(const void *a, const void *b);
/*
* Wrappers around the default array selectivity estimation functions.
*
* The default array selectivity operators for the @>, && and @< operators
* work fine for integer arrays. However, if we tried to just use arraycontsel
* and arracontjoinsel directly as the cost estimator functions for our
* operators, they would not work as intended, because they look at the
* operator's OID. Our operators behave exactly like the built-in anyarray
* versions, but we must tell the cost estimator functions which built-in
* operators they correspond to. These wrappers just replace the operator
* OID with the corresponding built-in operator's OID, and call the built-in
* function.
*/
Datum
_int_overlap_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_contains_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_contained_sel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3)));
}
Datum
_int_overlap_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
Datum
_int_contains_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
Datum
_int_contained_joinsel(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
PG_GETARG_DATUM(0),
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
PG_GETARG_DATUM(2),
PG_GETARG_DATUM(3),
PG_GETARG_DATUM(4)));
}
/*
* _int_matchsel -- restriction selectivity function for intarray @@ query_int
*/
Datum
_int_matchsel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec;
QUERYTYPE *query;
Datum *mcelems = NULL;
float4 *mcefreqs = NULL;
int nmcelems = 0;
float4 minfreq = 0.0;
float4 nullfrac = 0.0;
Form_pg_statistic stats;
Datum *values = NULL;
int nvalues = 0;
float4 *numbers = NULL;
int nnumbers = 0;
/*
* If expression is not "variable @@ something" or "something @@ variable"
* then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/*
* Variable should be int[]. We don't support cases where variable is
* query_int.
*/
if (vardata.vartype != INT4ARRAYOID)
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
}
/*
* The "@@" operator is strict, so we can cope with NULL right away.
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/* The caller made sure the const is a query, so get it now */
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
/* Empty query matches nothing */
if (query->size == 0)
{
ReleaseVariableStats(vardata);
return (Selectivity) 0.0;
}
/*
* Get the statistics for the intarray column.
*
* We're interested in the Most-Common-Elements list, and the NULL
* fraction.
*/
if (HeapTupleIsValid(vardata.statsTuple))
{
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
/*
* For an int4 array, the default array type analyze function will
* collect a Most Common Elements list, which is an array of int4s.
*/
if (get_attstatsslot(vardata.statsTuple,
INT4OID, -1,
STATISTIC_KIND_MCELEM, InvalidOid,
NULL,
&values, &nvalues,
&numbers, &nnumbers))
{
/*
* There should be three more Numbers than Values, because the
* last three (for intarray) cells are taken for minimal, maximal
* and nulls frequency. Punt if not.
*/
if (nnumbers == nvalues + 3)
{
/* Grab the lowest frequency. */
minfreq = numbers[nnumbers - (nnumbers - nvalues)];
mcelems = values;
mcefreqs = numbers;
nmcelems = nvalues;
}
}
}
/* Process the logical expression in the query, using the stats */
selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
mcelems, mcefreqs, nmcelems, minfreq);
/* MCE stats count only non-null rows, so adjust for null rows. */
selec *= (1.0 - nullfrac);
free_attstatsslot(INT4OID, values, nvalues, numbers, nnumbers);
ReleaseVariableStats(vardata);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
/*
* Estimate selectivity of single intquery operator
*/
static Selectivity
int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
int nmcelems, float4 minfreq)
{
Selectivity selec;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (item->type == VAL)
{
Datum *searchres;
if (mcelems == NULL)
return (Selectivity) DEFAULT_EQ_SEL;
searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
sizeof(Datum), compare_val_int4);
if (searchres)
{
/*
* The element is in MCELEM. Return precise selectivity (or at
* least as precise as ANALYZE could find out).
*/
selec = mcefreqs[searchres - mcelems];
}
else
{
/*
* The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2.
*/
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
}
}
else if (item->type == OPR)
{
/* Current query node is an operator */
Selectivity s1,
s2;
s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
minfreq);
switch (item->val)
{
case (int32) '!':
selec = 1.0 - s1;
break;
case (int32) '&':
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
nmcelems, minfreq);
selec = s1 * s2;
break;
case (int32) '|':
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
nmcelems, minfreq);
selec = s1 + s2 - s1 * s2;
break;
default:
elog(ERROR, "unrecognized operator: %d", item->val);
selec = 0; /* keep compiler quiet */
break;
}
}
else
{
elog(ERROR, "unrecognized int query item type: %u", item->type);
selec = 0; /* keep compiler quiet */
}
/* Clamp intermediate results to stay sane despite roundoff error */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* Comparison function for binary search in mcelem array.
*/
static int
compare_val_int4(const void *a, const void *b)
{
int32 key = *(int32 *) a;
const Datum *t = (const Datum *) b;
return key - DatumGetInt32(*t);
}

View File

@ -368,6 +368,7 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
CREATE TABLE test__int( a int[] ); CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data' \copy test__int from 'data/test__int.data'
ANALYZE test__int;
SELECT count(*) from test__int WHERE a && '{23,50}'; SELECT count(*) from test__int WHERE a && '{23,50}';
count count
------- -------

View File

@ -0,0 +1,49 @@
/* contrib/intarray/intarray--1.0--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION intarray UPDATE TO '1.1'" to load this file. \quit
CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
ALTER OPERATOR @@ (_int4, query_int) SET (RESTRICT = _int_matchsel);
ALTER OPERATOR ~~ (query_int, _int4) SET (RESTRICT = _int_matchsel);
CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
ALTER OPERATOR && (_int4, _int4) SET (RESTRICT = _int_overlap_sel, JOIN = _int_overlap_joinsel);
ALTER OPERATOR @> (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
ALTER OPERATOR <@ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);
ALTER OPERATOR @ (_int4, _int4) SET (RESTRICT = _int_contains_sel, JOIN = _int_contains_joinsel);
ALTER OPERATOR ~ (_int4, _int4) SET (RESTRICT = _int_contained_sel, JOIN = _int_contained_joinsel);

View File

@ -1,4 +1,4 @@
/* contrib/intarray/intarray--1.0.sql */ /* contrib/intarray/intarray--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION -- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION intarray" to load this file. \quit \echo Use "CREATE EXTENSION intarray" to load this file. \quit
@ -45,12 +45,17 @@ LANGUAGE C STRICT IMMUTABLE;
COMMENT ON FUNCTION rboolop(query_int, _int4) IS 'boolean operation with array'; COMMENT ON FUNCTION rboolop(query_int, _int4) IS 'boolean operation with array';
CREATE FUNCTION _int_matchsel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE OPERATOR @@ ( CREATE OPERATOR @@ (
LEFTARG = _int4, LEFTARG = _int4,
RIGHTARG = query_int, RIGHTARG = query_int,
PROCEDURE = boolop, PROCEDURE = boolop,
COMMUTATOR = '~~', COMMUTATOR = '~~',
RESTRICT = contsel, RESTRICT = _int_matchsel,
JOIN = contjoinsel JOIN = contjoinsel
); );
@ -59,7 +64,7 @@ CREATE OPERATOR ~~ (
RIGHTARG = _int4, RIGHTARG = _int4,
PROCEDURE = rboolop, PROCEDURE = rboolop,
COMMUTATOR = '@@', COMMUTATOR = '@@',
RESTRICT = contsel, RESTRICT = _int_matchsel,
JOIN = contjoinsel JOIN = contjoinsel
); );
@ -117,6 +122,36 @@ RETURNS _int4
AS 'MODULE_PATHNAME' AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE; LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION _int_overlap_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_sel(internal, oid, internal, integer)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_overlap_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contains_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
CREATE FUNCTION _int_contained_joinsel(internal, oid, internal, smallint, internal)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
-- --
-- OPERATORS -- OPERATORS
-- --
@ -126,8 +161,8 @@ CREATE OPERATOR && (
RIGHTARG = _int4, RIGHTARG = _int4,
PROCEDURE = _int_overlap, PROCEDURE = _int_overlap,
COMMUTATOR = '&&', COMMUTATOR = '&&',
RESTRICT = contsel, RESTRICT = _int_overlap_sel,
JOIN = contjoinsel JOIN = _int_overlap_joinsel
); );
--CREATE OPERATOR = ( --CREATE OPERATOR = (
@ -157,8 +192,8 @@ CREATE OPERATOR @> (
RIGHTARG = _int4, RIGHTARG = _int4,
PROCEDURE = _int_contains, PROCEDURE = _int_contains,
COMMUTATOR = '<@', COMMUTATOR = '<@',
RESTRICT = contsel, RESTRICT = _int_contains_sel,
JOIN = contjoinsel JOIN = _int_contains_joinsel
); );
CREATE OPERATOR <@ ( CREATE OPERATOR <@ (
@ -166,8 +201,8 @@ CREATE OPERATOR <@ (
RIGHTARG = _int4, RIGHTARG = _int4,
PROCEDURE = _int_contained, PROCEDURE = _int_contained,
COMMUTATOR = '@>', COMMUTATOR = '@>',
RESTRICT = contsel, RESTRICT = _int_contained_sel,
JOIN = contjoinsel JOIN = _int_contained_joinsel
); );
-- obsolete: -- obsolete:
@ -176,8 +211,8 @@ CREATE OPERATOR @ (
RIGHTARG = _int4, RIGHTARG = _int4,
PROCEDURE = _int_contains, PROCEDURE = _int_contains,
COMMUTATOR = '~', COMMUTATOR = '~',
RESTRICT = contsel, RESTRICT = _int_contains_sel,
JOIN = contjoinsel JOIN = _int_contains_joinsel
); );
CREATE OPERATOR ~ ( CREATE OPERATOR ~ (
@ -185,8 +220,8 @@ CREATE OPERATOR ~ (
RIGHTARG = _int4, RIGHTARG = _int4,
PROCEDURE = _int_contained, PROCEDURE = _int_contained,
COMMUTATOR = '@', COMMUTATOR = '@',
RESTRICT = contsel, RESTRICT = _int_contained_sel,
JOIN = contjoinsel JOIN = _int_contained_joinsel
); );
-------------- --------------

View File

@ -1,5 +1,5 @@
# intarray extension # intarray extension
comment = 'functions, operators, and index support for 1-D arrays of integers' comment = 'functions, operators, and index support for 1-D arrays of integers'
default_version = '1.0' default_version = '1.1'
module_pathname = '$libdir/_int' module_pathname = '$libdir/_int'
relocatable = true relocatable = true

View File

@ -68,8 +68,8 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
CREATE TABLE test__int( a int[] ); CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data' \copy test__int from 'data/test__int.data'
ANALYZE test__int;
SELECT count(*) from test__int WHERE a && '{23,50}'; SELECT count(*) from test__int WHERE a && '{23,50}';
SELECT count(*) from test__int WHERE a @@ '23|50'; SELECT count(*) from test__int WHERE a @@ '23|50';