commit Oleg and Teodor's RD-tree implementation ... this provides the

regression tests for the GiST changes ... this should be integrated into
the regular regression tests similar to Vadim's SPI contrib stuff ...
This commit is contained in:
Marc G. Fournier 2001-01-12 00:16:26 +00:00
parent 0ad7db4be4
commit 1db943b3ca
10 changed files with 6478 additions and 0 deletions

69
contrib/intarray/Makefile Normal file
View File

@ -0,0 +1,69 @@
subdir = contrib/intarray
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
# override libdir to install shlib in contrib not main directory
libdir := $(libdir)/contrib
# shared library parameters
NAME= _int
SO_MAJOR_VERSION= 1
SO_MINOR_VERSION= 0
override CPPFLAGS += -I$(srcdir) -DPGSQL71
OBJS= _int.o
all: all-lib $(NAME).sql
# Shared library stuff
include $(top_srcdir)/src/Makefile.shlib
$(NAME).sql: $(NAME).sql.in
sed -e 's:MODULE_PATHNAME:$(libdir)/$(shlib):g' < $< > $@
.PHONY: submake
submake:
$(MAKE) -C $(top_builddir)/src/test/regress pg_regress
# against installed postmaster
installcheck: submake
@echo "'make installcheck' is not supported."
installcheck: submake
$(top_builddir)/src/test/regress/pg_regress _int
# in-tree test doesn't work yet (no way to install my shared library)
#check: all submake
# $(top_builddir)/src/test/regress/pg_regress --temp-install \
# --top-builddir=$(top_builddir) _int
check:
@echo "'make check' is not supported."
@echo "Do 'make install', then 'make installcheck' instead."
install: all installdirs install-lib
#$(INSTALL_DATA) $(srcdir)/README.$(NAME) $(docdir)/contrib
$(INSTALL_DATA) $(NAME).sql $(datadir)/contrib
installdirs:
$(mkinstalldirs) $(docdir)/contrib $(datadir)/contrib $(libdir)
uninstall: uninstall-lib
rm -f $(docdir)/contrib/README.$(NAME) $(datadir)/contrib/$(NAME).sql
clean distclean maintainer-clean: clean-lib
rm -f *.so y.tab.c y.tab.h $(OBJS) $(NAME).sql
# things created by various check targets
rm -rf results tmp_check log
rm -f regression.diffs regression.out regress.out run_check.out
ifeq ($(PORTNAME), win)
rm -f regress.def
endif
depend dep:
$(CC) -MM $(CFLAGS) *.c >depend
ifeq (depend,$(wildcard depend))
include depend
endif

View File

@ -0,0 +1,64 @@
#-------------------------------------------------------------------------
#
# Makefile --
#
# Makefile for Enzyme Commission catalogue number type -- ec_code
#
#-------------------------------------------------------------------------
PGDIR = ../..
SRCDIR = $(PGDIR)/src
include $(SRCDIR)/Makefile.global
INCLUDE_OPT = -I ./ \
-I $(SRCDIR)/ \
-I $(SRCDIR)/include \
-I $(SRCDIR)/port/$(PORTNAME)
CFLAGS += $(INCLUDE_OPT) $(CFLAGS_SL)
MODNAME = _int
OBJFILES = $(MODNAME).o
SQLDEFS = $(MODNAME).sql
MODULE = $(MODNAME)$(DLSUFFIX)
MODDIR = $(LIBDIR)/modules
SQLDIR = $(LIBDIR)/sql
all: module sql
module: $(MODULE)
sql: $(SQLDEFS)
$(MODULE): $(OBJFILES)
$(CC) $(CFLAGS) -shared -o $@ $(OBJFILES)
install: $(MODULE) $(SQLDEFS) $(MODDIR) $(SQLDIR)
cp -p $(MODULE) $(MODDIR)/
strip $(MODDIR)/$(MODULE)
cp -p $(SQLDEFS) $(SQLDIR)/
$(MODDIR):
mkdir -p $@
$(SQLDIR):
mkdir -p $@
%.sql: %.sql.in
sed "s|MODULE_PATHNAME|$(MODDIR)/$(MODULE)|" < $< > $@
depend dep:
$(CC) -MM $(INCLUDE_OPT) *.c >depend
clean:
rm -f $(MODULE) $(SQLDEFS) *$(DLSUFFIX)
rm -f *~ *# *.b *.o *.output *.tab.h $(MODNAME)parse.h $(MODNAME)parse.c $(MODNAME)scan.c
ifeq (depend,$(wildcard depend))
include depend
endif

View File

@ -0,0 +1,81 @@
This is an implementation of RD-tree data structure using GiST interface
of PostgreSQL. It has built-in lossy compression - must be declared
in index creation - with (islossy). Current implementation has index support
for one-dimensional array of int4's.
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
(oleg@sai.msu.su). See http://www.sai.msu.su/~megera/postgres/gist
for additional information.
INSTALLATION:
gmake
gmake install
-- load functions
psql <database> < _int.sql
REGRESSION TEST:
gmake installcheck
EXAMPLE USAGE:
create table message (mid int not null,sections int[]);
create table message_section_map (mid int not null,sid int not null);
-- create indices
CREATE unique index message_key on message ( mid );
CREATE unique index message_section_map_key2 on message_section_map (sid, mid );
CREATE INDEX message_rdtree_idx on message using gist ( sections ) with ( islossy );
-- select some messages with section in 1 OR 2 - OVERLAP operator
select message.mid from message where message.sections && '{1,2}';
-- select messages contains in sections 1 AND 2 - CONTAINS operator
select message.mid from message where message.sections @ '{1,2}';
-- the same, CONTAINED operator
select message.mid from message where '{1,2}' ~ message.sections;
BENCHMARK:
subdirectory bench contains benchmark suite.
cd ./bench
1. createdb TEST
2. psql TEST < ../_int.sql
3. ./create_test.pl | psql TEST
4. ./bench.pl - perl script to benchmark queries, supports OR, AND queries
with/without RD-Tree. Run script without arguments to
see availbale options.
a)test without RD-Tree (OR)
./bench.pl -d TEST -s 1,2 -v
b)test with RD-Tree
./bench.pl -d TEST -s 1,2 -v -r
BENCHMARKS:
Size of table <message>: 200000
Size of table <message_section_map>: 268538
Distribution of messages by sections:
section 0: 73899 messages
section 1: 16298 messages
section 50: 1241 messages
section 99: 705 messages
old - without RD-Tree support,
new - with RD-Tree
+----------+---------------+----------------+
|Search set|OR, time in sec|AND, time in sec|
| +-------+-------+--------+-------+
| | old | new | old | new |
+----------+-------+-------+--------+-------+
| 1| 1.427| 0.215| -| -|
+----------+-------+-------+--------+-------+
| 99| 1.029| 0.018| -| -|
+----------+-------+-------+--------+-------+
| 1,2| 1.829| 0.334| 5.654| 0.042|
+----------+-------+-------+--------+-------+
| 1,2,50,60| 2.057| 0.359| 5.044| 0.007|
+----------+-------+-------+--------+-------+

842
contrib/intarray/_int.c Normal file
View File

@ -0,0 +1,842 @@
/******************************************************************************
This file contains routines that can be bound to a Postgres backend and
called by the backend in the process of processing queries. The calling
format for these routines is dictated by Postgres architecture.
******************************************************************************/
#include <stdio.h>
#include <float.h>
#include <string.h>
#include "postgres.h"
#include "access/gist.h"
#include "access/itup.h"
#include "access/rtree.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
#define MAXNUMRANGE 100
#define max(a,b) ((a) > (b) ? (a) : (b))
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define abs(a) ((a) < (0) ? (-a) : (a))
#define ARRPTR(x) ( (int4 *) ARR_DATA_PTR(x) )
#ifdef PGSQL71
#define ARRSIZE(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
#else
#define ARRSIZE(x) getNitems( ARR_NDIM(x), ARR_DIMS(x))
#endif
#define NDIM 1
#define ARRISNULL(x) ( (x) ? ( ( ARR_NDIM(x) == NDIM ) ? ( ( ARRSIZE( x ) ) ? 0 : 1 ) : 1 ) : 1 )
#define SORT(x) if ( ARRSIZE( x ) > 1 ) isort( (void*)ARRPTR( x ), ARRSIZE( x ) );
#define PREPAREARR(x) \
if ( ARRSIZE( x ) > 1 ) {\
if ( isort( (void*)ARRPTR( x ), ARRSIZE( x ) ) )\
x = _int_unique( x );\
}
/*
#define GIST_DEBUG
#define GIST_QUERY_DEBUG
*/
#ifdef GIST_DEBUG
static void printarr ( ArrayType * a, int num ) {
char bbb[16384];
char *cur;
int l;
int *d;
d = ARRPTR( a );
*bbb = '\0';
cur = bbb;
for(l=0; l<min( num, ARRSIZE( a ));l++) {
sprintf(cur,"%d ", d[l] );
cur = strchr( cur, '\0' ) ;
}
elog(NOTICE, "\t\t%s", bbb);
}
#endif
/*
** usefull function
*/
bool isort( int *a, const int len );
ArrayType * new_intArrayType( int num );
ArrayType * copy_intArrayType( ArrayType * a );
ArrayType * resize_intArrayType( ArrayType * a, int num );
int internal_size( int *a, int len );
ArrayType * _int_unique( ArrayType * a );
/*
** GiST support methods
*/
bool g_int_consistent(GISTENTRY *entry, ArrayType *query, StrategyNumber strategy);
GISTENTRY * g_int_compress(GISTENTRY *entry);
GISTENTRY * g_int_decompress(GISTENTRY *entry);
float * g_int_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result);
GIST_SPLITVEC * g_int_picksplit(bytea *entryvec, GIST_SPLITVEC *v);
bool g_int_internal_consistent(ArrayType *key, ArrayType *query, StrategyNumber strategy);
ArrayType * g_int_union(bytea *entryvec, int *sizep);
bool * g_int_same(ArrayType *b1, ArrayType *b2, bool *result);
/*
** R-tree suport functions
*/
bool inner_int_contains(ArrayType *a, ArrayType *b);
bool inner_int_overlap(ArrayType *a, ArrayType *b);
ArrayType * inner_int_union(ArrayType *a, ArrayType *b);
ArrayType * inner_int_inter(ArrayType *a, ArrayType *b);
bool _int_different(ArrayType *a, ArrayType *b);
bool _int_same(ArrayType *a, ArrayType *b);
bool _int_contains(ArrayType *a, ArrayType *b);
bool _int_contained(ArrayType *a, ArrayType *b);
bool _int_overlap(ArrayType *a, ArrayType *b);
ArrayType * _int_union(ArrayType *a, ArrayType *b);
ArrayType * _int_inter(ArrayType *a, ArrayType *b);
void rt__int_size(ArrayType *a, float* sz);
/*****************************************************************************
* GiST functions
*****************************************************************************/
/*
** The GiST Consistent method for _intments
** Should return false if for all data items x below entry,
** the predicate x op query == FALSE, where op is the oper
** corresponding to strategy in the pg_amop table.
*/
bool
g_int_consistent(GISTENTRY *entry,
ArrayType *query,
StrategyNumber strategy)
{
/* sort query for fast search, key is already sorted */
if ( ARRISNULL( query ) ) return FALSE;
PREPAREARR( query );
/*
** if entry is not leaf, use g_int_internal_consistent,
** else use g_int_leaf_consistent
*/
return(g_int_internal_consistent((ArrayType *)(entry->pred), query, strategy));
}
/*
** The GiST Union method for _intments
** returns the minimal set that encloses all the entries in entryvec
*/
ArrayType *
g_int_union(bytea *entryvec, int *sizep)
{
int numranges, i;
ArrayType *out = (ArrayType *)NULL;
ArrayType *tmp;
numranges = (VARSIZE(entryvec) - VARHDRSZ)/sizeof(GISTENTRY);
tmp = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[0]).pred;
#ifdef GIST_DEBUG
elog(NOTICE, "union %d", numranges);
#endif
for (i = 1; i < numranges; i++) {
out = inner_int_union(tmp, (ArrayType *)
(((GISTENTRY *)(VARDATA(entryvec)))[i]).pred);
if (i > 1 && tmp) pfree(tmp);
tmp = out;
}
*sizep = VARSIZE( out );
#ifdef GIST_DEBUG
elog(NOTICE, "\t ENDunion %d %d", *sizep, ARRSIZE( out ) );
#endif
if ( *sizep == 0 ) {
pfree( out );
return NULL;
}
return(out);
}
/*
** GiST Compress and Decompress methods
*/
GISTENTRY *
g_int_compress(GISTENTRY *entry)
{
GISTENTRY *retval;
ArrayType * r;
int len;
int *dr;
int i,min,cand;
retval = palloc(sizeof(GISTENTRY));
if ( ! retval )
elog(ERROR,"Can't allocate memory for compression");
if ( ARRISNULL( (ArrayType *) entry->pred ) ) {
#ifdef GIST_DEBUG
elog(NOTICE,"COMP IN: NULL");
#endif
gistentryinit(*retval, (char *)NULL, entry->rel, entry->page, entry->offset,
0, FALSE);
return( retval );
}
r = copy_intArrayType( (ArrayType *) entry->pred );
if ( entry->leafkey ) PREPAREARR( r );
len = ARRSIZE( r );
#ifdef GIST_DEBUG
elog(NOTICE, "COMP IN: %d leaf; %d rel; %d page; %d offset; %d bytes; %d elems", entry->leafkey, (int)entry->rel, (int)entry->page, (int)entry->offset, (int)entry->bytes, len);
//printarr( r, len );
#endif
if ( len >= 2*MAXNUMRANGE ) { /*compress*/
r = resize_intArrayType( r, 2*( len ) );
dr = ARRPTR( r );
for(i=len-1; i>=0;i--)
dr[2*i] = dr[2*i+1] = dr[i];
len *= 2;
cand = 1;
while( len > MAXNUMRANGE * 2 ) {
min = 0x7fffffff;
for( i=2; i<len;i+=2 )
if ( min > (dr[i] - dr[i-1]) ) {
min = (dr[i] - dr[i-1]);
cand = i;
}
memmove( (void*)&dr[cand-1], (void*)&dr[cand+1], (len - cand - 1)*sizeof(int) );
len -= 2;
}
r = resize_intArrayType(r, len );
}
gistentryinit(*retval, (char *)r, entry->rel, entry->page, entry->offset, VARSIZE( r ), FALSE);
return(retval);
}
GISTENTRY *
g_int_decompress(GISTENTRY *entry)
{
GISTENTRY *retval;
ArrayType * r;
int *dr, lenr;
ArrayType * in;
int lenin;
int *din;
int i,j;
if ( entry->bytes < ARR_OVERHEAD( NDIM ) || ARRISNULL( (ArrayType *) entry->pred ) ) {
retval = palloc(sizeof(GISTENTRY));
if ( ! retval )
elog(ERROR,"Can't allocate memory for decompression");
gistentryinit(*retval, (char *)NULL, entry->rel, entry->page, entry->offset, 0, FALSE);
#ifdef GIST_DEBUG
elog(NOTICE,"DECOMP IN: NULL");
#endif
return( retval );
}
in = (ArrayType *) entry->pred;
lenin = ARRSIZE(in);
din = ARRPTR(in);
if ( lenin < 2*MAXNUMRANGE ) { /*not comressed value*/
/* sometimes strange bytesize */
gistentryinit(*entry, (char *)in, entry->rel, entry->page, entry->offset, VARSIZE( in ), FALSE);
return (entry);
}
#ifdef GIST_DEBUG
elog(NOTICE, "DECOMP IN: %d leaf; %d rel; %d page; %d offset; %d bytes; %d elems", entry->leafkey, (int)entry->rel, (int)entry->page, (int)entry->offset, (int)entry->bytes, lenin);
//printarr( in, lenin );
#endif
lenr = internal_size(din, lenin);
r = new_intArrayType( lenr );
dr = ARRPTR( r );
for(i=0;i<lenin;i+=2)
for(j=din[i]; j<=din[i+1]; j++)
if ( (!i) || *(dr-1) != j )
*dr++ = j;
retval = palloc(sizeof(GISTENTRY));
if ( ! retval )
elog(ERROR,"Can't allocate memory for decompression");
gistentryinit(*retval, (char *)r, entry->rel, entry->page, entry->offset, VARSIZE( r ), FALSE);
return(retval);
}
/*
** The GiST Penalty method for _intments
*/
float *
g_int_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
{
Datum ud;
float tmp1, tmp2;
#ifdef GIST_DEBUG
elog(NOTICE, "penalty");
#endif
ud = (Datum)inner_int_union((ArrayType *)(origentry->pred), (ArrayType *)(newentry->pred));
rt__int_size((ArrayType *)ud, &tmp1);
rt__int_size((ArrayType *)(origentry->pred), &tmp2);
*result = tmp1 - tmp2;
pfree((char *)ud);
#ifdef GIST_DEBUG
elog(NOTICE, "--penalty\t%g", *result);
#endif
return(result);
}
/*
** The GiST PickSplit method for _intments
** We use Guttman's poly time split algorithm
*/
GIST_SPLITVEC *
g_int_picksplit(bytea *entryvec,
GIST_SPLITVEC *v)
{
OffsetNumber i, j;
ArrayType *datum_alpha, *datum_beta;
ArrayType *datum_l, *datum_r;
ArrayType *union_d, *union_dl, *union_dr;
ArrayType *inter_d;
bool firsttime;
float size_alpha, size_beta, size_union, size_inter;
float size_waste, waste;
float size_l, size_r;
int nbytes;
OffsetNumber seed_1 = 0, seed_2 = 0;
OffsetNumber *left, *right;
OffsetNumber maxoff;
#ifdef GIST_DEBUG
elog(NOTICE, "--------picksplit %d",(VARSIZE(entryvec) - VARHDRSZ)/sizeof(GISTENTRY));
#endif
maxoff = ((VARSIZE(entryvec) - VARHDRSZ)/sizeof(GISTENTRY)) - 2;
nbytes = (maxoff + 2) * sizeof(OffsetNumber);
v->spl_left = (OffsetNumber *) palloc(nbytes);
v->spl_right = (OffsetNumber *) palloc(nbytes);
firsttime = true;
waste = 0.0;
for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) {
datum_alpha = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[i].pred);
for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) {
datum_beta = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[j].pred);
/* compute the wasted space by unioning these guys */
/* size_waste = size_union - size_inter; */
union_d = (ArrayType *)inner_int_union(datum_alpha, datum_beta);
rt__int_size(union_d, &size_union);
inter_d = (ArrayType *)inner_int_inter(datum_alpha, datum_beta);
rt__int_size(inter_d, &size_inter);
size_waste = size_union - size_inter;
pfree(union_d);
if (inter_d != (ArrayType *) NULL)
pfree(inter_d);
/*
* are these a more promising split that what we've
* already seen?
*/
if (size_waste > waste || firsttime) {
waste = size_waste;
seed_1 = i;
seed_2 = j;
firsttime = false;
}
}
}
left = v->spl_left;
v->spl_nleft = 0;
right = v->spl_right;
v->spl_nright = 0;
datum_alpha = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[seed_1].pred);
datum_l = copy_intArrayType( datum_alpha );
rt__int_size((ArrayType *)datum_l, &size_l);
datum_beta = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[seed_2].pred);
datum_r = copy_intArrayType( datum_beta );
rt__int_size((ArrayType *)datum_r, &size_r);
/*
* Now split up the regions between the two seeds. An important
* property of this split algorithm is that the split vector v
* has the indices of items to be split in order in its left and
* right vectors. We exploit this property by doing a merge in
* the code that actually splits the page.
*
* For efficiency, we also place the new index tuple in this loop.
* This is handled at the very end, when we have placed all the
* existing tuples and i == maxoff + 1.
*/
maxoff = OffsetNumberNext(maxoff);
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
/*
* If we've already decided where to place this item, just
* put it on the right list. Otherwise, we need to figure
* out which page needs the least enlargement in order to
* store the item.
*/
if (i == seed_1) {
*left++ = i;
v->spl_nleft++;
continue;
} else if (i == seed_2) {
*right++ = i;
v->spl_nright++;
continue;
}
/* okay, which page needs least enlargement? */
datum_alpha = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[i].pred);
union_dl = (ArrayType *)inner_int_union(datum_l, datum_alpha);
union_dr = (ArrayType *)inner_int_union(datum_r, datum_alpha);
rt__int_size((ArrayType *)union_dl, &size_alpha);
rt__int_size((ArrayType *)union_dr, &size_beta);
/* pick which page to add it to */
if (size_alpha - size_l < size_beta - size_r) {
if ( datum_l ) pfree(datum_l);
if ( union_dr ) pfree(union_dr);
datum_l = union_dl;
size_l = size_alpha;
*left++ = i;
v->spl_nleft++;
} else {
if ( datum_r ) pfree(datum_r);
if ( union_dl ) pfree(union_dl);
datum_r = union_dr;
size_r = size_beta;
*right++ = i;
v->spl_nright++;
}
}
/**left = *right = FirstOffsetNumber;*/ /* sentinel value, see dosplit() */
if ( *(left-1) > *(right-1) ) {
*right = FirstOffsetNumber;
*(left-1) = InvalidOffsetNumber;
} else {
*left = FirstOffsetNumber;
*(right-1) = InvalidOffsetNumber;
}
v->spl_ldatum = (char *)datum_l;
v->spl_rdatum = (char *)datum_r;
#ifdef GIST_DEBUG
elog(NOTICE, "--------ENDpicksplit %d %d",v->spl_nleft, v->spl_nright);
#endif
return v;
}
/*
** Equality methods
*/
bool *
g_int_same(ArrayType *b1, ArrayType *b2, bool *result)
{
if (_int_same(b1, b2))
*result = TRUE;
else *result = FALSE;
return(result);
}
bool
g_int_internal_consistent(ArrayType *key,
ArrayType *query,
StrategyNumber strategy)
{
bool retval;
#ifdef GIST_QUERY_DEBUG
elog(NOTICE, "internal_consistent, %d", strategy);
#endif
switch(strategy) {
case RTOverlapStrategyNumber:
retval = (bool)inner_int_overlap(key, query);
break;
case RTSameStrategyNumber:
case RTContainsStrategyNumber:
retval = (bool)inner_int_contains(key, query);
break;
case RTContainedByStrategyNumber:
retval = (bool)inner_int_overlap(key, query);
break;
default:
retval = FALSE;
}
return(retval);
}
bool
_int_contained(ArrayType *a, ArrayType *b)
{
return ( _int_contains(b, a) );
}
bool
_int_contains ( ArrayType *a, ArrayType *b ) {
bool res;
ArrayType *an, *bn;
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
an = copy_intArrayType( a );
bn = copy_intArrayType( b );
PREPAREARR(an);
PREPAREARR(bn);
res = inner_int_contains( an, bn );
pfree( an ); pfree( bn );
return res;
}
bool
inner_int_contains ( ArrayType *a, ArrayType *b ) {
int na, nb;
int i,j, n;
int *da, *db;
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
#ifdef GIST_DEBUG
elog(NOTICE, "contains %d %d", na, nb);
#endif
i = j = n = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
i++;
else if ( da[i] == db[j] ) {
n++; i++; j++;
} else
j++;
return ( n == nb ) ? TRUE : FALSE;
}
/*****************************************************************************
* Operator class for R-tree indexing
*****************************************************************************/
bool
_int_different(ArrayType *a, ArrayType *b)
{
return ( !_int_same( a, b ) );
}
bool
_int_same ( ArrayType *a, ArrayType *b ) {
int na , nb ;
int n;
int *da, *db;
bool anull = ARRISNULL( a );
bool bnull = ARRISNULL( b );
if ( anull || bnull )
return ( anull && bnull ) ? TRUE : FALSE;
SORT( a );
SORT( b );
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
if ( na != nb ) return FALSE;
n = 0;
for(n=0; n<na; n++)
if ( da[n] != db[n] )
return FALSE;
return TRUE;
}
/* _int_overlap -- does a overlap b?
*/
bool
_int_overlap ( ArrayType *a, ArrayType *b ) {
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
SORT(a);
SORT(b);
return inner_int_overlap( a, b );
}
bool
inner_int_overlap ( ArrayType *a, ArrayType *b ) {
int na , nb ;
int i,j;
int *da, *db;
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
#ifdef GIST_DEBUG
elog(NOTICE, "g_int_overlap");
#endif
i = j = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
i++;
else if ( da[i] == db[j] )
return TRUE;
else
j++;
return FALSE;
}
ArrayType *
_int_union ( ArrayType *a, ArrayType *b ) {
if ( ! ARRISNULL( a ) ) SORT(a);
if ( ! ARRISNULL( b ) ) SORT(b);
return inner_int_union( a, b );
}
ArrayType *
inner_int_union ( ArrayType *a, ArrayType *b ) {
ArrayType * r = NULL;
int na , nb;
int *da, *db, *dr;
int i,j;
#ifdef GIST_DEBUG
//elog(NOTICE, "inner_union %d %d", ARRISNULL( a ) , ARRISNULL( b ) );
#endif
if ( ARRISNULL( a ) && ARRISNULL( b ) ) return new_intArrayType(0);
if ( ARRISNULL( a ) ) r = copy_intArrayType( b );
if ( ARRISNULL( b ) ) r = copy_intArrayType( a );
if ( r ) {
dr = ARRPTR( r );
} else {
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
r = new_intArrayType( na + nb );
dr = ARRPTR( r );
/* union */
i = j = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
*dr++ = da[i++];
else
*dr++ = db[j++];
while( i<na ) *dr++ = da[i++];
while( j<nb ) *dr++ = db[j++];
}
if ( ARRSIZE(r) > 1 )
r = _int_unique( r );
return r;
}
ArrayType *
_int_inter ( ArrayType *a, ArrayType *b ) {
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
SORT(a);
SORT(b);
return inner_int_inter( a, b );
}
ArrayType *
inner_int_inter ( ArrayType *a, ArrayType *b ) {
ArrayType * r;
int na , nb ;
int *da, *db, *dr;
int i,j;
#ifdef GIST_DEBUG
//elog(NOTICE, "inner_inter %d %d", ARRISNULL( a ), ARRISNULL( b ) );
#endif
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return NULL;
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
r = new_intArrayType( min(na, nb) );
dr = ARRPTR( r );
i = j = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
i++;
else if ( da[i] == db[j] ) {
if ( i+j == 0 || ( i+j>0 && *(dr-1) != db[j] ) )
*dr++ = db[j];
i++; j++;
} else
j++;
if ( (dr - ARRPTR(r)) == 0 ) {
pfree( r );
return NULL;
} else
return resize_intArrayType(r, dr - ARRPTR(r) );
}
void
rt__int_size(ArrayType *a, float *size)
{
if ( ARRISNULL( a ) )
*size = 0.0;
else
*size = (float)ARRSIZE( a );
return;
}
/*****************************************************************************
* Miscellaneous operators and functions
*****************************************************************************/
/* len >= 2 */
bool isort ( int *a, int len ) {
int tmp, index;
int *cur, *end;
bool r = FALSE;
end = a + len;
do {
index = 0;
cur = a + 1;
while( cur < end ) {
if( *(cur-1) > *cur ) {
tmp=*(cur-1); *(cur-1) = *cur; *cur=tmp;
index = 1;
} else if ( ! r && *(cur-1) == *cur )
r = TRUE;
cur++;
}
} while( index );
return r;
}
ArrayType * new_intArrayType( int num ) {
ArrayType * r;
int nbytes = ARR_OVERHEAD( NDIM ) + sizeof(int)*num;
r = (ArrayType *) palloc( nbytes );
if ( ! r )
elog(ERROR, "Can't allocate memory for new array");
MemSet(r, 0, nbytes);
r->size = nbytes;
r->ndim = NDIM;
#ifndef PGSQL71
SET_LO_FLAG(false, r);
#endif
*( (int*)ARR_DIMS(r) ) = num;
*( (int*)ARR_LBOUND(r) ) = 1;
return r;
}
ArrayType * resize_intArrayType( ArrayType * a, int num ) {
int nbytes = ARR_OVERHEAD( NDIM ) + sizeof(int)*num;
if ( num == ARRSIZE(a) ) return a;
a = (ArrayType *) repalloc( a, nbytes );
if ( ! a )
elog(ERROR, "Can't reallocate memory for new array");
a->size = nbytes;
*( (int*)ARR_DIMS(a) ) = num;
return a;
}
ArrayType * copy_intArrayType( ArrayType * a ) {
ArrayType * r;
if ( ! a ) return NULL;
r = new_intArrayType( ARRSIZE(a) );
memmove(r,a,VARSIZE(a));
return r;
}
/* num for compressed key */
int internal_size (int *a, int len ) {
int i,size=0;
for(i=0;i<len;i+=2)
if ( ! i || a[i] != a[i-1] ) /* do not count repeated range */
size += a[i+1] - a[i] + 1;
return size;
}
/* r is sorted and size of r > 1 */
ArrayType * _int_unique( ArrayType * r ) {
int *tmp, *dr, *data;
int num = ARRSIZE(r);
data = tmp = dr = ARRPTR( r );
while( tmp - data < num )
if ( *tmp != *dr )
*(++dr) = *tmp++;
else
tmp++;
return resize_intArrayType(r, dr + 1 - ARRPTR(r) );
}

View File

@ -0,0 +1,211 @@
-- Create the user-defined type for the 1-D frloating point indervals (_int4)
--
BEGIN TRANSACTION;
--
-- External C-functions for R-tree methods
--
-- Comparison methods
CREATE FUNCTION _int_contains(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'contains'::text
FROM pg_proc
WHERE proname = '_int_contains'::name;
CREATE FUNCTION _int_contained(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'contained in'::text
FROM pg_proc
WHERE proname = '_int_contained'::name;
CREATE FUNCTION _int_overlap(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'overlaps'::text
FROM pg_proc
WHERE proname = '_int_overlap'::name;
CREATE FUNCTION _int_same(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'same as'::text
FROM pg_proc
WHERE proname = '_int_same'::name;
CREATE FUNCTION _int_different(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'different'::text
FROM pg_proc
WHERE proname = '_int_different'::name;
-- support routines for indexing
CREATE FUNCTION _int_union(_int4, _int4) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION _int_inter(_int4, _int4) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
--
-- OPERATORS
--
CREATE OPERATOR && (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_overlap,
COMMUTATOR = '&&',
RESTRICT = contsel, JOIN = contjoinsel
);
--CREATE OPERATOR = (
-- LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_same,
-- COMMUTATOR = '=', NEGATOR = '<>',
-- RESTRICT = eqsel, JOIN = eqjoinsel,
-- SORT1 = '<', SORT2 = '<'
--);
CREATE OPERATOR <> (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_different,
COMMUTATOR = '<>', NEGATOR = '=',
RESTRICT = neqsel, JOIN = neqjoinsel
);
CREATE OPERATOR @ (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_contains,
COMMUTATOR = '~', RESTRICT = contsel, JOIN = contjoinsel
);
CREATE OPERATOR ~ (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_contained,
COMMUTATOR = '@', RESTRICT = contsel, JOIN = contjoinsel
);
-- define the GiST support methods
CREATE FUNCTION g_int_consistent(opaque,_int4,int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_compress(opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_decompress(opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_penalty(opaque,opaque,opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_picksplit(opaque, opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_union(bytea, opaque) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_same(_int4, _int4, opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
-- register the default opclass for indexing
INSERT INTO pg_opclass (opcname, opcdeftype)
SELECT 'gist__int_ops', oid
FROM pg_type
WHERE typname = '_int4';
-- get the comparators for _intments and store them in a tmp table
SELECT o.oid AS opoid, o.oprname
INTO TABLE _int_ops_tmp
FROM pg_operator o, pg_type t
WHERE o.oprleft = t.oid and o.oprright = t.oid
and t.typname = '_int4';
-- make sure we have the right operators
-- SELECT * from _int_ops_tmp;
-- using the tmp table, generate the amop entries
-- _int_overlap
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 3
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '&&';
-- _int_same
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 6
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '=';
-- _int_contains
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 7
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '@';
-- _int_contained
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 8
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '~';
DROP TABLE _int_ops_tmp;
-- add the entries to amproc for the support methods
-- note the amprocnum numbers associated with each are specific!
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 1
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_consistent';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 2
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_union';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 3
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_compress';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 4
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_decompress';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 5
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_penalty';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 6
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_picksplit';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 7
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_same';
END TRANSACTION;

104
contrib/intarray/bench/bench.pl Executable file
View File

@ -0,0 +1,104 @@
#!/usr/bin/perl
use strict;
# make sure we are in a sane environment.
use DBI();
use DBD::Pg();
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );
use Getopt::Std;
my %opt;
getopts('d:b:s:veorauc', \%opt);
if ( !( scalar %opt && defined $opt{s} ) ) {
print <<EOT;
Usage:
$0 -d DATABASE -s SECTIONS [-b NUMBER] [-v] [-e] [-o] [-r] [-a] [-u]
-d DATABASE -DATABASE
-b NUMBER -number of repeats
-s SECTIONS -sections, format sid1[,sid2[,sid3[...]]]]
-v -verbose (show SQL)
-e -show explain
-r -use RD-tree index
-a -AND section
-o -show output
-u -unique
-c -count
EOT
exit;
}
$opt{d} ||= '_int4';
my $dbi=DBI->connect('DBI:Pg:dbname='.$opt{d});
my %table;
my @where;
$table{message}=1;
if ( $opt{a} ) {
if ( $opt{r} ) {
push @where, "message.sections @ '{$opt{s}}'";
} else {
foreach my $sid ( split(/[,\s]+/, $opt{s} )) {
push @where, "EXISTS ( select message_section_map.mid from message_section_map where message.mid=message_section_map.mid and message_section_map.sid = $sid )";
}
}
} else {
if ( $opt{r} ) {
push @where, "message.sections && '{$opt{s}}'";
} else {
$table{message_section_map} = 1;
push @where, "message.mid = message_section_map.mid";
push @where, "message_section_map.sid in ($opt{s})";
}
}
my $outf;
if ( $opt{c} ) {
$outf = ( $opt{u} ) ? 'count( distinct message.mid )' : 'count( message.mid )';
} else {
$outf = ( $opt{u} ) ? 'distinct( message.mid )' : 'message.mid';
}
my $sql = "select $outf from ".join(', ', keys %table)." where ".join(' AND ', @where).';';
if ( $opt{v} ) {
print "$sql\n";
}
if ( $opt{e} ) {
$dbi->do("explain $sql");
}
my $t0 = [gettimeofday];
my $count=0;
my $b=$opt{b};
$b||=1;
my @a;
foreach ( 1..$b ) {
@a=exec_sql($dbi,$sql);
$count=$#a;
}
my $elapsed = tv_interval ( $t0, [gettimeofday]);
if ( $opt{o} ) {
foreach ( @a ) {
print "$_->{mid}\t$_->{sections}\n";
}
}
print sprintf("total: %.02f sec; number: %d; for one: %.03f sec; found %d docs\n", $elapsed, $b, $elapsed/$b, $count+1 );
$dbi -> disconnect;
sub exec_sql {
my ($dbi, $sql, @keys) = @_;
my $sth=$dbi->prepare($sql) || die;
$sth->execute( @keys ) || die;
my $r;
my @row;
while ( defined ( $r=$sth->fetchrow_hashref ) ) {
push @row, $r;
}
$sth->finish;
return @row;
}

View File

@ -0,0 +1,73 @@
#!/usr/bin/perl
use strict;
print <<EOT;
create table message (
mid int not null,
sections int[]
);
create table message_section_map (
mid int not null,
sid int not null
);
EOT
open(MSG,">message.tmp") || die;
open(MAP,">message_section_map.tmp") || die;
srand( 1 );
#foreach my $i ( 1..1778 ) {
#foreach my $i ( 1..3443 ) {
#foreach my $i ( 1..5000 ) {
#foreach my $i ( 1..29362 ) {
#foreach my $i ( 1..33331 ) {
#foreach my $i ( 1..83268 ) {
foreach my $i ( 1..200000 ) {
my @sect;
if ( rand() < 0.7 ) {
$sect[0] = int( (rand()**4)*100 );
} else {
my %hash;
@sect = grep { $hash{$_}++; $hash{$_} <= 1 } map { int( (rand()**4)*100) } 0..( int(rand()*5) );
}
if ( $#sect < 0 || rand() < 0.1 ) {
print MSG "$i\t\\N\n";
} else {
print MSG "$i\t{".join(',',@sect)."}\n";
map { print MAP "$i\t$_\n" } @sect;
}
}
close MAP;
close MSG;
copytable('message');
copytable('message_section_map');
print <<EOT;
CREATE unique index message_key on message ( mid );
--CREATE unique index message_section_map_key1 on message_section_map ( mid, sid );
CREATE unique index message_section_map_key2 on message_section_map ( sid, mid );
CREATE INDEX message_rdtree_idx on message using gist ( sections ) with ( islossy );
VACUUM ANALYZE;
select count(*) from message;
select count(*) from message_section_map;
EOT
unlink 'message.tmp', 'message_section_map.tmp';
sub copytable {
my $t = shift;
print "COPY $t from stdin;\n";
open( FFF, "$t.tmp") || die;
while(<FFF>) { print; }
close FFF;
print "\\.\n";
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of seg.sql.
--
\set ECHO none
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
SELECT count(*) from test__int WHERE a && '{23,50}';
count
-------
345
(1 row)
SELECT count(*) from test__int WHERE a @ '{23,50}';
count
-------
12
(1 row)

View File

@ -0,0 +1,15 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of seg.sql.
--
\set ECHO none
\i _int.sql
\set ECHO all
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
SELECT count(*) from test__int WHERE a && '{23,50}';
SELECT count(*) from test__int WHERE a @ '{23,50}';