/* * contrib/intarray/_int_gist.c */ #include "postgres.h" #include #include #include "_int.h" #include "access/gist.h" #include "access/reloptions.h" #include "access/stratnum.h" #define GETENTRY(vec,pos) ((ArrayType *) DatumGetPointer((vec)->vector[(pos)].key)) /* * Control the maximum sparseness of compressed keys. * * The upper safe bound for this limit is half the maximum allocatable array * size. A lower bound would give more guarantees that pathological data * wouldn't eat excessive CPU and memory, but at the expense of breaking * possibly working (after a fashion) indexes. */ #define MAXNUMELTS (Min((MaxAllocSize / sizeof(Datum)),((MaxAllocSize - ARR_OVERHEAD_NONULLS(1)) / sizeof(int)))/2) /* or: #define MAXNUMELTS 1000000 */ /* ** GiST support methods */ PG_FUNCTION_INFO_V1(g_int_consistent); PG_FUNCTION_INFO_V1(g_int_compress); PG_FUNCTION_INFO_V1(g_int_decompress); PG_FUNCTION_INFO_V1(g_int_penalty); PG_FUNCTION_INFO_V1(g_int_picksplit); PG_FUNCTION_INFO_V1(g_int_union); PG_FUNCTION_INFO_V1(g_int_same); PG_FUNCTION_INFO_V1(g_int_options); /* ** The GiST Consistent method for _intments ** Should return false if for all data items x below entry, ** the predicate x op query == false, where op is the oper ** corresponding to strategy in the pg_amop table. */ Datum g_int_consistent(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); ArrayType *query = PG_GETARG_ARRAYTYPE_P_COPY(1); StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); /* Oid subtype = PG_GETARG_OID(3); */ bool *recheck = (bool *) PG_GETARG_POINTER(4); bool retval = false; /* silence compiler warning */ /* this is exact except for RTSameStrategyNumber */ *recheck = (strategy == RTSameStrategyNumber); if (strategy == BooleanSearchStrategy) { retval = execconsistent((QUERYTYPE *) query, (ArrayType *) DatumGetPointer(entry->key), GIST_LEAF(entry)); pfree(query); PG_RETURN_BOOL(retval); } /* sort query for fast search, key is already sorted */ CHECKARRVALID(query); PREPAREARR(query); switch (strategy) { case RTOverlapStrategyNumber: retval = inner_int_overlap((ArrayType *) DatumGetPointer(entry->key), query); break; case RTSameStrategyNumber: if (GIST_LEAF(entry)) DirectFunctionCall3(g_int_same, entry->key, PointerGetDatum(query), PointerGetDatum(&retval)); else retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key), query); break; case RTContainsStrategyNumber: case RTOldContainsStrategyNumber: retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key), query); break; case RTContainedByStrategyNumber: case RTOldContainedByStrategyNumber: /* * This code is unreachable as of intarray 1.4, because the <@ * operator has been removed from the opclass. We keep it for now * to support older versions of the SQL definitions. */ if (GIST_LEAF(entry)) retval = inner_int_contains(query, (ArrayType *) DatumGetPointer(entry->key)); else { /* * Unfortunately, because empty arrays could be anywhere in * the index, we must search the whole tree. */ retval = true; } break; default: retval = false; } pfree(query); PG_RETURN_BOOL(retval); } Datum g_int_union(PG_FUNCTION_ARGS) { GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); int *size = (int *) PG_GETARG_POINTER(1); int32 i, *ptr; ArrayType *res; int totlen = 0; for (i = 0; i < entryvec->n; i++) { ArrayType *ent = GETENTRY(entryvec, i); CHECKARRVALID(ent); totlen += ARRNELEMS(ent); } res = new_intArrayType(totlen); ptr = ARRPTR(res); for (i = 0; i < entryvec->n; i++) { ArrayType *ent = GETENTRY(entryvec, i); int nel; nel = ARRNELEMS(ent); memcpy(ptr, ARRPTR(ent), nel * sizeof(int32)); ptr += nel; } QSORT(res, 1); res = _int_unique(res); *size = VARSIZE(res); PG_RETURN_POINTER(res); } /* ** GiST Compress and Decompress methods */ Datum g_int_compress(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); GISTENTRY *retval; ArrayType *r; int num_ranges = G_INT_GET_NUMRANGES(); int len, lenr; int *dr; int i, j, cand; int64 min; if (entry->leafkey) { r = DatumGetArrayTypePCopy(entry->key); CHECKARRVALID(r); PREPAREARR(r); if (ARRNELEMS(r) >= 2 * num_ranges) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("input array is too big (%d maximum allowed, %d current), use gist__intbig_ops opclass instead", 2 * num_ranges - 1, ARRNELEMS(r)))); retval = palloc(sizeof(GISTENTRY)); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); } /* * leaf entries never compress one more time, only when entry->leafkey * ==true, so now we work only with internal keys */ r = DatumGetArrayTypeP(entry->key); CHECKARRVALID(r); if (ARRISEMPTY(r)) { if (r != (ArrayType *) DatumGetPointer(entry->key)) pfree(r); PG_RETURN_POINTER(entry); } if ((len = ARRNELEMS(r)) >= 2 * num_ranges) { /* compress */ if (r == (ArrayType *) DatumGetPointer(entry->key)) r = DatumGetArrayTypePCopy(entry->key); r = resize_intArrayType(r, 2 * (len)); dr = ARRPTR(r); /* * "len" at this point is the number of ranges we will construct. * "lenr" is the number of ranges we must eventually remove by * merging, we must be careful to remove no more than this number. */ lenr = len - num_ranges; /* * Initially assume we can merge consecutive ints into a range. but we * must count every value removed and stop when lenr runs out */ for (j = i = len - 1; i > 0 && lenr > 0; i--, j--) { int r_end = dr[i]; int r_start = r_end; while (i > 0 && lenr > 0 && dr[i - 1] == r_start - 1) --r_start, --i, --lenr; dr[2 * j] = r_start; dr[2 * j + 1] = r_end; } /* just copy the rest, if any, as trivial ranges */ for (; i >= 0; i--, j--) dr[2 * j] = dr[2 * j + 1] = dr[i]; if (++j) { /* * shunt everything down to start at the right place */ memmove(&dr[0], &dr[2 * j], 2 * (len - j) * sizeof(int32)); } /* * make "len" be number of array elements, not ranges */ len = 2 * (len - j); cand = 1; while (len > num_ranges * 2) { min = PG_INT64_MAX; for (i = 2; i < len; i += 2) if (min > ((int64) dr[i] - (int64) dr[i - 1])) { min = ((int64) dr[i] - (int64) dr[i - 1]); cand = i; } memmove(&dr[cand - 1], &dr[cand + 1], (len - cand - 1) * sizeof(int32)); len -= 2; } /* * check sparseness of result */ lenr = internal_size(dr, len); if (lenr < 0 || lenr > MAXNUMELTS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("data is too sparse, recreate index using gist__intbig_ops opclass instead"))); r = resize_intArrayType(r, len); retval = palloc(sizeof(GISTENTRY)); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); } else PG_RETURN_POINTER(entry); } Datum g_int_decompress(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); GISTENTRY *retval; ArrayType *r; int num_ranges = G_INT_GET_NUMRANGES(); int *dr, lenr; ArrayType *in; int lenin; int *din; int i; in = DatumGetArrayTypeP(entry->key); CHECKARRVALID(in); if (ARRISEMPTY(in)) { if (in != (ArrayType *) DatumGetPointer(entry->key)) { retval = palloc(sizeof(GISTENTRY)); gistentryinit(*retval, PointerGetDatum(in), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); } PG_RETURN_POINTER(entry); } lenin = ARRNELEMS(in); if (lenin < 2 * num_ranges) { /* not compressed value */ if (in != (ArrayType *) DatumGetPointer(entry->key)) { retval = palloc(sizeof(GISTENTRY)); gistentryinit(*retval, PointerGetDatum(in), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); } PG_RETURN_POINTER(entry); } din = ARRPTR(in); lenr = internal_size(din, lenin); if (lenr < 0 || lenr > MAXNUMELTS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("compressed array is too big, recreate index using gist__intbig_ops opclass instead"))); r = new_intArrayType(lenr); dr = ARRPTR(r); for (i = 0; i < lenin; i += 2) { /* use int64 for j in case din[i + 1] is INT_MAX */ for (int64 j = din[i]; j <= din[i + 1]; j++) if ((!i) || *(dr - 1) != j) *dr++ = (int) j; } if (in != (ArrayType *) DatumGetPointer(entry->key)) pfree(in); retval = palloc(sizeof(GISTENTRY)); gistentryinit(*retval, PointerGetDatum(r), entry->rel, entry->page, entry->offset, false); PG_RETURN_POINTER(retval); } /* ** The GiST Penalty method for _intments */ Datum g_int_penalty(PG_FUNCTION_ARGS) { GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); float *result = (float *) PG_GETARG_POINTER(2); ArrayType *ud; float tmp1, tmp2; ud = inner_int_union((ArrayType *) DatumGetPointer(origentry->key), (ArrayType *) DatumGetPointer(newentry->key)); rt__int_size(ud, &tmp1); rt__int_size((ArrayType *) DatumGetPointer(origentry->key), &tmp2); *result = tmp1 - tmp2; pfree(ud); PG_RETURN_POINTER(result); } Datum g_int_same(PG_FUNCTION_ARGS) { ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); bool *result = (bool *) PG_GETARG_POINTER(2); int32 n = ARRNELEMS(a); int32 *da, *db; CHECKARRVALID(a); CHECKARRVALID(b); if (n != ARRNELEMS(b)) { *result = false; PG_RETURN_POINTER(result); } *result = true; da = ARRPTR(a); db = ARRPTR(b); while (n--) { if (*da++ != *db++) { *result = false; break; } } PG_RETURN_POINTER(result); } /***************************************************************** ** Common GiST Method *****************************************************************/ typedef struct { OffsetNumber pos; float cost; } SPLITCOST; static int comparecost(const void *a, const void *b) { if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost) return 0; else return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1; } /* ** The GiST PickSplit method for _intments ** We use Guttman's poly time split algorithm */ Datum g_int_picksplit(PG_FUNCTION_ARGS) { GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); OffsetNumber i, j; ArrayType *datum_alpha, *datum_beta; ArrayType *datum_l, *datum_r; ArrayType *union_d, *union_dl, *union_dr; ArrayType *inter_d; bool firsttime; float size_alpha, size_beta, size_union, size_inter; float size_waste, waste; float size_l, size_r; int nbytes; OffsetNumber seed_1 = 0, seed_2 = 0; OffsetNumber *left, *right; OffsetNumber maxoff; SPLITCOST *costvector; #ifdef GIST_DEBUG elog(DEBUG3, "--------picksplit %d", entryvec->n); #endif maxoff = entryvec->n - 2; nbytes = (maxoff + 2) * sizeof(OffsetNumber); v->spl_left = (OffsetNumber *) palloc(nbytes); v->spl_right = (OffsetNumber *) palloc(nbytes); firsttime = true; waste = 0.0; for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) { datum_alpha = GETENTRY(entryvec, i); for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) { datum_beta = GETENTRY(entryvec, j); /* compute the wasted space by unioning these guys */ /* size_waste = size_union - size_inter; */ union_d = inner_int_union(datum_alpha, datum_beta); rt__int_size(union_d, &size_union); inter_d = inner_int_inter(datum_alpha, datum_beta); rt__int_size(inter_d, &size_inter); size_waste = size_union - size_inter; pfree(union_d); pfree(inter_d); /* * are these a more promising split that what we've already seen? */ if (size_waste > waste || firsttime) { waste = size_waste; seed_1 = i; seed_2 = j; firsttime = false; } } } left = v->spl_left; v->spl_nleft = 0; right = v->spl_right; v->spl_nright = 0; if (seed_1 == 0 || seed_2 == 0) { seed_1 = 1; seed_2 = 2; } datum_alpha = GETENTRY(entryvec, seed_1); datum_l = copy_intArrayType(datum_alpha); rt__int_size(datum_l, &size_l); datum_beta = GETENTRY(entryvec, seed_2); datum_r = copy_intArrayType(datum_beta); rt__int_size(datum_r, &size_r); maxoff = OffsetNumberNext(maxoff); /* * sort entries */ costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { costvector[i - 1].pos = i; datum_alpha = GETENTRY(entryvec, i); union_d = inner_int_union(datum_l, datum_alpha); rt__int_size(union_d, &size_alpha); pfree(union_d); union_d = inner_int_union(datum_r, datum_alpha); rt__int_size(union_d, &size_beta); pfree(union_d); costvector[i - 1].cost = fabsf((size_alpha - size_l) - (size_beta - size_r)); } qsort(costvector, maxoff, sizeof(SPLITCOST), comparecost); /* * Now split up the regions between the two seeds. An important property * of this split algorithm is that the split vector v has the indices of * items to be split in order in its left and right vectors. We exploit * this property by doing a merge in the code that actually splits the * page. * * For efficiency, we also place the new index tuple in this loop. This is * handled at the very end, when we have placed all the existing tuples * and i == maxoff + 1. */ for (j = 0; j < maxoff; j++) { i = costvector[j].pos; /* * If we've already decided where to place this item, just put it on * the right list. Otherwise, we need to figure out which page needs * the least enlargement in order to store the item. */ if (i == seed_1) { *left++ = i; v->spl_nleft++; continue; } else if (i == seed_2) { *right++ = i; v->spl_nright++; continue; } /* okay, which page needs least enlargement? */ datum_alpha = GETENTRY(entryvec, i); union_dl = inner_int_union(datum_l, datum_alpha); union_dr = inner_int_union(datum_r, datum_alpha); rt__int_size(union_dl, &size_alpha); rt__int_size(union_dr, &size_beta); /* pick which page to add it to */ if (size_alpha - size_l < size_beta - size_r + WISH_F(v->spl_nleft, v->spl_nright, 0.01)) { pfree(datum_l); pfree(union_dr); datum_l = union_dl; size_l = size_alpha; *left++ = i; v->spl_nleft++; } else { pfree(datum_r); pfree(union_dl); datum_r = union_dr; size_r = size_beta; *right++ = i; v->spl_nright++; } } pfree(costvector); *right = *left = FirstOffsetNumber; v->spl_ldatum = PointerGetDatum(datum_l); v->spl_rdatum = PointerGetDatum(datum_r); PG_RETURN_POINTER(v); } Datum g_int_options(PG_FUNCTION_ARGS) { local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); init_local_reloptions(relopts, sizeof(GISTIntArrayOptions)); add_local_int_reloption(relopts, "numranges", "number of ranges for compression", G_INT_NUMRANGES_DEFAULT, 1, G_INT_NUMRANGES_MAX, offsetof(GISTIntArrayOptions, num_ranges)); PG_RETURN_VOID(); }