From 9ee014fc899a28a198492b074e32b60ed8915ea9 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Fri, 1 Apr 2016 16:42:24 +0300 Subject: [PATCH] Bloom index contrib module Module provides new access method. It is actually a simple Bloom filter implemented as pgsql's index. It could give some benefits on search with large number of columns. Module is a single way to test generic WAL interface committed earlier. Author: Teodor Sigaev, Alexander Korotkov Reviewers: Aleksander Alekseev, Michael Paquier, Jim Nasby --- contrib/Makefile | 1 + contrib/bloom/.gitignore | 4 + contrib/bloom/Makefile | 24 ++ contrib/bloom/blcost.c | 48 ++++ contrib/bloom/blinsert.c | 313 +++++++++++++++++++++ contrib/bloom/bloom--1.0.sql | 19 ++ contrib/bloom/bloom.control | 5 + contrib/bloom/bloom.h | 178 ++++++++++++ contrib/bloom/blscan.c | 175 ++++++++++++ contrib/bloom/blutils.c | 463 +++++++++++++++++++++++++++++++ contrib/bloom/blvacuum.c | 212 ++++++++++++++ contrib/bloom/blvalidate.c | 220 +++++++++++++++ contrib/bloom/expected/bloom.out | 122 ++++++++ contrib/bloom/sql/bloom.sql | 47 ++++ contrib/bloom/t/001_wal.pl | 75 +++++ doc/src/sgml/bloom.sgml | 218 +++++++++++++++ doc/src/sgml/contrib.sgml | 1 + doc/src/sgml/filelist.sgml | 1 + 18 files changed, 2126 insertions(+) create mode 100644 contrib/bloom/.gitignore create mode 100644 contrib/bloom/Makefile create mode 100644 contrib/bloom/blcost.c create mode 100644 contrib/bloom/blinsert.c create mode 100644 contrib/bloom/bloom--1.0.sql create mode 100644 contrib/bloom/bloom.control create mode 100644 contrib/bloom/bloom.h create mode 100644 contrib/bloom/blscan.c create mode 100644 contrib/bloom/blutils.c create mode 100644 contrib/bloom/blvacuum.c create mode 100644 contrib/bloom/blvalidate.c create mode 100644 contrib/bloom/expected/bloom.out create mode 100644 contrib/bloom/sql/bloom.sql create mode 100644 contrib/bloom/t/001_wal.pl create mode 100644 doc/src/sgml/bloom.sgml diff --git a/contrib/Makefile b/contrib/Makefile index d12dd6379b..25263c0be9 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -8,6 +8,7 @@ SUBDIRS = \ adminpack \ auth_delay \ auto_explain \ + bloom \ btree_gin \ btree_gist \ chkpass \ diff --git a/contrib/bloom/.gitignore b/contrib/bloom/.gitignore new file mode 100644 index 0000000000..5dcb3ff972 --- /dev/null +++ b/contrib/bloom/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/contrib/bloom/Makefile b/contrib/bloom/Makefile new file mode 100644 index 0000000000..13bd397b70 --- /dev/null +++ b/contrib/bloom/Makefile @@ -0,0 +1,24 @@ +# contrib/bloom/Makefile + +MODULE_big = bloom +OBJS = blcost.o blinsert.o blscan.o blutils.o blvacuum.o blvalidate.o $(WIN32RES) + +EXTENSION = bloom +DATA = bloom--1.0.sql +PGFILEDESC = "bloom access method - signature file based index" + +REGRESS = bloom + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/bloom +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +wal-check: temp-install + $(prove_check) diff --git a/contrib/bloom/blcost.c b/contrib/bloom/blcost.c new file mode 100644 index 0000000000..989789850e --- /dev/null +++ b/contrib/bloom/blcost.c @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * blcost.c + * Cost estimate function for bloom indexes. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blcost.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "fmgr.h" +#include "optimizer/cost.h" +#include "utils/selfuncs.h" + +#include "bloom.h" + +/* + * Estimate cost of bloom index scan. + */ +void +blcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation) +{ + IndexOptInfo *index = path->indexinfo; + List *qinfos; + GenericCosts costs; + + /* Do preliminary analysis of indexquals */ + qinfos = deconstruct_indexquals(path); + + MemSet(&costs, 0, sizeof(costs)); + + /* We have to visit all index tuples anyway */ + costs.numIndexTuples = index->tuples; + + /* Use generic estimate */ + genericcostestimate(root, path, loop_count, qinfos, &costs); + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; +} diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c new file mode 100644 index 0000000000..9e6678087c --- /dev/null +++ b/contrib/bloom/blinsert.c @@ -0,0 +1,313 @@ +/*------------------------------------------------------------------------- + * + * blinsert.c + * Bloom index build and insert functions. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blinsert.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/generic_xlog.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +#include "bloom.h" + +PG_MODULE_MAGIC; + +/* + * State of bloom index build. We accumulate one page data here before + * flushing it to buffer manager. + */ +typedef struct +{ + BloomState blstate; /* bloom index state */ + MemoryContext tmpCtx; /* temporary memory context reset after + * each tuple */ + char data[BLCKSZ]; /* cached page */ + int64 count; /* number of tuples in cached page */ +} BloomBuildState; + +/* + * Flush page cached in BloomBuildState. + */ +static void +flushCachedPage(Relation index, BloomBuildState *buildstate) +{ + Page page; + Buffer buffer = BloomNewBuffer(index); + GenericXLogState *state; + + state = GenericXLogStart(index); + page = GenericXLogRegister(state, buffer, true); + memcpy(page, buildstate->data, BLCKSZ); + GenericXLogFinish(state); + UnlockReleaseBuffer(buffer); +} + +/* + * (Re)initialize cached page in BloomBuildState. + */ +static void +initCachedPage(BloomBuildState *buildstate) +{ + memset(buildstate->data, 0, BLCKSZ); + BloomInitPage(buildstate->data, 0); + buildstate->count = 0; +} + +/* + * Per-tuple callback from IndexBuildHeapScan. + */ +static void +bloomBuildCallback(Relation index, HeapTuple htup, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + BloomBuildState *buildstate = (BloomBuildState *) state; + MemoryContext oldCtx; + BloomTuple *itup; + + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + itup = BloomFormTuple(&buildstate->blstate, &htup->t_self, values, isnull); + + /* Try to add next item to cached page */ + if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup)) + { + /* Next item was added successfully */ + buildstate->count++; + } + else + { + /* Cached page is full, flush it out and make a new one */ + flushCachedPage(index, buildstate); + + CHECK_FOR_INTERRUPTS(); + + initCachedPage(buildstate); + + if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup) == false) + { + /* We shouldn't be here since we're inserting to the empty page */ + elog(ERROR, "can not add new tuple"); + } + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); +} + +/* + * Build a new bloom index. + */ +IndexBuildResult * +blbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + BloomBuildState buildstate; + + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* Initialize the meta page */ + BloomInitMetapage(index); + + /* Initialize the bloom build state */ + memset(&buildstate, 0, sizeof(buildstate)); + initBloomState(&buildstate.blstate, index); + buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "Bloom build temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + initCachedPage(&buildstate); + + /* Do the heap scan */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, + bloomBuildCallback, (void *) &buildstate); + + /* + * There are could be some items in cached page. Flush this page + * if needed. + */ + if (buildstate.count > 0) + flushCachedPage(index, &buildstate); + + MemoryContextDelete(buildstate.tmpCtx); + + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result->heap_tuples = result->index_tuples = reltuples; + + return result; +} + +/* + * Build an empty bloom index in the initialization fork. + */ +void +blbuildempty(Relation index) +{ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* Initialize the meta page */ + BloomInitMetapage(index); +} + +/* + * Insert new tuple to the bloom index. + */ +bool +blinsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique) +{ + BloomState blstate; + BloomTuple *itup; + MemoryContext oldCtx; + MemoryContext insertCtx; + BloomMetaPageData *metaData; + Buffer buffer, + metaBuffer; + Page page, + metaPage; + BlockNumber blkno = InvalidBlockNumber; + OffsetNumber nStart; + GenericXLogState *state; + + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "Bloom insert temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + oldCtx = MemoryContextSwitchTo(insertCtx); + + initBloomState(&blstate, index); + itup = BloomFormTuple(&blstate, ht_ctid, values, isnull); + + /* + * At first, try to insert new tuple to the first page in notFullPage + * array. If success we don't need to modify the meta page. + */ + metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); + LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); + metaData = BloomPageGetMeta(BufferGetPage(metaBuffer)); + + if (metaData->nEnd > metaData->nStart) + { + Page page; + + blkno = metaData->notFullPage[metaData->nStart]; + + Assert(blkno != InvalidBlockNumber); + LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); + + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + state = GenericXLogStart(index); + page = GenericXLogRegister(state, buffer, false); + + if (BloomPageAddItem(&blstate, page, itup)) + { + GenericXLogFinish(state); + UnlockReleaseBuffer(buffer); + ReleaseBuffer(metaBuffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + return false; + } + else + { + GenericXLogAbort(state); + UnlockReleaseBuffer(buffer); + } + } + else + { + /* First page in notFullPage isn't suitable */ + LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); + } + + /* + * Try other pages in notFullPage array. We will have to change nStart in + * metapage. Thus, grab exclusive lock on metapage. + */ + LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); + + state = GenericXLogStart(index); + metaPage = GenericXLogRegister(state, metaBuffer, false); + metaData = BloomPageGetMeta(metaPage); + + /* + * Iterate over notFullPage array. Skip page we already tried first. + */ + nStart = metaData->nStart; + if (metaData->nEnd > nStart && + blkno == metaData->notFullPage[nStart]) + nStart++; + + while (metaData->nEnd > nStart) + { + blkno = metaData->notFullPage[nStart]; + Assert(blkno != InvalidBlockNumber); + + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = GenericXLogRegister(state, buffer, false); + + if (BloomPageAddItem(&blstate, page, itup)) + { + metaData->nStart = nStart; + GenericXLogFinish(state); + UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(metaBuffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + return false; + } + else + { + GenericXLogUnregister(state, buffer); + UnlockReleaseBuffer(buffer); + } + nStart++; + } + + GenericXLogAbort(state); + + /* + * Didn't find place to insert in notFullPage array. Allocate new page. + */ + buffer = BloomNewBuffer(index); + + state = GenericXLogStart(index); + metaPage = GenericXLogRegister(state, metaBuffer, false); + metaData = BloomPageGetMeta(metaPage); + page = GenericXLogRegister(state, buffer, true); + BloomInitPage(page, 0); + BloomPageAddItem(&blstate, page, itup); + + metaData->nStart = 0; + metaData->nEnd = 1; + metaData->notFullPage[0] = BufferGetBlockNumber(buffer); + + GenericXLogFinish(state); + + UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(metaBuffer); + + return false; +} diff --git a/contrib/bloom/bloom--1.0.sql b/contrib/bloom/bloom--1.0.sql new file mode 100644 index 0000000000..7fa751361a --- /dev/null +++ b/contrib/bloom/bloom--1.0.sql @@ -0,0 +1,19 @@ +CREATE OR REPLACE FUNCTION blhandler(internal) +RETURNS index_am_handler +AS 'MODULE_PATHNAME' +LANGUAGE C; + +-- Access method +CREATE ACCESS METHOD bloom TYPE INDEX HANDLER blhandler; + +-- Opclasses + +CREATE OPERATOR CLASS int4_ops +DEFAULT FOR TYPE int4 USING bloom AS + OPERATOR 1 =(int4, int4), + FUNCTION 1 hashint4(int4); + +CREATE OPERATOR CLASS text_ops +DEFAULT FOR TYPE text USING bloom AS + OPERATOR 1 =(text, text), + FUNCTION 1 hashtext(text); diff --git a/contrib/bloom/bloom.control b/contrib/bloom/bloom.control new file mode 100644 index 0000000000..4d4124b3b0 --- /dev/null +++ b/contrib/bloom/bloom.control @@ -0,0 +1,5 @@ +# bloom extension +comment = 'bloom access method - signature file based index' +default_version = '1.0' +module_pathname = '$libdir/bloom' +relocatable = true diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h new file mode 100644 index 0000000000..50bf99bf03 --- /dev/null +++ b/contrib/bloom/bloom.h @@ -0,0 +1,178 @@ +/*------------------------------------------------------------------------- + * + * bloom.h + * Header for bloom index. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/bloom.h + * + *------------------------------------------------------------------------- + */ +#ifndef _BLOOM_H_ +#define _BLOOM_H_ + +#include "access/amapi.h" +#include "access/generic_xlog.h" +#include "access/itup.h" +#include "access/xlog.h" +#include "nodes/relation.h" +#include "fmgr.h" + +/* Support procedures numbers */ +#define BLOOM_HASH_PROC 1 +#define BLOOM_NPROC 1 + +/* Scan strategies */ +#define BLOOM_EQUAL_STRATEGY 1 +#define BLOOM_NSTRATEGIES 1 + +/* Opaque for bloom pages */ +typedef struct BloomPageOpaqueData +{ + OffsetNumber maxoff; + uint16 flags; +} BloomPageOpaqueData; + +typedef BloomPageOpaqueData *BloomPageOpaque; + +/* Bloom page flags */ +#define BLOOM_META (1<<0) +#define BLOOM_DELETED (2<<0) + +/* Macros for accessing bloom page structures */ +#define BloomPageGetOpaque(page) ((BloomPageOpaque) PageGetSpecialPointer(page)) +#define BloomPageGetMaxOffset(page) (BloomPageGetOpaque(page)->maxoff) +#define BloomPageIsMeta(page) (BloomPageGetOpaque(page)->flags & BLOOM_META) +#define BloomPageIsDeleted(page) (BloomPageGetOpaque(page)->flags & BLOOM_DELETED) +#define BloomPageSetDeleted(page) (BloomPageGetOpaque(page)->flags |= BLOOM_DELETED) +#define BloomPageSetNonDeleted(page) (BloomPageGetOpaque(page)->flags &= ~BLOOM_DELETED) +#define BloomPageGetData(page) ((BloomTuple *)PageGetContents(page)) +#define BloomPageGetTuple(state, page, offset) \ + ((BloomTuple *)(PageGetContents(page) \ + + (state)->sizeOfBloomTuple * ((offset) - 1))) +#define BloomPageGetNextTuple(state, tuple) \ + ((BloomTuple *)((Pointer)(tuple) + (state)->sizeOfBloomTuple)) + +/* Preserved page numbers */ +#define BLOOM_METAPAGE_BLKNO (0) +#define BLOOM_HEAD_BLKNO (1) /* first data page */ + +/* Bloom index options */ +typedef struct BloomOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int bloomLength; /* length of signature in uint16 */ + int bitSize[INDEX_MAX_KEYS]; /* signature bits per index + * key */ +} BloomOptions; + +/* + * FreeBlockNumberArray - array of block numbers sized so that metadata fill + * all space in metapage. + */ +typedef BlockNumber FreeBlockNumberArray[ + MAXALIGN_DOWN( + BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData)) + - MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions)) + ) / sizeof(BlockNumber) +]; + +/* Metadata of bloom index */ +typedef struct BloomMetaPageData +{ + uint32 magickNumber; + uint16 nStart; + uint16 nEnd; + BloomOptions opts; + FreeBlockNumberArray notFullPage; +} BloomMetaPageData; + +/* Magic number to distinguish bloom pages among anothers */ +#define BLOOM_MAGICK_NUMBER (0xDBAC0DED) + +/* Number of blocks numbers fit in BloomMetaPageData */ +#define BloomMetaBlockN (sizeof(FreeBlockNumberArray) / sizeof(BlockNumber)) + +#define BloomPageGetMeta(page) ((BloomMetaPageData *) PageGetContents(page)) + +typedef struct BloomState +{ + FmgrInfo hashFn[INDEX_MAX_KEYS]; + BloomOptions *opts; /* stored in rd_amcache and defined at + * creation time */ + int32 nColumns; + + /* + * sizeOfBloomTuple is index's specific, and it depends on reloptions, so + * precompute it + */ + int32 sizeOfBloomTuple; +} BloomState; + +#define BloomPageGetFreeSpace(state, page) \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + - BloomPageGetMaxOffset(page) * (state)->sizeOfBloomTuple \ + - MAXALIGN(sizeof(BloomPageOpaqueData))) + +/* + * Tuples are very different from all other relations + */ +typedef uint16 SignType; + +typedef struct BloomTuple +{ + ItemPointerData heapPtr; + SignType sign[1]; +} BloomTuple; + +#define BLOOMTUPLEHDRSZ offsetof(BloomTuple, sign) + +/* Opaque data structure for bloom index scan */ +typedef struct BloomScanOpaqueData +{ + SignType *sign; /* Scan signature */ + BloomState state; +} BloomScanOpaqueData; + +typedef BloomScanOpaqueData *BloomScanOpaque; + +/* blutils.c */ +extern void _PG_init(void); +extern Datum blhandler(PG_FUNCTION_ARGS); +extern void initBloomState(BloomState * state, Relation index); +extern void BloomInitMetapage(Relation index); +extern void BloomInitPage(Page page, uint16 flags); +extern Buffer BloomNewBuffer(Relation index); +extern void signValue(BloomState * state, SignType * sign, Datum value, int attno); +extern BloomTuple *BloomFormTuple(BloomState * state, ItemPointer iptr, Datum *values, bool *isnull); +extern bool BloomPageAddItem(BloomState * state, Page page, BloomTuple * tuple); + +/* blvalidate.c */ +extern bool blvalidate(Oid opclassoid); + +/* index access method interface functions */ +extern bool blinsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique); +extern IndexScanDesc blbeginscan(Relation r, int nkeys, int norderbys); +extern int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern void blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void blendscan(IndexScanDesc scan); +extern IndexBuildResult *blbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void blbuildempty(Relation index); +extern IndexBulkDeleteResult *blbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *blvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern bytea *bloptions(Datum reloptions, bool validate); +extern void blcostestimate(PlannerInfo *root, IndexPath *path, + double loop_count, Cost *indexStartupCost, + Cost *indexTotalCost, Selectivity *indexSelectivity, + double *indexCorrelation); + +#endif diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c new file mode 100644 index 0000000000..d156e88669 --- /dev/null +++ b/contrib/bloom/blscan.c @@ -0,0 +1,175 @@ +/*------------------------------------------------------------------------- + * + * blscan.c + * Bloom index scan functions. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blscan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "pgstat.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +#include "bloom.h" + +/* + * Begin scan of bloom index. + */ +IndexScanDesc +blbeginscan(Relation r, int nkeys, int norderbys) +{ + IndexScanDesc scan; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + return scan; +} + +/* + * Rescan a bloom index. + */ +void +blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + BloomScanOpaque so; + + so = (BloomScanOpaque) scan->opaque; + + if (so == NULL) + { + /* if called from blbeginscan */ + so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData)); + initBloomState(&so->state, scan->indexRelation); + scan->opaque = so; + + } + else + { + if (so->sign) + pfree(so->sign); + } + so->sign = NULL; + + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } +} + +/* + * End scan of bloom index. + */ +void +blendscan(IndexScanDesc scan) +{ + BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + + if (so->sign) + pfree(so->sign); + so->sign = NULL; +} + +/* + * Insert all matching tuples into to a bitmap. + */ +int64 +blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + int64 ntids = 0; + BlockNumber blkno = BLOOM_HEAD_BLKNO, + npages; + int i; + BufferAccessStrategy bas; + BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + + if (so->sign == NULL && scan->numberOfKeys > 0) + { + /* New search: have to calculate search signature */ + ScanKey skey = scan->keyData; + + so->sign = palloc0(sizeof(SignType) * so->state.opts->bloomLength); + + for (i = 0; i < scan->numberOfKeys; i++) + { + /* + * Assume bloom-indexable operators to be strict, so nothing could + * be found for NULL key. + */ + if (skey->sk_flags & SK_ISNULL) + { + pfree(so->sign); + so->sign = NULL; + return 0; + } + + /* Add next value to the signature */ + signValue(&so->state, so->sign, skey->sk_argument, + skey->sk_attno - 1); + + skey++; + } + } + + /* + * We're going to read the whole index. This is why we use appropriate + * buffer access strategy. + */ + bas = GetAccessStrategy(BAS_BULKREAD); + npages = RelationGetNumberOfBlocks(scan->indexRelation); + + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + + buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, + blkno, RBM_NORMAL, bas); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + + if (!BloomPageIsDeleted(page)) + { + OffsetNumber offset, + maxOffset = BloomPageGetMaxOffset(page); + + for (offset = 1; offset <= maxOffset; offset++) + { + BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset); + bool res = true; + + /* Check index signature with scan signature */ + for (i = 0; res && i < so->state.opts->bloomLength; i++) + { + if ((itup->sign[i] & so->sign[i]) != so->sign[i]) + res = false; + } + + /* Add matching tuples to bitmap */ + if (res) + { + tbm_add_tuples(tbm, &itup->heapPtr, 1, true); + ntids++; + } + } + } + + UnlockReleaseBuffer(buffer); + CHECK_FOR_INTERRUPTS(); + } + FreeAccessStrategy(bas); + + return ntids; +} diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c new file mode 100644 index 0000000000..b86f51fb82 --- /dev/null +++ b/contrib/bloom/blutils.c @@ -0,0 +1,463 @@ +/*------------------------------------------------------------------------- + * + * blutils.c + * Bloom index utilities. + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1990-1993, Regents of the University of California + * + * IDENTIFICATION + * contrib/bloom/blutils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amapi.h" +#include "access/generic_xlog.h" +#include "catalog/index.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "utils/memutils.h" +#include "access/reloptions.h" +#include "storage/freespace.h" +#include "storage/indexfsm.h" + +#include "bloom.h" + +/* Signature dealing macros */ +#define BITSIGNTYPE (BITS_PER_BYTE * sizeof(SignType)) +#define GETWORD(x,i) ( *( (SignType*)(x) + (int)( (i) / BITSIGNTYPE ) ) ) +#define CLRBIT(x,i) GETWORD(x,i) &= ~( 0x01 << ( (i) % BITSIGNTYPE ) ) +#define SETBIT(x,i) GETWORD(x,i) |= ( 0x01 << ( (i) % BITSIGNTYPE ) ) +#define GETBIT(x,i) ( (GETWORD(x,i) >> ( (i) % BITSIGNTYPE )) & 0x01 ) + +PG_FUNCTION_INFO_V1(blhandler); + +/* Kind of relation optioms for bloom index */ +static relopt_kind bl_relopt_kind; + +static int32 myRand(); +static void mySrand(uint32 seed); + +/* + * Module initialize function: initilized relation options. + */ +void +_PG_init(void) +{ + int i; + char buf[16]; + + bl_relopt_kind = add_reloption_kind(); + + add_int_reloption(bl_relopt_kind, "length", + "Length of signature in uint16 type", 5, 1, 256); + + for (i = 0; i < INDEX_MAX_KEYS; i++) + { + snprintf(buf, 16, "col%d", i + 1); + add_int_reloption(bl_relopt_kind, buf, + "Number of bits for corresponding column", 2, 1, 2048); + } +} + +/* + * Bloom handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +blhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 1; + amroutine->amsupport = 1; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amkeytype = 0; + + amroutine->aminsert = blinsert; + amroutine->ambeginscan = blbeginscan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = blgetbitmap; + amroutine->amrescan = blrescan; + amroutine->amendscan = blendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->ambuild = blbuild; + amroutine->ambuildempty = blbuildempty; + amroutine->ambulkdelete = blbulkdelete; + amroutine->amvacuumcleanup = blvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = blcostestimate; + amroutine->amoptions = bloptions; + amroutine->amvalidate = blvalidate; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Fill BloomState structure for particular index. + */ +void +initBloomState(BloomState *state, Relation index) +{ + int i; + + state->nColumns = index->rd_att->natts; + + /* Initialize hash function for each attribute */ + for (i = 0; i < index->rd_att->natts; i++) + { + fmgr_info_copy(&(state->hashFn[i]), + index_getprocinfo(index, i + 1, BLOOM_HASH_PROC), + CurrentMemoryContext); + } + + /* Initialize amcache if needed with options from metapage */ + if (!index->rd_amcache) + { + Buffer buffer; + Page page; + BloomMetaPageData *meta; + BloomOptions *opts; + + opts = MemoryContextAlloc(index->rd_indexcxt, sizeof(BloomOptions)); + + buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + + if (!BloomPageIsMeta(page)) + elog(ERROR, "Relation is not a bloom index"); + meta = BloomPageGetMeta(BufferGetPage(buffer)); + + if (meta->magickNumber != BLOOM_MAGICK_NUMBER) + elog(ERROR, "Relation is not a bloom index"); + + *opts = meta->opts; + + UnlockReleaseBuffer(buffer); + + index->rd_amcache = (void *) opts; + } + + state->opts = (BloomOptions *) index->rd_amcache; + state->sizeOfBloomTuple = BLOOMTUPLEHDRSZ + + sizeof(SignType) * state->opts->bloomLength; +} + +/* + * Random generator copied from FreeBSD. Using own random generator here for + * two reasons: + * + * 1) In this case random numbers are used for on-disk storage. Usage of + * PostgreSQL number generator would obstruct it from all possible changes. + * 2) Changing seed of PostgreSQL random generator would be undesirable side + * effect. + */ +static int32 next; + +static int32 +myRand() +{ + /* + * Compute x = (7^5 * x) mod (2^31 - 1) + * without overflowing 31 bits: + * (2^31 - 1) = 127773 * (7^5) + 2836 + * From "Random number generators: good ones are hard to find", + * Park and Miller, Communications of the ACM, vol. 31, no. 10, + * October 1988, p. 1195. + */ + int32 hi, lo, x; + + /* Must be in [1, 0x7ffffffe] range at this point. */ + hi = next / 127773; + lo = next % 127773; + x = 16807 * lo - 2836 * hi; + if (x < 0) + x += 0x7fffffff; + next = x; + /* Transform to [0, 0x7ffffffd] range. */ + return (x - 1); +} + +void +mySrand(uint32 seed) +{ + next = seed; + /* Transform to [1, 0x7ffffffe] range. */ + next = (next % 0x7ffffffe) + 1; +} + +/* + * Add bits of given value to the signature. + */ +void +signValue(BloomState *state, SignType *sign, Datum value, int attno) +{ + uint32 hashVal; + int nBit, + j; + + /* + * init generator with "column's" number to get "hashed" seed for new + * value. We don't want to map the same numbers from different columns + * into the same bits! + */ + mySrand(attno); + + /* + * Init hash sequence to map our value into bits. the same values in + * different columns will be mapped into different bits because of step + * above + */ + hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value)); + mySrand(hashVal ^ myRand()); + + for (j = 0; j < state->opts->bitSize[attno]; j++) + { + /* prevent mutiple evaluation */ + nBit = myRand() % (state->opts->bloomLength * BITSIGNTYPE); + SETBIT(sign, nBit); + } +} + +/* + * Make bloom tuple from values. + */ +BloomTuple * +BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull) +{ + int i; + BloomTuple *res = (BloomTuple *) palloc0(state->sizeOfBloomTuple); + + res->heapPtr = *iptr; + + /* Blooming each column */ + for (i = 0; i < state->nColumns; i++) + { + /* skip nulls */ + if (isnull[i]) + continue; + + signValue(state, res->sign, values[i], i); + } + + return res; +} + +/* + * Add new bloom tuple to the page. Returns true if new tuple was successfully + * added to the page. Returns false if it doesn't git the page. + */ +bool +BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple) +{ + BloomTuple *itup; + BloomPageOpaque opaque; + Pointer ptr; + + /* Does new tuple fit the page */ + if (BloomPageGetFreeSpace(state, page) < state->sizeOfBloomTuple) + return false; + + /* Copy new tuple to the end of page */ + opaque = BloomPageGetOpaque(page); + itup = BloomPageGetTuple(state, page, opaque->maxoff + 1); + memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple); + + /* Adjust maxoff and pd_lower */ + opaque->maxoff++; + ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1); + ((PageHeader) page)->pd_lower = ptr - page; + + return true; +} + +/* + * Allocate a new page (either by recycling, or by extending the index file) + * The returned buffer is already pinned and exclusive-locked + * Caller is responsible for initializing the page by calling BloomInitBuffer + */ +Buffer +BloomNewBuffer(Relation index) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(index); + + if (blkno == InvalidBlockNumber) + break; + + buffer = ReadBuffer(index, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + Page page = BufferGetPage(buffer); + + if (PageIsNew(page)) + return buffer; /* OK to use, if never initialized */ + + if (BloomPageIsDeleted(page)) + return buffer; /* OK to use */ + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(index); + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + + buffer = ReadBuffer(index, P_NEW); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return buffer; +} + +/* + * Initialize bloom page. + */ +void +BloomInitPage(Page page, uint16 flags) +{ + BloomPageOpaque opaque; + + PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData)); + + opaque = BloomPageGetOpaque(page); + memset(opaque, 0, sizeof(BloomPageOpaqueData)); + opaque->flags = flags; +} + +/* + * Adjust options of bloom index. + */ +static void +adjustBloomOptions(BloomOptions *opts) +{ + int i; + + /* Default length of bloom filter is 5 of 16-bit integers */ + if (opts->bloomLength <= 0) + opts->bloomLength = 5; + else + opts->bloomLength = opts->bloomLength; + + /* Check singnature length */ + for (i = 0; i < INDEX_MAX_KEYS; i++) + { + /* + * Zero and negative number of bits is meaningless. Also setting + * more bits than signature have seems useless. Replace both cases + * with 2 bits default. + */ + if (opts->bitSize[i] <= 0 + || opts->bitSize[i] >= opts->bloomLength * sizeof(SignType)) + opts->bitSize[i] = 2; + } +} + +/* + * Initialize metapage for bloom index. + */ +void +BloomInitMetapage(Relation index) +{ + Page metaPage; + Buffer metaBuffer; + BloomMetaPageData *metadata; + GenericXLogState *state; + + /* + * Make a new buffer, since it first buffer it should be associated with + * block number 0 (BLOOM_METAPAGE_BLKNO). + */ + metaBuffer = BloomNewBuffer(index); + Assert(BufferGetBlockNumber(metaBuffer) == BLOOM_METAPAGE_BLKNO); + + /* Initialize bloom index options */ + if (!index->rd_options) + index->rd_options = palloc0(sizeof(BloomOptions)); + adjustBloomOptions((BloomOptions *) index->rd_options); + + /* Initialize contents of meta page */ + state = GenericXLogStart(index); + metaPage = GenericXLogRegister(state, metaBuffer, true); + + BloomInitPage(metaPage, BLOOM_META); + metadata = BloomPageGetMeta(metaPage); + memset(metadata, 0, sizeof(BloomMetaPageData)); + metadata->magickNumber = BLOOM_MAGICK_NUMBER; + metadata->opts = *((BloomOptions *) index->rd_options); + ((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData); + + GenericXLogFinish(state); + UnlockReleaseBuffer(metaBuffer); +} + +/* + * Initialize options for bloom index. + */ +bytea * +bloptions(Datum reloptions, bool validate) +{ + relopt_value *options; + int numoptions; + BloomOptions *rdopts; + relopt_parse_elt tab[INDEX_MAX_KEYS + 1]; + int i; + char buf[16]; + + /* Option for length of signature */ + tab[0].optname = "length"; + tab[0].opttype = RELOPT_TYPE_INT; + tab[0].offset = offsetof(BloomOptions, bloomLength); + + /* Number of bits for each of possible columns: col1, col2, ... */ + for (i = 0; i < INDEX_MAX_KEYS; i++) + { + snprintf(buf, sizeof(buf), "col%d", i + 1); + tab[i + 1].optname = pstrdup(buf); + tab[i + 1].opttype = RELOPT_TYPE_INT; + tab[i + 1].offset = offsetof(BloomOptions, bitSize[i]); + } + + options = parseRelOptions(reloptions, validate, bl_relopt_kind, &numoptions); + rdopts = allocateReloptStruct(sizeof(BloomOptions), options, numoptions); + fillRelOptions((void *) rdopts, sizeof(BloomOptions), options, numoptions, + validate, tab, INDEX_MAX_KEYS + 1); + + adjustBloomOptions(rdopts); + + return (bytea *) rdopts; +} diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c new file mode 100644 index 0000000000..fb8d9b8a5f --- /dev/null +++ b/contrib/bloom/blvacuum.c @@ -0,0 +1,212 @@ +/*------------------------------------------------------------------------- + * + * blvacuum.c + * Bloom VACUUM functions. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blvacuum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "catalog/storage.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" + +#include "bloom.h" + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + Relation index = info->index; + BlockNumber blkno, + npages; + FreeBlockNumberArray notFullPage; + int countPage = 0; + BloomState state; + Buffer buffer; + Page page; + GenericXLogState *gxlogState; + + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + initBloomState(&state, index); + + /* + * Interate over the pages. We don't care about concurrently added pages, + * they can't contain tuples to delete. + */ + npages = RelationGetNumberOfBlocks(index); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) + { + BloomTuple *itup, + *itupPtr, + *itupEnd; + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + gxlogState = GenericXLogStart(index); + page = GenericXLogRegister(gxlogState, buffer, false); + + if (BloomPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + CHECK_FOR_INTERRUPTS(); + continue; + } + + /* Iterate over the tuples */ + itup = BloomPageGetTuple(&state, page, 1); + itupPtr = BloomPageGetTuple(&state, page, 1); + itupEnd = BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1); + while (itup < itupEnd) + { + /* Do we have to delete this tuple? */ + if (callback(&itup->heapPtr, callback_state)) + { + stats->tuples_removed += 1; + BloomPageGetOpaque(page)->maxoff--; + } + else + { + if (itupPtr != itup) + { + /* + * If we already delete something before, we have to move + * this tuple backward. + */ + memmove((Pointer) itupPtr, (Pointer) itup, + state.sizeOfBloomTuple); + } + stats->num_index_tuples++; + itupPtr = BloomPageGetNextTuple(&state, itupPtr); + } + + itup = BloomPageGetNextTuple(&state, itup); + } + + Assert(itupPtr == BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1)); + + if (!BloomPageIsDeleted(page) && + BloomPageGetFreeSpace(&state, page) > state.sizeOfBloomTuple && + countPage < BloomMetaBlockN) + notFullPage[countPage++] = blkno; + + /* Did we delete something? */ + if (itupPtr != itup) + { + /* Is it empty page now? */ + if (itupPtr == BloomPageGetData(page)) + BloomPageSetDeleted(page); + /* Adjust pg_lower */ + ((PageHeader) page)->pd_lower = (Pointer) itupPtr - page; + /* Finish WAL-logging */ + GenericXLogFinish(gxlogState); + } + else + { + /* Didn't change anything: abort WAL-logging */ + GenericXLogAbort(gxlogState); + } + UnlockReleaseBuffer(buffer); + CHECK_FOR_INTERRUPTS(); + } + + if (countPage > 0) + { + BloomMetaPageData *metaData; + + buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + gxlogState = GenericXLogStart(index); + page = GenericXLogRegister(gxlogState, buffer, false); + + metaData = BloomPageGetMeta(page); + memcpy(metaData->notFullPage, notFullPage, sizeof(FreeBlockNumberArray)); + metaData->nStart = 0; + metaData->nEnd = countPage; + + GenericXLogFinish(gxlogState); + UnlockReleaseBuffer(buffer); + } + + return stats; +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation index = info->index; + BlockNumber npages, + blkno; + BlockNumber totFreePages; + + if (info->analyze_only) + return stats; + + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* + * Iterate over the pages: insert deleted pages into FSM and collect + * statistics. + */ + npages = RelationGetNumberOfBlocks(index); + totFreePages = 0; + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = (Page) BufferGetPage(buffer); + + if (BloomPageIsDeleted(page)) + { + RecordFreeIndexPage(index, blkno); + totFreePages++; + } + else + { + stats->num_index_tuples += BloomPageGetMaxOffset(page); + stats->estimated_count += BloomPageGetMaxOffset(page); + } + + UnlockReleaseBuffer(buffer); + } + + IndexFreeSpaceMapVacuum(info->index); + stats->pages_free = totFreePages; + stats->num_pages = RelationGetNumberOfBlocks(index); + + return stats; +} diff --git a/contrib/bloom/blvalidate.c b/contrib/bloom/blvalidate.c new file mode 100644 index 0000000000..12e7c7dbda --- /dev/null +++ b/contrib/bloom/blvalidate.c @@ -0,0 +1,220 @@ +/*------------------------------------------------------------------------- + * + * blvalidate.c + * Opclass validator for bloom. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blvalidate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +#include "bloom.h" + +/* + * Validator for a bloom opclass. + */ +bool +blvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + Oid opckeytype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opckeytype = classform->opckeytype; + if (!OidIsValid(opckeytype)) + opckeytype = opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* + * All bloom support functions should be registered with matching + * left/right types + */ + if (procform->amproclefttype != procform->amprocrighttype) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("bloom opfamily %s contains support procedure %s with cross-type registration", + opfamilyname, + format_procedure(procform->amproc)))); + result = false; + } + + /* + * We can't check signatures except within the specific opclass, since + * we need to know the associated opckeytype in many cases. + */ + if (procform->amproclefttype != opcintype) + continue; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case BLOOM_HASH_PROC: + ok = check_amproc_signature(procform->amproc, INT4OID, false, + 1, 1, opckeytype); + break; + default: + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("bloom opfamily %s contains function %s with invalid support number %d", + opfamilyname, + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* don't want additional message */ + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("gist opfamily %s contains function %s with wrong signature for support number %d", + opfamilyname, + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check it's allowed strategy for bloom */ + if (oprform->amopstrategy < 1 || + oprform->amopstrategy > BLOOM_NSTRATEGIES) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("bloom opfamily %s contains operator %s with invalid strategy number %d", + opfamilyname, + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* bloom doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("bloom opfamily %s contains invalid ORDER BY specification for operator %s", + opfamilyname, + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all bloom strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("bloom opfamily %s contains operator %s with wrong signature", + opfamilyname, + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * There is not a lot we can do to check the operator sets, since each + * bloom opclass is more or less a law unto itself, and some contain + * only operators that are binary-compatible with the opclass datatype + * (meaning that empty operator sets can be OK). That case also means + * that we shouldn't insist on nonempty function sets except for the + * opclass's own group. + */ + } + + /* Check that the originally-named opclass is complete */ + for (i = 1; i <= BLOOM_NPROC; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((uint64) 1) << i)) != 0) + continue; /* got it */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("bloom opclass %s is missing support function %d", + opclassname, i))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} diff --git a/contrib/bloom/expected/bloom.out b/contrib/bloom/expected/bloom.out new file mode 100644 index 0000000000..5e8269faf3 --- /dev/null +++ b/contrib/bloom/expected/bloom.out @@ -0,0 +1,122 @@ +CREATE EXTENSION bloom; +CREATE TABLE tst ( + i int4, + t text +); +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; +CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3); +SET enable_seqscan=on; +SET enable_bitmapscan=off; +SET enable_indexscan=off; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 10000 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 6264 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 588 +(1 row) + +SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7; + QUERY PLAN +------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tst + Recheck Cond: (i = 7) + -> Bitmap Index Scan on bloomidx + Index Cond: (i = 7) +(5 rows) + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5'; + QUERY PLAN +------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tst + Recheck Cond: (t = '5'::text) + -> Bitmap Index Scan on bloomidx + Index Cond: (t = '5'::text) +(5 rows) + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tst + Recheck Cond: ((i = 7) AND (t = '5'::text)) + -> Bitmap Index Scan on bloomidx + Index Cond: ((i = 7) AND (t = '5'::text)) +(5 rows) + +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 10000 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 6264 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 588 +(1 row) + +DELETE FROM tst; +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; +VACUUM ANALYZE tst; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 10000 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 6264 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 588 +(1 row) + +VACUUM FULL tst; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 10000 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 6264 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 588 +(1 row) + +RESET enable_seqscan; +RESET enable_bitmapscan; +RESET enable_indexscan; diff --git a/contrib/bloom/sql/bloom.sql b/contrib/bloom/sql/bloom.sql new file mode 100644 index 0000000000..f9d0ad45d9 --- /dev/null +++ b/contrib/bloom/sql/bloom.sql @@ -0,0 +1,47 @@ +CREATE EXTENSION bloom; + +CREATE TABLE tst ( + i int4, + t text +); + +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; +CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3); + +SET enable_seqscan=on; +SET enable_bitmapscan=off; +SET enable_indexscan=off; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5'; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +DELETE FROM tst; +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; +VACUUM ANALYZE tst; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +VACUUM FULL tst; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +RESET enable_seqscan; +RESET enable_bitmapscan; +RESET enable_indexscan; diff --git a/contrib/bloom/t/001_wal.pl b/contrib/bloom/t/001_wal.pl new file mode 100644 index 0000000000..dbb6a905b6 --- /dev/null +++ b/contrib/bloom/t/001_wal.pl @@ -0,0 +1,75 @@ +# Test generic xlog record work for bloom index replication. +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 31; + +my $node_master; +my $node_standby; + +# Run few queries on both master and standby and check their results match. +sub test_index_replay +{ + my ($test_name) = @_; + + # Wait for standby to catch up + my $applname = $node_standby->name; + my $caughtup_query = + "SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';"; + $node_master->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for standby 1 to catch up"; + + my $queries = qq(SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; +SELECT * FROM tst WHERE i = 0; +SELECT * FROM tst WHERE i = 3; +SELECT * FROM tst WHERE t = 'b'; +SELECT * FROM tst WHERE t = 'f'; +SELECT * FROM tst WHERE i = 3 AND t = 'c'; +SELECT * FROM tst WHERE i = 7 AND t = 'e'; +); + + # Run test queries and compare their result + my $master_result = $node_master->psql("postgres", $queries); + my $standby_result = $node_standby->psql("postgres", $queries); + + is($master_result, $standby_result, "$test_name: query result matches"); +} + +# Initialize master node +$node_master = get_new_node('master'); +$node_master->init(allows_streaming => 1); +$node_master->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_master->backup($backup_name); + +# Create streaming standby linking to master +$node_standby = get_new_node('standby'); +$node_standby->init_from_backup($node_master, $backup_name, + has_streaming => 1); +$node_standby->start; + +# Create some bloom index on master +$node_master->psql("postgres", "CREATE EXTENSION bloom;"); +$node_master->psql("postgres", "CREATE TABLE tst (i int4, t text);"); +$node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;"); +$node_master->psql("postgres", "CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);"); + +# Test that queries give same result +test_index_replay('initial'); + +# Run 10 cycles of table modification. Run test queries after each modification. +for my $i (1..10) +{ + $node_master->psql("postgres", "DELETE FROM tst WHERE i = $i;"); + test_index_replay("delete $i"); + $node_master->psql("postgres", "VACUUM tst;"); + test_index_replay("vacuum $i"); + my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000); + $node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series($start,$end) i;"); + test_index_replay("insert $i"); +} diff --git a/doc/src/sgml/bloom.sgml b/doc/src/sgml/bloom.sgml new file mode 100644 index 0000000000..c207e6dd68 --- /dev/null +++ b/doc/src/sgml/bloom.sgml @@ -0,0 +1,218 @@ + + + + bloom + + + bloom + + + + bloom is a contrib which implements index access method. It comes + as example of custom access methods and generic WAL records usage. But it + is also useful itself. + + + + Introduction + + + Implementation of + Bloom filter + allows fast exclusion of non-candidate tuples. + Since signature is a lossy representation of all indexed attributes, + search results should be rechecked using heap information. + User can specify signature length (in uint16, default is 5) and the number of + bits, which can be setted, per attribute (1 < colN < 2048). + + + + This index is useful if table has many attributes and queries can include + their arbitary combinations. Traditional btree index is faster + than bloom index, but it'd require too many indexes to support all possible + queries, while one need only one bloom index. Bloom index supports only + equality comparison. Since it's a signature file, not a tree, it always + should be readed fully, but sequentially, so index search performance is + constant and doesn't depend on a query. + + + + + Parameters + + + bloom indexes accept following parameters in WITH + clause. + + + + + length + + + Length of signature in uint16 type values + + + + + + + col1 — col16 + + + Number of bits for corresponding column + + + + + + + + Examples + + + Example of index definition is given below. + + + +CREATE INDEX bloomidx ON tbloom(i1,i2,i3) + WITH (length=5, col1=2, col2=2, col3=4); + + + + Here, we create bloom index with signature length 80 bits and attributes + i1, i2 mapped to 2 bits, attribute i3 - to 4 bits. + + + + Example of index definition and usage is given below. + + + +CREATE TABLE tbloom AS +SELECT + random()::int as i1, + random()::int as i2, + random()::int as i3, + random()::int as i4, + random()::int as i5, + random()::int as i6, + random()::int as i7, + random()::int as i8, + random()::int as i9, + random()::int as i10, + random()::int as i11, + random()::int as i12, + random()::int as i13 +FROM + generate_series(1,1000); +CREATE INDEX bloomidx ON tbloom USING + bloom (i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12); +SELECT pg_relation_size('bloomidx'); +CREATE index btree_idx ON tbloom(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12); +SELECT pg_relation_size('btree_idx'); + + + +=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tbloom (cost=1.50..5.52 rows=1 width=52) (actual time=0.057..0.057 rows=0 loops=1) + Recheck Cond: ((i2 = 20) AND (i10 = 15)) + -> Bitmap Index Scan on bloomidx (cost=0.00..1.50 rows=1 width=0) (actual time=0.041..0.041 rows=9 loops=1) + Index Cond: ((i2 = 20) AND (i10 = 15)) + Total runtime: 0.081 ms +(5 rows) + + + + Seqscan is slow. + + + +=# SET enable_bitmapscan = off; +=# SET enable_indexscan = off; +=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.162..0.162 rows=0 loops=1) + Filter: ((i2 = 20) AND (i10 = 15)) + Total runtime: 0.181 ms +(3 rows) + + + + Btree index will be not used for this query. + + + +=# DROP INDEX bloomidx; +=# CREATE INDEX btree_idx ON tbloom(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12); +=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.210..0.210 rows=0 loops=1) + Filter: ((i2 = 20) AND (i10 = 15)) + Total runtime: 0.250 ms +(3 rows) + + + + + Opclass interface + + + Bloom opclass interface is simple. It requires 1 supporting function: + hash function for indexing datatype. And it provides 1 search operator: + equality operator. The example below shows opclass definition + for text datatype. + + + +CREATE OPERATOR CLASS text_ops +DEFAULT FOR TYPE text USING bloom AS + OPERATOR 1 =(text, text), + FUNCTION 1 hashtext(text); + + + + + Limitation + + + + + + For now, only opclasses for int4, text comes + with contrib. However, users may define more of them. + + + + + + Only = operator is supported for search now. But it's + possible to add support of arrays with contains and intersection + operations in future. + + + + + + + + Authors + + + Teodor Sigaev teodor@postgrespro.ru, Postgres Professional, Moscow, Russia + + + + Alexander Korotkov a.korotkov@postgrespro.ru, Postgres Professional, Moscow, Russia + + + + Oleg Bartunov obartunov@postgrespro.ru, Postgres Professional, Moscow, Russia + + + + diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index 4e3f337125..c8708ecf8b 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -105,6 +105,7 @@ CREATE EXTENSION module_name FROM unpackaged; &adminpack; &auth-delay; &auto-explain; + &bloom; &btree-gin; &btree-gist; &chkpass; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 9046f50628..6c0ad3ffaa 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -107,6 +107,7 @@ +