2016-04-01 15:42:24 +02:00
|
|
|
<!-- doc/src/sgml/bloom.sgml -->
|
|
|
|
|
|
|
|
<sect1 id="bloom" xreflabel="bloom">
|
|
|
|
<title>bloom</title>
|
|
|
|
|
|
|
|
<indexterm zone="bloom">
|
|
|
|
<primary>bloom</primary>
|
|
|
|
</indexterm>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
<literal>bloom</> is a module which implements an index access method. It comes
|
|
|
|
as an example of custom access methods and generic WAL records usage. But it
|
|
|
|
is also useful in itself.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<sect2>
|
|
|
|
<title>Introduction</title>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
The implementation of a
|
2016-04-01 15:42:24 +02:00
|
|
|
<ulink url="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</ulink>
|
2016-04-01 17:47:44 +02:00
|
|
|
allows fast exclusion of non-candidate tuples via signatures.
|
2016-05-31 19:56:25 +02:00
|
|
|
Since a signature is a lossy representation of all indexed attributes,
|
|
|
|
search results must be rechecked using heap information.
|
2016-04-01 17:47:44 +02:00
|
|
|
The user can specify signature length (in uint16, default is 5) and the
|
|
|
|
number of bits, which can be set per attribute (1 < colN < 2048).
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
This index is useful if a table has many attributes and queries include
|
|
|
|
arbitrary combinations of them. A traditional <literal>btree</> index is
|
|
|
|
faster than a bloom index, but it can require many indexes to support all
|
|
|
|
possible queries where one needs only a single bloom index. A Bloom index
|
|
|
|
supports only equality comparison. Since it's a signature file, and not a
|
|
|
|
tree, it always must be read fully, but sequentially, so that index search
|
2016-05-31 19:56:25 +02:00
|
|
|
performance is constant and doesn't depend on a query.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
</sect2>
|
|
|
|
|
|
|
|
<sect2>
|
|
|
|
<title>Parameters</title>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
<literal>bloom</> indexes accept the following parameters in the
|
|
|
|
<literal>WITH</>
|
2016-04-01 15:42:24 +02:00
|
|
|
clause.
|
|
|
|
</para>
|
|
|
|
|
|
|
|
<variablelist>
|
|
|
|
<varlistentry>
|
|
|
|
<term><literal>length</></term>
|
|
|
|
<listitem>
|
|
|
|
<para>
|
|
|
|
Length of signature in uint16 type values
|
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
</varlistentry>
|
|
|
|
</variablelist>
|
|
|
|
<variablelist>
|
|
|
|
<varlistentry>
|
|
|
|
<term><literal>col1 — col16</></term>
|
|
|
|
<listitem>
|
|
|
|
<para>
|
|
|
|
Number of bits for corresponding column
|
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
</varlistentry>
|
|
|
|
</variablelist>
|
|
|
|
</sect2>
|
|
|
|
|
|
|
|
<sect2>
|
|
|
|
<title>Examples</title>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
An example of an index definition is given below.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<programlisting>
|
2016-05-24 14:27:48 +02:00
|
|
|
CREATE INDEX bloomidx ON tbloom USING bloom (i1,i2,i3)
|
2016-04-01 15:42:24 +02:00
|
|
|
WITH (length=5, col1=2, col2=2, col3=4);
|
|
|
|
</programlisting>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
Here, we created a bloom index with a signature length of 80 bits,
|
|
|
|
and attributes i1 and i2 mapped to 2 bits, and attribute i3 to 4 bits.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
Here is a fuller example of index definition and usage:
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<programlisting>
|
|
|
|
CREATE TABLE tbloom AS
|
|
|
|
SELECT
|
|
|
|
random()::int as i1,
|
|
|
|
random()::int as i2,
|
|
|
|
random()::int as i3,
|
|
|
|
random()::int as i4,
|
|
|
|
random()::int as i5,
|
|
|
|
random()::int as i6,
|
|
|
|
random()::int as i7,
|
|
|
|
random()::int as i8,
|
|
|
|
random()::int as i9,
|
|
|
|
random()::int as i10,
|
|
|
|
random()::int as i11,
|
|
|
|
random()::int as i12,
|
|
|
|
random()::int as i13
|
|
|
|
FROM
|
|
|
|
generate_series(1,1000);
|
|
|
|
CREATE INDEX bloomidx ON tbloom USING
|
|
|
|
bloom (i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12);
|
|
|
|
SELECT pg_relation_size('bloomidx');
|
|
|
|
CREATE index btree_idx ON tbloom(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12);
|
|
|
|
SELECT pg_relation_size('btree_idx');
|
|
|
|
</programlisting>
|
|
|
|
|
|
|
|
<programlisting>
|
|
|
|
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
|
|
|
|
QUERY PLAN
|
|
|
|
-----------------------------------------------------------------------------------------------------------------
|
|
|
|
Bitmap Heap Scan on tbloom (cost=1.50..5.52 rows=1 width=52) (actual time=0.057..0.057 rows=0 loops=1)
|
|
|
|
Recheck Cond: ((i2 = 20) AND (i10 = 15))
|
|
|
|
-> Bitmap Index Scan on bloomidx (cost=0.00..1.50 rows=1 width=0) (actual time=0.041..0.041 rows=9 loops=1)
|
|
|
|
Index Cond: ((i2 = 20) AND (i10 = 15))
|
|
|
|
Total runtime: 0.081 ms
|
|
|
|
(5 rows)
|
|
|
|
</programlisting>
|
|
|
|
|
|
|
|
<para>
|
|
|
|
Seqscan is slow.
|
|
|
|
</para>
|
|
|
|
|
|
|
|
<programlisting>
|
|
|
|
=# SET enable_bitmapscan = off;
|
|
|
|
=# SET enable_indexscan = off;
|
|
|
|
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
|
|
|
|
QUERY PLAN
|
|
|
|
--------------------------------------------------------------------------------------------------
|
|
|
|
Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.162..0.162 rows=0 loops=1)
|
|
|
|
Filter: ((i2 = 20) AND (i10 = 15))
|
|
|
|
Total runtime: 0.181 ms
|
|
|
|
(3 rows)
|
|
|
|
</programlisting>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
A btree index will be not used for this query.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<programlisting>
|
|
|
|
=# DROP INDEX bloomidx;
|
|
|
|
=# CREATE INDEX btree_idx ON tbloom(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12);
|
|
|
|
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
|
|
|
|
QUERY PLAN
|
|
|
|
--------------------------------------------------------------------------------------------------
|
|
|
|
Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.210..0.210 rows=0 loops=1)
|
|
|
|
Filter: ((i2 = 20) AND (i10 = 15))
|
|
|
|
Total runtime: 0.250 ms
|
|
|
|
(3 rows)
|
|
|
|
</programlisting>
|
|
|
|
</sect2>
|
|
|
|
|
|
|
|
<sect2>
|
|
|
|
<title>Opclass interface</title>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
The Bloom opclass interface is simple. It requires 1 supporting function:
|
|
|
|
a hash function for the indexing datatype. It provides 1 search operator:
|
|
|
|
the equality operator. The example below shows <literal>opclass</>
|
|
|
|
definition for <literal>text</> datatype.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<programlisting>
|
|
|
|
CREATE OPERATOR CLASS text_ops
|
|
|
|
DEFAULT FOR TYPE text USING bloom AS
|
|
|
|
OPERATOR 1 =(text, text),
|
|
|
|
FUNCTION 1 hashtext(text);
|
|
|
|
</programlisting>
|
|
|
|
</sect2>
|
|
|
|
|
|
|
|
<sect2>
|
|
|
|
<title>Limitation</title>
|
|
|
|
<para>
|
|
|
|
|
|
|
|
<itemizedlist>
|
|
|
|
<listitem>
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
For now, only opclasses for <literal>int4</>, <literal>text</> come
|
|
|
|
with the module. However, users may define more of them.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
|
|
|
|
<listitem>
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
Only the <literal>=</literal> operator is supported for search at the
|
|
|
|
moment. But it's possible to add support for arrays with contains and
|
|
|
|
intersection operations in the future.
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
</itemizedlist>
|
|
|
|
</para>
|
|
|
|
</sect2>
|
|
|
|
|
|
|
|
<sect2>
|
|
|
|
<title>Authors</title>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
Teodor Sigaev <email>teodor@postgrespro.ru</email>,
|
|
|
|
Postgres Professional, Moscow, Russia
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
Alexander Korotkov <email>a.korotkov@postgrespro.ru</email>,
|
|
|
|
Postgres Professional, Moscow, Russia
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
2016-04-01 17:47:44 +02:00
|
|
|
Oleg Bartunov <email>obartunov@postgrespro.ru</email>,
|
|
|
|
Postgres Professional, Moscow, Russia
|
2016-04-01 15:42:24 +02:00
|
|
|
</para>
|
|
|
|
</sect2>
|
|
|
|
|
|
|
|
</sect1>
|