From 620b49a16d2a16ce6f9edf072a88111f981919d0 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 4 Aug 2017 15:29:26 -0400 Subject: [PATCH] hash: Increase the number of possible overflow bitmaps by 8x. Per a report from AP, it's not that hard to exhaust the supply of bitmap pages if you create a table with a hash index and then insert a few billion rows - and then you start getting errors when you try to insert additional rows. In the particular case reported by AP, there's another fix that we can make to improve recycling of overflow pages, which is another way to avoid the error, but there may be other cases where this problem happens and that fix won't help. So let's buy ourselves as much headroom as we can without rearchitecting anything. The comments claim that the old limit was 64GB, but it was really only 32GB, because we didn't use all the bits in the page for bitmap bits - only the largest power of 2 that could fit after deducting space for the page header and so forth. Thus, we have 4kB per page for bitmap bits, not 8kB. The new limit is thus actually 8 times the old *real* limit but only 4 times the old *purported* limit. Since this breaks on-disk compatibility, bump HASH_VERSION. We've already done this earlier in this release cycle, so this doesn't cause any incremental inconvenience for people using pg_upgrade from releases prior to v10. However, users who use pg_upgrade to reach 10beta3 or later from 10beta2 or earlier will need to REINDEX any hash indexes again. Amit Kapila and Robert Haas Discussion: http://postgr.es/m/20170704105728.mwb72jebfmok2nm2@zip.com.au --- contrib/pageinspect/expected/hash.out | 6 +++--- contrib/pgstattuple/expected/pgstattuple.out | 4 ++-- doc/src/sgml/pageinspect.sgml | 13 +++++++++---- doc/src/sgml/pgstattuple.sgml | 2 +- src/include/access/hash.h | 9 ++++----- 5 files changed, 19 insertions(+), 15 deletions(-) diff --git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out index 39374158b1..75d7bcfad5 100644 --- a/contrib/pageinspect/expected/hash.out +++ b/contrib/pageinspect/expected/hash.out @@ -43,9 +43,9 @@ ERROR: invalid overflow block number 5 SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM hash_metapage_info(get_raw_page('test_hash_a_idx', 0)); --[ RECORD 1 ]---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +-[ RECORD 1 ]-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- magic | 105121344 -version | 3 +version | 4 ntuples | 1 bsize | 8152 bmsize | 4096 @@ -58,7 +58,7 @@ firstfree | 0 nmaps | 1 procid | 450 spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} -mapp | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +mapp | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index c7c1732827..20b5585d03 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -134,7 +134,7 @@ create index test_hashidx on test using hash (b); select * from pgstathashindex('test_hashidx'); version | bucket_pages | overflow_pages | bitmap_pages | unused_pages | live_items | dead_items | free_percent ---------+--------------+----------------+--------------+--------------+------------+------------+-------------- - 3 | 4 | 0 | 1 | 0 | 0 | 0 | 100 + 4 | 4 | 0 | 1 | 0 | 0 | 0 | 100 (1 row) -- these should error with the wrong type @@ -235,7 +235,7 @@ select pgstatindex('test_partition_idx'); select pgstathashindex('test_partition_hash_idx'); pgstathashindex --------------------- - (3,8,0,1,0,0,0,100) + (4,8,0,1,0,0,0,100) (1 row) drop table test_partitioned; diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index ccdaf3e0ac..e46f5ca6bc 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -687,8 +687,13 @@ test=# SELECT * FROM hash_bitmap_info('con_hash_index', 2052); hash_metapage_info returns information stored in meta page of a HASH index. For example: -test=# SELECT * FROM hash_metapage_info(get_raw_page('con_hash_index', 0)); --[ RECORD 1 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +test=# SELECT magic, version, ntuples, ffactor, bsize, bmsize, bmshift, +test-# maxbucket, highmask, lowmask, ovflpoint, firstfree, nmaps, procid, +test-# regexp_replace(spares::text, '(,0)*}', '}') as spares, +test-# regexp_replace(mapp::text, '(,0)*}', '}') as mapp +test-# FROM hash_metapage_info(get_raw_page('con_hash_index', 0)); +-[ RECORD 1 ]------------------------------------------------------------------------------- +spares | {0,0,0,0,0,0,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204} magic | 105121344 version | 3 ntuples | 500500 @@ -703,8 +708,8 @@ ovflpoint | 28 firstfree | 1204 nmaps | 1 procid | 450 -spares | {0,0,0,0,0,0,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} -mapp | {65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +spares | {0,0,0,0,0,0,1,1,1,1,1,1,1,1,3,4,4,4,45,55,58,59,508,567,628,704,1193,1202,1204} +mapp | {65} diff --git a/doc/src/sgml/pgstattuple.sgml b/doc/src/sgml/pgstattuple.sgml index e98e04fa2f..a7c67ae645 100644 --- a/doc/src/sgml/pgstattuple.sgml +++ b/doc/src/sgml/pgstattuple.sgml @@ -368,7 +368,7 @@ pending_tuples | 0 test=> select * from pgstathashindex('con_hash_index'); -[ RECORD 1 ]--+----------------- -version | 2 +version | 4 bucket_pages | 33081 overflow_pages | 0 bitmap_pages | 1 diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 7fa868b556..72fce3038c 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -158,8 +158,7 @@ typedef HashScanOpaqueData *HashScanOpaque; #define HASH_METAPAGE 0 /* metapage is always block 0 */ #define HASH_MAGIC 0x6440640 -#define HASH_VERSION 3 /* 3 signifies multi-phased bucket allocation - * to reduce doubling */ +#define HASH_VERSION 4 /* * spares[] holds the number of overflow pages currently allocated at or @@ -182,10 +181,10 @@ typedef HashScanOpaqueData *HashScanOpaque; * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE). * * There is no particular upper limit on the size of mapp[], other than - * needing to fit into the metapage. (With 8K block size, 128 bitmaps - * limit us to 64 GB of overflow space...) + * needing to fit into the metapage. (With 8K block size, 1024 bitmaps + * limit us to 256 GB of overflow space...) */ -#define HASH_MAX_BITMAPS 128 +#define HASH_MAX_BITMAPS 1024 #define HASH_SPLITPOINT_PHASE_BITS 2 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)