diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 2fe9867353..e85abcfd72 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -23,6 +23,7 @@ #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/smgr.h" #include "utils/tqual.h" @@ -85,7 +86,6 @@ static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); - /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. * @@ -111,32 +111,121 @@ _bt_doinsert(Relation rel, IndexTuple itup, bool is_unique = false; int natts = rel->rd_rel->relnatts; ScanKey itup_scankey; - BTStack stack; + BTStack stack = NULL; Buffer buf; OffsetNumber offset; + bool fastpath; /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, itup); -top: - /* find the first page containing this key */ - stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, NULL); - - offset = InvalidOffsetNumber; - - /* trade in our read lock for a write lock */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBuffer(buf, BT_WRITE); - /* - * If the page was split between the time that we surrendered our read - * lock and acquired our write lock, then this page may no longer be the - * right place for the key we want to insert. In this case, we need to - * move right in the tree. See Lehman and Yao for an excruciatingly - * precise description. + * It's very common to have an index on an auto-incremented or + * monotonically increasing value. In such cases, every insertion happens + * towards the end of the index. We try to optimise that case by caching + * the right-most leaf of the index. If our cached block is still the + * rightmost leaf, has enough free space to accommodate a new entry and + * the insertion key is strictly greater than the first key in this page, + * then we can safely conclude that the new key will be inserted in the + * cached block. So we simply search within the cached block and insert the + * key at the appropriate location. We call it a fastpath. + * + * Testing has revealed, though, that the fastpath can result in increased + * contention on the exclusive-lock on the rightmost leaf page. So we + * conditionally check if the lock is available. If it's not available then + * we simply abandon the fastpath and take the regular path. This makes + * sense because unavailability of the lock also signals that some other + * backend might be concurrently inserting into the page, thus reducing our + * chances to finding an insertion place in this page. */ - buf = _bt_moveright(rel, buf, natts, itup_scankey, false, - true, stack, BT_WRITE, NULL); +top: + fastpath = false; + offset = InvalidOffsetNumber; + if (RelationGetTargetBlock(rel) != InvalidBlockNumber) + { + Size itemsz; + Page page; + BTPageOpaque lpageop; + + /* + * Conditionally acquire exclusive lock on the buffer before doing any + * checks. If we don't get the lock, we simply follow slowpath. If we + * do get the lock, this ensures that the index state cannot change, as + * far as the rightmost part of the index is concerned. + */ + buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); + + if (ConditionalLockBuffer(buf)) + { + _bt_checkpage(rel, buf); + + page = BufferGetPage(buf); + + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this + * but we need to be consistent */ + + /* + * Check if the page is still the rightmost leaf page, has enough + * free space to accommodate the new tuple, no split is in progress + * and the scankey is greater than or equal to the first key on the + * page. + */ + if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) && + !P_INCOMPLETE_SPLIT(lpageop) && + !P_IGNORE(lpageop) && + (PageGetFreeSpace(page) > itemsz) && + PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && + _bt_compare(rel, natts, itup_scankey, page, + P_FIRSTDATAKEY(lpageop)) > 0) + { + fastpath = true; + } + else + { + _bt_relbuf(rel, buf); + + /* + * Something did not workout. Just forget about the cached + * block and follow the normal path. It might be set again if + * the conditions are favourble. + */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + } + else + { + ReleaseBuffer(buf); + + /* + * If someone's holding a lock, it's likely to change anyway, + * so don't try again until we get an updated rightmost leaf. + */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + } + + if (!fastpath) + { + /* find the first page containing this key */ + stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, + NULL); + + /* trade in our read lock for a write lock */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our read + * lock and acquired our write lock, then this page may no longer be + * the right place for the key we want to insert. In this case, we + * need to move right in the tree. See Lehman and Yao for an + * excruciatingly precise description. + */ + buf = _bt_moveright(rel, buf, natts, itup_scankey, false, + true, stack, BT_WRITE, NULL); + } /* * If we're not allowing duplicates, make sure the key isn't already in @@ -184,7 +273,8 @@ top: XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); /* start over... */ - _bt_freestack(stack); + if (stack) + _bt_freestack(stack); goto top; } } @@ -211,7 +301,8 @@ top: } /* be tidy */ - _bt_freestack(stack); + if (stack) + _bt_freestack(stack); _bt_freeskey(itup_scankey); return is_unique; @@ -879,7 +970,16 @@ _bt_insertonpg(Relation rel, XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); if (P_ISLEAF(lpageop)) + { xlinfo = XLOG_BTREE_INSERT_LEAF; + + /* + * Cache the block information if we just inserted into the + * rightmost leaf page of the index. + */ + if (P_RIGHTMOST(lpageop)) + RelationSetTargetBlock(rel, BufferGetBlockNumber(buf)); + } else { /* diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index fb32ffffdc..ecd2b110ff 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -1067,6 +1067,255 @@ select tableoid::regclass, * from idxpart order by a; (8 rows) drop table idxpart; +-- test fastpath mechanism for index insertion +create table fastpath (a int, b text, c numeric); +create unique index fpindex1 on fastpath(a); +insert into fastpath values (1, 'b1', 100.00); +insert into fastpath values (1, 'b1', 100.00); -- unique key check +ERROR: duplicate key value violates unique constraint "fpindex1" +DETAIL: Key (a)=(1) already exists. +truncate fastpath; +insert into fastpath select generate_series(1,10000), 'b', 100; +-- vacuum the table so as to improve chances of index-only scans. we can't +-- guarantee if index-only scans will be picked up in all cases or not, but +-- that fuzziness actually helps the test. +vacuum fastpath; +set enable_seqscan to false; +set enable_bitmapscan to false; +explain select sum(a) from fastpath where a = 6456; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate (cost=4.31..4.32 rows=1 width=8) + -> Index Only Scan using fpindex1 on fastpath (cost=0.29..4.30 rows=1 width=4) + Index Cond: (a = 6456) +(3 rows) + +explain select sum(a) from fastpath where a >= 5000 and a < 5700; + QUERY PLAN +------------------------------------------------------------------------------------- + Aggregate (cost=5.41..5.42 rows=1 width=8) + -> Index Only Scan using fpindex1 on fastpath (cost=0.29..5.29 rows=50 width=4) + Index Cond: ((a >= 5000) AND (a < 5700)) +(3 rows) + +select sum(a) from fastpath where a = 6456; + sum +------ + 6456 +(1 row) + +select sum(a) from fastpath where a >= 5000 and a < 5700; + sum +--------- + 3744650 +(1 row) + +-- drop the only index on the table and compute hashes for +-- a few queries which orders the results in various different ways. +drop index fpindex1; +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 6167a852b3e0679886b84a5405b5b53d +(1 row) + +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + dfcf2bd5e5fea8397d47b2fd14618d31 +(1 row) + +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +-- now create a multi-column index with both column asc +create index fpindex2 on fastpath(a, b); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +-- again, vacuum here either forces index-only scans or creates fuzziness +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 6167a852b3e0679886b84a5405b5b53d +(1 row) + +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + dfcf2bd5e5fea8397d47b2fd14618d31 +(1 row) + +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +-- same queries with a different kind of index now. the final result must not +-- change irrespective of what kind of index we have. +drop index fpindex2; +create index fpindex3 on fastpath(a desc, b asc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 6167a852b3e0679886b84a5405b5b53d +(1 row) + +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + dfcf2bd5e5fea8397d47b2fd14618d31 +(1 row) + +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +-- repeat again +drop index fpindex3; +create index fpindex4 on fastpath(a asc, b desc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 6167a852b3e0679886b84a5405b5b53d +(1 row) + +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + dfcf2bd5e5fea8397d47b2fd14618d31 +(1 row) + +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +-- and again, this time indexing by (b, a). Note that column "b" has non-unique +-- values. +drop index fpindex4; +create index fpindex5 on fastpath(b asc, a desc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 6167a852b3e0679886b84a5405b5b53d +(1 row) + +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + dfcf2bd5e5fea8397d47b2fd14618d31 +(1 row) + +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +-- one last time +drop index fpindex5; +create index fpindex6 on fastpath(b desc, a desc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 6167a852b3e0679886b84a5405b5b53d +(1 row) + +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + dfcf2bd5e5fea8397d47b2fd14618d31 +(1 row) + +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + md5 +---------------------------------- + 2ca216010a558a52d7df12f76dfc77ab +(1 row) + +drop table fastpath; -- intentionally leave some objects around create table idxpart (a int) partition by range (a); create table idxpart1 partition of idxpart for values from (0) to (100); diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index c4ab89fc48..80f4adc6aa 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -574,6 +574,122 @@ insert into idxpart values (857142, 'six'); select tableoid::regclass, * from idxpart order by a; drop table idxpart; +-- test fastpath mechanism for index insertion +create table fastpath (a int, b text, c numeric); +create unique index fpindex1 on fastpath(a); + +insert into fastpath values (1, 'b1', 100.00); +insert into fastpath values (1, 'b1', 100.00); -- unique key check + +truncate fastpath; +insert into fastpath select generate_series(1,10000), 'b', 100; + +-- vacuum the table so as to improve chances of index-only scans. we can't +-- guarantee if index-only scans will be picked up in all cases or not, but +-- that fuzziness actually helps the test. +vacuum fastpath; + +set enable_seqscan to false; +set enable_bitmapscan to false; + +explain select sum(a) from fastpath where a = 6456; +explain select sum(a) from fastpath where a >= 5000 and a < 5700; +select sum(a) from fastpath where a = 6456; +select sum(a) from fastpath where a >= 5000 and a < 5700; + +-- drop the only index on the table and compute hashes for +-- a few queries which orders the results in various different ways. +drop index fpindex1; +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + +-- now create a multi-column index with both column asc +create index fpindex2 on fastpath(a, b); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +-- again, vacuum here either forces index-only scans or creates fuzziness +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + +-- same queries with a different kind of index now. the final result must not +-- change irrespective of what kind of index we have. +drop index fpindex2; +create index fpindex3 on fastpath(a desc, b asc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + +-- repeat again +drop index fpindex3; +create index fpindex4 on fastpath(a asc, b desc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + +-- and again, this time indexing by (b, a). Note that column "b" has non-unique +-- values. +drop index fpindex4; +create index fpindex5 on fastpath(b asc, a desc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + +-- one last time +drop index fpindex5; +create index fpindex6 on fastpath(b desc, a desc); +truncate fastpath; +insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y; +vacuum fastpath; +select md5(string_agg(a::text, b order by a, b asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a desc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; +select md5(string_agg(a::text, b order by b, a asc)) from fastpath + where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3'; + +drop table fastpath; + -- intentionally leave some objects around create table idxpart (a int) partition by range (a); create table idxpart1 partition of idxpart for values from (0) to (100);