Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
-- When there is a LIMIT clause, incremental sort is beneficial because
|
|
|
|
-- it only has to sort some of the groups, and not the entire table.
|
|
|
|
explain (costs off)
|
|
|
|
select * from (select * from tenk1 order by four) t order by four, ten
|
|
|
|
limit 1;
|
|
|
|
QUERY PLAN
|
|
|
|
-----------------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: tenk1.four, tenk1.ten
|
|
|
|
Presorted Key: tenk1.four
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.four
|
|
|
|
-> Seq Scan on tenk1
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
-- When work_mem is not enough to sort the entire table, incremental sort
|
|
|
|
-- may be faster if individual groups still fit into work_mem.
|
|
|
|
set work_mem to '2MB';
|
|
|
|
explain (costs off)
|
|
|
|
select * from (select * from tenk1 order by four) t order by four, ten;
|
|
|
|
QUERY PLAN
|
|
|
|
-----------------------------------
|
|
|
|
Incremental Sort
|
|
|
|
Sort Key: tenk1.four, tenk1.ten
|
|
|
|
Presorted Key: tenk1.four
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.four
|
|
|
|
-> Seq Scan on tenk1
|
|
|
|
(6 rows)
|
|
|
|
|
|
|
|
reset work_mem;
|
|
|
|
create table t(a integer, b integer);
|
|
|
|
create or replace function explain_analyze_without_memory(query text)
|
|
|
|
returns table (out_line text) language plpgsql
|
|
|
|
as
|
|
|
|
$$
|
|
|
|
declare
|
|
|
|
line text;
|
|
|
|
begin
|
|
|
|
for line in
|
|
|
|
execute 'explain (analyze, costs off, summary off, timing off) ' || query
|
|
|
|
loop
|
|
|
|
out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g');
|
|
|
|
return next;
|
|
|
|
end loop;
|
|
|
|
end;
|
|
|
|
$$;
|
|
|
|
create or replace function explain_analyze_inc_sort_nodes(query text)
|
|
|
|
returns jsonb language plpgsql
|
|
|
|
as
|
|
|
|
$$
|
|
|
|
declare
|
|
|
|
elements jsonb;
|
|
|
|
element jsonb;
|
|
|
|
matching_nodes jsonb := '[]'::jsonb;
|
|
|
|
begin
|
|
|
|
execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements;
|
|
|
|
while jsonb_array_length(elements) > 0 loop
|
|
|
|
element := elements->0;
|
|
|
|
elements := elements - 0;
|
|
|
|
case jsonb_typeof(element)
|
|
|
|
when 'array' then
|
|
|
|
if jsonb_array_length(element) > 0 then
|
|
|
|
elements := elements || element;
|
|
|
|
end if;
|
|
|
|
when 'object' then
|
|
|
|
if element ? 'Plan' then
|
|
|
|
elements := elements || jsonb_build_array(element->'Plan');
|
|
|
|
element := element - 'Plan';
|
|
|
|
else
|
|
|
|
if element ? 'Plans' then
|
|
|
|
elements := elements || jsonb_build_array(element->'Plans');
|
|
|
|
element := element - 'Plans';
|
|
|
|
end if;
|
|
|
|
if (element->>'Node Type')::text = 'Incremental Sort' then
|
|
|
|
matching_nodes := matching_nodes || element;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
end case;
|
|
|
|
end loop;
|
|
|
|
return matching_nodes;
|
|
|
|
end;
|
|
|
|
$$;
|
|
|
|
create or replace function explain_analyze_inc_sort_nodes_without_memory(query text)
|
|
|
|
returns jsonb language plpgsql
|
|
|
|
as
|
|
|
|
$$
|
|
|
|
declare
|
|
|
|
nodes jsonb := '[]'::jsonb;
|
|
|
|
node jsonb;
|
|
|
|
group_key text;
|
|
|
|
space_key text;
|
|
|
|
begin
|
|
|
|
for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
|
2020-05-12 20:04:39 +02:00
|
|
|
for group_key in select unnest(array['Full-sort Groups', 'Pre-sorted Groups']::text[]) t loop
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
|
|
|
|
node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false);
|
2020-04-07 18:03:24 +02:00
|
|
|
node := jsonb_set(node, array[group_key, space_key, 'Peak Sort Space Used'], '"NN"', false);
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
end loop;
|
|
|
|
end loop;
|
|
|
|
nodes := nodes || node;
|
|
|
|
end loop;
|
|
|
|
return nodes;
|
|
|
|
end;
|
|
|
|
$$;
|
|
|
|
create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text)
|
|
|
|
returns bool language plpgsql
|
|
|
|
as
|
|
|
|
$$
|
|
|
|
declare
|
|
|
|
node jsonb;
|
|
|
|
group_stats jsonb;
|
|
|
|
group_key text;
|
|
|
|
space_key text;
|
|
|
|
begin
|
|
|
|
for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
|
2020-05-12 20:04:39 +02:00
|
|
|
for group_key in select unnest(array['Full-sort Groups', 'Pre-sorted Groups']::text[]) t loop
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
group_stats := node->group_key;
|
|
|
|
for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
|
2020-04-07 18:03:24 +02:00
|
|
|
if (group_stats->space_key->'Peak Sort Space Used')::bigint < (group_stats->space_key->'Peak Sort Space Used')::bigint then
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
raise exception '% has invalid max space < average space', group_key;
|
|
|
|
end if;
|
|
|
|
end loop;
|
|
|
|
end loop;
|
|
|
|
end loop;
|
|
|
|
return true;
|
|
|
|
end;
|
|
|
|
$$;
|
|
|
|
-- A single large group tested around each mode transition point.
|
2020-04-08 18:30:11 +02:00
|
|
|
insert into t(a, b) select i/100 + 1, i + 1 from generate_series(0, 999) n(i);
|
|
|
|
analyze t;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 31;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
1 | 2
|
|
|
|
1 | 3
|
|
|
|
1 | 4
|
|
|
|
1 | 5
|
|
|
|
1 | 6
|
|
|
|
1 | 7
|
|
|
|
1 | 8
|
|
|
|
1 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
1 | 20
|
|
|
|
1 | 21
|
|
|
|
1 | 22
|
|
|
|
1 | 23
|
|
|
|
1 | 24
|
|
|
|
1 | 25
|
|
|
|
1 | 26
|
|
|
|
1 | 27
|
|
|
|
1 | 28
|
|
|
|
1 | 29
|
|
|
|
1 | 30
|
|
|
|
1 | 31
|
|
|
|
(31 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 32;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
1 | 2
|
|
|
|
1 | 3
|
|
|
|
1 | 4
|
|
|
|
1 | 5
|
|
|
|
1 | 6
|
|
|
|
1 | 7
|
|
|
|
1 | 8
|
|
|
|
1 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
1 | 20
|
|
|
|
1 | 21
|
|
|
|
1 | 22
|
|
|
|
1 | 23
|
|
|
|
1 | 24
|
|
|
|
1 | 25
|
|
|
|
1 | 26
|
|
|
|
1 | 27
|
|
|
|
1 | 28
|
|
|
|
1 | 29
|
|
|
|
1 | 30
|
|
|
|
1 | 31
|
|
|
|
1 | 32
|
|
|
|
(32 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 33;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
1 | 2
|
|
|
|
1 | 3
|
|
|
|
1 | 4
|
|
|
|
1 | 5
|
|
|
|
1 | 6
|
|
|
|
1 | 7
|
|
|
|
1 | 8
|
|
|
|
1 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
1 | 20
|
|
|
|
1 | 21
|
|
|
|
1 | 22
|
|
|
|
1 | 23
|
|
|
|
1 | 24
|
|
|
|
1 | 25
|
|
|
|
1 | 26
|
|
|
|
1 | 27
|
|
|
|
1 | 28
|
|
|
|
1 | 29
|
|
|
|
1 | 30
|
|
|
|
1 | 31
|
|
|
|
1 | 32
|
|
|
|
1 | 33
|
|
|
|
(33 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 65;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
1 | 2
|
|
|
|
1 | 3
|
|
|
|
1 | 4
|
|
|
|
1 | 5
|
|
|
|
1 | 6
|
|
|
|
1 | 7
|
|
|
|
1 | 8
|
|
|
|
1 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
1 | 20
|
|
|
|
1 | 21
|
|
|
|
1 | 22
|
|
|
|
1 | 23
|
|
|
|
1 | 24
|
|
|
|
1 | 25
|
|
|
|
1 | 26
|
|
|
|
1 | 27
|
|
|
|
1 | 28
|
|
|
|
1 | 29
|
|
|
|
1 | 30
|
|
|
|
1 | 31
|
|
|
|
1 | 32
|
|
|
|
1 | 33
|
|
|
|
1 | 34
|
|
|
|
1 | 35
|
|
|
|
1 | 36
|
|
|
|
1 | 37
|
|
|
|
1 | 38
|
|
|
|
1 | 39
|
|
|
|
1 | 40
|
|
|
|
1 | 41
|
|
|
|
1 | 42
|
|
|
|
1 | 43
|
|
|
|
1 | 44
|
|
|
|
1 | 45
|
|
|
|
1 | 46
|
|
|
|
1 | 47
|
|
|
|
1 | 48
|
|
|
|
1 | 49
|
|
|
|
1 | 50
|
|
|
|
1 | 51
|
|
|
|
1 | 52
|
|
|
|
1 | 53
|
|
|
|
1 | 54
|
|
|
|
1 | 55
|
|
|
|
1 | 56
|
|
|
|
1 | 57
|
|
|
|
1 | 58
|
|
|
|
1 | 59
|
|
|
|
1 | 60
|
|
|
|
1 | 61
|
|
|
|
1 | 62
|
|
|
|
1 | 63
|
|
|
|
1 | 64
|
|
|
|
1 | 65
|
|
|
|
(65 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 66;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
1 | 2
|
|
|
|
1 | 3
|
|
|
|
1 | 4
|
|
|
|
1 | 5
|
|
|
|
1 | 6
|
|
|
|
1 | 7
|
|
|
|
1 | 8
|
|
|
|
1 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
1 | 20
|
|
|
|
1 | 21
|
|
|
|
1 | 22
|
|
|
|
1 | 23
|
|
|
|
1 | 24
|
|
|
|
1 | 25
|
|
|
|
1 | 26
|
|
|
|
1 | 27
|
|
|
|
1 | 28
|
|
|
|
1 | 29
|
|
|
|
1 | 30
|
|
|
|
1 | 31
|
|
|
|
1 | 32
|
|
|
|
1 | 33
|
|
|
|
1 | 34
|
|
|
|
1 | 35
|
|
|
|
1 | 36
|
|
|
|
1 | 37
|
|
|
|
1 | 38
|
|
|
|
1 | 39
|
|
|
|
1 | 40
|
|
|
|
1 | 41
|
|
|
|
1 | 42
|
|
|
|
1 | 43
|
|
|
|
1 | 44
|
|
|
|
1 | 45
|
|
|
|
1 | 46
|
|
|
|
1 | 47
|
|
|
|
1 | 48
|
|
|
|
1 | 49
|
|
|
|
1 | 50
|
|
|
|
1 | 51
|
|
|
|
1 | 52
|
|
|
|
1 | 53
|
|
|
|
1 | 54
|
|
|
|
1 | 55
|
|
|
|
1 | 56
|
|
|
|
1 | 57
|
|
|
|
1 | 58
|
|
|
|
1 | 59
|
|
|
|
1 | 60
|
|
|
|
1 | 61
|
|
|
|
1 | 62
|
|
|
|
1 | 63
|
|
|
|
1 | 64
|
|
|
|
1 | 65
|
|
|
|
1 | 66
|
|
|
|
(66 rows)
|
|
|
|
|
|
|
|
delete from t;
|
|
|
|
-- An initial large group followed by a small group.
|
2020-04-08 18:30:11 +02:00
|
|
|
insert into t(a, b) select i/50 + 1, i + 1 from generate_series(0, 999) n(i);
|
|
|
|
analyze t;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 55;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 55;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
1 | 2
|
|
|
|
1 | 3
|
|
|
|
1 | 4
|
|
|
|
1 | 5
|
|
|
|
1 | 6
|
|
|
|
1 | 7
|
|
|
|
1 | 8
|
|
|
|
1 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
1 | 20
|
|
|
|
1 | 21
|
|
|
|
1 | 22
|
|
|
|
1 | 23
|
|
|
|
1 | 24
|
|
|
|
1 | 25
|
|
|
|
1 | 26
|
|
|
|
1 | 27
|
|
|
|
1 | 28
|
|
|
|
1 | 29
|
|
|
|
1 | 30
|
|
|
|
1 | 31
|
|
|
|
1 | 32
|
|
|
|
1 | 33
|
|
|
|
1 | 34
|
|
|
|
1 | 35
|
|
|
|
1 | 36
|
|
|
|
1 | 37
|
|
|
|
1 | 38
|
|
|
|
1 | 39
|
|
|
|
1 | 40
|
|
|
|
1 | 41
|
|
|
|
1 | 42
|
|
|
|
1 | 43
|
|
|
|
1 | 44
|
|
|
|
1 | 45
|
|
|
|
1 | 46
|
|
|
|
1 | 47
|
|
|
|
1 | 48
|
|
|
|
1 | 49
|
2020-04-08 18:30:11 +02:00
|
|
|
1 | 50
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
2 | 51
|
|
|
|
2 | 52
|
|
|
|
2 | 53
|
|
|
|
2 | 54
|
|
|
|
2 | 55
|
|
|
|
(55 rows)
|
|
|
|
|
|
|
|
-- Test EXPLAIN ANALYZE with only a fullsort group.
|
|
|
|
select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55');
|
2020-05-12 20:04:39 +02:00
|
|
|
explain_analyze_without_memory
|
|
|
|
---------------------------------------------------------------------------------------------------------------
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Limit (actual rows=55 loops=1)
|
|
|
|
-> Incremental Sort (actual rows=55 loops=1)
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
2020-05-12 20:04:39 +02:00
|
|
|
Full-sort Groups: 2 Sort Methods: top-N heapsort, quicksort Average Memory: NNkB Peak Memory: NNkB
|
2020-04-08 18:30:11 +02:00
|
|
|
-> Sort (actual rows=101 loops=1)
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Sort Key: t.a
|
|
|
|
Sort Method: quicksort Memory: NNkB
|
2020-04-08 18:30:11 +02:00
|
|
|
-> Seq Scan on t (actual rows=1000 loops=1)
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
(9 rows)
|
|
|
|
|
|
|
|
select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55'));
|
2020-04-07 18:03:24 +02:00
|
|
|
jsonb_pretty
|
|
|
|
-------------------------------------------------
|
|
|
|
[ +
|
|
|
|
{ +
|
|
|
|
"Sort Key": [ +
|
|
|
|
"t.a", +
|
|
|
|
"t.b" +
|
|
|
|
], +
|
|
|
|
"Node Type": "Incremental Sort", +
|
|
|
|
"Actual Rows": 55, +
|
|
|
|
"Actual Loops": 1, +
|
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a
non-parallel-aware Append concurrently rather than serially to improve
performance when possible. Currently, the only node type that can be
run concurrently is a ForeignScan that is an immediate child of such an
Append. In the case where such ForeignScans access data on different
remote servers, this would run those ForeignScans concurrently, and
overlap the remote operations to be performed simultaneously, so it'll
improve the performance especially when the operations involve
time-consuming ones such as remote join and remote aggregation.
We may extend this to other node types such as joins or aggregates over
ForeignScans in the future.
This also adds the support for postgres_fdw, which is enabled by the
table-level/server-level option "async_capable". The default is false.
Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit
is mostly based on the patch proposed by Robert Haas, but also uses
stuff from the patch proposed by Kyotaro Horiguchi and from the patch
proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin
Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and
others.
Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com
Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
|
|
|
"Async Capable": false, +
|
2020-04-07 18:03:24 +02:00
|
|
|
"Presorted Key": [ +
|
|
|
|
"t.a" +
|
|
|
|
], +
|
|
|
|
"Parallel Aware": false, +
|
|
|
|
"Full-sort Groups": { +
|
|
|
|
"Group Count": 2, +
|
|
|
|
"Sort Methods Used": [ +
|
|
|
|
"top-N heapsort", +
|
|
|
|
"quicksort" +
|
|
|
|
], +
|
|
|
|
"Sort Space Memory": { +
|
|
|
|
"Peak Sort Space Used": "NN", +
|
|
|
|
"Average Sort Space Used": "NN"+
|
|
|
|
} +
|
|
|
|
}, +
|
|
|
|
"Parent Relationship": "Outer" +
|
|
|
|
} +
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
]
|
|
|
|
(1 row)
|
|
|
|
|
|
|
|
select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55');
|
|
|
|
explain_analyze_inc_sort_nodes_verify_invariants
|
|
|
|
--------------------------------------------------
|
|
|
|
t
|
|
|
|
(1 row)
|
|
|
|
|
|
|
|
delete from t;
|
|
|
|
-- An initial small group followed by a large group.
|
2020-04-08 18:30:11 +02:00
|
|
|
insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 1000) n(i);
|
|
|
|
analyze t;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 70;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 70;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
9 | 5
|
|
|
|
9 | 6
|
|
|
|
9 | 7
|
|
|
|
9 | 8
|
|
|
|
9 | 9
|
|
|
|
9 | 10
|
|
|
|
9 | 11
|
|
|
|
9 | 12
|
|
|
|
9 | 13
|
|
|
|
9 | 14
|
|
|
|
9 | 15
|
|
|
|
9 | 16
|
|
|
|
9 | 17
|
|
|
|
9 | 18
|
|
|
|
9 | 19
|
|
|
|
9 | 20
|
|
|
|
9 | 21
|
|
|
|
9 | 22
|
|
|
|
9 | 23
|
|
|
|
9 | 24
|
|
|
|
9 | 25
|
|
|
|
9 | 26
|
|
|
|
9 | 27
|
|
|
|
9 | 28
|
|
|
|
9 | 29
|
|
|
|
9 | 30
|
|
|
|
9 | 31
|
|
|
|
9 | 32
|
|
|
|
9 | 33
|
|
|
|
9 | 34
|
|
|
|
9 | 35
|
|
|
|
9 | 36
|
|
|
|
9 | 37
|
|
|
|
9 | 38
|
|
|
|
9 | 39
|
|
|
|
9 | 40
|
|
|
|
9 | 41
|
|
|
|
9 | 42
|
|
|
|
9 | 43
|
|
|
|
9 | 44
|
|
|
|
9 | 45
|
|
|
|
9 | 46
|
|
|
|
9 | 47
|
|
|
|
9 | 48
|
|
|
|
9 | 49
|
|
|
|
9 | 50
|
|
|
|
9 | 51
|
|
|
|
9 | 52
|
|
|
|
9 | 53
|
|
|
|
9 | 54
|
|
|
|
9 | 55
|
|
|
|
9 | 56
|
|
|
|
9 | 57
|
|
|
|
9 | 58
|
|
|
|
9 | 59
|
|
|
|
9 | 60
|
|
|
|
9 | 61
|
|
|
|
9 | 62
|
|
|
|
9 | 63
|
|
|
|
9 | 64
|
|
|
|
9 | 65
|
|
|
|
9 | 66
|
|
|
|
9 | 67
|
|
|
|
9 | 68
|
|
|
|
9 | 69
|
|
|
|
9 | 70
|
|
|
|
(70 rows)
|
|
|
|
|
2021-02-05 01:12:09 +01:00
|
|
|
-- Checks case where we hit a group boundary at the last tuple of a batch.
|
2021-02-15 16:17:58 +01:00
|
|
|
-- Because the full sort state is bounded, we scan 64 tuples (the mode
|
|
|
|
-- transition point) but only retain 5. Thus when we transition modes, all
|
|
|
|
-- tuples in the full sort state have different prefix keys.
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 5;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
2021-02-05 01:12:09 +01:00
|
|
|
select * from (select * from t order by a) s order by a, b limit 5;
|
|
|
|
a | b
|
|
|
|
---+---
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
9 | 5
|
|
|
|
(5 rows)
|
|
|
|
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
-- Test rescan.
|
|
|
|
begin;
|
|
|
|
-- We force the planner to choose a plan with incremental sort on the right side
|
|
|
|
-- of a nested loop join node. That way we trigger the rescan code path.
|
|
|
|
set local enable_hashjoin = off;
|
|
|
|
set local enable_mergejoin = off;
|
|
|
|
set local enable_material = off;
|
|
|
|
set local enable_sort = off;
|
|
|
|
explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
|
|
|
|
QUERY PLAN
|
|
|
|
------------------------------------------------
|
|
|
|
Nested Loop Left Join
|
|
|
|
Join Filter: (t_1.a = t.a)
|
|
|
|
-> Seq Scan on t
|
|
|
|
Filter: (a = ANY ('{1,2}'::integer[]))
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t_1.a, t_1.b
|
|
|
|
Presorted Key: t_1.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t_1.a
|
|
|
|
-> Seq Scan on t t_1
|
|
|
|
(10 rows)
|
|
|
|
|
|
|
|
select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
|
|
|
|
a | b | a | b
|
|
|
|
---+---+---+---
|
|
|
|
1 | 1 | 1 | 1
|
|
|
|
2 | 2 | 2 | 2
|
|
|
|
(2 rows)
|
|
|
|
|
|
|
|
rollback;
|
|
|
|
-- Test EXPLAIN ANALYZE with both fullsort and presorted groups.
|
|
|
|
select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70');
|
2020-05-12 20:04:39 +02:00
|
|
|
explain_analyze_without_memory
|
|
|
|
----------------------------------------------------------------------------------------------------------------
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Limit (actual rows=70 loops=1)
|
|
|
|
-> Incremental Sort (actual rows=70 loops=1)
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
2020-05-12 20:04:39 +02:00
|
|
|
Full-sort Groups: 1 Sort Method: quicksort Average Memory: NNkB Peak Memory: NNkB
|
|
|
|
Pre-sorted Groups: 5 Sort Methods: top-N heapsort, quicksort Average Memory: NNkB Peak Memory: NNkB
|
2020-04-08 18:30:11 +02:00
|
|
|
-> Sort (actual rows=1000 loops=1)
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Sort Key: t.a
|
|
|
|
Sort Method: quicksort Memory: NNkB
|
2020-04-08 18:30:11 +02:00
|
|
|
-> Seq Scan on t (actual rows=1000 loops=1)
|
2020-05-12 20:04:39 +02:00
|
|
|
(10 rows)
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
|
|
|
|
select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70'));
|
2020-04-07 18:03:24 +02:00
|
|
|
jsonb_pretty
|
|
|
|
-------------------------------------------------
|
|
|
|
[ +
|
|
|
|
{ +
|
|
|
|
"Sort Key": [ +
|
|
|
|
"t.a", +
|
|
|
|
"t.b" +
|
|
|
|
], +
|
|
|
|
"Node Type": "Incremental Sort", +
|
|
|
|
"Actual Rows": 70, +
|
|
|
|
"Actual Loops": 1, +
|
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a
non-parallel-aware Append concurrently rather than serially to improve
performance when possible. Currently, the only node type that can be
run concurrently is a ForeignScan that is an immediate child of such an
Append. In the case where such ForeignScans access data on different
remote servers, this would run those ForeignScans concurrently, and
overlap the remote operations to be performed simultaneously, so it'll
improve the performance especially when the operations involve
time-consuming ones such as remote join and remote aggregation.
We may extend this to other node types such as joins or aggregates over
ForeignScans in the future.
This also adds the support for postgres_fdw, which is enabled by the
table-level/server-level option "async_capable". The default is false.
Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit
is mostly based on the patch proposed by Robert Haas, but also uses
stuff from the patch proposed by Kyotaro Horiguchi and from the patch
proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin
Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and
others.
Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com
Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
|
|
|
"Async Capable": false, +
|
2020-04-07 18:03:24 +02:00
|
|
|
"Presorted Key": [ +
|
|
|
|
"t.a" +
|
|
|
|
], +
|
|
|
|
"Parallel Aware": false, +
|
|
|
|
"Full-sort Groups": { +
|
|
|
|
"Group Count": 1, +
|
|
|
|
"Sort Methods Used": [ +
|
|
|
|
"quicksort" +
|
|
|
|
], +
|
|
|
|
"Sort Space Memory": { +
|
|
|
|
"Peak Sort Space Used": "NN", +
|
|
|
|
"Average Sort Space Used": "NN"+
|
|
|
|
} +
|
|
|
|
}, +
|
2020-05-12 20:04:39 +02:00
|
|
|
"Pre-sorted Groups": { +
|
2020-04-07 18:03:24 +02:00
|
|
|
"Group Count": 5, +
|
|
|
|
"Sort Methods Used": [ +
|
2020-04-08 18:30:11 +02:00
|
|
|
"top-N heapsort", +
|
2020-04-07 18:03:24 +02:00
|
|
|
"quicksort" +
|
|
|
|
], +
|
|
|
|
"Sort Space Memory": { +
|
|
|
|
"Peak Sort Space Used": "NN", +
|
|
|
|
"Average Sort Space Used": "NN"+
|
|
|
|
} +
|
|
|
|
}, +
|
|
|
|
"Parent Relationship": "Outer" +
|
|
|
|
} +
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
]
|
|
|
|
(1 row)
|
|
|
|
|
|
|
|
select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70');
|
|
|
|
explain_analyze_inc_sort_nodes_verify_invariants
|
|
|
|
--------------------------------------------------
|
|
|
|
t
|
|
|
|
(1 row)
|
|
|
|
|
|
|
|
delete from t;
|
|
|
|
-- Small groups of 10 tuples each tested around each mode transition point.
|
2020-04-08 18:30:11 +02:00
|
|
|
insert into t(a, b) select i / 10, i from generate_series(1, 1000) n(i);
|
|
|
|
analyze t;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 31;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
0 | 1
|
|
|
|
0 | 2
|
|
|
|
0 | 3
|
|
|
|
0 | 4
|
|
|
|
0 | 5
|
|
|
|
0 | 6
|
|
|
|
0 | 7
|
|
|
|
0 | 8
|
|
|
|
0 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
2 | 20
|
|
|
|
2 | 21
|
|
|
|
2 | 22
|
|
|
|
2 | 23
|
|
|
|
2 | 24
|
|
|
|
2 | 25
|
|
|
|
2 | 26
|
|
|
|
2 | 27
|
|
|
|
2 | 28
|
|
|
|
2 | 29
|
|
|
|
3 | 30
|
|
|
|
3 | 31
|
|
|
|
(31 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 32;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
0 | 1
|
|
|
|
0 | 2
|
|
|
|
0 | 3
|
|
|
|
0 | 4
|
|
|
|
0 | 5
|
|
|
|
0 | 6
|
|
|
|
0 | 7
|
|
|
|
0 | 8
|
|
|
|
0 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
2 | 20
|
|
|
|
2 | 21
|
|
|
|
2 | 22
|
|
|
|
2 | 23
|
|
|
|
2 | 24
|
|
|
|
2 | 25
|
|
|
|
2 | 26
|
|
|
|
2 | 27
|
|
|
|
2 | 28
|
|
|
|
2 | 29
|
|
|
|
3 | 30
|
|
|
|
3 | 31
|
|
|
|
3 | 32
|
|
|
|
(32 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 33;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
0 | 1
|
|
|
|
0 | 2
|
|
|
|
0 | 3
|
|
|
|
0 | 4
|
|
|
|
0 | 5
|
|
|
|
0 | 6
|
|
|
|
0 | 7
|
|
|
|
0 | 8
|
|
|
|
0 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
2 | 20
|
|
|
|
2 | 21
|
|
|
|
2 | 22
|
|
|
|
2 | 23
|
|
|
|
2 | 24
|
|
|
|
2 | 25
|
|
|
|
2 | 26
|
|
|
|
2 | 27
|
|
|
|
2 | 28
|
|
|
|
2 | 29
|
|
|
|
3 | 30
|
|
|
|
3 | 31
|
|
|
|
3 | 32
|
|
|
|
3 | 33
|
|
|
|
(33 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 65;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
0 | 1
|
|
|
|
0 | 2
|
|
|
|
0 | 3
|
|
|
|
0 | 4
|
|
|
|
0 | 5
|
|
|
|
0 | 6
|
|
|
|
0 | 7
|
|
|
|
0 | 8
|
|
|
|
0 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
2 | 20
|
|
|
|
2 | 21
|
|
|
|
2 | 22
|
|
|
|
2 | 23
|
|
|
|
2 | 24
|
|
|
|
2 | 25
|
|
|
|
2 | 26
|
|
|
|
2 | 27
|
|
|
|
2 | 28
|
|
|
|
2 | 29
|
|
|
|
3 | 30
|
|
|
|
3 | 31
|
|
|
|
3 | 32
|
|
|
|
3 | 33
|
|
|
|
3 | 34
|
|
|
|
3 | 35
|
|
|
|
3 | 36
|
|
|
|
3 | 37
|
|
|
|
3 | 38
|
|
|
|
3 | 39
|
|
|
|
4 | 40
|
|
|
|
4 | 41
|
|
|
|
4 | 42
|
|
|
|
4 | 43
|
|
|
|
4 | 44
|
|
|
|
4 | 45
|
|
|
|
4 | 46
|
|
|
|
4 | 47
|
|
|
|
4 | 48
|
|
|
|
4 | 49
|
|
|
|
5 | 50
|
|
|
|
5 | 51
|
|
|
|
5 | 52
|
|
|
|
5 | 53
|
|
|
|
5 | 54
|
|
|
|
5 | 55
|
|
|
|
5 | 56
|
|
|
|
5 | 57
|
|
|
|
5 | 58
|
|
|
|
5 | 59
|
|
|
|
6 | 60
|
|
|
|
6 | 61
|
|
|
|
6 | 62
|
|
|
|
6 | 63
|
|
|
|
6 | 64
|
|
|
|
6 | 65
|
|
|
|
(65 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 66;
|
|
|
|
a | b
|
|
|
|
---+----
|
|
|
|
0 | 1
|
|
|
|
0 | 2
|
|
|
|
0 | 3
|
|
|
|
0 | 4
|
|
|
|
0 | 5
|
|
|
|
0 | 6
|
|
|
|
0 | 7
|
|
|
|
0 | 8
|
|
|
|
0 | 9
|
|
|
|
1 | 10
|
|
|
|
1 | 11
|
|
|
|
1 | 12
|
|
|
|
1 | 13
|
|
|
|
1 | 14
|
|
|
|
1 | 15
|
|
|
|
1 | 16
|
|
|
|
1 | 17
|
|
|
|
1 | 18
|
|
|
|
1 | 19
|
|
|
|
2 | 20
|
|
|
|
2 | 21
|
|
|
|
2 | 22
|
|
|
|
2 | 23
|
|
|
|
2 | 24
|
|
|
|
2 | 25
|
|
|
|
2 | 26
|
|
|
|
2 | 27
|
|
|
|
2 | 28
|
|
|
|
2 | 29
|
|
|
|
3 | 30
|
|
|
|
3 | 31
|
|
|
|
3 | 32
|
|
|
|
3 | 33
|
|
|
|
3 | 34
|
|
|
|
3 | 35
|
|
|
|
3 | 36
|
|
|
|
3 | 37
|
|
|
|
3 | 38
|
|
|
|
3 | 39
|
|
|
|
4 | 40
|
|
|
|
4 | 41
|
|
|
|
4 | 42
|
|
|
|
4 | 43
|
|
|
|
4 | 44
|
|
|
|
4 | 45
|
|
|
|
4 | 46
|
|
|
|
4 | 47
|
|
|
|
4 | 48
|
|
|
|
4 | 49
|
|
|
|
5 | 50
|
|
|
|
5 | 51
|
|
|
|
5 | 52
|
|
|
|
5 | 53
|
|
|
|
5 | 54
|
|
|
|
5 | 55
|
|
|
|
5 | 56
|
|
|
|
5 | 57
|
|
|
|
5 | 58
|
|
|
|
5 | 59
|
|
|
|
6 | 60
|
|
|
|
6 | 61
|
|
|
|
6 | 62
|
|
|
|
6 | 63
|
|
|
|
6 | 64
|
|
|
|
6 | 65
|
|
|
|
6 | 66
|
|
|
|
(66 rows)
|
|
|
|
|
|
|
|
delete from t;
|
|
|
|
-- Small groups of only 1 tuple each tested around each mode transition point.
|
2020-04-08 18:30:11 +02:00
|
|
|
insert into t(a, b) select i, i from generate_series(1, 1000) n(i);
|
|
|
|
analyze t;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 31;
|
|
|
|
a | b
|
|
|
|
----+----
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
5 | 5
|
|
|
|
6 | 6
|
|
|
|
7 | 7
|
|
|
|
8 | 8
|
|
|
|
9 | 9
|
|
|
|
10 | 10
|
|
|
|
11 | 11
|
|
|
|
12 | 12
|
|
|
|
13 | 13
|
|
|
|
14 | 14
|
|
|
|
15 | 15
|
|
|
|
16 | 16
|
|
|
|
17 | 17
|
|
|
|
18 | 18
|
|
|
|
19 | 19
|
|
|
|
20 | 20
|
|
|
|
21 | 21
|
|
|
|
22 | 22
|
|
|
|
23 | 23
|
|
|
|
24 | 24
|
|
|
|
25 | 25
|
|
|
|
26 | 26
|
|
|
|
27 | 27
|
|
|
|
28 | 28
|
|
|
|
29 | 29
|
|
|
|
30 | 30
|
|
|
|
31 | 31
|
|
|
|
(31 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 32;
|
|
|
|
a | b
|
|
|
|
----+----
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
5 | 5
|
|
|
|
6 | 6
|
|
|
|
7 | 7
|
|
|
|
8 | 8
|
|
|
|
9 | 9
|
|
|
|
10 | 10
|
|
|
|
11 | 11
|
|
|
|
12 | 12
|
|
|
|
13 | 13
|
|
|
|
14 | 14
|
|
|
|
15 | 15
|
|
|
|
16 | 16
|
|
|
|
17 | 17
|
|
|
|
18 | 18
|
|
|
|
19 | 19
|
|
|
|
20 | 20
|
|
|
|
21 | 21
|
|
|
|
22 | 22
|
|
|
|
23 | 23
|
|
|
|
24 | 24
|
|
|
|
25 | 25
|
|
|
|
26 | 26
|
|
|
|
27 | 27
|
|
|
|
28 | 28
|
|
|
|
29 | 29
|
|
|
|
30 | 30
|
|
|
|
31 | 31
|
|
|
|
32 | 32
|
|
|
|
(32 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 33;
|
|
|
|
a | b
|
|
|
|
----+----
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
5 | 5
|
|
|
|
6 | 6
|
|
|
|
7 | 7
|
|
|
|
8 | 8
|
|
|
|
9 | 9
|
|
|
|
10 | 10
|
|
|
|
11 | 11
|
|
|
|
12 | 12
|
|
|
|
13 | 13
|
|
|
|
14 | 14
|
|
|
|
15 | 15
|
|
|
|
16 | 16
|
|
|
|
17 | 17
|
|
|
|
18 | 18
|
|
|
|
19 | 19
|
|
|
|
20 | 20
|
|
|
|
21 | 21
|
|
|
|
22 | 22
|
|
|
|
23 | 23
|
|
|
|
24 | 24
|
|
|
|
25 | 25
|
|
|
|
26 | 26
|
|
|
|
27 | 27
|
|
|
|
28 | 28
|
|
|
|
29 | 29
|
|
|
|
30 | 30
|
|
|
|
31 | 31
|
|
|
|
32 | 32
|
|
|
|
33 | 33
|
|
|
|
(33 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 65;
|
|
|
|
a | b
|
|
|
|
----+----
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
5 | 5
|
|
|
|
6 | 6
|
|
|
|
7 | 7
|
|
|
|
8 | 8
|
|
|
|
9 | 9
|
|
|
|
10 | 10
|
|
|
|
11 | 11
|
|
|
|
12 | 12
|
|
|
|
13 | 13
|
|
|
|
14 | 14
|
|
|
|
15 | 15
|
|
|
|
16 | 16
|
|
|
|
17 | 17
|
|
|
|
18 | 18
|
|
|
|
19 | 19
|
|
|
|
20 | 20
|
|
|
|
21 | 21
|
|
|
|
22 | 22
|
|
|
|
23 | 23
|
|
|
|
24 | 24
|
|
|
|
25 | 25
|
|
|
|
26 | 26
|
|
|
|
27 | 27
|
|
|
|
28 | 28
|
|
|
|
29 | 29
|
|
|
|
30 | 30
|
|
|
|
31 | 31
|
|
|
|
32 | 32
|
|
|
|
33 | 33
|
|
|
|
34 | 34
|
|
|
|
35 | 35
|
|
|
|
36 | 36
|
|
|
|
37 | 37
|
|
|
|
38 | 38
|
|
|
|
39 | 39
|
|
|
|
40 | 40
|
|
|
|
41 | 41
|
|
|
|
42 | 42
|
|
|
|
43 | 43
|
|
|
|
44 | 44
|
|
|
|
45 | 45
|
|
|
|
46 | 46
|
|
|
|
47 | 47
|
|
|
|
48 | 48
|
|
|
|
49 | 49
|
|
|
|
50 | 50
|
|
|
|
51 | 51
|
|
|
|
52 | 52
|
|
|
|
53 | 53
|
|
|
|
54 | 54
|
|
|
|
55 | 55
|
|
|
|
56 | 56
|
|
|
|
57 | 57
|
|
|
|
58 | 58
|
|
|
|
59 | 59
|
|
|
|
60 | 60
|
|
|
|
61 | 61
|
|
|
|
62 | 62
|
|
|
|
63 | 63
|
|
|
|
64 | 64
|
|
|
|
65 | 65
|
|
|
|
(65 rows)
|
|
|
|
|
|
|
|
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------
|
|
|
|
Limit
|
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: t.a, t.b
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a
|
|
|
|
-> Seq Scan on t
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
select * from (select * from t order by a) s order by a, b limit 66;
|
|
|
|
a | b
|
|
|
|
----+----
|
|
|
|
1 | 1
|
|
|
|
2 | 2
|
|
|
|
3 | 3
|
|
|
|
4 | 4
|
|
|
|
5 | 5
|
|
|
|
6 | 6
|
|
|
|
7 | 7
|
|
|
|
8 | 8
|
|
|
|
9 | 9
|
|
|
|
10 | 10
|
|
|
|
11 | 11
|
|
|
|
12 | 12
|
|
|
|
13 | 13
|
|
|
|
14 | 14
|
|
|
|
15 | 15
|
|
|
|
16 | 16
|
|
|
|
17 | 17
|
|
|
|
18 | 18
|
|
|
|
19 | 19
|
|
|
|
20 | 20
|
|
|
|
21 | 21
|
|
|
|
22 | 22
|
|
|
|
23 | 23
|
|
|
|
24 | 24
|
|
|
|
25 | 25
|
|
|
|
26 | 26
|
|
|
|
27 | 27
|
|
|
|
28 | 28
|
|
|
|
29 | 29
|
|
|
|
30 | 30
|
|
|
|
31 | 31
|
|
|
|
32 | 32
|
|
|
|
33 | 33
|
|
|
|
34 | 34
|
|
|
|
35 | 35
|
|
|
|
36 | 36
|
|
|
|
37 | 37
|
|
|
|
38 | 38
|
|
|
|
39 | 39
|
|
|
|
40 | 40
|
|
|
|
41 | 41
|
|
|
|
42 | 42
|
|
|
|
43 | 43
|
|
|
|
44 | 44
|
|
|
|
45 | 45
|
|
|
|
46 | 46
|
|
|
|
47 | 47
|
|
|
|
48 | 48
|
|
|
|
49 | 49
|
|
|
|
50 | 50
|
|
|
|
51 | 51
|
|
|
|
52 | 52
|
|
|
|
53 | 53
|
|
|
|
54 | 54
|
|
|
|
55 | 55
|
|
|
|
56 | 56
|
|
|
|
57 | 57
|
|
|
|
58 | 58
|
|
|
|
59 | 59
|
|
|
|
60 | 60
|
|
|
|
61 | 61
|
|
|
|
62 | 62
|
|
|
|
63 | 63
|
|
|
|
64 | 64
|
|
|
|
65 | 65
|
|
|
|
66 | 66
|
|
|
|
(66 rows)
|
|
|
|
|
|
|
|
delete from t;
|
|
|
|
drop table t;
|
|
|
|
-- Incremental sort vs. parallel queries
|
|
|
|
set min_parallel_table_scan_size = '1kB';
|
|
|
|
set min_parallel_index_scan_size = '1kB';
|
|
|
|
set parallel_setup_cost = 0;
|
|
|
|
set parallel_tuple_cost = 0;
|
2020-04-06 23:58:10 +02:00
|
|
|
set max_parallel_workers_per_gather = 2;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
create table t (a int, b int, c int);
|
Revert "Optimize order of GROUP BY keys".
This reverts commit db0d67db2401eb6238ccc04c6407a4fd4f985832 and
several follow-on fixes. The idea of making a cost-based choice
of the order of the sorting columns is not fundamentally unsound,
but it requires cost information and data statistics that we don't
really have. For example, relying on procost to distinguish the
relative costs of different sort comparators is pretty pointless
so long as most such comparator functions are labeled with cost 1.0.
Moreover, estimating the number of comparisons done by Quicksort
requires more than just an estimate of the number of distinct values
in the input: you also need some idea of the sizes of the larger
groups, if you want an estimate that's good to better than a factor of
three or so. That's data that's often unknown or not very reliable.
Worse, to arrive at estimates of the number of calls made to the
lower-order-column comparison functions, the code needs to make
estimates of the numbers of distinct values of multiple columns,
which are necessarily even less trustworthy than per-column stats.
Even if all the inputs are perfectly reliable, the cost algorithm
as-implemented cannot offer useful information about how to order
sorting columns beyond the point at which the average group size
is estimated to drop to 1.
Close inspection of the code added by db0d67db2 shows that there
are also multiple small bugs. These could have been fixed, but
there's not much point if we don't trust the estimates to be
accurate in-principle.
Finally, the changes in cost_sort's behavior made for very large
changes (often a factor of 2 or so) in the cost estimates for all
sorting operations, not only those for multi-column GROUP BY.
That naturally changes plan choices in many situations, and there's
precious little evidence to show that the changes are for the better.
Given the above doubts about whether the new estimates are really
trustworthy, it's hard to summon much confidence that these changes
are better on the average.
Since we're hard up against the release deadline for v15, let's
revert these changes for now. We can always try again later.
Note: in v15, I left T_PathKeyInfo in place in nodes.h even though
it's unreferenced. Removing it would be an ABI break, and it seems
a bit late in the release cycle for that.
Discussion: https://postgr.es/m/TYAPR01MB586665EB5FB2C3807E893941F5579@TYAPR01MB5866.jpnprd01.prod.outlook.com
2022-10-03 16:56:16 +02:00
|
|
|
insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
create index on t (a);
|
|
|
|
analyze t;
|
2020-07-05 11:41:52 +02:00
|
|
|
set enable_incremental_sort = off;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
|
|
|
|
QUERY PLAN
|
|
|
|
------------------------------------------------------
|
|
|
|
Limit
|
|
|
|
-> Sort
|
|
|
|
Sort Key: a, b, (sum(c))
|
|
|
|
-> Finalize HashAggregate
|
|
|
|
Group Key: a, b
|
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Partial HashAggregate
|
|
|
|
Group Key: a, b
|
|
|
|
-> Parallel Seq Scan on t
|
|
|
|
(10 rows)
|
|
|
|
|
2020-07-05 11:41:52 +02:00
|
|
|
set enable_incremental_sort = on;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
|
2020-04-07 16:43:18 +02:00
|
|
|
QUERY PLAN
|
|
|
|
----------------------------------------------------------------------
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Limit
|
2020-04-07 16:43:18 +02:00
|
|
|
-> Incremental Sort
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Sort Key: a, b, (sum(c))
|
2020-04-07 16:43:18 +02:00
|
|
|
Presorted Key: a, b
|
|
|
|
-> GroupAggregate
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Group Key: a, b
|
2020-04-07 16:43:18 +02:00
|
|
|
-> Gather Merge
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
Workers Planned: 2
|
2020-04-07 16:43:18 +02:00
|
|
|
-> Incremental Sort
|
|
|
|
Sort Key: a, b
|
|
|
|
Presorted Key: a
|
|
|
|
-> Parallel Index Scan using t_a_idx on t
|
|
|
|
(12 rows)
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
|
2020-04-23 00:15:24 +02:00
|
|
|
-- Incremental sort vs. set operations with varno 0
|
|
|
|
set enable_hashagg to off;
|
|
|
|
explain (costs off) select * from t union select * from t order by 1,3;
|
|
|
|
QUERY PLAN
|
|
|
|
----------------------------------------------------------
|
|
|
|
Incremental Sort
|
|
|
|
Sort Key: t.a, t.c
|
|
|
|
Presorted Key: t.a
|
|
|
|
-> Unique
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.a, t.b, t.c
|
Fix rowcount estimate for SubqueryScan that's under a Gather.
SubqueryScan was always getting labeled with a rowcount estimate
appropriate for non-parallel cases. However, nodes that are
underneath a Gather should be treated as processing only one
worker's share of the rows, whether the particular node is explicitly
parallel-aware or not. Most non-scan-level node types get this
right automatically because they base their rowcount estimate on
that of their input sub-Path(s). But SubqueryScan didn't do that,
instead using the whole-relation rowcount estimate as if it were
a non-parallel-aware scan node. If there is a parallel-aware node
below the SubqueryScan, this is wrong, and it results in inflating
the cost estimates for nodes above the SubqueryScan, which can cause
us to not choose a parallel plan, or choose a silly one --- as indeed
is visible in the one regression test whose results change with this
patch. (Although that plan tree appears to contain no SubqueryScans,
there were some in it before setrefs.c deleted them.)
To fix, use path->subpath->rows not baserel->tuples as the number
of input tuples we'll process. This requires estimating the quals'
selectivity afresh, which is slightly annoying; but it shouldn't
really add much cost thanks to the caching done in RestrictInfo.
This is pretty clearly a bug fix, but I'll refrain from back-patching
as people might not appreciate plan choices changing in stable branches.
The fact that it took us this long to identify the bug suggests that
it's not a major problem.
Per report from bucoo, though this is not his proposed patch.
Discussion: https://postgr.es/m/202204121457159307248@sohu.com
2022-05-04 20:44:40 +02:00
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Parallel Append
|
2020-04-23 00:15:24 +02:00
|
|
|
-> Parallel Seq Scan on t
|
|
|
|
-> Parallel Seq Scan on t t_1
|
Fix rowcount estimate for SubqueryScan that's under a Gather.
SubqueryScan was always getting labeled with a rowcount estimate
appropriate for non-parallel cases. However, nodes that are
underneath a Gather should be treated as processing only one
worker's share of the rows, whether the particular node is explicitly
parallel-aware or not. Most non-scan-level node types get this
right automatically because they base their rowcount estimate on
that of their input sub-Path(s). But SubqueryScan didn't do that,
instead using the whole-relation rowcount estimate as if it were
a non-parallel-aware scan node. If there is a parallel-aware node
below the SubqueryScan, this is wrong, and it results in inflating
the cost estimates for nodes above the SubqueryScan, which can cause
us to not choose a parallel plan, or choose a silly one --- as indeed
is visible in the one regression test whose results change with this
patch. (Although that plan tree appears to contain no SubqueryScans,
there were some in it before setrefs.c deleted them.)
To fix, use path->subpath->rows not baserel->tuples as the number
of input tuples we'll process. This requires estimating the quals'
selectivity afresh, which is slightly annoying; but it shouldn't
really add much cost thanks to the caching done in RestrictInfo.
This is pretty clearly a bug fix, but I'll refrain from back-patching
as people might not appreciate plan choices changing in stable branches.
The fact that it took us this long to identify the bug suggests that
it's not a major problem.
Per report from bucoo, though this is not his proposed patch.
Discussion: https://postgr.es/m/202204121457159307248@sohu.com
2022-05-04 20:44:40 +02:00
|
|
|
(11 rows)
|
2020-04-23 00:15:24 +02:00
|
|
|
|
2020-12-21 18:09:57 +01:00
|
|
|
-- Full sort, not just incremental sort can be pushed below a gather merge path
|
|
|
|
-- by generate_useful_gather_paths.
|
|
|
|
explain (costs off) select distinct a,b from t;
|
2023-01-10 22:25:43 +01:00
|
|
|
QUERY PLAN
|
|
|
|
------------------------------------------------
|
2020-12-21 18:09:57 +01:00
|
|
|
Unique
|
|
|
|
-> Gather Merge
|
|
|
|
Workers Planned: 2
|
2023-01-10 22:25:43 +01:00
|
|
|
-> Unique
|
|
|
|
-> Sort
|
|
|
|
Sort Key: a, b
|
|
|
|
-> Parallel Seq Scan on t
|
|
|
|
(7 rows)
|
2020-12-21 18:09:57 +01:00
|
|
|
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
drop table t;
|
2020-11-03 20:07:23 +01:00
|
|
|
-- Sort pushdown can't go below where expressions are part of the rel target.
|
|
|
|
-- In particular this is interesting for volatile expressions which have to
|
|
|
|
-- go above joins since otherwise we'll incorrectly use expression evaluations
|
|
|
|
-- across multiple rows.
|
|
|
|
set enable_hashagg=off;
|
|
|
|
set enable_seqscan=off;
|
|
|
|
set enable_incremental_sort = off;
|
|
|
|
set parallel_tuple_cost=0;
|
|
|
|
set parallel_setup_cost=0;
|
|
|
|
set min_parallel_table_scan_size = 0;
|
|
|
|
set min_parallel_index_scan_size = 0;
|
|
|
|
-- Parallel sort below join.
|
|
|
|
explain (costs off) select distinct sub.unique1, stringu1
|
|
|
|
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub;
|
|
|
|
QUERY PLAN
|
|
|
|
--------------------------------------------------------------------------
|
|
|
|
Unique
|
|
|
|
-> Nested Loop
|
|
|
|
-> Gather Merge
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.unique1, tenk1.stringu1
|
|
|
|
-> Parallel Index Scan using tenk1_unique1 on tenk1
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
(8 rows)
|
|
|
|
|
|
|
|
explain (costs off) select sub.unique1, stringu1
|
|
|
|
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub
|
|
|
|
order by 1, 2;
|
|
|
|
QUERY PLAN
|
|
|
|
--------------------------------------------------------------------
|
|
|
|
Nested Loop
|
|
|
|
-> Gather Merge
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.unique1, tenk1.stringu1
|
|
|
|
-> Parallel Index Scan using tenk1_unique1 on tenk1
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
(7 rows)
|
|
|
|
|
|
|
|
-- Parallel sort but with expression that can be safely generated at the base rel.
|
|
|
|
explain (costs off) select distinct sub.unique1, md5(stringu1)
|
|
|
|
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub;
|
|
|
|
QUERY PLAN
|
|
|
|
----------------------------------------------------------------------------------------
|
|
|
|
Unique
|
|
|
|
-> Nested Loop
|
|
|
|
-> Gather Merge
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.unique1, (md5((tenk1.stringu1)::text)) COLLATE "C"
|
|
|
|
-> Parallel Index Scan using tenk1_unique1 on tenk1
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
(8 rows)
|
|
|
|
|
|
|
|
explain (costs off) select sub.unique1, md5(stringu1)
|
|
|
|
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub
|
|
|
|
order by 1, 2;
|
|
|
|
QUERY PLAN
|
|
|
|
----------------------------------------------------------------------------------
|
|
|
|
Nested Loop
|
|
|
|
-> Gather Merge
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.unique1, (md5((tenk1.stringu1)::text)) COLLATE "C"
|
|
|
|
-> Parallel Index Scan using tenk1_unique1 on tenk1
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
(7 rows)
|
|
|
|
|
2021-04-20 17:32:02 +02:00
|
|
|
-- Parallel sort with an aggregate that can be safely generated in parallel,
|
|
|
|
-- but we can't sort by partial aggregate values.
|
|
|
|
explain (costs off) select count(*)
|
|
|
|
from tenk1 t1
|
|
|
|
join tenk1 t2 on t1.unique1 = t2.unique2
|
|
|
|
join tenk1 t3 on t2.unique1 = t3.unique1
|
|
|
|
order by count(*);
|
|
|
|
QUERY PLAN
|
|
|
|
-----------------------------------------------------------------------------------------------
|
|
|
|
Sort
|
|
|
|
Sort Key: (count(*))
|
|
|
|
-> Finalize Aggregate
|
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Partial Aggregate
|
|
|
|
-> Parallel Hash Join
|
|
|
|
Hash Cond: (t2.unique1 = t3.unique1)
|
|
|
|
-> Parallel Hash Join
|
|
|
|
Hash Cond: (t1.unique1 = t2.unique2)
|
|
|
|
-> Parallel Index Only Scan using tenk1_unique1 on tenk1 t1
|
|
|
|
-> Parallel Hash
|
|
|
|
-> Parallel Index Scan using tenk1_unique2 on tenk1 t2
|
|
|
|
-> Parallel Hash
|
|
|
|
-> Parallel Index Only Scan using tenk1_unique1 on tenk1 t3
|
|
|
|
(15 rows)
|
|
|
|
|
2020-12-21 18:29:46 +01:00
|
|
|
-- Parallel sort but with expression (correlated subquery) that
|
|
|
|
-- is prohibited in parallel plans.
|
|
|
|
explain (costs off) select distinct
|
|
|
|
unique1,
|
|
|
|
(select t.unique1 from tenk1 where tenk1.unique1 = t.unique1)
|
|
|
|
from tenk1 t, generate_series(1, 1000);
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------------------------------------------------------
|
|
|
|
Unique
|
|
|
|
-> Sort
|
|
|
|
Sort Key: t.unique1, ((SubPlan 1))
|
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Nested Loop
|
|
|
|
-> Parallel Index Only Scan using tenk1_unique1 on tenk1 t
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
SubPlan 1
|
|
|
|
-> Index Only Scan using tenk1_unique1 on tenk1
|
|
|
|
Index Cond: (unique1 = t.unique1)
|
|
|
|
(11 rows)
|
|
|
|
|
|
|
|
explain (costs off) select
|
|
|
|
unique1,
|
|
|
|
(select t.unique1 from tenk1 where tenk1.unique1 = t.unique1)
|
|
|
|
from tenk1 t, generate_series(1, 1000)
|
|
|
|
order by 1, 2;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------------------------------------------------
|
|
|
|
Sort
|
|
|
|
Sort Key: t.unique1, ((SubPlan 1))
|
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Nested Loop
|
|
|
|
-> Parallel Index Only Scan using tenk1_unique1 on tenk1 t
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
SubPlan 1
|
|
|
|
-> Index Only Scan using tenk1_unique1 on tenk1
|
|
|
|
Index Cond: (unique1 = t.unique1)
|
|
|
|
(10 rows)
|
|
|
|
|
2020-11-03 20:07:23 +01:00
|
|
|
-- Parallel sort but with expression not available until the upper rel.
|
|
|
|
explain (costs off) select distinct sub.unique1, stringu1 || random()::text
|
|
|
|
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------------------------------------------------------------------
|
|
|
|
Unique
|
|
|
|
-> Sort
|
|
|
|
Sort Key: tenk1.unique1, (((tenk1.stringu1)::text || (random())::text)) COLLATE "C"
|
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Nested Loop
|
|
|
|
-> Parallel Index Scan using tenk1_unique1 on tenk1
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
(8 rows)
|
|
|
|
|
|
|
|
explain (costs off) select sub.unique1, stringu1 || random()::text
|
|
|
|
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub
|
|
|
|
order by 1, 2;
|
|
|
|
QUERY PLAN
|
|
|
|
---------------------------------------------------------------------------------------
|
|
|
|
Sort
|
|
|
|
Sort Key: tenk1.unique1, (((tenk1.stringu1)::text || (random())::text)) COLLATE "C"
|
|
|
|
-> Gather
|
|
|
|
Workers Planned: 2
|
|
|
|
-> Nested Loop
|
|
|
|
-> Parallel Index Scan using tenk1_unique1 on tenk1
|
|
|
|
-> Function Scan on generate_series
|
|
|
|
(7 rows)
|
|
|
|
|