Track I/O timing for temporary file blocks in EXPLAIN (BUFFERS)

Previously, the output of EXPLAIN (BUFFERS) option showed only the I/O
timing spent reading and writing shared and local buffers.  This commit
adds on top of that the I/O timing for temporary buffers in the output
of EXPLAIN (for spilled external sorts, hashes, materialization. etc).
This can be helpful for users in cases where the I/O related to
temporary buffers is the bottleneck.

Like its cousin, this information is available only when track_io_timing
is enabled.  Playing the patch, this is showing an extra overhead of up
to 1% even when using gettimeofday() as implementation for interval
timings, which is slightly within the usual range noise still that's
measurable.

Author: Masahiko Sawada
Reviewed-by: Georgios Kokolatos, Melanie Plageman, Julien Rouhaud,
Ranier Vilela
Discussion: https://postgr.es/m/CAD21AoAJgotTeP83p6HiAGDhs_9Fw9pZ2J=_tYTsiO5Ob-V5GQ@mail.gmail.com
This commit is contained in:
Michael Paquier 2022-04-08 11:27:21 +09:00
parent dafae9707a
commit efb0ef909f
7 changed files with 147 additions and 68 deletions

View File

@ -174,14 +174,15 @@ ROLLBACK;
Include information on buffer usage. Specifically, include the number of
shared blocks hit, read, dirtied, and written, the number of local blocks
hit, read, dirtied, and written, the number of temp blocks read and
written, and the time spent reading and writing data file blocks
(in milliseconds) if <xref linkend="guc-track-io-timing"/> is enabled.
A <emphasis>hit</emphasis> means that a read was avoided because the block was
found already in cache when needed.
written, and the time spent reading and writing data file blocks and
temporary file blocks (in milliseconds) if
<xref linkend="guc-track-io-timing"/> is enabled. A
<emphasis>hit</emphasis> means that a read was avoided because the block
was found already in cache when needed.
Shared blocks contain data from regular tables and indexes;
local blocks contain data from temporary tables and indexes;
while temp blocks contain short-term working data used in sorts, hashes,
Materialize plan nodes, and similar cases.
while temporary blocks contain short-term working data used in sorts,
hashes, Materialize plan nodes, and similar cases.
The number of blocks <emphasis>dirtied</emphasis> indicates the number of
previously unmodified blocks that were changed by this query; while the
number of blocks <emphasis>written</emphasis> indicates the number of

View File

@ -3521,8 +3521,11 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
usage->temp_blks_written > 0);
bool has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
!INSTR_TIME_IS_ZERO(usage->blk_write_time));
bool has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) ||
!INSTR_TIME_IS_ZERO(usage->temp_blk_write_time));
bool show_planning = (planning && (has_shared ||
has_local || has_temp || has_timing));
has_local || has_temp || has_timing ||
has_temp_timing));
if (show_planning)
{
@ -3587,16 +3590,33 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
}
/* As above, show only positive counter values. */
if (has_timing)
if (has_timing || has_temp_timing)
{
ExplainIndentText(es);
appendStringInfoString(es->str, "I/O Timings:");
if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
appendStringInfo(es->str, " read=%0.3f",
INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
appendStringInfo(es->str, " write=%0.3f",
INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
if (has_timing)
{
appendStringInfoString(es->str, " shared/local");
if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
appendStringInfo(es->str, " read=%0.3f",
INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
appendStringInfo(es->str, " write=%0.3f",
INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
if (has_temp_timing)
appendStringInfoChar(es->str, ',');
}
if (has_temp_timing)
{
appendStringInfoString(es->str, " temp");
if (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time))
appendStringInfo(es->str, " read=%0.3f",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
if (!INSTR_TIME_IS_ZERO(usage->temp_blk_write_time))
appendStringInfo(es->str, " write=%0.3f",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
}
appendStringInfoChar(es->str, '\n');
}
@ -3633,6 +3653,12 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
ExplainPropertyFloat("I/O Write Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->blk_write_time),
3, es);
ExplainPropertyFloat("Temp I/O Read Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time),
3, es);
ExplainPropertyFloat("Temp I/O Write Time", "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
3, es);
}
}
}

View File

@ -237,6 +237,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
dst->temp_blks_written += add->temp_blks_written;
INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
}
/* dst += add - sub */
@ -259,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst,
add->blk_read_time, sub->blk_read_time);
INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
add->blk_write_time, sub->blk_write_time);
INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
add->temp_blk_read_time, sub->temp_blk_read_time);
INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
add->temp_blk_write_time, sub->temp_blk_write_time);
}
/* helper functions for WAL usage accumulation */

View File

@ -429,6 +429,8 @@ static void
BufFileLoadBuffer(BufFile *file)
{
File thisfile;
instr_time io_start;
instr_time io_time;
/*
* Advance to next component file if necessary and possible.
@ -440,10 +442,14 @@ BufFileLoadBuffer(BufFile *file)
file->curOffset = 0L;
}
thisfile = file->files[file->curFile];
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
/*
* Read whatever we can get, up to a full bufferload.
*/
thisfile = file->files[file->curFile];
file->nbytes = FileRead(thisfile,
file->buffer.data,
sizeof(file->buffer),
@ -458,6 +464,13 @@ BufFileLoadBuffer(BufFile *file)
FilePathName(thisfile))));
}
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
INSTR_TIME_SUBTRACT(io_time, io_start);
INSTR_TIME_ADD(pgBufferUsage.temp_blk_read_time, io_time);
}
/* we choose not to advance curOffset here */
if (file->nbytes > 0)
@ -485,6 +498,8 @@ BufFileDumpBuffer(BufFile *file)
while (wpos < file->nbytes)
{
off_t availbytes;
instr_time io_start;
instr_time io_time;
/*
* Advance to next component file if necessary and possible.
@ -507,6 +522,10 @@ BufFileDumpBuffer(BufFile *file)
bytestowrite = (int) availbytes;
thisfile = file->files[file->curFile];
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
bytestowrite = FileWrite(thisfile,
file->buffer.data + wpos,
bytestowrite,
@ -517,6 +536,14 @@ BufFileDumpBuffer(BufFile *file)
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m",
FilePathName(thisfile))));
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
INSTR_TIME_SUBTRACT(io_time, io_start);
INSTR_TIME_ADD(pgBufferUsage.temp_blk_write_time, io_time);
}
file->curOffset += bytestowrite;
wpos += bytestowrite;

View File

@ -33,8 +33,10 @@ typedef struct BufferUsage
int64 local_blks_written; /* # of local disk blocks written */
int64 temp_blks_read; /* # of temp blocks read */
int64 temp_blks_written; /* # of temp blocks written */
instr_time blk_read_time; /* time spent reading */
instr_time blk_write_time; /* time spent writing */
instr_time blk_read_time; /* time spent reading blocks */
instr_time blk_write_time; /* time spent writing blocks */
instr_time temp_blk_read_time; /* time spent reading temp blocks */
instr_time temp_blk_write_time; /* time spent writing temp blocks */
} BufferUsage;
/*

View File

@ -85,56 +85,6 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
Execution Time: N.N ms
(3 rows)
select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
explain_filter
------------------------------------
[ +
{ +
"Plan": { +
"Node Type": "Seq Scan", +
"Parallel Aware": false, +
"Async Capable": false, +
"Relation Name": "int8_tbl",+
"Alias": "i8", +
"Startup Cost": N.N, +
"Total Cost": N.N, +
"Plan Rows": N, +
"Plan Width": N, +
"Actual Startup Time": N.N, +
"Actual Total Time": N.N, +
"Actual Rows": N, +
"Actual Loops": N, +
"Shared Hit Blocks": N, +
"Shared Read Blocks": N, +
"Shared Dirtied Blocks": N, +
"Shared Written Blocks": N, +
"Local Hit Blocks": N, +
"Local Read Blocks": N, +
"Local Dirtied Blocks": N, +
"Local Written Blocks": N, +
"Temp Read Blocks": N, +
"Temp Written Blocks": N +
}, +
"Planning": { +
"Shared Hit Blocks": N, +
"Shared Read Blocks": N, +
"Shared Dirtied Blocks": N, +
"Shared Written Blocks": N, +
"Local Hit Blocks": N, +
"Local Read Blocks": N, +
"Local Dirtied Blocks": N, +
"Local Written Blocks": N, +
"Temp Read Blocks": N, +
"Temp Written Blocks": N +
}, +
"Planning Time": N.N, +
"Triggers": [ +
], +
"Execution Time": N.N +
} +
]
(1 row)
select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
explain_filter
--------------------------------------------------------
@ -276,6 +226,68 @@ select explain_filter('explain (buffers, format json) select * from int8_tbl i8'
]
(1 row)
-- Check output including I/O timings. These fields are conditional
-- but always set in JSON format, so check them only in this case.
set track_io_timing = on;
select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
explain_filter
------------------------------------
[ +
{ +
"Plan": { +
"Node Type": "Seq Scan", +
"Parallel Aware": false, +
"Async Capable": false, +
"Relation Name": "int8_tbl",+
"Alias": "i8", +
"Startup Cost": N.N, +
"Total Cost": N.N, +
"Plan Rows": N, +
"Plan Width": N, +
"Actual Startup Time": N.N, +
"Actual Total Time": N.N, +
"Actual Rows": N, +
"Actual Loops": N, +
"Shared Hit Blocks": N, +
"Shared Read Blocks": N, +
"Shared Dirtied Blocks": N, +
"Shared Written Blocks": N, +
"Local Hit Blocks": N, +
"Local Read Blocks": N, +
"Local Dirtied Blocks": N, +
"Local Written Blocks": N, +
"Temp Read Blocks": N, +
"Temp Written Blocks": N, +
"I/O Read Time": N.N, +
"I/O Write Time": N.N, +
"Temp I/O Read Time": N.N, +
"Temp I/O Write Time": N.N +
}, +
"Planning": { +
"Shared Hit Blocks": N, +
"Shared Read Blocks": N, +
"Shared Dirtied Blocks": N, +
"Shared Written Blocks": N, +
"Local Hit Blocks": N, +
"Local Read Blocks": N, +
"Local Dirtied Blocks": N, +
"Local Written Blocks": N, +
"Temp Read Blocks": N, +
"Temp Written Blocks": N, +
"I/O Read Time": N.N, +
"I/O Write Time": N.N, +
"Temp I/O Read Time": N.N, +
"Temp I/O Write Time": N.N +
}, +
"Planning Time": N.N, +
"Triggers": [ +
], +
"Execution Time": N.N +
} +
]
(1 row)
set track_io_timing = off;
-- SETTINGS option
-- We have to ignore other settings that might be imposed by the environment,
-- so printing the whole Settings field unfortunately won't do.

View File

@ -65,12 +65,17 @@ select explain_filter('explain select * from int8_tbl i8');
select explain_filter('explain (analyze) select * from int8_tbl i8');
select explain_filter('explain (analyze, verbose) select * from int8_tbl i8');
select explain_filter('explain (analyze, buffers, format text) select * from int8_tbl i8');
select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
select explain_filter('explain (analyze, buffers, format yaml) select * from int8_tbl i8');
select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
-- Check output including I/O timings. These fields are conditional
-- but always set in JSON format, so check them only in this case.
set track_io_timing = on;
select explain_filter('explain (analyze, buffers, format json) select * from int8_tbl i8');
set track_io_timing = off;
-- SETTINGS option
-- We have to ignore other settings that might be imposed by the environment,
-- so printing the whole Settings field unfortunately won't do.