Allow per-tablespace effective_io_concurrency

Per discussion, nowadays it is possible to have tablespaces that have
wildly different I/O characteristics from others.  Setting different
effective_io_concurrency parameters for those has been measured to
improve performance.

Author: Julien Rouhaud
Reviewed by: Andres Freund
This commit is contained in:
Alvaro Herrera 2015-09-08 12:51:42 -03:00
parent 665a00c9e2
commit 1aba62ec63
12 changed files with 145 additions and 63 deletions

View File

@ -1901,7 +1901,10 @@ include_dir 'conf.d'
</para> </para>
<para> <para>
The default is 1 on supported systems, otherwise 0. The default is 1 on supported systems, otherwise 0. This value can
be overriden for tables in a particular tablespace by setting the
tablespace parameter of the same name (see
<xref linkend="sql-altertablespace">).
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>

View File

@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
<listitem> <listitem>
<para> <para>
A tablespace parameter to be set or reset. Currently, the only A tablespace parameter to be set or reset. Currently, the only
available parameters are <varname>seq_page_cost</> and available parameters are <varname>seq_page_cost</>,
<varname>random_page_cost</>. Setting either value for a particular <varname>random_page_cost</> and <varname>effective_io_concurrency</>.
tablespace will override the planner's usual estimate of the cost of Setting either value for a particular tablespace will override the
reading pages from tables in that tablespace, as established by planner's usual estimate of the cost of reading pages from tables in
the configuration parameters of the same name (see that tablespace, as established by the configuration parameters of the
<xref linkend="guc-seq-page-cost">, same name (see <xref linkend="guc-seq-page-cost">,
<xref linkend="guc-random-page-cost">). This may be useful if one <xref linkend="guc-random-page-cost">,
tablespace is located on a disk which is faster or slower than the <xref linkend="guc-effective-io-concurrency">). This may be useful if
one tablespace is located on a disk which is faster or slower than the
remainder of the I/O subsystem. remainder of the I/O subsystem.
</para> </para>
</listitem> </listitem>

View File

@ -254,6 +254,19 @@ static relopt_int intRelOpts[] =
}, },
-1, 64, MAX_KILOBYTES -1, 64, MAX_KILOBYTES
}, },
{
{
"effective_io_concurrency",
"Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
RELOPT_KIND_TABLESPACE,
AccessExclusiveLock
},
#ifdef USE_PREFETCH
-1, 0, MAX_IO_CONCURRENCY
#else
0, 0, 0
#endif
},
/* list terminator */ /* list terminator */
{{NULL}} {{NULL}}
@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
int numoptions; int numoptions;
static const relopt_parse_elt tab[] = { static const relopt_parse_elt tab[] = {
{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)}, {"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)} {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
}; };
options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE, options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,

View File

@ -44,6 +44,7 @@
#include "storage/predicate.h" #include "storage/predicate.h"
#include "utils/memutils.h" #include "utils/memutils.h"
#include "utils/rel.h" #include "utils/rel.h"
#include "utils/spccache.h"
#include "utils/snapmgr.h" #include "utils/snapmgr.h"
#include "utils/tqual.h" #include "utils/tqual.h"
@ -95,9 +96,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
* prefetching. node->prefetch_pages tracks exactly how many pages ahead * prefetching. node->prefetch_pages tracks exactly how many pages ahead
* the prefetch iterator is. Also, node->prefetch_target tracks the * the prefetch iterator is. Also, node->prefetch_target tracks the
* desired prefetch distance, which starts small and increases up to the * desired prefetch distance, which starts small and increases up to the
* GUC-controlled maximum, target_prefetch_pages. This is to avoid doing * node->prefetch_maximum. This is to avoid doing a lot of prefetching in
* a lot of prefetching in a scan that stops after a few tuples because of * a scan that stops after a few tuples because of a LIMIT.
* a LIMIT.
*/ */
if (tbm == NULL) if (tbm == NULL)
{ {
@ -111,7 +111,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
node->tbmres = tbmres = NULL; node->tbmres = tbmres = NULL;
#ifdef USE_PREFETCH #ifdef USE_PREFETCH
if (target_prefetch_pages > 0) if (node->prefetch_maximum > 0)
{ {
node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
node->prefetch_pages = 0; node->prefetch_pages = 0;
@ -188,10 +188,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
* page/tuple, then to one after the second tuple is fetched, then * page/tuple, then to one after the second tuple is fetched, then
* it doubles as later pages are fetched. * it doubles as later pages are fetched.
*/ */
if (node->prefetch_target >= target_prefetch_pages) if (node->prefetch_target >= node->prefetch_maximum)
/* don't increase any further */ ; /* don't increase any further */ ;
else if (node->prefetch_target >= target_prefetch_pages / 2) else if (node->prefetch_target >= node->prefetch_maximum / 2)
node->prefetch_target = target_prefetch_pages; node->prefetch_target = node->prefetch_maximum;
else if (node->prefetch_target > 0) else if (node->prefetch_target > 0)
node->prefetch_target *= 2; node->prefetch_target *= 2;
else else
@ -211,7 +211,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
* Try to prefetch at least a few pages even before we get to the * Try to prefetch at least a few pages even before we get to the
* second page if we don't stop reading after the first tuple. * second page if we don't stop reading after the first tuple.
*/ */
if (node->prefetch_target < target_prefetch_pages) if (node->prefetch_target < node->prefetch_maximum)
node->prefetch_target++; node->prefetch_target++;
#endif /* USE_PREFETCH */ #endif /* USE_PREFETCH */
} }
@ -539,6 +539,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
{ {
BitmapHeapScanState *scanstate; BitmapHeapScanState *scanstate;
Relation currentRelation; Relation currentRelation;
int io_concurrency;
/* check for unsupported flags */ /* check for unsupported flags */
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@ -564,6 +565,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->prefetch_iterator = NULL; scanstate->prefetch_iterator = NULL;
scanstate->prefetch_pages = 0; scanstate->prefetch_pages = 0;
scanstate->prefetch_target = 0; scanstate->prefetch_target = 0;
/* may be updated below */
scanstate->prefetch_maximum = target_prefetch_pages;
/* /*
* Miscellaneous initialization * Miscellaneous initialization
@ -598,6 +601,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
*/ */
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
/*
* Determine the maximum for prefetch_target. If the tablespace has a
* specific IO concurrency set, use that to compute the corresponding
* maximum value; otherwise, we already initialized to the value computed
* by the GUC machinery.
*/
io_concurrency =
get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
if (io_concurrency != effective_io_concurrency)
{
double maximum;
if (ComputeIoConcurrency(io_concurrency, &maximum))
scanstate->prefetch_maximum = rint(maximum);
}
scanstate->ss.ss_currentRelation = currentRelation; scanstate->ss.ss_currentRelation = currentRelation;
/* /*

View File

@ -80,11 +80,14 @@ bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100; int bgwriter_lru_maxpages = 100;
double bgwriter_lru_multiplier = 2.0; double bgwriter_lru_multiplier = 2.0;
bool track_io_timing = false; bool track_io_timing = false;
int effective_io_concurrency = 0;
/* /*
* How many buffers PrefetchBuffer callers should try to stay ahead of their * How many buffers PrefetchBuffer callers should try to stay ahead of their
* ReadBuffer calls by. This is maintained by the assign hook for * ReadBuffer calls by. This is maintained by the assign hook for
* effective_io_concurrency. Zero means "never prefetch". * effective_io_concurrency. Zero means "never prefetch". This value is
* only used for buffers not belonging to tablespaces that have their
* effective_io_concurrency parameter set.
*/ */
int target_prefetch_pages = 0; int target_prefetch_pages = 0;
@ -415,6 +418,64 @@ static void CheckForBufferLeaks(void);
static int rnode_comparator(const void *p1, const void *p2); static int rnode_comparator(const void *p1, const void *p2);
/*
* ComputeIoConcurrency -- get the number of pages to prefetch for a given
* number of spindles.
*/
bool
ComputeIoConcurrency(int io_concurrency, double *target)
{
double new_prefetch_pages = 0.0;
int i;
/*
* Make sure the io_concurrency value is within valid range; it may have
* been forced with a manual pg_tablespace update.
*/
io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
/*----------
* The user-visible GUC parameter is the number of drives (spindles),
* which we need to translate to a number-of-pages-to-prefetch target.
* The target value is stashed in *extra and then assigned to the actual
* variable by assign_effective_io_concurrency.
*
* The expected number of prefetch pages needed to keep N drives busy is:
*
* drives | I/O requests
* -------+----------------
* 1 | 1
* 2 | 2/1 + 2/2 = 3
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
* n | n * H(n)
*
* This is called the "coupon collector problem" and H(n) is called the
* harmonic series. This could be approximated by n * ln(n), but for
* reasonable numbers of drives we might as well just compute the series.
*
* Alternatively we could set the target to the number of pages necessary
* so that the expected number of active spindles is some arbitrary
* percentage of the total. This sounds the same but is actually slightly
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
* that desired fraction.
*
* Experimental results show that both of these formulas aren't aggressive
* enough, but we don't really have any better proposals.
*
* Note that if io_concurrency = 0 (disabled), we must set target = 0.
*----------
*/
for (i = 1; i <= io_concurrency; i++)
new_prefetch_pages += (double) io_concurrency / (double) i;
*target = new_prefetch_pages;
/* This range check shouldn't fail, but let's be paranoid */
return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
}
/* /*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation * PrefetchBuffer -- initiate asynchronous read of a block of a relation
* *

View File

@ -23,6 +23,7 @@
#include "commands/tablespace.h" #include "commands/tablespace.h"
#include "miscadmin.h" #include "miscadmin.h"
#include "optimizer/cost.h" #include "optimizer/cost.h"
#include "storage/bufmgr.h"
#include "utils/catcache.h" #include "utils/catcache.h"
#include "utils/hsearch.h" #include "utils/hsearch.h"
#include "utils/inval.h" #include "utils/inval.h"
@ -198,3 +199,14 @@ get_tablespace_page_costs(Oid spcid,
*spc_seq_page_cost = spc->opts->seq_page_cost; *spc_seq_page_cost = spc->opts->seq_page_cost;
} }
} }
int
get_tablespace_io_concurrency(Oid spcid)
{
TableSpaceCacheEntry *spc = get_tablespace(spcid);
if (!spc->opts || spc->opts->effective_io_concurrency < 0)
return effective_io_concurrency;
else
return spc->opts->effective_io_concurrency;
}

View File

@ -490,7 +490,6 @@ static int wal_block_size;
static bool data_checksums; static bool data_checksums;
static int wal_segment_size; static int wal_segment_size;
static bool integer_datetimes; static bool integer_datetimes;
static int effective_io_concurrency;
static bool assert_enabled; static bool assert_enabled;
/* should be static, but commands/variable.c needs to get at this */ /* should be static, but commands/variable.c needs to get at this */
@ -2352,7 +2351,7 @@ static struct config_int ConfigureNamesInt[] =
}, },
&effective_io_concurrency, &effective_io_concurrency,
#ifdef USE_PREFETCH #ifdef USE_PREFETCH
1, 0, 1000, 1, 0, MAX_IO_CONCURRENCY,
#else #else
0, 0, 0, 0, 0, 0,
#endif #endif
@ -9986,47 +9985,9 @@ static bool
check_effective_io_concurrency(int *newval, void **extra, GucSource source) check_effective_io_concurrency(int *newval, void **extra, GucSource source)
{ {
#ifdef USE_PREFETCH #ifdef USE_PREFETCH
double new_prefetch_pages = 0.0; double new_prefetch_pages;
int i;
/*---------- if (ComputeIoConcurrency(*newval, &new_prefetch_pages))
* The user-visible GUC parameter is the number of drives (spindles),
* which we need to translate to a number-of-pages-to-prefetch target.
* The target value is stashed in *extra and then assigned to the actual
* variable by assign_effective_io_concurrency.
*
* The expected number of prefetch pages needed to keep N drives busy is:
*
* drives | I/O requests
* -------+----------------
* 1 | 1
* 2 | 2/1 + 2/2 = 3
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
* n | n * H(n)
*
* This is called the "coupon collector problem" and H(n) is called the
* harmonic series. This could be approximated by n * ln(n), but for
* reasonable numbers of drives we might as well just compute the series.
*
* Alternatively we could set the target to the number of pages necessary
* so that the expected number of active spindles is some arbitrary
* percentage of the total. This sounds the same but is actually slightly
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
* that desired fraction.
*
* Experimental results show that both of these formulas aren't aggressive
* enough, but we don't really have any better proposals.
*
* Note that if *newval = 0 (disabled), we must set target = 0.
*----------
*/
for (i = 1; i <= *newval; i++)
new_prefetch_pages += (double) *newval / (double) i;
/* This range check shouldn't fail, but let's be paranoid */
if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
{ {
int *myextra = (int *) guc_malloc(ERROR, sizeof(int)); int *myextra = (int *) guc_malloc(ERROR, sizeof(int));

View File

@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end)
pg_strcasecmp(prev_wd, "(") == 0) pg_strcasecmp(prev_wd, "(") == 0)
{ {
static const char *const list_TABLESPACEOPTIONS[] = static const char *const list_TABLESPACEOPTIONS[] =
{"seq_page_cost", "random_page_cost", NULL}; {"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS); COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
} }

View File

@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
int32 vl_len_; /* varlena header (do not touch directly!) */ int32 vl_len_; /* varlena header (do not touch directly!) */
float8 random_page_cost; float8 random_page_cost;
float8 seq_page_cost; float8 seq_page_cost;
int effective_io_concurrency;
} TableSpaceOpts; } TableSpaceOpts;
extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt); extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt);

View File

@ -1424,7 +1424,8 @@ typedef struct BitmapIndexScanState
* lossy_pages total number of lossy pages retrieved * lossy_pages total number of lossy pages retrieved
* prefetch_iterator iterator for prefetching ahead of current page * prefetch_iterator iterator for prefetching ahead of current page
* prefetch_pages # pages prefetch iterator is ahead of current * prefetch_pages # pages prefetch iterator is ahead of current
* prefetch_target target prefetch distance * prefetch_target current target prefetch distance
* prefetch_maximum maximum value for prefetch_target
* ---------------- * ----------------
*/ */
typedef struct BitmapHeapScanState typedef struct BitmapHeapScanState
@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState
TBMIterator *prefetch_iterator; TBMIterator *prefetch_iterator;
int prefetch_pages; int prefetch_pages;
int prefetch_target; int prefetch_target;
int prefetch_maximum;
} BitmapHeapScanState; } BitmapHeapScanState;
/* ---------------- /* ----------------

View File

@ -58,11 +58,17 @@ extern int target_prefetch_pages;
/* in buf_init.c */ /* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks; extern PGDLLIMPORT char *BufferBlocks;
/* in guc.c */
extern int effective_io_concurrency;
/* in localbuf.c */ /* in localbuf.c */
extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT int NLocBuffer;
extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT Block *LocalBufferBlockPointers;
extern PGDLLIMPORT int32 *LocalRefCount; extern PGDLLIMPORT int32 *LocalRefCount;
/* upper limit for effective_io_concurrency */
#define MAX_IO_CONCURRENCY 1000
/* special block number for ReadBuffer() */ /* special block number for ReadBuffer() */
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */ #define P_NEW InvalidBlockNumber /* grow the file to get a new page */
@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount;
/* /*
* prototypes for functions in bufmgr.c * prototypes for functions in bufmgr.c
*/ */
extern bool ComputeIoConcurrency(int io_concurrency, double *target);
extern void PrefetchBuffer(Relation reln, ForkNumber forkNum, extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
BlockNumber blockNum); BlockNumber blockNum);
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);

View File

@ -15,5 +15,6 @@
void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost, void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
float8 *spc_seq_page_cost); float8 *spc_seq_page_cost);
int get_tablespace_io_concurrency(Oid spcid);
#endif /* SPCCACHE_H */ #endif /* SPCCACHE_H */