From 79ccd7cbd5ca44bee0191d12e9e65abf702899e7 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 21 Aug 2017 14:43:00 -0400 Subject: [PATCH] pg_prewarm: Add automatic prewarm feature. Periodically while the server is running, and at shutdown, write out a list of blocks in shared buffers. When the server reaches consistency -- unfortunatey, we can't do it before that point without breaking things -- reload those blocks into any still-unused shared buffers. Mithun Cy and Robert Haas, reviewed and tested by Beena Emerson, Amit Kapila, Jim Nasby, and Rafia Sabih. Discussion: http://postgr.es/m/CAD__OugubOs1Vy7kgF6xTjmEqTR4CrGAv8w+ZbaY_+MZeitukw@mail.gmail.com --- contrib/file_fdw/data/list1.csv | 2 + contrib/file_fdw/data/list2.bad | 2 + contrib/file_fdw/data/list2.csv | 2 + contrib/pg_prewarm/Makefile | 4 +- contrib/pg_prewarm/autoprewarm.c | 924 ++++++++++++++++++++ contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql | 14 + contrib/pg_prewarm/pg_prewarm.control | 2 +- doc/src/sgml/pgprewarm.sgml | 69 +- src/backend/storage/buffer/freelist.c | 17 + src/include/storage/buf_internals.h | 1 + src/tools/pgindent/typedefs.list | 2 + 11 files changed, 1035 insertions(+), 4 deletions(-) create mode 100644 contrib/file_fdw/data/list1.csv create mode 100644 contrib/file_fdw/data/list2.bad create mode 100644 contrib/file_fdw/data/list2.csv create mode 100644 contrib/pg_prewarm/autoprewarm.c create mode 100644 contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql diff --git a/contrib/file_fdw/data/list1.csv b/contrib/file_fdw/data/list1.csv new file mode 100644 index 0000000000..203f3b2324 --- /dev/null +++ b/contrib/file_fdw/data/list1.csv @@ -0,0 +1,2 @@ +1,foo +1,bar diff --git a/contrib/file_fdw/data/list2.bad b/contrib/file_fdw/data/list2.bad new file mode 100644 index 0000000000..00af47f5ef --- /dev/null +++ b/contrib/file_fdw/data/list2.bad @@ -0,0 +1,2 @@ +2,baz +1,qux diff --git a/contrib/file_fdw/data/list2.csv b/contrib/file_fdw/data/list2.csv new file mode 100644 index 0000000000..2fb133d004 --- /dev/null +++ b/contrib/file_fdw/data/list2.csv @@ -0,0 +1,2 @@ +2,baz +2,qux diff --git a/contrib/pg_prewarm/Makefile b/contrib/pg_prewarm/Makefile index 7ad941e72b..88580d1118 100644 --- a/contrib/pg_prewarm/Makefile +++ b/contrib/pg_prewarm/Makefile @@ -1,10 +1,10 @@ # contrib/pg_prewarm/Makefile MODULE_big = pg_prewarm -OBJS = pg_prewarm.o $(WIN32RES) +OBJS = pg_prewarm.o autoprewarm.o $(WIN32RES) EXTENSION = pg_prewarm -DATA = pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql +DATA = pg_prewarm--1.1--1.2.sql pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql PGFILEDESC = "pg_prewarm - preload relation data into system buffer cache" ifdef USE_PGXS diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c new file mode 100644 index 0000000000..cc0350e6d6 --- /dev/null +++ b/contrib/pg_prewarm/autoprewarm.c @@ -0,0 +1,924 @@ +/*------------------------------------------------------------------------- + * + * autoprewarm.c + * Periodically dump information about the blocks present in + * shared_buffers, and reload them on server restart. + * + * Due to locking considerations, we can't actually begin prewarming + * until the server reaches a consistent state. We need the catalogs + * to be consistent so that we can figure out which relation to lock, + * and we need to lock the relations so that we don't try to prewarm + * pages from a relation that is in the process of being dropped. + * + * While prewarming, autoprewarm will use two workers. There's a + * master worker that reads and sorts the list of blocks to be + * prewarmed and then launches a per-database worker for each + * relevant database in turn. The former keeps running after the + * initial prewarm is complete to update the dump file periodically. + * + * Copyright (c) 2016-2017, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pg_prewarm/autoprewarm.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include + +#include "access/heapam.h" +#include "access/xact.h" +#include "catalog/pg_class.h" +#include "catalog/pg_type.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "storage/buf_internals.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/acl.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relfilenodemap.h" +#include "utils/resowner.h" + +#define AUTOPREWARM_FILE "autoprewarm.blocks" + +/* Metadata for each block we dump. */ +typedef struct BlockInfoRecord +{ + Oid database; + Oid tablespace; + Oid filenode; + ForkNumber forknum; + BlockNumber blocknum; +} BlockInfoRecord; + +/* Shared state information for autoprewarm bgworker. */ +typedef struct AutoPrewarmSharedState +{ + LWLock lock; /* mutual exclusion */ + pid_t bgworker_pid; /* for main bgworker */ + pid_t pid_using_dumpfile; /* for autoprewarm or block dump */ + + /* Following items are for communication with per-database worker */ + dsm_handle block_info_handle; + Oid database; + int64 prewarm_start_idx; + int64 prewarm_stop_idx; + int64 prewarmed_blocks; +} AutoPrewarmSharedState; + +void _PG_init(void); +void autoprewarm_main(Datum main_arg); +void autoprewarm_database_main(Datum main_arg); + +PG_FUNCTION_INFO_V1(autoprewarm_start_worker); +PG_FUNCTION_INFO_V1(autoprewarm_dump_now); + +static void apw_load_buffers(void); +static int64 apw_dump_now(bool is_bgworker, bool dump_unlogged); +static void apw_start_master_worker(void); +static void apw_start_database_worker(void); +static bool apw_init_shmem(void); +static void apw_detach_shmem(int code, Datum arg); +static int apw_compare_blockinfo(const void *p, const void *q); +static void apw_sigterm_handler(SIGNAL_ARGS); +static void apw_sighup_handler(SIGNAL_ARGS); + +/* Flags set by signal handlers */ +static volatile sig_atomic_t got_sigterm = false; +static volatile sig_atomic_t got_sighup = false; + +/* Pointer to shared-memory state. */ +static AutoPrewarmSharedState *apw_state = NULL; + +/* GUC variables. */ +static bool autoprewarm = true; /* start worker? */ +static int autoprewarm_interval; /* dump interval */ + +/* + * Module load callback. + */ +void +_PG_init(void) +{ + DefineCustomIntVariable("pg_prewarm.autoprewarm_interval", + "Sets the interval between dumps of shared buffers", + "If set to zero, time-based dumping is disabled.", + &autoprewarm_interval, + 300, + 0, INT_MAX / 1000, + PGC_SIGHUP, + GUC_UNIT_S, + NULL, + NULL, + NULL); + + if (!process_shared_preload_libraries_in_progress) + return; + + /* can't define PGC_POSTMASTER variable after startup */ + DefineCustomBoolVariable("pg_prewarm.autoprewarm", + "Starts the autoprewarm worker.", + NULL, + &autoprewarm, + true, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + EmitWarningsOnPlaceholders("pg_prewarm"); + + RequestAddinShmemSpace(MAXALIGN(sizeof(AutoPrewarmSharedState))); + + /* Register autoprewarm worker, if enabled. */ + if (autoprewarm) + apw_start_master_worker(); +} + +/* + * Main entry point for the master autoprewarm process. Per-database workers + * have a separate entry point. + */ +void +autoprewarm_main(Datum main_arg) +{ + bool first_time = true; + TimestampTz last_dump_time = 0; + + /* Establish signal handlers; once that's done, unblock signals. */ + pqsignal(SIGTERM, apw_sigterm_handler); + pqsignal(SIGHUP, apw_sighup_handler); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + BackgroundWorkerUnblockSignals(); + + /* Create (if necessary) and attach to our shared memory area. */ + if (apw_init_shmem()) + first_time = false; + + /* Set on-detach hook so that our PID will be cleared on exit. */ + on_shmem_exit(apw_detach_shmem, 0); + + /* + * Store our PID in the shared memory area --- unless there's already + * another worker running, in which case just exit. + */ + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + if (apw_state->bgworker_pid != InvalidPid) + { + LWLockRelease(&apw_state->lock); + ereport(LOG, + (errmsg("autoprewarm worker is already running under PID %d", + apw_state->bgworker_pid))); + return; + } + apw_state->bgworker_pid = MyProcPid; + LWLockRelease(&apw_state->lock); + + /* + * Preload buffers from the dump file only if we just created the shared + * memory region. Otherwise, it's either already been done or shouldn't + * be done - e.g. because the old dump file has been overwritten since the + * server was started. + * + * There's not much point in performing a dump immediately after we finish + * preloading; so, if we do end up preloading, consider the last dump time + * to be equal to the current time. + */ + if (first_time) + { + apw_load_buffers(); + last_dump_time = GetCurrentTimestamp(); + } + + /* Periodically dump buffers until terminated. */ + while (!got_sigterm) + { + int rc; + + /* In case of a SIGHUP, just reload the configuration. */ + if (got_sighup) + { + got_sighup = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (autoprewarm_interval <= 0) + { + /* We're only dumping at shutdown, so just wait forever. */ + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_POSTMASTER_DEATH, + -1L, + PG_WAIT_EXTENSION); + } + else + { + long delay_in_ms = 0; + TimestampTz next_dump_time = 0; + long secs = 0; + int usecs = 0; + + /* Compute the next dump time. */ + next_dump_time = + TimestampTzPlusMilliseconds(last_dump_time, + autoprewarm_interval * 1000); + TimestampDifference(GetCurrentTimestamp(), next_dump_time, + &secs, &usecs); + delay_in_ms = secs + (usecs / 1000); + + /* Perform a dump if it's time. */ + if (delay_in_ms <= 0) + { + last_dump_time = GetCurrentTimestamp(); + apw_dump_now(true, false); + continue; + } + + /* Sleep until the next dump time. */ + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + delay_in_ms, + PG_WAIT_EXTENSION); + } + + /* Reset the latch, bail out if postmaster died, otherwise loop. */ + ResetLatch(&MyProc->procLatch); + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + } + + /* + * Dump one last time. We assume this is probably the result of a system + * shutdown, although it's possible that we've merely been terminated. + */ + apw_dump_now(true, true); +} + +/* + * Read the dump file and launch per-database workers one at a time to + * prewarm the buffers found there. + */ +static void +apw_load_buffers(void) +{ + FILE *file = NULL; + int64 num_elements, + i; + BlockInfoRecord *blkinfo; + dsm_segment *seg; + + /* + * Skip the prewarm if the dump file is in use; otherwise, prevent any + * other process from writing it while we're using it. + */ + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + if (apw_state->pid_using_dumpfile == InvalidPid) + apw_state->pid_using_dumpfile = MyProcPid; + else + { + LWLockRelease(&apw_state->lock); + ereport(LOG, + (errmsg("skipping prewarm because block dump file is being written by PID %d", + apw_state->pid_using_dumpfile))); + return; + } + LWLockRelease(&apw_state->lock); + + /* + * Open the block dump file. Exit quietly if it doesn't exist, but report + * any other error. + */ + file = AllocateFile(AUTOPREWARM_FILE, "r"); + if (!file) + { + if (errno == ENOENT) + { + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + apw_state->pid_using_dumpfile = InvalidPid; + LWLockRelease(&apw_state->lock); + return; /* No file to load. */ + } + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + AUTOPREWARM_FILE))); + } + + /* First line of the file is a record count. */ + if (fscanf(file, "<<" INT64_FORMAT ">>\n", &num_elements) != 1) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": %m", + AUTOPREWARM_FILE))); + + /* Allocate a dynamic shared memory segment to store the record data. */ + seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0); + blkinfo = (BlockInfoRecord *) dsm_segment_address(seg); + + /* Read records, one per line. */ + for (i = 0; i < num_elements; i++) + { + unsigned forknum; + + if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database, + &blkinfo[i].tablespace, &blkinfo[i].filenode, + &forknum, &blkinfo[i].blocknum) != 5) + ereport(ERROR, + (errmsg("autoprewarm block dump file is corrupted at line " INT64_FORMAT, + i + 1))); + blkinfo[i].forknum = forknum; + } + + FreeFile(file); + + /* Sort the blocks to be loaded. */ + pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord), + apw_compare_blockinfo); + + /* Populate shared memory state. */ + apw_state->block_info_handle = dsm_segment_handle(seg); + apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0; + apw_state->prewarmed_blocks = 0; + + /* Get the info position of the first block of the next database. */ + while (apw_state->prewarm_start_idx < num_elements) + { + uint32 i = apw_state->prewarm_start_idx; + Oid current_db = blkinfo[i].database; + + /* + * Advance the prewarm_stop_idx to the first BlockRecordInfo that does + * not belong to this database. + */ + i++; + while (i < num_elements) + { + if (current_db != blkinfo[i].database) + { + /* + * Combine BlockRecordInfos for global objects withs those of + * the database. + */ + if (current_db != InvalidOid) + break; + current_db = blkinfo[i].database; + } + + i++; + } + + /* + * If we reach this point with current_db == InvalidOid, then only + * BlockRecordInfos belonging to global objects exist. We can't + * prewarm without a database connection, so just bail out. + */ + if (current_db == InvalidOid) + break; + + /* Configure stop point and database for next per-database worker. */ + apw_state->prewarm_stop_idx = i; + apw_state->database = current_db; + Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx); + + /* If we've run out of free buffers, don't launch another worker. */ + if (!have_free_buffer()) + break; + + /* + * Start a per-database worker to load blocks for this database; this + * function will return once the per-database worker exits. + */ + apw_start_database_worker(); + + /* Prepare for next database. */ + apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx; + } + + /* Clean up. */ + dsm_detach(seg); + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + apw_state->block_info_handle = DSM_HANDLE_INVALID; + apw_state->pid_using_dumpfile = InvalidPid; + LWLockRelease(&apw_state->lock); + + /* Report our success. */ + ereport(LOG, + (errmsg("autoprewarm successfully prewarmed " INT64_FORMAT + " of " INT64_FORMAT " previously-loaded blocks", + apw_state->prewarmed_blocks, num_elements))); +} + +/* + * Prewarm all blocks for one database (and possibly also global objects, if + * those got grouped with this database). + */ +void +autoprewarm_database_main(Datum main_arg) +{ + uint32 pos; + BlockInfoRecord *block_info; + Relation rel = NULL; + BlockNumber nblocks = 0; + BlockInfoRecord *old_blk = NULL; + dsm_segment *seg; + + /* Establish signal handlers; once that's done, unblock signals. */ + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Connect to correct database and get block information. */ + apw_init_shmem(); + seg = dsm_attach(apw_state->block_info_handle); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid); + block_info = (BlockInfoRecord *) dsm_segment_address(seg); + pos = apw_state->prewarm_start_idx; + + /* + * Loop until we run out of blocks to prewarm or until we run out of free + * buffers. + */ + while (pos < apw_state->prewarm_stop_idx && have_free_buffer()) + { + BlockInfoRecord *blk = &block_info[pos++]; + Buffer buf; + + CHECK_FOR_INTERRUPTS(); + + /* + * Quit if we've reached records for another database. If previous + * blocks are of some global objects, then continue pre-warming. + */ + if (old_blk != NULL && old_blk->database != blk->database && + old_blk->database != 0) + break; + + /* + * As soon as we encounter a block of a new relation, close the old + * relation. Note that rel will be NULL if try_relation_open failed + * previously; in that case, there is nothing to close. + */ + if (old_blk != NULL && old_blk->filenode != blk->filenode && + rel != NULL) + { + relation_close(rel, AccessShareLock); + rel = NULL; + CommitTransactionCommand(); + } + + /* + * Try to open each new relation, but only once, when we first + * encounter it. If it's been dropped, skip the associated blocks. + */ + if (old_blk == NULL || old_blk->filenode != blk->filenode) + { + Oid reloid; + + Assert(rel == NULL); + StartTransactionCommand(); + reloid = RelidByRelfilenode(blk->tablespace, blk->filenode); + if (OidIsValid(reloid)) + rel = try_relation_open(reloid, AccessShareLock); + + if (!rel) + CommitTransactionCommand(); + } + if (!rel) + { + old_blk = blk; + continue; + } + + /* Once per fork, check for fork existence and size. */ + if (old_blk == NULL || + old_blk->filenode != blk->filenode || + old_blk->forknum != blk->forknum) + { + RelationOpenSmgr(rel); + + /* + * smgrexists is not safe for illegal forknum, hence check whether + * the passed forknum is valid before using it in smgrexists. + */ + if (blk->forknum > InvalidForkNumber && + blk->forknum <= MAX_FORKNUM && + smgrexists(rel->rd_smgr, blk->forknum)) + nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum); + else + nblocks = 0; + } + + /* Check whether blocknum is valid and within fork file size. */ + if (blk->blocknum >= nblocks) + { + /* Move to next forknum. */ + old_blk = blk; + continue; + } + + /* Prewarm buffer. */ + buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL, + NULL); + if (BufferIsValid(buf)) + { + apw_state->prewarmed_blocks++; + ReleaseBuffer(buf); + } + + old_blk = blk; + } + + dsm_detach(seg); + + /* Release lock on previous relation. */ + if (rel) + { + relation_close(rel, AccessShareLock); + CommitTransactionCommand(); + } +} + +/* + * Dump information on blocks in shared buffers. We use a text format here + * so that it's easy to understand and even change the file contents if + * necessary. + */ +static int64 +apw_dump_now(bool is_bgworker, bool dump_unlogged) +{ + uint32 i; + int ret; + int64 num_blocks; + BlockInfoRecord *block_info_array; + BufferDesc *bufHdr; + FILE *file; + char transient_dump_file_path[MAXPGPATH]; + pid_t pid; + + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + pid = apw_state->pid_using_dumpfile; + if (apw_state->pid_using_dumpfile == InvalidPid) + apw_state->pid_using_dumpfile = MyProcPid; + LWLockRelease(&apw_state->lock); + + if (pid != InvalidPid) + { + if (!is_bgworker) + ereport(ERROR, + (errmsg("could not perform block dump because dump file is being used by PID %d", + apw_state->pid_using_dumpfile))); + + ereport(LOG, + (errmsg("skipping block dump because it is already being performed by PID %d", + apw_state->pid_using_dumpfile))); + return 0; + } + + block_info_array = + (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers); + + for (num_blocks = 0, i = 0; i < NBuffers; i++) + { + uint32 buf_state; + + CHECK_FOR_INTERRUPTS(); + + bufHdr = GetBufferDescriptor(i); + + /* Lock each buffer header before inspecting. */ + buf_state = LockBufHdr(bufHdr); + + /* + * Unlogged tables will be automatically truncated after a crash or + * unclean shutdown. In such cases we need not prewarm them. Dump them + * only if requested by caller. + */ + if (buf_state & BM_TAG_VALID && + ((buf_state & BM_PERMANENT) || dump_unlogged)) + { + block_info_array[num_blocks].database = bufHdr->tag.rnode.dbNode; + block_info_array[num_blocks].tablespace = bufHdr->tag.rnode.spcNode; + block_info_array[num_blocks].filenode = bufHdr->tag.rnode.relNode; + block_info_array[num_blocks].forknum = bufHdr->tag.forkNum; + block_info_array[num_blocks].blocknum = bufHdr->tag.blockNum; + ++num_blocks; + } + + UnlockBufHdr(bufHdr, buf_state); + } + + snprintf(transient_dump_file_path, MAXPGPATH, "%s.tmp", AUTOPREWARM_FILE); + file = AllocateFile(transient_dump_file_path, "w"); + if (!file) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + transient_dump_file_path))); + + ret = fprintf(file, "<<" INT64_FORMAT ">>\n", num_blocks); + if (ret < 0) + { + int save_errno = errno; + + FreeFile(file); + unlink(transient_dump_file_path); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\" : %m", + transient_dump_file_path))); + } + + for (i = 0; i < num_blocks; i++) + { + CHECK_FOR_INTERRUPTS(); + + ret = fprintf(file, "%u,%u,%u,%u,%u\n", + block_info_array[i].database, + block_info_array[i].tablespace, + block_info_array[i].filenode, + (uint32) block_info_array[i].forknum, + block_info_array[i].blocknum); + if (ret < 0) + { + int save_errno = errno; + + FreeFile(file); + unlink(transient_dump_file_path); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\" : %m", + transient_dump_file_path))); + } + } + + pfree(block_info_array); + + /* + * Rename transient_dump_file_path to AUTOPREWARM_FILE to make things + * permanent. + */ + ret = FreeFile(file); + if (ret != 0) + { + int save_errno = errno; + + unlink(transient_dump_file_path); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\" : %m", + transient_dump_file_path))); + } + + (void) durable_rename(transient_dump_file_path, AUTOPREWARM_FILE, ERROR); + apw_state->pid_using_dumpfile = InvalidPid; + + ereport(DEBUG1, + (errmsg("wrote block details for " INT64_FORMAT " blocks", + num_blocks))); + return num_blocks; +} + +/* + * SQL-callable function to launch autoprewarm. + */ +Datum +autoprewarm_start_worker(PG_FUNCTION_ARGS) +{ + pid_t pid; + + if (!autoprewarm) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("autoprewarm is disabled"))); + + apw_init_shmem(); + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + pid = apw_state->bgworker_pid; + LWLockRelease(&apw_state->lock); + + if (pid != InvalidPid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("autoprewarm worker is already running under PID %d", + pid))); + + apw_start_master_worker(); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable function to perform an immediate block dump. + */ +Datum +autoprewarm_dump_now(PG_FUNCTION_ARGS) +{ + int64 num_blocks; + + apw_init_shmem(); + + PG_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0); + { + num_blocks = apw_dump_now(false, true); + } + PG_END_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0); + + PG_RETURN_INT64(num_blocks); +} + +/* + * Allocate and initialize autoprewarm related shared memory, if not already + * done, and set up backend-local pointer to that state. Returns true if an + * existing shared memory segment was found. + */ +static bool +apw_init_shmem(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + apw_state = ShmemInitStruct("autoprewarm", + sizeof(AutoPrewarmSharedState), + &found); + if (!found) + { + /* First time through ... */ + LWLockInitialize(&apw_state->lock, LWLockNewTrancheId()); + apw_state->bgworker_pid = InvalidPid; + apw_state->pid_using_dumpfile = InvalidPid; + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +/* + * Clear our PID from autoprewarm shared state. + */ +static void +apw_detach_shmem(int code, Datum arg) +{ + LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE); + if (apw_state->pid_using_dumpfile == MyProcPid) + apw_state->pid_using_dumpfile = InvalidPid; + if (apw_state->bgworker_pid == MyProcPid) + apw_state->bgworker_pid = InvalidPid; + LWLockRelease(&apw_state->lock); +} + +/* + * Start autoprewarm master worker process. + */ +static void +apw_start_master_worker(void) +{ + BackgroundWorker worker; + BackgroundWorkerHandle *handle; + BgwHandleStatus status; + pid_t pid; + + MemSet(&worker, 0, sizeof(BackgroundWorker)); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + strcpy(worker.bgw_library_name, "pg_prewarm"); + strcpy(worker.bgw_function_name, "autoprewarm_main"); + strcpy(worker.bgw_name, "autoprewarm"); + + if (process_shared_preload_libraries_in_progress) + { + RegisterBackgroundWorker(&worker); + return; + } + + /* must set notify PID to wait for startup */ + worker.bgw_notify_pid = MyProcPid; + + if (!RegisterDynamicBackgroundWorker(&worker, &handle)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("could not register background process"), + errhint("You may need to increase max_worker_processes."))); + + status = WaitForBackgroundWorkerStartup(handle, &pid); + if (status != BGWH_STARTED) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("could not start background process"), + errhint("More details may be available in the server log."))); +} + +/* + * Start autoprewarm per-database worker process. + */ +static void +apw_start_database_worker(void) +{ + BackgroundWorker worker; + BackgroundWorkerHandle *handle; + + MemSet(&worker, 0, sizeof(BackgroundWorker)); + worker.bgw_flags = + BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + strcpy(worker.bgw_library_name, "pg_prewarm"); + strcpy(worker.bgw_function_name, "autoprewarm_database_main"); + strcpy(worker.bgw_name, "autoprewarm"); + + /* must set notify PID to wait for shutdown */ + worker.bgw_notify_pid = MyProcPid; + + if (!RegisterDynamicBackgroundWorker(&worker, &handle)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("registering dynamic bgworker autoprewarm failed"), + errhint("Consider increasing configuration parameter \"max_worker_processes\"."))); + + /* + * Ignore return value; if it fails, postmaster has died, but we have + * checks for that elsewhere. + */ + WaitForBackgroundWorkerShutdown(handle); +} + +/* Compare member elements to check whether they are not equal. */ +#define cmp_member_elem(fld) \ +do { \ + if (a->fld < b->fld) \ + return -1; \ + else if (a->fld > b->fld) \ + return 1; \ +} while(0); + +/* + * apw_compare_blockinfo + * + * We depend on all records for a particular database being consecutive + * in the dump file; each per-database worker will preload blocks until + * it sees a block for some other database. Sorting by tablespace, + * filenode, forknum, and blocknum isn't critical for correctness, but + * helps us get a sequential I/O pattern. + */ +static int +apw_compare_blockinfo(const void *p, const void *q) +{ + BlockInfoRecord *a = (BlockInfoRecord *) p; + BlockInfoRecord *b = (BlockInfoRecord *) q; + + cmp_member_elem(database); + cmp_member_elem(tablespace); + cmp_member_elem(filenode); + cmp_member_elem(forknum); + cmp_member_elem(blocknum); + + return 0; +} + +/* + * Signal handler for SIGTERM + */ +static void +apw_sigterm_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_sigterm = true; + + if (MyProc) + SetLatch(&MyProc->procLatch); + + errno = save_errno; +} + +/* + * Signal handler for SIGHUP + */ +static void +apw_sighup_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_sighup = true; + + if (MyProc) + SetLatch(&MyProc->procLatch); + + errno = save_errno; +} diff --git a/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql new file mode 100644 index 0000000000..2381c06eb9 --- /dev/null +++ b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql @@ -0,0 +1,14 @@ +/* contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_prewarm UPDATE TO '1.2'" to load this file. \quit + +CREATE FUNCTION autoprewarm_start_worker() +RETURNS VOID STRICT +AS 'MODULE_PATHNAME', 'autoprewarm_start_worker' +LANGUAGE C; + +CREATE FUNCTION autoprewarm_dump_now() +RETURNS pg_catalog.int8 STRICT +AS 'MODULE_PATHNAME', 'autoprewarm_dump_now' +LANGUAGE C; diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control index cf2fb92bed..40e3add481 100644 --- a/contrib/pg_prewarm/pg_prewarm.control +++ b/contrib/pg_prewarm/pg_prewarm.control @@ -1,5 +1,5 @@ # pg_prewarm extension comment = 'prewarm relation data' -default_version = '1.1' +default_version = '1.2' module_pathname = '$libdir/pg_prewarm' relocatable = true diff --git a/doc/src/sgml/pgprewarm.sgml b/doc/src/sgml/pgprewarm.sgml index c090401eca..c6b94a8b72 100644 --- a/doc/src/sgml/pgprewarm.sgml +++ b/doc/src/sgml/pgprewarm.sgml @@ -10,7 +10,13 @@ The pg_prewarm module provides a convenient way to load relation data into either the operating system buffer cache - or the PostgreSQL buffer cache. + or the PostgreSQL buffer cache. Prewarming + can be performed manually using the pg_prewarm function, + or can be performed automatically by including pg_prewarm in + . In the latter case, the + system will run a background worker which periodically records the contents + of shared buffers in a file called autoprewarm.blocks and + will, using 2 background workers, reload those same blocks after a restart. @@ -55,6 +61,67 @@ pg_prewarm(regclass, mode text default 'buffer', fork text default 'main', cache. For these reasons, prewarming is typically most useful at startup, when caches are largely empty. + + +autoprewarm_start_worker() RETURNS void + + + + Launch the main autoprewarm worker. This will normally happen + automatically, but is useful if automatic prewarm was not configured at + server startup time and you wish to start up the worker at a later time. + + + +autoprewarm_dump_now() RETURNS int8 + + + + Update autoprewarm.blocks immediately. This may be useful + if the autoprewarm worker is not running but you anticipate running it + after the next restart. The return value is the number of records written + to autoprewarm.blocks. + + + + + Configuration Parameters + + + + + pg_prewarm.autoprewarm (boolean) + + pg_prewarm.autoprewarm configuration parameter + + + + + Controls whether the server should run the autoprewarm worker. This is + on by default. This parameter can only be set at server start. + + + + + + + + + pg_prewarm.autoprewarm_interval (int) + + pg_prewarm.autoprewarm_interval configuration parameter + + + + + This is the interval between updates to autoprewarm.blocks. + The default is 300 seconds. If set to 0, the file will not be + dumped at regular intervals, but only when the server is shut down. + + + + + diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 9d8ae6ae8e..f033323cff 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -168,6 +168,23 @@ ClockSweepTick(void) return victim; } +/* + * have_free_buffer -- a lockless check to see if there is a free buffer in + * buffer pool. + * + * If the result is true that will become stale once free buffers are moved out + * by other operations, so the caller who strictly want to use a free buffer + * should not call this. + */ +bool +have_free_buffer() +{ + if (StrategyControl->firstFreeBuffer >= 0) + return true; + else + return false; +} + /* * StrategyGetBuffer * diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index b768b6fc96..300adfcf9e 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -317,6 +317,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 8166d86ca1..a4ace383fa 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -138,6 +138,7 @@ AttrDefault AttrNumber AttributeOpts AuthRequest +AutoPrewarmSharedState AutoVacOpts AutoVacuumShmemStruct AutoVacuumWorkItem @@ -218,6 +219,7 @@ BlobInfo Block BlockId BlockIdData +BlockInfoRecord BlockNumber BlockSampler BlockSamplerData