Refactor the fsync queue for wider use.

Previously, md.c and checkpointer.c were tightly integrated so that
fsync calls could be handed off and processed in the background.
Introduce a system of callbacks and file tags, so that other modules
can hand off fsync work in the same way.

For now only md.c uses the new interface, but other users are being
proposed.  Since there may be use cases that are not strictly SMGR
implementations, use a new function table for sync handlers rather
than extending the traditional SMGR one.

Instead of using a bitmapset of segment numbers for each RelFileNode
in the checkpointer's hash table, make the segment number part of the
key.  This requires sending explicit "forget" requests for every
segment individually when relations are dropped, but suits the file
layout schemes of proposed future users better (ie sparse or high
segment numbers).

Author: Shawn Debnath and Thomas Munro
Reviewed-by: Thomas Munro, Andres Freund
Discussion: https://postgr.es/m/CAEepm=2gTANm=e3ARnJT=n0h8hf88wqmaZxk0JYkxw+b21fNrw@mail.gmail.com
This commit is contained in:
Thomas Munro 2019-04-04 21:56:03 +13:00
parent 33215d113d
commit 3eb77eba5a
18 changed files with 904 additions and 890 deletions

View File

@ -98,6 +98,7 @@
#include "replication/walsender.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"

View File

@ -50,6 +50,7 @@
#include "storage/fd.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/md.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"

View File

@ -66,6 +66,7 @@
#include "storage/reinit.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "storage/sync.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/memutils.h"
@ -6981,7 +6982,7 @@ StartupXLOG(void)
if (ArchiveRecoveryRequested && IsUnderPostmaster)
{
PublishStartupProcessInformation();
SetForwardFsyncRequests();
EnableSyncRequestForwarding();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
@ -8566,7 +8567,7 @@ CreateCheckPoint(int flags)
* the REDO pointer. Note that smgr must not do anything that'd have to
* be undone if we decide no checkpoint is needed.
*/
smgrpreckpt();
SyncPreCheckpoint();
/* Begin filling in the checkpoint WAL record */
MemSet(&checkPoint, 0, sizeof(checkPoint));
@ -8856,7 +8857,7 @@ CreateCheckPoint(int flags)
/*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
smgrpostckpt();
SyncPostCheckpoint();
/*
* Update the average distance between checkpoints if the prior checkpoint

View File

@ -54,6 +54,7 @@
#include "storage/fd.h"
#include "storage/lmgr.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "utils/acl.h"
@ -941,11 +942,11 @@ dropdb(const char *dbname, bool missing_ok)
* worse, it will delete files that belong to a newly created database
* with the same OID.
*/
ForgetDatabaseFsyncRequests(db_id);
ForgetDatabaseSyncRequests(db_id);
/*
* Force a checkpoint to make sure the checkpointer has received the
* message sent by ForgetDatabaseFsyncRequests. On Windows, this also
* message sent by ForgetDatabaseSyncRequests. On Windows, this also
* ensures that background procs don't hold any open files, which would
* cause rmdir() to fail.
*/
@ -2150,7 +2151,7 @@ dbase_redo(XLogReaderState *record)
DropDatabaseBuffers(xlrec->db_id);
/* Also, clean out any fsync requests that might be pending in md.c */
ForgetDatabaseFsyncRequests(xlrec->db_id);
ForgetDatabaseSyncRequests(xlrec->db_id);
/* Clean out the xlog relcache too */
XLogDropDatabase(xlrec->db_id);

View File

@ -108,10 +108,8 @@
*/
typedef struct
{
RelFileNode rnode;
ForkNumber forknum;
BlockNumber segno; /* see md.c for special values */
/* might add a real request-type field later; not needed yet */
SyncRequestType type; /* request type */
FileTag ftag; /* file identifier */
} CheckpointerRequest;
typedef struct
@ -349,7 +347,7 @@ CheckpointerMain(void)
/*
* Process any requests or signals received recently.
*/
AbsorbFsyncRequests();
AbsorbSyncRequests();
if (got_SIGHUP)
{
@ -684,7 +682,7 @@ CheckpointWriteDelay(int flags, double progress)
UpdateSharedMemoryConfig();
}
AbsorbFsyncRequests();
AbsorbSyncRequests();
absorb_counter = WRITES_PER_ABSORB;
CheckArchiveTimeout();
@ -709,7 +707,7 @@ CheckpointWriteDelay(int flags, double progress)
* operations even when we don't sleep, to prevent overflow of the
* fsync request queue.
*/
AbsorbFsyncRequests();
AbsorbSyncRequests();
absorb_counter = WRITES_PER_ABSORB;
}
}
@ -1084,7 +1082,7 @@ RequestCheckpoint(int flags)
}
/*
* ForwardFsyncRequest
* ForwardSyncRequest
* Forward a file-fsync request from a backend to the checkpointer
*
* Whenever a backend is compelled to write directly to a relation
@ -1093,15 +1091,6 @@ RequestCheckpoint(int flags)
* is dirty and must be fsync'd before next checkpoint. We also use this
* opportunity to count such writes for statistical purposes.
*
* This functionality is only supported for regular (not backend-local)
* relations, so the rnode argument is intentionally RelFileNode not
* RelFileNodeBackend.
*
* segno specifies which segment (not block!) of the relation needs to be
* fsync'd. (Since the valid range is much less than BlockNumber, we can
* use high values for special flags; that's all internal to md.c, which
* see for details.)
*
* To avoid holding the lock for longer than necessary, we normally write
* to the requests[] queue without checking for duplicates. The checkpointer
* will have to eliminate dups internally anyway. However, if we discover
@ -1113,7 +1102,7 @@ RequestCheckpoint(int flags)
* let the backend know by returning false.
*/
bool
ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
{
CheckpointerRequest *request;
bool too_full;
@ -1122,7 +1111,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
return false; /* probably shouldn't even get here */
if (AmCheckpointerProcess())
elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
@ -1151,9 +1140,8 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
/* OK, insert request */
request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
request->rnode = rnode;
request->forknum = forknum;
request->segno = segno;
request->ftag = *ftag;
request->type = type;
/* If queue is more than half full, nudge the checkpointer to empty it */
too_full = (CheckpointerShmem->num_requests >=
@ -1284,8 +1272,8 @@ CompactCheckpointerRequestQueue(void)
}
/*
* AbsorbFsyncRequests
* Retrieve queued fsync requests and pass them to local smgr.
* AbsorbSyncRequests
* Retrieve queued sync requests and pass them to sync mechanism.
*
* This is exported because it must be called during CreateCheckPoint;
* we have to be sure we have accepted all pending requests just before
@ -1293,7 +1281,7 @@ CompactCheckpointerRequestQueue(void)
* non-checkpointer processes, do nothing if not checkpointer.
*/
void
AbsorbFsyncRequests(void)
AbsorbSyncRequests(void)
{
CheckpointerRequest *requests = NULL;
CheckpointerRequest *request;
@ -1335,7 +1323,7 @@ AbsorbFsyncRequests(void)
LWLockRelease(CheckpointerCommLock);
for (request = requests; n > 0; request++, n--)
RememberFsyncRequest(request->rnode, request->forknum, request->segno);
RememberSyncRequest(&request->ftag, request->type);
END_CRIT_SECTION();

View File

@ -8,6 +8,6 @@ subdir = src/backend/storage
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
SUBDIRS = buffer file freespace ipc large_object lmgr page smgr
SUBDIRS = buffer file freespace ipc large_object lmgr page smgr sync
include $(top_srcdir)/src/backend/common.mk

View File

@ -2584,7 +2584,7 @@ CheckPointBuffers(int flags)
BufferSync(flags);
CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
smgrsync();
ProcessSyncRequests();
CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
}

View File

@ -29,45 +29,17 @@
#include "access/xlogutils.h"
#include "access/xlog.h"
#include "pgstat.h"
#include "portability/instr_time.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/bufmgr.h"
#include "storage/md.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "pg_trace.h"
/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
#define FSYNCS_PER_ABSORB 10
#define UNLINKS_PER_ABSORB 10
/*
* Special values for the segno arg to RememberFsyncRequest.
*
* Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
* fsync request from the queue if an identical, subsequent request is found.
* See comments there before making changes here.
*/
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
/*
* On Windows, we have to interpret EACCES as possibly meaning the same as
* ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
* that's what you get. Ugh. This code is designed so that we don't
* actually believe these cases are okay without further evidence (namely,
* a pending fsync request getting canceled ... see mdsync).
*/
#ifndef WIN32
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
#else
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
#endif
/*
* The magnetic disk storage manager keeps track of open file
* descriptors in its own descriptor pool. This is done to make it
@ -115,49 +87,15 @@ typedef struct _MdfdVec
static MemoryContext MdCxt; /* context for all MdfdVec objects */
/*
* In some contexts (currently, standalone backends and the checkpointer)
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
* table remembers the pending operations. We use a hash table mostly as
* a convenient way of merging duplicate requests.
*
* We use a similar mechanism to remember no-longer-needed files that can
* be deleted after the next checkpoint, but we use a linked list instead of
* a hash table, because we don't expect there to be any duplicate requests.
*
* These mechanisms are only used for non-temp relations; we never fsync
* temp rels, nor do we need to postpone their deletion (see comments in
* mdunlink).
*
* (Regular backends do not track pending operations locally, but forward
* them to the checkpointer.)
*/
typedef uint16 CycleCtr; /* can be any convenient integer size */
typedef struct
{
RelFileNode rnode; /* hash table key (must be first!) */
CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */
/* requests[f] has bit n set if we need to fsync segment n of fork f */
Bitmapset *requests[MAX_FORKNUM + 1];
/* canceled[f] is true if we canceled fsyncs for fork "recently" */
bool canceled[MAX_FORKNUM + 1];
} PendingOperationEntry;
typedef struct
{
RelFileNode rnode; /* the dead relation to delete */
CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */
} PendingUnlinkEntry;
static HTAB *pendingOpsTable = NULL;
static List *pendingUnlinks = NIL;
static MemoryContext pendingOpsCxt; /* context for the above */
static CycleCtr mdsync_cycle_ctr = 0;
static CycleCtr mdckpt_cycle_ctr = 0;
/* Populate a file tag describing an md.c segment file. */
#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rnode = (xx_rnode), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
/*** behavior for mdopen & _mdfd_getseg ***/
@ -185,7 +123,10 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
static void register_unlink(RelFileNodeBackend rnode);
static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
BlockNumber segno);
static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
BlockNumber segno);
static void _fdvec_resize(SMgrRelation reln,
ForkNumber forknum,
int nseg);
@ -208,64 +149,6 @@ mdinit(void)
MdCxt = AllocSetContextCreate(TopMemoryContext,
"MdSmgr",
ALLOCSET_DEFAULT_SIZES);
/*
* Create pending-operations hashtable if we need it. Currently, we need
* it if we are standalone (not under a postmaster) or if we are a startup
* or checkpointer auxiliary process.
*/
if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
{
HASHCTL hash_ctl;
/*
* XXX: The checkpointer needs to add entries to the pending ops table
* when absorbing fsync requests. That is done within a critical
* section, which isn't usually allowed, but we make an exception. It
* means that there's a theoretical possibility that you run out of
* memory while absorbing fsync requests, which leads to a PANIC.
* Fortunately the hash table is small so that's unlikely to happen in
* practice.
*/
pendingOpsCxt = AllocSetContextCreate(MdCxt,
"Pending ops context",
ALLOCSET_DEFAULT_SIZES);
MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(RelFileNode);
hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hcxt = pendingOpsCxt;
pendingOpsTable = hash_create("Pending Ops Table",
100L,
&hash_ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
pendingUnlinks = NIL;
}
}
/*
* In archive recovery, we rely on checkpointer to do fsyncs, but we will have
* already created the pendingOpsTable during initialization of the startup
* process. Calling this function drops the local pendingOpsTable so that
* subsequent requests will be forwarded to checkpointer.
*/
void
SetForwardFsyncRequests(void)
{
/* Perform any pending fsyncs we may have queued up, then drop table */
if (pendingOpsTable)
{
mdsync();
hash_destroy(pendingOpsTable);
}
pendingOpsTable = NULL;
/*
* We should not have any pending unlink requests, since mdunlink doesn't
* queue unlink requests when isRedo.
*/
Assert(pendingUnlinks == NIL);
}
/*
@ -380,16 +263,6 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
void
mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
{
/*
* We have to clean out any pending fsync requests for the doomed
* relation, else the next mdsync() will fail. There can't be any such
* requests for a temp relation, though. We can send just one request
* even when deleting multiple forks, since the fsync queuing code accepts
* the "InvalidForkNumber = all forks" convention.
*/
if (!RelFileNodeBackendIsTemp(rnode))
ForgetRelationFsyncRequests(rnode.node, forkNum);
/* Now do the per-fork work */
if (forkNum == InvalidForkNumber)
{
@ -413,6 +286,11 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
*/
if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
{
/* First, forget any pending sync requests for the first segment */
if (!RelFileNodeBackendIsTemp(rnode))
register_forget_request(rnode, forkNum, 0 /* first seg */ );
/* Next unlink the file */
ret = unlink(path);
if (ret < 0 && errno != ENOENT)
ereport(WARNING,
@ -442,7 +320,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
errmsg("could not truncate file \"%s\": %m", path)));
/* Register request to unlink first segment later */
register_unlink(rnode);
register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
}
/*
@ -459,6 +337,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
*/
for (segno = 1;; segno++)
{
/*
* Forget any pending sync requests for this segment before we try
* to unlink.
*/
if (!RelFileNodeBackendIsTemp(rnode))
register_forget_request(rnode, forkNum, segno);
sprintf(segpath, "%s.%u", path, segno);
if (unlink(segpath) < 0)
{
@ -1003,413 +888,27 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
}
}
/*
* mdsync() -- Sync previous writes to stable storage.
*/
void
mdsync(void)
{
static bool mdsync_in_progress = false;
HASH_SEQ_STATUS hstat;
PendingOperationEntry *entry;
int absorb_counter;
/* Statistics on sync times */
int processed = 0;
instr_time sync_start,
sync_end,
sync_diff;
uint64 elapsed;
uint64 longest = 0;
uint64 total_elapsed = 0;
/*
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOpsTable.
*/
if (!pendingOpsTable)
elog(ERROR, "cannot sync without a pendingOpsTable");
/*
* If we are in the checkpointer, the sync had better include all fsync
* requests that were queued by backends up to this point. The tightest
* race condition that could occur is that a buffer that must be written
* and fsync'd for the checkpoint could have been dumped by a backend just
* before it was visited by BufferSync(). We know the backend will have
* queued an fsync request before clearing the buffer's dirtybit, so we
* are safe as long as we do an Absorb after completing BufferSync().
*/
AbsorbFsyncRequests();
/*
* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
* checkpoint), we want to ignore fsync requests that are entered into the
* hashtable after this point --- they should be processed next time,
* instead. We use mdsync_cycle_ctr to tell old entries apart from new
* ones: new ones will have cycle_ctr equal to the incremented value of
* mdsync_cycle_ctr.
*
* In normal circumstances, all entries present in the table at this point
* will have cycle_ctr exactly equal to the current (about to be old)
* value of mdsync_cycle_ctr. However, if we fail partway through the
* fsync'ing loop, then older values of cycle_ctr might remain when we
* come back here to try again. Repeated checkpoint failures would
* eventually wrap the counter around to the point where an old entry
* might appear new, causing us to skip it, possibly allowing a checkpoint
* to succeed that should not have. To forestall wraparound, any time the
* previous mdsync() failed to complete, run through the table and
* forcibly set cycle_ctr = mdsync_cycle_ctr.
*
* Think not to merge this loop with the main loop, as the problem is
* exactly that that loop may fail before having visited all the entries.
* From a performance point of view it doesn't matter anyway, as this path
* will never be taken in a system that's functioning normally.
*/
if (mdsync_in_progress)
{
/* prior try failed, so update any stale cycle_ctr values */
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
entry->cycle_ctr = mdsync_cycle_ctr;
}
}
/* Advance counter so that new hashtable entries are distinguishable */
mdsync_cycle_ctr++;
/* Set flag to detect failure if we don't reach the end of the loop */
mdsync_in_progress = true;
/* Now scan the hashtable for fsync requests to process */
absorb_counter = FSYNCS_PER_ABSORB;
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
ForkNumber forknum;
/*
* If the entry is new then don't process it this time; it might
* contain multiple fsync-request bits, but they are all new. Note
* "continue" bypasses the hash-remove call at the bottom of the loop.
*/
if (entry->cycle_ctr == mdsync_cycle_ctr)
continue;
/* Else assert we haven't missed it */
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
/*
* Scan over the forks and segments represented by the entry.
*
* The bitmap manipulations are slightly tricky, because we can call
* AbsorbFsyncRequests() inside the loop and that could result in
* bms_add_member() modifying and even re-palloc'ing the bitmapsets.
* So we detach it, but if we fail we'll merge it with any new
* requests that have arrived in the meantime.
*/
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
Bitmapset *requests = entry->requests[forknum];
int segno;
entry->requests[forknum] = NULL;
entry->canceled[forknum] = false;
segno = -1;
while ((segno = bms_next_member(requests, segno)) >= 0)
{
int failures;
/*
* If fsync is off then we don't have to bother opening the
* file at all. (We delay checking until this point so that
* changing fsync on the fly behaves sensibly.)
*/
if (!enableFsync)
continue;
/*
* If in checkpointer, we want to absorb pending requests
* every so often to prevent overflow of the fsync request
* queue. It is unspecified whether newly-added entries will
* be visited by hash_seq_search, but we don't care since we
* don't need to process them anyway.
*/
if (--absorb_counter <= 0)
{
AbsorbFsyncRequests();
absorb_counter = FSYNCS_PER_ABSORB;
}
/*
* The fsync table could contain requests to fsync segments
* that have been deleted (unlinked) by the time we get to
* them. Rather than just hoping an ENOENT (or EACCES on
* Windows) error can be ignored, what we do on error is
* absorb pending requests and then retry. Since mdunlink()
* queues a "cancel" message before actually unlinking, the
* fsync request is guaranteed to be marked canceled after the
* absorb if it really was this case. DROP DATABASE likewise
* has to tell us to forget fsync requests before it starts
* deletions.
*/
for (failures = 0;; failures++) /* loop exits at "break" */
{
SMgrRelation reln;
MdfdVec *seg;
char *path;
int save_errno;
/*
* Find or create an smgr hash entry for this relation.
* This may seem a bit unclean -- md calling smgr? But
* it's really the best solution. It ensures that the
* open file reference isn't permanently leaked if we get
* an error here. (You may say "but an unreferenced
* SMgrRelation is still a leak!" Not really, because the
* only case in which a checkpoint is done by a process
* that isn't about to shut down is in the checkpointer,
* and it will periodically do smgrcloseall(). This fact
* justifies our not closing the reln in the success path
* either, which is a good thing since in non-checkpointer
* cases we couldn't safely do that.)
*/
reln = smgropen(entry->rnode, InvalidBackendId);
/* Attempt to open and fsync the target segment */
seg = _mdfd_getseg(reln, forknum,
(BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
false,
EXTENSION_RETURN_NULL
| EXTENSION_DONT_CHECK_SIZE);
INSTR_TIME_SET_CURRENT(sync_start);
if (seg != NULL &&
FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
{
/* Success; update statistics about sync timing */
INSTR_TIME_SET_CURRENT(sync_end);
sync_diff = sync_end;
INSTR_TIME_SUBTRACT(sync_diff, sync_start);
elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
if (elapsed > longest)
longest = elapsed;
total_elapsed += elapsed;
processed++;
requests = bms_del_member(requests, segno);
if (log_checkpoints)
elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
processed,
FilePathName(seg->mdfd_vfd),
(double) elapsed / 1000);
break; /* out of retry loop */
}
/* Compute file name for use in message */
save_errno = errno;
path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
errno = save_errno;
/*
* It is possible that the relation has been dropped or
* truncated since the fsync request was entered.
* Therefore, allow ENOENT, but only if we didn't fail
* already on this file. This applies both for
* _mdfd_getseg() and for FileSync, since fd.c might have
* closed the file behind our back.
*
* XXX is there any point in allowing more than one retry?
* Don't see one at the moment, but easy to change the
* test here if so.
*/
if (!FILE_POSSIBLY_DELETED(errno) ||
failures > 0)
{
Bitmapset *new_requests;
/*
* We need to merge these unsatisfied requests with
* any others that have arrived since we started.
*/
new_requests = entry->requests[forknum];
entry->requests[forknum] =
bms_join(new_requests, requests);
errno = save_errno;
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
path)));
}
else
ereport(DEBUG1,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\" but retrying: %m",
path)));
pfree(path);
/*
* Absorb incoming requests and check to see if a cancel
* arrived for this relation fork.
*/
AbsorbFsyncRequests();
absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
if (entry->canceled[forknum])
break;
} /* end retry loop */
}
bms_free(requests);
}
/*
* We've finished everything that was requested before we started to
* scan the entry. If no new requests have been inserted meanwhile,
* remove the entry. Otherwise, update its cycle counter, as all the
* requests now in it must have arrived during this cycle.
*/
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
if (entry->requests[forknum] != NULL)
break;
}
if (forknum <= MAX_FORKNUM)
entry->cycle_ctr = mdsync_cycle_ctr;
else
{
/* Okay to remove it */
if (hash_search(pendingOpsTable, &entry->rnode,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
} /* end loop over hashtable entries */
/* Return sync performance metrics for report at checkpoint end */
CheckpointStats.ckpt_sync_rels = processed;
CheckpointStats.ckpt_longest_sync = longest;
CheckpointStats.ckpt_agg_sync_time = total_elapsed;
/* Flag successful completion of mdsync */
mdsync_in_progress = false;
}
/*
* mdpreckpt() -- Do pre-checkpoint work
*
* To distinguish unlink requests that arrived before this checkpoint
* started from those that arrived during the checkpoint, we use a cycle
* counter similar to the one we use for fsync requests. That cycle
* counter is incremented here.
*
* This must be called *before* the checkpoint REDO point is determined.
* That ensures that we won't delete files too soon.
*
* Note that we can't do anything here that depends on the assumption
* that the checkpoint will be completed.
*/
void
mdpreckpt(void)
{
/*
* Any unlink requests arriving after this point will be assigned the next
* cycle counter, and won't be unlinked until next checkpoint.
*/
mdckpt_cycle_ctr++;
}
/*
* mdpostckpt() -- Do post-checkpoint work
*
* Remove any lingering files that can now be safely removed.
*/
void
mdpostckpt(void)
{
int absorb_counter;
absorb_counter = UNLINKS_PER_ABSORB;
while (pendingUnlinks != NIL)
{
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
char *path;
/*
* New entries are appended to the end, so if the entry is new we've
* reached the end of old entries.
*
* Note: if just the right number of consecutive checkpoints fail, we
* could be fooled here by cycle_ctr wraparound. However, the only
* consequence is that we'd delay unlinking for one more checkpoint,
* which is perfectly tolerable.
*/
if (entry->cycle_ctr == mdckpt_cycle_ctr)
break;
/* Unlink the file */
path = relpathperm(entry->rnode, MAIN_FORKNUM);
if (unlink(path) < 0)
{
/*
* There's a race condition, when the database is dropped at the
* same time that we process the pending unlink requests. If the
* DROP DATABASE deletes the file before we do, we will get ENOENT
* here. rmtree() also has to ignore ENOENT errors, to deal with
* the possibility that we delete the file first.
*/
if (errno != ENOENT)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m", path)));
}
pfree(path);
/* And remove the list entry */
pendingUnlinks = list_delete_first(pendingUnlinks);
pfree(entry);
/*
* As in mdsync, we don't want to stop absorbing fsync requests for a
* long time when there are many deletions to be done. We can safely
* call AbsorbFsyncRequests() at this point in the loop (note it might
* try to delete list entries).
*/
if (--absorb_counter <= 0)
{
AbsorbFsyncRequests();
absorb_counter = UNLINKS_PER_ABSORB;
}
}
}
/*
* register_dirty_segment() -- Mark a relation segment as needing fsync
*
* If there is a local pending-ops table, just make an entry in it for
* mdsync to process later. Otherwise, try to pass off the fsync request
* to the checkpointer process. If that fails, just do the fsync
* locally before returning (we hope this will not happen often enough
* to be a performance problem).
* ProcessSyncRequests to process later. Otherwise, try to pass off the
* fsync request to the checkpointer process. If that fails, just do the
* fsync locally before returning (we hope this will not happen often
* enough to be a performance problem).
*/
static void
register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
FileTag tag;
INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno);
/* Temp relations should never be fsync'd */
Assert(!SmgrIsTemp(reln));
if (pendingOpsTable)
if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
{
/* push it into local pending-ops table */
RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
}
else
{
if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
return; /* passed it off successfully */
ereport(DEBUG1,
(errmsg("could not forward fsync request because request queue is full")));
@ -1423,254 +922,51 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
/*
* register_unlink() -- Schedule a file to be deleted after next checkpoint
*
* We don't bother passing in the fork number, because this is only used
* with main forks.
*
* As with register_dirty_segment, this could involve either a local or
* a remote pending-ops table.
*/
static void
register_unlink(RelFileNodeBackend rnode)
register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
BlockNumber segno)
{
FileTag tag;
INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
/* Should never be used with temp relations */
Assert(!RelFileNodeBackendIsTemp(rnode));
if (pendingOpsTable)
{
/* push it into local pending-ops table */
RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
UNLINK_RELATION_REQUEST);
}
else
{
/*
* Notify the checkpointer about it. If we fail to queue the request
* message, we have to sleep and try again, because we can't simply
* delete the file now. Ugly, but hopefully won't happen often.
*
* XXX should we just leave the file orphaned instead?
*/
Assert(IsUnderPostmaster);
while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
UNLINK_RELATION_REQUEST))
pg_usleep(10000L); /* 10 msec seems a good number */
}
RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
}
/*
* RememberFsyncRequest() -- callback from checkpointer side of fsync request
*
* We stuff fsync requests into the local hash table for execution
* during the checkpointer's next checkpoint. UNLINK requests go into a
* separate linked list, however, because they get processed separately.
*
* The range of possible segment numbers is way less than the range of
* BlockNumber, so we can reserve high values of segno for special purposes.
* We define three:
* - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
* either for one fork, or all forks if forknum is InvalidForkNumber
* - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
* - UNLINK_RELATION_REQUEST is a request to delete the file after the next
* checkpoint.
* Note also that we're assuming real segment numbers don't exceed INT_MAX.
*
* (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
* table has to be searched linearly, but dropping a database is a pretty
* heavyweight operation anyhow, so we'll live with it.)
* register_forget_request() -- forget any fsyncs for a relation fork's segment
*/
void
RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
static void
register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
BlockNumber segno)
{
Assert(pendingOpsTable);
FileTag tag;
if (segno == FORGET_RELATION_FSYNC)
{
/* Remove any pending requests for the relation (one or all forks) */
PendingOperationEntry *entry;
INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
&rnode,
HASH_FIND,
NULL);
if (entry)
{
/*
* We can't just delete the entry since mdsync could have an
* active hashtable scan. Instead we delete the bitmapsets; this
* is safe because of the way mdsync is coded. We also set the
* "canceled" flags so that mdsync can tell that a cancel arrived
* for the fork(s).
*/
if (forknum == InvalidForkNumber)
{
/* remove requests for all forks */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
bms_free(entry->requests[forknum]);
entry->requests[forknum] = NULL;
entry->canceled[forknum] = true;
}
}
else
{
/* remove requests for single fork */
bms_free(entry->requests[forknum]);
entry->requests[forknum] = NULL;
entry->canceled[forknum] = true;
}
}
}
else if (segno == FORGET_DATABASE_FSYNC)
{
/* Remove any pending requests for the entire database */
HASH_SEQ_STATUS hstat;
PendingOperationEntry *entry;
ListCell *cell,
*prev,
*next;
/* Remove fsync requests */
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
if (entry->rnode.dbNode == rnode.dbNode)
{
/* remove requests for all forks */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
bms_free(entry->requests[forknum]);
entry->requests[forknum] = NULL;
entry->canceled[forknum] = true;
}
}
}
/* Remove unlink requests */
prev = NULL;
for (cell = list_head(pendingUnlinks); cell; cell = next)
{
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
next = lnext(cell);
if (entry->rnode.dbNode == rnode.dbNode)
{
pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
pfree(entry);
}
else
prev = cell;
}
}
else if (segno == UNLINK_RELATION_REQUEST)
{
/* Unlink request: put it in the linked list */
MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
PendingUnlinkEntry *entry;
/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
Assert(forknum == MAIN_FORKNUM);
entry = palloc(sizeof(PendingUnlinkEntry));
entry->rnode = rnode;
entry->cycle_ctr = mdckpt_cycle_ctr;
pendingUnlinks = lappend(pendingUnlinks, entry);
MemoryContextSwitchTo(oldcxt);
}
else
{
/* Normal case: enter a request to fsync this segment */
MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
PendingOperationEntry *entry;
bool found;
entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
&rnode,
HASH_ENTER,
&found);
/* if new entry, initialize it */
if (!found)
{
entry->cycle_ctr = mdsync_cycle_ctr;
MemSet(entry->requests, 0, sizeof(entry->requests));
MemSet(entry->canceled, 0, sizeof(entry->canceled));
}
/*
* NB: it's intentional that we don't change cycle_ctr if the entry
* already exists. The cycle_ctr must represent the oldest fsync
* request that could be in the entry.
*/
entry->requests[forknum] = bms_add_member(entry->requests[forknum],
(int) segno);
MemoryContextSwitchTo(oldcxt);
}
}
/*
* ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
*
* forknum == InvalidForkNumber means all forks, although this code doesn't
* actually know that, since it's just forwarding the request elsewhere.
*/
void
ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
{
if (pendingOpsTable)
{
/* standalone backend or startup process: fsync state is local */
RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
}
else if (IsUnderPostmaster)
{
/*
* Notify the checkpointer about it. If we fail to queue the cancel
* message, we have to sleep and try again ... ugly, but hopefully
* won't happen often.
*
* XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
* error would leave the no-longer-used file still present on disk,
* which would be bad, so I'm inclined to assume that the checkpointer
* will always empty the queue soon.
*/
while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
pg_usleep(10000L); /* 10 msec seems a good number */
/*
* Note we don't wait for the checkpointer to actually absorb the
* cancel message; see mdsync() for the implications.
*/
}
RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
}
/*
* ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
*/
void
ForgetDatabaseFsyncRequests(Oid dbid)
ForgetDatabaseSyncRequests(Oid dbid)
{
FileTag tag;
RelFileNode rnode;
rnode.dbNode = dbid;
rnode.spcNode = 0;
rnode.relNode = 0;
if (pendingOpsTable)
{
/* standalone backend or startup process: fsync state is local */
RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
}
else if (IsUnderPostmaster)
{
/* see notes in ForgetRelationFsyncRequests */
while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
FORGET_DATABASE_FSYNC))
pg_usleep(10000L); /* 10 msec seems a good number */
}
INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber);
RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
}
/*
@ -1951,3 +1247,72 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
/* note that this calculation will ignore any partial block at EOF */
return (BlockNumber) (len / BLCKSZ);
}
/*
* Sync a file to disk, given a file tag. Write the path into an output
* buffer so the caller can use it in error messages.
*
* Return 0 on success, -1 on failure, with errno set.
*/
int
mdsyncfiletag(const FileTag *ftag, char *path)
{
SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
MdfdVec *v;
char *p;
/* Provide the path for informational messages. */
p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
strlcpy(path, p, MAXPGPATH);
pfree(p);
/* Try to find open the requested segment. */
v = _mdfd_getseg(reln, ftag->forknum, ftag->segno, false,
EXTENSION_RETURN_NULL);
if (v == NULL)
{
errno = ENOENT;
return -1;
}
/* Try to fsync the file. */
return FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC);
}
/*
* Unlink a file, given a file tag. Write the path into an output
* buffer so the caller can use it in error messages.
*
* Return 0 on success, -1 on failure, with errno set.
*/
int
mdunlinkfiletag(const FileTag *ftag, char *path)
{
SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
char *p;
/* Compute the path. */
p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
strlcpy(path, p, MAXPGPATH);
pfree(p);
/* Try to unlink the file. */
return unlink(path);
}
/*
* Check if a given candidate request matches a given tag, when processing
* a SYNC_FILTER_REQUEST request. This will be called for all pending
* requests to find out whether to forget them.
*/
bool
mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
{
/*
* For now we only use filter requests as a way to drop all scheduled
* callbacks relating to a given database, when dropping the database.
* We'll return true for all candidates that have the same database OID as
* the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
*/
return ftag->rnode.dbNode == candidate->rnode.dbNode;
}

View File

@ -21,6 +21,7 @@
#include "lib/ilist.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
@ -60,12 +61,8 @@ typedef struct f_smgr
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_pre_ckpt) (void); /* may be NULL */
void (*smgr_sync) (void); /* may be NULL */
void (*smgr_post_ckpt) (void); /* may be NULL */
} f_smgr;
static const f_smgr smgrsw[] = {
/* magnetic disk */
{
@ -83,15 +80,11 @@ static const f_smgr smgrsw[] = {
.smgr_nblocks = mdnblocks,
.smgr_truncate = mdtruncate,
.smgr_immedsync = mdimmedsync,
.smgr_pre_ckpt = mdpreckpt,
.smgr_sync = mdsync,
.smgr_post_ckpt = mdpostckpt
}
};
static const int NSmgr = lengthof(smgrsw);
/*
* Each backend has a hashtable that stores all extant SMgrRelation objects.
* In addition, "unowned" SMgrRelation objects are chained together in a list.
@ -705,52 +698,6 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
}
/*
* smgrpreckpt() -- Prepare for checkpoint.
*/
void
smgrpreckpt(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_pre_ckpt)
smgrsw[i].smgr_pre_ckpt();
}
}
/*
* smgrsync() -- Sync files to disk during checkpoint.
*/
void
smgrsync(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_sync)
smgrsw[i].smgr_sync();
}
}
/*
* smgrpostckpt() -- Post-checkpoint cleanup.
*/
void
smgrpostckpt(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_post_ckpt)
smgrsw[i].smgr_post_ckpt();
}
}
/*
* AtEOXact_SMgr
*

View File

@ -0,0 +1,17 @@
#-------------------------------------------------------------------------
#
# Makefile--
# Makefile for storage/sync
#
# IDENTIFICATION
# src/backend/storage/sync/Makefile
#
#-------------------------------------------------------------------------
subdir = src/backend/storage/sync
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = sync.o
include $(top_srcdir)/src/backend/common.mk

View File

@ -0,0 +1,598 @@
/*-------------------------------------------------------------------------
*
* sync.c
* File synchronization management code.
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/storage/sync/sync.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "miscadmin.h"
#include "pgstat.h"
#include "access/xlogutils.h"
#include "access/xlog.h"
#include "commands/tablespace.h"
#include "portability/instr_time.h"
#include "postmaster/bgwriter.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/inval.h"
static MemoryContext pendingOpsCxt; /* context for the pending ops state */
/*
* In some contexts (currently, standalone backends and the checkpointer)
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
* table remembers the pending operations. We use a hash table mostly as
* a convenient way of merging duplicate requests.
*
* We use a similar mechanism to remember no-longer-needed files that can
* be deleted after the next checkpoint, but we use a linked list instead of
* a hash table, because we don't expect there to be any duplicate requests.
*
* These mechanisms are only used for non-temp relations; we never fsync
* temp rels, nor do we need to postpone their deletion (see comments in
* mdunlink).
*
* (Regular backends do not track pending operations locally, but forward
* them to the checkpointer.)
*/
typedef uint16 CycleCtr; /* can be any convenient integer size */
typedef struct
{
FileTag tag; /* identifies handler and file */
CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
bool canceled; /* canceled is true if we canceled "recently" */
} PendingFsyncEntry;
typedef struct
{
FileTag tag; /* identifies handler and file */
CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
} PendingUnlinkEntry;
static HTAB *pendingOps = NULL;
static List *pendingUnlinks = NIL;
static MemoryContext pendingOpsCxt; /* context for the above */
static CycleCtr sync_cycle_ctr = 0;
static CycleCtr checkpoint_cycle_ctr = 0;
/* Intervals for calling AbsorbFsyncRequests */
#define FSYNCS_PER_ABSORB 10
#define UNLINKS_PER_ABSORB 10
/*
* Function pointers for handling sync and unlink requests.
*/
typedef struct SyncOps
{
int (*sync_syncfiletag) (const FileTag *ftag, char *path);
int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
bool (*sync_filetagmatches) (const FileTag *ftag,
const FileTag *candidate);
} SyncOps;
static const SyncOps syncsw[] = {
/* magnetic disk */
{
.sync_syncfiletag = mdsyncfiletag,
.sync_unlinkfiletag = mdunlinkfiletag,
.sync_filetagmatches = mdfiletagmatches
}
};
/*
* Initialize data structures for the file sync tracking.
*/
void
InitSync(void)
{
/*
* Create pending-operations hashtable if we need it. Currently, we need
* it if we are standalone (not under a postmaster) or if we are a startup
* or checkpointer auxiliary process.
*/
if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
{
HASHCTL hash_ctl;
/*
* XXX: The checkpointer needs to add entries to the pending ops table
* when absorbing fsync requests. That is done within a critical
* section, which isn't usually allowed, but we make an exception. It
* means that there's a theoretical possibility that you run out of
* memory while absorbing fsync requests, which leads to a PANIC.
* Fortunately the hash table is small so that's unlikely to happen in
* practice.
*/
pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
"Pending ops context",
ALLOCSET_DEFAULT_SIZES);
MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(FileTag);
hash_ctl.entrysize = sizeof(PendingFsyncEntry);
hash_ctl.hcxt = pendingOpsCxt;
pendingOps = hash_create("Pending Ops Table",
100L,
&hash_ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
pendingUnlinks = NIL;
}
}
/*
* SyncPreCheckpoint() -- Do pre-checkpoint work
*
* To distinguish unlink requests that arrived before this checkpoint
* started from those that arrived during the checkpoint, we use a cycle
* counter similar to the one we use for fsync requests. That cycle
* counter is incremented here.
*
* This must be called *before* the checkpoint REDO point is determined.
* That ensures that we won't delete files too soon.
*
* Note that we can't do anything here that depends on the assumption
* that the checkpoint will be completed.
*/
void
SyncPreCheckpoint(void)
{
/*
* Any unlink requests arriving after this point will be assigned the next
* cycle counter, and won't be unlinked until next checkpoint.
*/
checkpoint_cycle_ctr++;
}
/*
* SyncPostCheckpoint() -- Do post-checkpoint work
*
* Remove any lingering files that can now be safely removed.
*/
void
SyncPostCheckpoint(void)
{
int absorb_counter;
absorb_counter = UNLINKS_PER_ABSORB;
while (pendingUnlinks != NIL)
{
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
char path[MAXPGPATH];
/*
* New entries are appended to the end, so if the entry is new we've
* reached the end of old entries.
*
* Note: if just the right number of consecutive checkpoints fail, we
* could be fooled here by cycle_ctr wraparound. However, the only
* consequence is that we'd delay unlinking for one more checkpoint,
* which is perfectly tolerable.
*/
if (entry->cycle_ctr == checkpoint_cycle_ctr)
break;
/* Unlink the file */
if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
path) < 0)
{
/*
* There's a race condition, when the database is dropped at the
* same time that we process the pending unlink requests. If the
* DROP DATABASE deletes the file before we do, we will get ENOENT
* here. rmtree() also has to ignore ENOENT errors, to deal with
* the possibility that we delete the file first.
*/
if (errno != ENOENT)
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not remove file \"%s\": %m", path)));
}
/* And remove the list entry */
pendingUnlinks = list_delete_first(pendingUnlinks);
pfree(entry);
/*
* As in ProcessFsyncRequests, we don't want to stop absorbing fsync
* requests for along time when there are many deletions to be done.
* We can safely call AbsorbFsyncRequests() at this point in the loop
* (note it might try to delete list entries).
*/
if (--absorb_counter <= 0)
{
AbsorbSyncRequests();
absorb_counter = UNLINKS_PER_ABSORB;
}
}
}
/*
* ProcessSyncRequests() -- Process queued fsync requests.
*/
void
ProcessSyncRequests(void)
{
static bool sync_in_progress = false;
HASH_SEQ_STATUS hstat;
PendingFsyncEntry *entry;
int absorb_counter;
/* Statistics on sync times */
int processed = 0;
instr_time sync_start,
sync_end,
sync_diff;
uint64 elapsed;
uint64 longest = 0;
uint64 total_elapsed = 0;
/*
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOps.
*/
if (!pendingOps)
elog(ERROR, "cannot sync without a pendingOps table");
/*
* If we are in the checkpointer, the sync had better include all fsync
* requests that were queued by backends up to this point. The tightest
* race condition that could occur is that a buffer that must be written
* and fsync'd for the checkpoint could have been dumped by a backend just
* before it was visited by BufferSync(). We know the backend will have
* queued an fsync request before clearing the buffer's dirtybit, so we
* are safe as long as we do an Absorb after completing BufferSync().
*/
AbsorbSyncRequests();
/*
* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
* checkpoint), we want to ignore fsync requests that are entered into the
* hashtable after this point --- they should be processed next time,
* instead. We use sync_cycle_ctr to tell old entries apart from new
* ones: new ones will have cycle_ctr equal to the incremented value of
* sync_cycle_ctr.
*
* In normal circumstances, all entries present in the table at this point
* will have cycle_ctr exactly equal to the current (about to be old)
* value of sync_cycle_ctr. However, if we fail partway through the
* fsync'ing loop, then older values of cycle_ctr might remain when we
* come back here to try again. Repeated checkpoint failures would
* eventually wrap the counter around to the point where an old entry
* might appear new, causing us to skip it, possibly allowing a checkpoint
* to succeed that should not have. To forestall wraparound, any time the
* previous ProcessFsyncRequests() failed to complete, run through the
* table and forcibly set cycle_ctr = sync_cycle_ctr.
*
* Think not to merge this loop with the main loop, as the problem is
* exactly that that loop may fail before having visited all the entries.
* From a performance point of view it doesn't matter anyway, as this path
* will never be taken in a system that's functioning normally.
*/
if (sync_in_progress)
{
/* prior try failed, so update any stale cycle_ctr values */
hash_seq_init(&hstat, pendingOps);
while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
{
entry->cycle_ctr = sync_cycle_ctr;
}
}
/* Advance counter so that new hashtable entries are distinguishable */
sync_cycle_ctr++;
/* Set flag to detect failure if we don't reach the end of the loop */
sync_in_progress = true;
/* Now scan the hashtable for fsync requests to process */
absorb_counter = FSYNCS_PER_ABSORB;
hash_seq_init(&hstat, pendingOps);
while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
{
int failures;
/*
* If fsync is off then we don't have to bother opening the file at
* all. (We delay checking until this point so that changing fsync on
* the fly behaves sensibly.)
*/
if (!enableFsync)
continue;
/*
* If the entry is new then don't process it this time; it is new.
* Note "continue" bypasses the hash-remove call at the bottom of the
* loop.
*/
if (entry->cycle_ctr == sync_cycle_ctr)
continue;
/* Else assert we haven't missed it */
Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
/*
* If in checkpointer, we want to absorb pending requests every so
* often to prevent overflow of the fsync request queue. It is
* unspecified whether newly-added entries will be visited by
* hash_seq_search, but we don't care since we don't need to process
* them anyway.
*/
if (--absorb_counter <= 0)
{
AbsorbSyncRequests();
absorb_counter = FSYNCS_PER_ABSORB;
}
/*
* The fsync table could contain requests to fsync segments that have
* been deleted (unlinked) by the time we get to them. Rather than
* just hoping an ENOENT (or EACCES on Windows) error can be ignored,
* what we do on error is absorb pending requests and then retry.
* Since mdunlink() queues a "cancel" message before actually
* unlinking, the fsync request is guaranteed to be marked canceled
* after the absorb if it really was this case. DROP DATABASE likewise
* has to tell us to forget fsync requests before it starts deletions.
*/
for (failures = 0; !entry->canceled; failures++)
{
char path[MAXPGPATH];
INSTR_TIME_SET_CURRENT(sync_start);
if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
path) == 0)
{
/* Success; update statistics about sync timing */
INSTR_TIME_SET_CURRENT(sync_end);
sync_diff = sync_end;
INSTR_TIME_SUBTRACT(sync_diff, sync_start);
elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
if (elapsed > longest)
longest = elapsed;
total_elapsed += elapsed;
processed++;
if (log_checkpoints)
elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
processed,
path,
(double) elapsed / 1000);
break; /* out of retry loop */
}
/*
* It is possible that the relation has been dropped or truncated
* since the fsync request was entered. Therefore, allow ENOENT,
* but only if we didn't fail already on this file.
*/
if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
path)));
else
ereport(DEBUG1,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\" but retrying: %m",
path)));
/*
* Absorb incoming requests and check to see if a cancel arrived
* for this relation fork.
*/
AbsorbSyncRequests();
absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
} /* end retry loop */
/* We are done with this entry, remove it */
if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOps corrupted");
} /* end loop over hashtable entries */
/* Return sync performance metrics for report at checkpoint end */
CheckpointStats.ckpt_sync_rels = processed;
CheckpointStats.ckpt_longest_sync = longest;
CheckpointStats.ckpt_agg_sync_time = total_elapsed;
/* Flag successful completion of ProcessSyncRequests */
sync_in_progress = false;
}
/*
* RememberSyncRequest() -- callback from checkpointer side of sync request
*
* We stuff fsync requests into the local hash table for execution
* during the checkpointer's next checkpoint. UNLINK requests go into a
* separate linked list, however, because they get processed separately.
*
* See sync.h for more information on the types of sync requests supported.
*/
void
RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
{
Assert(pendingOps);
if (type == SYNC_FORGET_REQUEST)
{
PendingFsyncEntry *entry;
/* Cancel previously entered request */
entry = (PendingFsyncEntry *) hash_search(pendingOps,
(void *) ftag,
HASH_FIND,
NULL);
if (entry != NULL)
entry->canceled = true;
}
else if (type == SYNC_FILTER_REQUEST)
{
HASH_SEQ_STATUS hstat;
PendingFsyncEntry *entry;
ListCell *cell,
*prev,
*next;
/* Cancel matching fsync requests */
hash_seq_init(&hstat, pendingOps);
while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
{
if (entry->tag.handler == ftag->handler &&
syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
entry->canceled = true;
}
/* Remove matching unlink requests */
prev = NULL;
for (cell = list_head(pendingUnlinks); cell; cell = next)
{
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
next = lnext(cell);
if (entry->tag.handler == ftag->handler &&
syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
{
pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
pfree(entry);
}
else
prev = cell;
}
}
else if (type == SYNC_UNLINK_REQUEST)
{
/* Unlink request: put it in the linked list */
MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
PendingUnlinkEntry *entry;
entry = palloc(sizeof(PendingUnlinkEntry));
entry->tag = *ftag;
entry->cycle_ctr = checkpoint_cycle_ctr;
pendingUnlinks = lappend(pendingUnlinks, entry);
MemoryContextSwitchTo(oldcxt);
}
else
{
/* Normal case: enter a request to fsync this segment */
MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
PendingFsyncEntry *entry;
bool found;
Assert(type == SYNC_REQUEST);
entry = (PendingFsyncEntry *) hash_search(pendingOps,
(void *) ftag,
HASH_ENTER,
&found);
/* if new entry, initialize it */
if (!found)
{
entry->cycle_ctr = sync_cycle_ctr;
entry->canceled = false;
}
/*
* NB: it's intentional that we don't change cycle_ctr if the entry
* already exists. The cycle_ctr must represent the oldest fsync
* request that could be in the entry.
*/
MemoryContextSwitchTo(oldcxt);
}
}
/*
* Register the sync request locally, or forward it to the checkpointer.
*
* If retryOnError is true, we'll keep trying if there is no space in the
* queue. Return true if we succeeded, or false if there wasn't space.
*/
bool
RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
bool retryOnError)
{
bool ret;
if (pendingOps != NULL)
{
/* standalone backend or startup process: fsync state is local */
RememberSyncRequest(ftag, type);
return true;
}
for (;;)
{
/*
* Notify the checkpointer about it. If we fail to queue a message
* in retryOnError mode, we have to sleep and try again ... ugly, but
* hopefully won't happen often.
*
* XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
* error in the case of SYNC_UNLINK_REQUEST would leave the
* no-longer-used file still present on disk, which would be bad, so
* I'm inclined to assume that the checkpointer will always empty the
* queue soon.
*/
ret = ForwardSyncRequest(ftag, type);
/*
* If we are successful in queueing the request, or we failed and were
* instructed not to retry on error, break.
*/
if (ret || (!ret && !retryOnError))
break;
pg_usleep(10000L);
}
return ret;
}
/*
* In archive recovery, we rely on checkpointer to do fsyncs, but we will have
* already created the pendingOps during initialization of the startup
* process. Calling this function drops the local pendingOps so that
* subsequent requests will be forwarded to checkpointer.
*/
void
EnableSyncRequestForwarding(void)
{
/* Perform any pending fsyncs we may have queued up, then drop table */
if (pendingOps)
{
ProcessSyncRequests();
hash_destroy(pendingOps);
}
pendingOps = NULL;
/*
* We should not have any pending unlink requests, since mdunlink doesn't
* queue unlink requests when isRedo.
*/
Assert(pendingUnlinks == NIL);
}

View File

@ -51,6 +51,7 @@
#include "storage/proc.h"
#include "storage/sinvaladt.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "tcop/tcopprot.h"
#include "utils/acl.h"
#include "utils/fmgroids.h"
@ -557,6 +558,7 @@ BaseInit(void)
/* Do local initialization of file, storage and buffer managers */
InitFileAccess();
InitSync();
smgrinit();
InitBufferPoolAccess();
}

View File

@ -17,6 +17,8 @@
#include "storage/block.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "storage/sync.h"
/* GUC options */
@ -31,9 +33,9 @@ extern void CheckpointerMain(void) pg_attribute_noreturn();
extern void RequestCheckpoint(int flags);
extern void CheckpointWriteDelay(int flags, double progress);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void AbsorbFsyncRequests(void);
extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
extern void AbsorbSyncRequests(void);
extern Size CheckpointerShmemSize(void);
extern void CheckpointerShmemInit(void);

View File

@ -54,6 +54,18 @@ extern PGDLLIMPORT bool data_sync_retry;
*/
extern int max_safe_fds;
/*
* On Windows, we have to interpret EACCES as possibly meaning the same as
* ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
* that's what you get. Ugh. This code is designed so that we don't
* actually believe these cases are okay without further evidence (namely,
* a pending fsync request getting canceled ... see ProcessSyncRequests).
*/
#ifndef WIN32
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
#else
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
#endif
/*
* prototypes for functions in fd.c

51
src/include/storage/md.h Normal file
View File

@ -0,0 +1,51 @@
/*-------------------------------------------------------------------------
*
* md.h
* magnetic disk storage manager public interface declarations.
*
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/md.h
*
*-------------------------------------------------------------------------
*/
#ifndef MD_H
#define MD_H
#include "storage/block.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "storage/sync.h"
/* md storage manager functionality */
extern void mdinit(void);
extern void mdclose(SMgrRelation reln, ForkNumber forknum);
extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void ForgetDatabaseSyncRequests(Oid dbid);
extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
/* md sync callbacks */
extern int mdsyncfiletag(const FileTag *ftag, char *path);
extern int mdunlinkfiletag(const FileTag *ftag, char *path);
extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
#endif /* MD_H */

View File

@ -18,7 +18,6 @@
#include "storage/block.h"
#include "storage/relfilenode.h"
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
* cached file handles. An SMgrRelation is created (if not already present)
@ -106,43 +105,6 @@ extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void smgrpreckpt(void);
extern void smgrsync(void);
extern void smgrpostckpt(void);
extern void AtEOXact_SMgr(void);
/* internals: move me elsewhere -- ay 7/94 */
/* in md.c */
extern void mdinit(void);
extern void mdclose(SMgrRelation reln, ForkNumber forknum);
extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void mdpreckpt(void);
extern void mdsync(void);
extern void mdpostckpt(void);
extern void SetForwardFsyncRequests(void);
extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno);
extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
extern void ForgetDatabaseFsyncRequests(Oid dbid);
extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
#endif /* SMGR_H */

View File

@ -0,0 +1,62 @@
/*-------------------------------------------------------------------------
*
* sync.h
* File synchronization management code.
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/sync.h
*
*-------------------------------------------------------------------------
*/
#ifndef SYNC_H
#define SYNC_H
#include "storage/relfilenode.h"
/*
* Type of sync request. These are used to manage the set of pending
* requests to call a sync handler's sync or unlink functions at the next
* checkpoint.
*/
typedef enum SyncRequestType
{
SYNC_REQUEST, /* schedule a call of sync function */
SYNC_UNLINK_REQUEST, /* schedule a call of unlink function */
SYNC_FORGET_REQUEST, /* forget all calls for a tag */
SYNC_FILTER_REQUEST /* forget all calls satisfying match fn */
} SyncRequestType;
/*
* Which set of functions to use to handle a given request. See the function
* table in sync.c.
*/
typedef enum SyncRequestHandler
{
SYNC_HANDLER_MD = 0 /* md smgr */
} SyncRequestHandler;
/*
* A tag identifying a file. Currently it has the members required for md.c's
* usage, but sync.c has no knowledge of the internal structure, and it is
* liable to change as required by future handlers.
*/
typedef struct FileTag
{
int16 handler; /* SyncRequstHandler value, saving space */
int16 forknum; /* ForkNumber, saving space */
RelFileNode rnode;
uint32 segno;
} FileTag;
extern void InitSync(void);
extern void SyncPreCheckpoint(void);
extern void SyncPostCheckpoint(void);
extern void ProcessSyncRequests(void);
extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type);
extern void EnableSyncRequestForwarding(void);
extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
bool retryOnError);
#endif /* SYNC_H */

View File

@ -651,6 +651,7 @@ File
FileFdwExecutionState
FileFdwPlanState
FileNameMap
FileTag
FindSplitData
FixedParallelExecutorState
FixedParallelState
@ -1700,7 +1701,7 @@ PathKeysComparison
PathTarget
Pattern_Prefix_Status
Pattern_Type
PendingOperationEntry
PendingFsyncEntry
PendingRelDelete
PendingUnlinkEntry
PendingWriteback
@ -2276,7 +2277,10 @@ Subscription
SubscriptionInfo
SubscriptionRelState
Syn
SyncOps
SyncRepConfigData
SyncRequestHandler
SyncRequestType
SysScanDesc
SyscacheCallbackFunction
SystemRowsSamplerData