diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 11992f7447..ecc01f741d 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -98,6 +98,7 @@ #include "replication/walsender.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/md.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b04fdb5d5e..bd5024ef00 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -50,6 +50,7 @@ #include "storage/fd.h" #include "storage/freespace.h" #include "storage/lmgr.h" +#include "storage/md.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e3a3110716..c00b63c751 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -66,6 +66,7 @@ #include "storage/reinit.h" #include "storage/smgr.h" #include "storage/spin.h" +#include "storage/sync.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" @@ -6981,7 +6982,7 @@ StartupXLOG(void) if (ArchiveRecoveryRequested && IsUnderPostmaster) { PublishStartupProcessInformation(); - SetForwardFsyncRequests(); + EnableSyncRequestForwarding(); SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); bgwriterLaunched = true; } @@ -8566,7 +8567,7 @@ CreateCheckPoint(int flags) * the REDO pointer. Note that smgr must not do anything that'd have to * be undone if we decide no checkpoint is needed. */ - smgrpreckpt(); + SyncPreCheckpoint(); /* Begin filling in the checkpoint WAL record */ MemSet(&checkPoint, 0, sizeof(checkPoint)); @@ -8856,7 +8857,7 @@ CreateCheckPoint(int flags) /* * Let smgr do post-checkpoint cleanup (eg, deleting old files). */ - smgrpostckpt(); + SyncPostCheckpoint(); /* * Update the average distance between checkpoints if the prior checkpoint diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 35cad0b629..9707afabd9 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -54,6 +54,7 @@ #include "storage/fd.h" #include "storage/lmgr.h" #include "storage/ipc.h" +#include "storage/md.h" #include "storage/procarray.h" #include "storage/smgr.h" #include "utils/acl.h" @@ -941,11 +942,11 @@ dropdb(const char *dbname, bool missing_ok) * worse, it will delete files that belong to a newly created database * with the same OID. */ - ForgetDatabaseFsyncRequests(db_id); + ForgetDatabaseSyncRequests(db_id); /* * Force a checkpoint to make sure the checkpointer has received the - * message sent by ForgetDatabaseFsyncRequests. On Windows, this also + * message sent by ForgetDatabaseSyncRequests. On Windows, this also * ensures that background procs don't hold any open files, which would * cause rmdir() to fail. */ @@ -2150,7 +2151,7 @@ dbase_redo(XLogReaderState *record) DropDatabaseBuffers(xlrec->db_id); /* Also, clean out any fsync requests that might be pending in md.c */ - ForgetDatabaseFsyncRequests(xlrec->db_id); + ForgetDatabaseSyncRequests(xlrec->db_id); /* Clean out the xlog relcache too */ XLogDropDatabase(xlrec->db_id); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index c2411081a5..d303ce3679 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -108,10 +108,8 @@ */ typedef struct { - RelFileNode rnode; - ForkNumber forknum; - BlockNumber segno; /* see md.c for special values */ - /* might add a real request-type field later; not needed yet */ + SyncRequestType type; /* request type */ + FileTag ftag; /* file identifier */ } CheckpointerRequest; typedef struct @@ -349,7 +347,7 @@ CheckpointerMain(void) /* * Process any requests or signals received recently. */ - AbsorbFsyncRequests(); + AbsorbSyncRequests(); if (got_SIGHUP) { @@ -684,7 +682,7 @@ CheckpointWriteDelay(int flags, double progress) UpdateSharedMemoryConfig(); } - AbsorbFsyncRequests(); + AbsorbSyncRequests(); absorb_counter = WRITES_PER_ABSORB; CheckArchiveTimeout(); @@ -709,7 +707,7 @@ CheckpointWriteDelay(int flags, double progress) * operations even when we don't sleep, to prevent overflow of the * fsync request queue. */ - AbsorbFsyncRequests(); + AbsorbSyncRequests(); absorb_counter = WRITES_PER_ABSORB; } } @@ -1084,7 +1082,7 @@ RequestCheckpoint(int flags) } /* - * ForwardFsyncRequest + * ForwardSyncRequest * Forward a file-fsync request from a backend to the checkpointer * * Whenever a backend is compelled to write directly to a relation @@ -1093,15 +1091,6 @@ RequestCheckpoint(int flags) * is dirty and must be fsync'd before next checkpoint. We also use this * opportunity to count such writes for statistical purposes. * - * This functionality is only supported for regular (not backend-local) - * relations, so the rnode argument is intentionally RelFileNode not - * RelFileNodeBackend. - * - * segno specifies which segment (not block!) of the relation needs to be - * fsync'd. (Since the valid range is much less than BlockNumber, we can - * use high values for special flags; that's all internal to md.c, which - * see for details.) - * * To avoid holding the lock for longer than necessary, we normally write * to the requests[] queue without checking for duplicates. The checkpointer * will have to eliminate dups internally anyway. However, if we discover @@ -1113,7 +1102,7 @@ RequestCheckpoint(int flags) * let the backend know by returning false. */ bool -ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) +ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) { CheckpointerRequest *request; bool too_full; @@ -1122,7 +1111,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) return false; /* probably shouldn't even get here */ if (AmCheckpointerProcess()) - elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer"); + elog(ERROR, "ForwardSyncRequest must not be called in checkpointer"); LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); @@ -1151,9 +1140,8 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) /* OK, insert request */ request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; - request->rnode = rnode; - request->forknum = forknum; - request->segno = segno; + request->ftag = *ftag; + request->type = type; /* If queue is more than half full, nudge the checkpointer to empty it */ too_full = (CheckpointerShmem->num_requests >= @@ -1284,8 +1272,8 @@ CompactCheckpointerRequestQueue(void) } /* - * AbsorbFsyncRequests - * Retrieve queued fsync requests and pass them to local smgr. + * AbsorbSyncRequests + * Retrieve queued sync requests and pass them to sync mechanism. * * This is exported because it must be called during CreateCheckPoint; * we have to be sure we have accepted all pending requests just before @@ -1293,7 +1281,7 @@ CompactCheckpointerRequestQueue(void) * non-checkpointer processes, do nothing if not checkpointer. */ void -AbsorbFsyncRequests(void) +AbsorbSyncRequests(void) { CheckpointerRequest *requests = NULL; CheckpointerRequest *request; @@ -1335,7 +1323,7 @@ AbsorbFsyncRequests(void) LWLockRelease(CheckpointerCommLock); for (request = requests; n > 0; request++, n--) - RememberFsyncRequest(request->rnode, request->forknum, request->segno); + RememberSyncRequest(&request->ftag, request->type); END_CRIT_SECTION(); diff --git a/src/backend/storage/Makefile b/src/backend/storage/Makefile index bd2d272c6e..8376cdfca2 100644 --- a/src/backend/storage/Makefile +++ b/src/backend/storage/Makefile @@ -8,6 +8,6 @@ subdir = src/backend/storage top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = buffer file freespace ipc large_object lmgr page smgr +SUBDIRS = buffer file freespace ipc large_object lmgr page smgr sync include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 273e2f385f..887023fc8a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2584,7 +2584,7 @@ CheckPointBuffers(int flags) BufferSync(flags); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); - smgrsync(); + ProcessSyncRequests(); CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp(); TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE(); } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 6ed68185ed..ffb3569698 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -29,45 +29,17 @@ #include "access/xlogutils.h" #include "access/xlog.h" #include "pgstat.h" -#include "portability/instr_time.h" #include "postmaster/bgwriter.h" #include "storage/fd.h" #include "storage/bufmgr.h" +#include "storage/md.h" #include "storage/relfilenode.h" #include "storage/smgr.h" +#include "storage/sync.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "pg_trace.h" - -/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */ -#define FSYNCS_PER_ABSORB 10 -#define UNLINKS_PER_ABSORB 10 - -/* - * Special values for the segno arg to RememberFsyncRequest. - * - * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an - * fsync request from the queue if an identical, subsequent request is found. - * See comments there before making changes here. - */ -#define FORGET_RELATION_FSYNC (InvalidBlockNumber) -#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) -#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2) - -/* - * On Windows, we have to interpret EACCES as possibly meaning the same as - * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, - * that's what you get. Ugh. This code is designed so that we don't - * actually believe these cases are okay without further evidence (namely, - * a pending fsync request getting canceled ... see mdsync). - */ -#ifndef WIN32 -#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) -#else -#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) -#endif - /* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it @@ -115,49 +87,15 @@ typedef struct _MdfdVec static MemoryContext MdCxt; /* context for all MdfdVec objects */ -/* - * In some contexts (currently, standalone backends and the checkpointer) - * we keep track of pending fsync operations: we need to remember all relation - * segments that have been written since the last checkpoint, so that we can - * fsync them down to disk before completing the next checkpoint. This hash - * table remembers the pending operations. We use a hash table mostly as - * a convenient way of merging duplicate requests. - * - * We use a similar mechanism to remember no-longer-needed files that can - * be deleted after the next checkpoint, but we use a linked list instead of - * a hash table, because we don't expect there to be any duplicate requests. - * - * These mechanisms are only used for non-temp relations; we never fsync - * temp rels, nor do we need to postpone their deletion (see comments in - * mdunlink). - * - * (Regular backends do not track pending operations locally, but forward - * them to the checkpointer.) - */ -typedef uint16 CycleCtr; /* can be any convenient integer size */ - -typedef struct -{ - RelFileNode rnode; /* hash table key (must be first!) */ - CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */ - /* requests[f] has bit n set if we need to fsync segment n of fork f */ - Bitmapset *requests[MAX_FORKNUM + 1]; - /* canceled[f] is true if we canceled fsyncs for fork "recently" */ - bool canceled[MAX_FORKNUM + 1]; -} PendingOperationEntry; - -typedef struct -{ - RelFileNode rnode; /* the dead relation to delete */ - CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */ -} PendingUnlinkEntry; - -static HTAB *pendingOpsTable = NULL; -static List *pendingUnlinks = NIL; -static MemoryContext pendingOpsCxt; /* context for the above */ - -static CycleCtr mdsync_cycle_ctr = 0; -static CycleCtr mdckpt_cycle_ctr = 0; +/* Populate a file tag describing an md.c segment file. */ +#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = SYNC_HANDLER_MD, \ + (a).rnode = (xx_rnode), \ + (a).forknum = (xx_forknum), \ + (a).segno = (xx_segno) \ +) /*** behavior for mdopen & _mdfd_getseg ***/ @@ -185,7 +123,10 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior); static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); -static void register_unlink(RelFileNodeBackend rnode); +static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno); +static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno); static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg); @@ -208,64 +149,6 @@ mdinit(void) MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", ALLOCSET_DEFAULT_SIZES); - - /* - * Create pending-operations hashtable if we need it. Currently, we need - * it if we are standalone (not under a postmaster) or if we are a startup - * or checkpointer auxiliary process. - */ - if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) - { - HASHCTL hash_ctl; - - /* - * XXX: The checkpointer needs to add entries to the pending ops table - * when absorbing fsync requests. That is done within a critical - * section, which isn't usually allowed, but we make an exception. It - * means that there's a theoretical possibility that you run out of - * memory while absorbing fsync requests, which leads to a PANIC. - * Fortunately the hash table is small so that's unlikely to happen in - * practice. - */ - pendingOpsCxt = AllocSetContextCreate(MdCxt, - "Pending ops context", - ALLOCSET_DEFAULT_SIZES); - MemoryContextAllowInCriticalSection(pendingOpsCxt, true); - - MemSet(&hash_ctl, 0, sizeof(hash_ctl)); - hash_ctl.keysize = sizeof(RelFileNode); - hash_ctl.entrysize = sizeof(PendingOperationEntry); - hash_ctl.hcxt = pendingOpsCxt; - pendingOpsTable = hash_create("Pending Ops Table", - 100L, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - pendingUnlinks = NIL; - } -} - -/* - * In archive recovery, we rely on checkpointer to do fsyncs, but we will have - * already created the pendingOpsTable during initialization of the startup - * process. Calling this function drops the local pendingOpsTable so that - * subsequent requests will be forwarded to checkpointer. - */ -void -SetForwardFsyncRequests(void) -{ - /* Perform any pending fsyncs we may have queued up, then drop table */ - if (pendingOpsTable) - { - mdsync(); - hash_destroy(pendingOpsTable); - } - pendingOpsTable = NULL; - - /* - * We should not have any pending unlink requests, since mdunlink doesn't - * queue unlink requests when isRedo. - */ - Assert(pendingUnlinks == NIL); } /* @@ -380,16 +263,6 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) void mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { - /* - * We have to clean out any pending fsync requests for the doomed - * relation, else the next mdsync() will fail. There can't be any such - * requests for a temp relation, though. We can send just one request - * even when deleting multiple forks, since the fsync queuing code accepts - * the "InvalidForkNumber = all forks" convention. - */ - if (!RelFileNodeBackendIsTemp(rnode)) - ForgetRelationFsyncRequests(rnode.node, forkNum); - /* Now do the per-fork work */ if (forkNum == InvalidForkNumber) { @@ -413,6 +286,11 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) { + /* First, forget any pending sync requests for the first segment */ + if (!RelFileNodeBackendIsTemp(rnode)) + register_forget_request(rnode, forkNum, 0 /* first seg */ ); + + /* Next unlink the file */ ret = unlink(path); if (ret < 0 && errno != ENOENT) ereport(WARNING, @@ -442,7 +320,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) errmsg("could not truncate file \"%s\": %m", path))); /* Register request to unlink first segment later */ - register_unlink(rnode); + register_unlink_segment(rnode, forkNum, 0 /* first seg */ ); } /* @@ -459,6 +337,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ for (segno = 1;; segno++) { + /* + * Forget any pending sync requests for this segment before we try + * to unlink. + */ + if (!RelFileNodeBackendIsTemp(rnode)) + register_forget_request(rnode, forkNum, segno); + sprintf(segpath, "%s.%u", path, segno); if (unlink(segpath) < 0) { @@ -1003,413 +888,27 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) } } -/* - * mdsync() -- Sync previous writes to stable storage. - */ -void -mdsync(void) -{ - static bool mdsync_in_progress = false; - - HASH_SEQ_STATUS hstat; - PendingOperationEntry *entry; - int absorb_counter; - - /* Statistics on sync times */ - int processed = 0; - instr_time sync_start, - sync_end, - sync_diff; - uint64 elapsed; - uint64 longest = 0; - uint64 total_elapsed = 0; - - /* - * This is only called during checkpoints, and checkpoints should only - * occur in processes that have created a pendingOpsTable. - */ - if (!pendingOpsTable) - elog(ERROR, "cannot sync without a pendingOpsTable"); - - /* - * If we are in the checkpointer, the sync had better include all fsync - * requests that were queued by backends up to this point. The tightest - * race condition that could occur is that a buffer that must be written - * and fsync'd for the checkpoint could have been dumped by a backend just - * before it was visited by BufferSync(). We know the backend will have - * queued an fsync request before clearing the buffer's dirtybit, so we - * are safe as long as we do an Absorb after completing BufferSync(). - */ - AbsorbFsyncRequests(); - - /* - * To avoid excess fsync'ing (in the worst case, maybe a never-terminating - * checkpoint), we want to ignore fsync requests that are entered into the - * hashtable after this point --- they should be processed next time, - * instead. We use mdsync_cycle_ctr to tell old entries apart from new - * ones: new ones will have cycle_ctr equal to the incremented value of - * mdsync_cycle_ctr. - * - * In normal circumstances, all entries present in the table at this point - * will have cycle_ctr exactly equal to the current (about to be old) - * value of mdsync_cycle_ctr. However, if we fail partway through the - * fsync'ing loop, then older values of cycle_ctr might remain when we - * come back here to try again. Repeated checkpoint failures would - * eventually wrap the counter around to the point where an old entry - * might appear new, causing us to skip it, possibly allowing a checkpoint - * to succeed that should not have. To forestall wraparound, any time the - * previous mdsync() failed to complete, run through the table and - * forcibly set cycle_ctr = mdsync_cycle_ctr. - * - * Think not to merge this loop with the main loop, as the problem is - * exactly that that loop may fail before having visited all the entries. - * From a performance point of view it doesn't matter anyway, as this path - * will never be taken in a system that's functioning normally. - */ - if (mdsync_in_progress) - { - /* prior try failed, so update any stale cycle_ctr values */ - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { - entry->cycle_ctr = mdsync_cycle_ctr; - } - } - - /* Advance counter so that new hashtable entries are distinguishable */ - mdsync_cycle_ctr++; - - /* Set flag to detect failure if we don't reach the end of the loop */ - mdsync_in_progress = true; - - /* Now scan the hashtable for fsync requests to process */ - absorb_counter = FSYNCS_PER_ABSORB; - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { - ForkNumber forknum; - - /* - * If the entry is new then don't process it this time; it might - * contain multiple fsync-request bits, but they are all new. Note - * "continue" bypasses the hash-remove call at the bottom of the loop. - */ - if (entry->cycle_ctr == mdsync_cycle_ctr) - continue; - - /* Else assert we haven't missed it */ - Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); - - /* - * Scan over the forks and segments represented by the entry. - * - * The bitmap manipulations are slightly tricky, because we can call - * AbsorbFsyncRequests() inside the loop and that could result in - * bms_add_member() modifying and even re-palloc'ing the bitmapsets. - * So we detach it, but if we fail we'll merge it with any new - * requests that have arrived in the meantime. - */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - Bitmapset *requests = entry->requests[forknum]; - int segno; - - entry->requests[forknum] = NULL; - entry->canceled[forknum] = false; - - segno = -1; - while ((segno = bms_next_member(requests, segno)) >= 0) - { - int failures; - - /* - * If fsync is off then we don't have to bother opening the - * file at all. (We delay checking until this point so that - * changing fsync on the fly behaves sensibly.) - */ - if (!enableFsync) - continue; - - /* - * If in checkpointer, we want to absorb pending requests - * every so often to prevent overflow of the fsync request - * queue. It is unspecified whether newly-added entries will - * be visited by hash_seq_search, but we don't care since we - * don't need to process them anyway. - */ - if (--absorb_counter <= 0) - { - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; - } - - /* - * The fsync table could contain requests to fsync segments - * that have been deleted (unlinked) by the time we get to - * them. Rather than just hoping an ENOENT (or EACCES on - * Windows) error can be ignored, what we do on error is - * absorb pending requests and then retry. Since mdunlink() - * queues a "cancel" message before actually unlinking, the - * fsync request is guaranteed to be marked canceled after the - * absorb if it really was this case. DROP DATABASE likewise - * has to tell us to forget fsync requests before it starts - * deletions. - */ - for (failures = 0;; failures++) /* loop exits at "break" */ - { - SMgrRelation reln; - MdfdVec *seg; - char *path; - int save_errno; - - /* - * Find or create an smgr hash entry for this relation. - * This may seem a bit unclean -- md calling smgr? But - * it's really the best solution. It ensures that the - * open file reference isn't permanently leaked if we get - * an error here. (You may say "but an unreferenced - * SMgrRelation is still a leak!" Not really, because the - * only case in which a checkpoint is done by a process - * that isn't about to shut down is in the checkpointer, - * and it will periodically do smgrcloseall(). This fact - * justifies our not closing the reln in the success path - * either, which is a good thing since in non-checkpointer - * cases we couldn't safely do that.) - */ - reln = smgropen(entry->rnode, InvalidBackendId); - - /* Attempt to open and fsync the target segment */ - seg = _mdfd_getseg(reln, forknum, - (BlockNumber) segno * (BlockNumber) RELSEG_SIZE, - false, - EXTENSION_RETURN_NULL - | EXTENSION_DONT_CHECK_SIZE); - - INSTR_TIME_SET_CURRENT(sync_start); - - if (seg != NULL && - FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0) - { - /* Success; update statistics about sync timing */ - INSTR_TIME_SET_CURRENT(sync_end); - sync_diff = sync_end; - INSTR_TIME_SUBTRACT(sync_diff, sync_start); - elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); - if (elapsed > longest) - longest = elapsed; - total_elapsed += elapsed; - processed++; - requests = bms_del_member(requests, segno); - if (log_checkpoints) - elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", - processed, - FilePathName(seg->mdfd_vfd), - (double) elapsed / 1000); - - break; /* out of retry loop */ - } - - /* Compute file name for use in message */ - save_errno = errno; - path = _mdfd_segpath(reln, forknum, (BlockNumber) segno); - errno = save_errno; - - /* - * It is possible that the relation has been dropped or - * truncated since the fsync request was entered. - * Therefore, allow ENOENT, but only if we didn't fail - * already on this file. This applies both for - * _mdfd_getseg() and for FileSync, since fd.c might have - * closed the file behind our back. - * - * XXX is there any point in allowing more than one retry? - * Don't see one at the moment, but easy to change the - * test here if so. - */ - if (!FILE_POSSIBLY_DELETED(errno) || - failures > 0) - { - Bitmapset *new_requests; - - /* - * We need to merge these unsatisfied requests with - * any others that have arrived since we started. - */ - new_requests = entry->requests[forknum]; - entry->requests[forknum] = - bms_join(new_requests, requests); - - errno = save_errno; - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", - path))); - } - else - ereport(DEBUG1, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\" but retrying: %m", - path))); - pfree(path); - - /* - * Absorb incoming requests and check to see if a cancel - * arrived for this relation fork. - */ - AbsorbFsyncRequests(); - absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ - - if (entry->canceled[forknum]) - break; - } /* end retry loop */ - } - bms_free(requests); - } - - /* - * We've finished everything that was requested before we started to - * scan the entry. If no new requests have been inserted meanwhile, - * remove the entry. Otherwise, update its cycle counter, as all the - * requests now in it must have arrived during this cycle. - */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - if (entry->requests[forknum] != NULL) - break; - } - if (forknum <= MAX_FORKNUM) - entry->cycle_ctr = mdsync_cycle_ctr; - else - { - /* Okay to remove it */ - if (hash_search(pendingOpsTable, &entry->rnode, - HASH_REMOVE, NULL) == NULL) - elog(ERROR, "pendingOpsTable corrupted"); - } - } /* end loop over hashtable entries */ - - /* Return sync performance metrics for report at checkpoint end */ - CheckpointStats.ckpt_sync_rels = processed; - CheckpointStats.ckpt_longest_sync = longest; - CheckpointStats.ckpt_agg_sync_time = total_elapsed; - - /* Flag successful completion of mdsync */ - mdsync_in_progress = false; -} - -/* - * mdpreckpt() -- Do pre-checkpoint work - * - * To distinguish unlink requests that arrived before this checkpoint - * started from those that arrived during the checkpoint, we use a cycle - * counter similar to the one we use for fsync requests. That cycle - * counter is incremented here. - * - * This must be called *before* the checkpoint REDO point is determined. - * That ensures that we won't delete files too soon. - * - * Note that we can't do anything here that depends on the assumption - * that the checkpoint will be completed. - */ -void -mdpreckpt(void) -{ - /* - * Any unlink requests arriving after this point will be assigned the next - * cycle counter, and won't be unlinked until next checkpoint. - */ - mdckpt_cycle_ctr++; -} - -/* - * mdpostckpt() -- Do post-checkpoint work - * - * Remove any lingering files that can now be safely removed. - */ -void -mdpostckpt(void) -{ - int absorb_counter; - - absorb_counter = UNLINKS_PER_ABSORB; - while (pendingUnlinks != NIL) - { - PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); - char *path; - - /* - * New entries are appended to the end, so if the entry is new we've - * reached the end of old entries. - * - * Note: if just the right number of consecutive checkpoints fail, we - * could be fooled here by cycle_ctr wraparound. However, the only - * consequence is that we'd delay unlinking for one more checkpoint, - * which is perfectly tolerable. - */ - if (entry->cycle_ctr == mdckpt_cycle_ctr) - break; - - /* Unlink the file */ - path = relpathperm(entry->rnode, MAIN_FORKNUM); - if (unlink(path) < 0) - { - /* - * There's a race condition, when the database is dropped at the - * same time that we process the pending unlink requests. If the - * DROP DATABASE deletes the file before we do, we will get ENOENT - * here. rmtree() also has to ignore ENOENT errors, to deal with - * the possibility that we delete the file first. - */ - if (errno != ENOENT) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - } - pfree(path); - - /* And remove the list entry */ - pendingUnlinks = list_delete_first(pendingUnlinks); - pfree(entry); - - /* - * As in mdsync, we don't want to stop absorbing fsync requests for a - * long time when there are many deletions to be done. We can safely - * call AbsorbFsyncRequests() at this point in the loop (note it might - * try to delete list entries). - */ - if (--absorb_counter <= 0) - { - AbsorbFsyncRequests(); - absorb_counter = UNLINKS_PER_ABSORB; - } - } -} - /* * register_dirty_segment() -- Mark a relation segment as needing fsync * * If there is a local pending-ops table, just make an entry in it for - * mdsync to process later. Otherwise, try to pass off the fsync request - * to the checkpointer process. If that fails, just do the fsync - * locally before returning (we hope this will not happen often enough - * to be a performance problem). + * ProcessSyncRequests to process later. Otherwise, try to pass off the + * fsync request to the checkpointer process. If that fails, just do the + * fsync locally before returning (we hope this will not happen often + * enough to be a performance problem). */ static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { + FileTag tag; + + INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno); + /* Temp relations should never be fsync'd */ Assert(!SmgrIsTemp(reln)); - if (pendingOpsTable) + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { - /* push it into local pending-ops table */ - RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); - } - else - { - if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno)) - return; /* passed it off successfully */ - ereport(DEBUG1, (errmsg("could not forward fsync request because request queue is full"))); @@ -1423,254 +922,51 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* * register_unlink() -- Schedule a file to be deleted after next checkpoint - * - * We don't bother passing in the fork number, because this is only used - * with main forks. - * - * As with register_dirty_segment, this could involve either a local or - * a remote pending-ops table. */ static void -register_unlink(RelFileNodeBackend rnode) +register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno) { + FileTag tag; + + INIT_MD_FILETAG(tag, rnode.node, forknum, segno); + /* Should never be used with temp relations */ Assert(!RelFileNodeBackendIsTemp(rnode)); - if (pendingOpsTable) - { - /* push it into local pending-ops table */ - RememberFsyncRequest(rnode.node, MAIN_FORKNUM, - UNLINK_RELATION_REQUEST); - } - else - { - /* - * Notify the checkpointer about it. If we fail to queue the request - * message, we have to sleep and try again, because we can't simply - * delete the file now. Ugly, but hopefully won't happen often. - * - * XXX should we just leave the file orphaned instead? - */ - Assert(IsUnderPostmaster); - while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM, - UNLINK_RELATION_REQUEST)) - pg_usleep(10000L); /* 10 msec seems a good number */ - } + RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); } /* - * RememberFsyncRequest() -- callback from checkpointer side of fsync request - * - * We stuff fsync requests into the local hash table for execution - * during the checkpointer's next checkpoint. UNLINK requests go into a - * separate linked list, however, because they get processed separately. - * - * The range of possible segment numbers is way less than the range of - * BlockNumber, so we can reserve high values of segno for special purposes. - * We define three: - * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation, - * either for one fork, or all forks if forknum is InvalidForkNumber - * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database - * - UNLINK_RELATION_REQUEST is a request to delete the file after the next - * checkpoint. - * Note also that we're assuming real segment numbers don't exceed INT_MAX. - * - * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash - * table has to be searched linearly, but dropping a database is a pretty - * heavyweight operation anyhow, so we'll live with it.) + * register_forget_request() -- forget any fsyncs for a relation fork's segment */ -void -RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) +static void +register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, + BlockNumber segno) { - Assert(pendingOpsTable); + FileTag tag; - if (segno == FORGET_RELATION_FSYNC) - { - /* Remove any pending requests for the relation (one or all forks) */ - PendingOperationEntry *entry; + INIT_MD_FILETAG(tag, rnode.node, forknum, segno); - entry = (PendingOperationEntry *) hash_search(pendingOpsTable, - &rnode, - HASH_FIND, - NULL); - if (entry) - { - /* - * We can't just delete the entry since mdsync could have an - * active hashtable scan. Instead we delete the bitmapsets; this - * is safe because of the way mdsync is coded. We also set the - * "canceled" flags so that mdsync can tell that a cancel arrived - * for the fork(s). - */ - if (forknum == InvalidForkNumber) - { - /* remove requests for all forks */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - bms_free(entry->requests[forknum]); - entry->requests[forknum] = NULL; - entry->canceled[forknum] = true; - } - } - else - { - /* remove requests for single fork */ - bms_free(entry->requests[forknum]); - entry->requests[forknum] = NULL; - entry->canceled[forknum] = true; - } - } - } - else if (segno == FORGET_DATABASE_FSYNC) - { - /* Remove any pending requests for the entire database */ - HASH_SEQ_STATUS hstat; - PendingOperationEntry *entry; - ListCell *cell, - *prev, - *next; - - /* Remove fsync requests */ - hash_seq_init(&hstat, pendingOpsTable); - while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) - { - if (entry->rnode.dbNode == rnode.dbNode) - { - /* remove requests for all forks */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - bms_free(entry->requests[forknum]); - entry->requests[forknum] = NULL; - entry->canceled[forknum] = true; - } - } - } - - /* Remove unlink requests */ - prev = NULL; - for (cell = list_head(pendingUnlinks); cell; cell = next) - { - PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); - - next = lnext(cell); - if (entry->rnode.dbNode == rnode.dbNode) - { - pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); - pfree(entry); - } - else - prev = cell; - } - } - else if (segno == UNLINK_RELATION_REQUEST) - { - /* Unlink request: put it in the linked list */ - MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); - PendingUnlinkEntry *entry; - - /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ - Assert(forknum == MAIN_FORKNUM); - - entry = palloc(sizeof(PendingUnlinkEntry)); - entry->rnode = rnode; - entry->cycle_ctr = mdckpt_cycle_ctr; - - pendingUnlinks = lappend(pendingUnlinks, entry); - - MemoryContextSwitchTo(oldcxt); - } - else - { - /* Normal case: enter a request to fsync this segment */ - MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); - PendingOperationEntry *entry; - bool found; - - entry = (PendingOperationEntry *) hash_search(pendingOpsTable, - &rnode, - HASH_ENTER, - &found); - /* if new entry, initialize it */ - if (!found) - { - entry->cycle_ctr = mdsync_cycle_ctr; - MemSet(entry->requests, 0, sizeof(entry->requests)); - MemSet(entry->canceled, 0, sizeof(entry->canceled)); - } - - /* - * NB: it's intentional that we don't change cycle_ctr if the entry - * already exists. The cycle_ctr must represent the oldest fsync - * request that could be in the entry. - */ - - entry->requests[forknum] = bms_add_member(entry->requests[forknum], - (int) segno); - - MemoryContextSwitchTo(oldcxt); - } -} - -/* - * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork - * - * forknum == InvalidForkNumber means all forks, although this code doesn't - * actually know that, since it's just forwarding the request elsewhere. - */ -void -ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) -{ - if (pendingOpsTable) - { - /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC); - } - else if (IsUnderPostmaster) - { - /* - * Notify the checkpointer about it. If we fail to queue the cancel - * message, we have to sleep and try again ... ugly, but hopefully - * won't happen often. - * - * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an - * error would leave the no-longer-used file still present on disk, - * which would be bad, so I'm inclined to assume that the checkpointer - * will always empty the queue soon. - */ - while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC)) - pg_usleep(10000L); /* 10 msec seems a good number */ - - /* - * Note we don't wait for the checkpointer to actually absorb the - * cancel message; see mdsync() for the implications. - */ - } + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); } /* * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB */ void -ForgetDatabaseFsyncRequests(Oid dbid) +ForgetDatabaseSyncRequests(Oid dbid) { + FileTag tag; RelFileNode rnode; rnode.dbNode = dbid; rnode.spcNode = 0; rnode.relNode = 0; - if (pendingOpsTable) - { - /* standalone backend or startup process: fsync state is local */ - RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC); - } - else if (IsUnderPostmaster) - { - /* see notes in ForgetRelationFsyncRequests */ - while (!ForwardFsyncRequest(rnode, InvalidForkNumber, - FORGET_DATABASE_FSYNC)) - pg_usleep(10000L); /* 10 msec seems a good number */ - } + INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber); + + RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); } /* @@ -1951,3 +1247,72 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* note that this calculation will ignore any partial block at EOF */ return (BlockNumber) (len / BLCKSZ); } + +/* + * Sync a file to disk, given a file tag. Write the path into an output + * buffer so the caller can use it in error messages. + * + * Return 0 on success, -1 on failure, with errno set. + */ +int +mdsyncfiletag(const FileTag *ftag, char *path) +{ + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); + MdfdVec *v; + char *p; + + /* Provide the path for informational messages. */ + p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); + strlcpy(path, p, MAXPGPATH); + pfree(p); + + /* Try to find open the requested segment. */ + v = _mdfd_getseg(reln, ftag->forknum, ftag->segno, false, + EXTENSION_RETURN_NULL); + if (v == NULL) + { + errno = ENOENT; + return -1; + } + + /* Try to fsync the file. */ + return FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC); +} + +/* + * Unlink a file, given a file tag. Write the path into an output + * buffer so the caller can use it in error messages. + * + * Return 0 on success, -1 on failure, with errno set. + */ +int +mdunlinkfiletag(const FileTag *ftag, char *path) +{ + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); + char *p; + + /* Compute the path. */ + p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); + strlcpy(path, p, MAXPGPATH); + pfree(p); + + /* Try to unlink the file. */ + return unlink(path); +} + +/* + * Check if a given candidate request matches a given tag, when processing + * a SYNC_FILTER_REQUEST request. This will be called for all pending + * requests to find out whether to forget them. + */ +bool +mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) +{ + /* + * For now we only use filter requests as a way to drop all scheduled + * callbacks relating to a given database, when dropping the database. + * We'll return true for all candidates that have the same database OID as + * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten. + */ + return ftag->rnode.dbNode == candidate->rnode.dbNode; +} diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index f6de9df9e6..8191118b61 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -21,6 +21,7 @@ #include "lib/ilist.h" #include "storage/bufmgr.h" #include "storage/ipc.h" +#include "storage/md.h" #include "storage/smgr.h" #include "utils/hsearch.h" #include "utils/inval.h" @@ -60,12 +61,8 @@ typedef struct f_smgr void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_pre_ckpt) (void); /* may be NULL */ - void (*smgr_sync) (void); /* may be NULL */ - void (*smgr_post_ckpt) (void); /* may be NULL */ } f_smgr; - static const f_smgr smgrsw[] = { /* magnetic disk */ { @@ -83,15 +80,11 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, - .smgr_pre_ckpt = mdpreckpt, - .smgr_sync = mdsync, - .smgr_post_ckpt = mdpostckpt } }; static const int NSmgr = lengthof(smgrsw); - /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unowned" SMgrRelation objects are chained together in a list. @@ -705,52 +698,6 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum) smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); } - -/* - * smgrpreckpt() -- Prepare for checkpoint. - */ -void -smgrpreckpt(void) -{ - int i; - - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_pre_ckpt) - smgrsw[i].smgr_pre_ckpt(); - } -} - -/* - * smgrsync() -- Sync files to disk during checkpoint. - */ -void -smgrsync(void) -{ - int i; - - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_sync) - smgrsw[i].smgr_sync(); - } -} - -/* - * smgrpostckpt() -- Post-checkpoint cleanup. - */ -void -smgrpostckpt(void) -{ - int i; - - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_post_ckpt) - smgrsw[i].smgr_post_ckpt(); - } -} - /* * AtEOXact_SMgr * diff --git a/src/backend/storage/sync/Makefile b/src/backend/storage/sync/Makefile new file mode 100644 index 0000000000..cfc60cadb4 --- /dev/null +++ b/src/backend/storage/sync/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/sync +# +# IDENTIFICATION +# src/backend/storage/sync/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/sync +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = sync.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c new file mode 100644 index 0000000000..f77519d7d1 --- /dev/null +++ b/src/backend/storage/sync/sync.c @@ -0,0 +1,598 @@ +/*------------------------------------------------------------------------- + * + * sync.c + * File synchronization management code. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/sync/sync.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlogutils.h" +#include "access/xlog.h" +#include "commands/tablespace.h" +#include "portability/instr_time.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/md.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/inval.h" + +static MemoryContext pendingOpsCxt; /* context for the pending ops state */ + +/* + * In some contexts (currently, standalone backends and the checkpointer) + * we keep track of pending fsync operations: we need to remember all relation + * segments that have been written since the last checkpoint, so that we can + * fsync them down to disk before completing the next checkpoint. This hash + * table remembers the pending operations. We use a hash table mostly as + * a convenient way of merging duplicate requests. + * + * We use a similar mechanism to remember no-longer-needed files that can + * be deleted after the next checkpoint, but we use a linked list instead of + * a hash table, because we don't expect there to be any duplicate requests. + * + * These mechanisms are only used for non-temp relations; we never fsync + * temp rels, nor do we need to postpone their deletion (see comments in + * mdunlink). + * + * (Regular backends do not track pending operations locally, but forward + * them to the checkpointer.) + */ +typedef uint16 CycleCtr; /* can be any convenient integer size */ + +typedef struct +{ + FileTag tag; /* identifies handler and file */ + CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */ + bool canceled; /* canceled is true if we canceled "recently" */ +} PendingFsyncEntry; + +typedef struct +{ + FileTag tag; /* identifies handler and file */ + CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */ +} PendingUnlinkEntry; + +static HTAB *pendingOps = NULL; +static List *pendingUnlinks = NIL; +static MemoryContext pendingOpsCxt; /* context for the above */ + +static CycleCtr sync_cycle_ctr = 0; +static CycleCtr checkpoint_cycle_ctr = 0; + +/* Intervals for calling AbsorbFsyncRequests */ +#define FSYNCS_PER_ABSORB 10 +#define UNLINKS_PER_ABSORB 10 + +/* + * Function pointers for handling sync and unlink requests. + */ +typedef struct SyncOps +{ + int (*sync_syncfiletag) (const FileTag *ftag, char *path); + int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); + bool (*sync_filetagmatches) (const FileTag *ftag, + const FileTag *candidate); +} SyncOps; + +static const SyncOps syncsw[] = { + /* magnetic disk */ + { + .sync_syncfiletag = mdsyncfiletag, + .sync_unlinkfiletag = mdunlinkfiletag, + .sync_filetagmatches = mdfiletagmatches + } +}; + +/* + * Initialize data structures for the file sync tracking. + */ +void +InitSync(void) +{ + /* + * Create pending-operations hashtable if we need it. Currently, we need + * it if we are standalone (not under a postmaster) or if we are a startup + * or checkpointer auxiliary process. + */ + if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess()) + { + HASHCTL hash_ctl; + + /* + * XXX: The checkpointer needs to add entries to the pending ops table + * when absorbing fsync requests. That is done within a critical + * section, which isn't usually allowed, but we make an exception. It + * means that there's a theoretical possibility that you run out of + * memory while absorbing fsync requests, which leads to a PANIC. + * Fortunately the hash table is small so that's unlikely to happen in + * practice. + */ + pendingOpsCxt = AllocSetContextCreate(TopMemoryContext, + "Pending ops context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(FileTag); + hash_ctl.entrysize = sizeof(PendingFsyncEntry); + hash_ctl.hcxt = pendingOpsCxt; + pendingOps = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + pendingUnlinks = NIL; + } + +} + +/* + * SyncPreCheckpoint() -- Do pre-checkpoint work + * + * To distinguish unlink requests that arrived before this checkpoint + * started from those that arrived during the checkpoint, we use a cycle + * counter similar to the one we use for fsync requests. That cycle + * counter is incremented here. + * + * This must be called *before* the checkpoint REDO point is determined. + * That ensures that we won't delete files too soon. + * + * Note that we can't do anything here that depends on the assumption + * that the checkpoint will be completed. + */ +void +SyncPreCheckpoint(void) +{ + /* + * Any unlink requests arriving after this point will be assigned the next + * cycle counter, and won't be unlinked until next checkpoint. + */ + checkpoint_cycle_ctr++; +} + +/* + * SyncPostCheckpoint() -- Do post-checkpoint work + * + * Remove any lingering files that can now be safely removed. + */ +void +SyncPostCheckpoint(void) +{ + int absorb_counter; + + absorb_counter = UNLINKS_PER_ABSORB; + while (pendingUnlinks != NIL) + { + PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); + char path[MAXPGPATH]; + + /* + * New entries are appended to the end, so if the entry is new we've + * reached the end of old entries. + * + * Note: if just the right number of consecutive checkpoints fail, we + * could be fooled here by cycle_ctr wraparound. However, the only + * consequence is that we'd delay unlinking for one more checkpoint, + * which is perfectly tolerable. + */ + if (entry->cycle_ctr == checkpoint_cycle_ctr) + break; + + /* Unlink the file */ + if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, + path) < 0) + { + /* + * There's a race condition, when the database is dropped at the + * same time that we process the pending unlink requests. If the + * DROP DATABASE deletes the file before we do, we will get ENOENT + * here. rmtree() also has to ignore ENOENT errors, to deal with + * the possibility that we delete the file first. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } + + /* And remove the list entry */ + pendingUnlinks = list_delete_first(pendingUnlinks); + pfree(entry); + + /* + * As in ProcessFsyncRequests, we don't want to stop absorbing fsync + * requests for along time when there are many deletions to be done. + * We can safely call AbsorbFsyncRequests() at this point in the loop + * (note it might try to delete list entries). + */ + if (--absorb_counter <= 0) + { + AbsorbSyncRequests(); + absorb_counter = UNLINKS_PER_ABSORB; + } + } +} + +/* + + * ProcessSyncRequests() -- Process queued fsync requests. + */ +void +ProcessSyncRequests(void) +{ + static bool sync_in_progress = false; + + HASH_SEQ_STATUS hstat; + PendingFsyncEntry *entry; + int absorb_counter; + + /* Statistics on sync times */ + int processed = 0; + instr_time sync_start, + sync_end, + sync_diff; + uint64 elapsed; + uint64 longest = 0; + uint64 total_elapsed = 0; + + /* + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOps. + */ + if (!pendingOps) + elog(ERROR, "cannot sync without a pendingOps table"); + + /* + * If we are in the checkpointer, the sync had better include all fsync + * requests that were queued by backends up to this point. The tightest + * race condition that could occur is that a buffer that must be written + * and fsync'd for the checkpoint could have been dumped by a backend just + * before it was visited by BufferSync(). We know the backend will have + * queued an fsync request before clearing the buffer's dirtybit, so we + * are safe as long as we do an Absorb after completing BufferSync(). + */ + AbsorbSyncRequests(); + + /* + * To avoid excess fsync'ing (in the worst case, maybe a never-terminating + * checkpoint), we want to ignore fsync requests that are entered into the + * hashtable after this point --- they should be processed next time, + * instead. We use sync_cycle_ctr to tell old entries apart from new + * ones: new ones will have cycle_ctr equal to the incremented value of + * sync_cycle_ctr. + * + * In normal circumstances, all entries present in the table at this point + * will have cycle_ctr exactly equal to the current (about to be old) + * value of sync_cycle_ctr. However, if we fail partway through the + * fsync'ing loop, then older values of cycle_ctr might remain when we + * come back here to try again. Repeated checkpoint failures would + * eventually wrap the counter around to the point where an old entry + * might appear new, causing us to skip it, possibly allowing a checkpoint + * to succeed that should not have. To forestall wraparound, any time the + * previous ProcessFsyncRequests() failed to complete, run through the + * table and forcibly set cycle_ctr = sync_cycle_ctr. + * + * Think not to merge this loop with the main loop, as the problem is + * exactly that that loop may fail before having visited all the entries. + * From a performance point of view it doesn't matter anyway, as this path + * will never be taken in a system that's functioning normally. + */ + if (sync_in_progress) + { + /* prior try failed, so update any stale cycle_ctr values */ + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + entry->cycle_ctr = sync_cycle_ctr; + } + } + + /* Advance counter so that new hashtable entries are distinguishable */ + sync_cycle_ctr++; + + /* Set flag to detect failure if we don't reach the end of the loop */ + sync_in_progress = true; + + /* Now scan the hashtable for fsync requests to process */ + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + int failures; + + /* + * If fsync is off then we don't have to bother opening the file at + * all. (We delay checking until this point so that changing fsync on + * the fly behaves sensibly.) + */ + if (!enableFsync) + continue; + + /* + * If the entry is new then don't process it this time; it is new. + * Note "continue" bypasses the hash-remove call at the bottom of the + * loop. + */ + if (entry->cycle_ctr == sync_cycle_ctr) + continue; + + /* Else assert we haven't missed it */ + Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr); + + /* + * If in checkpointer, we want to absorb pending requests every so + * often to prevent overflow of the fsync request queue. It is + * unspecified whether newly-added entries will be visited by + * hash_seq_search, but we don't care since we don't need to process + * them anyway. + */ + if (--absorb_counter <= 0) + { + AbsorbSyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } + + /* + * The fsync table could contain requests to fsync segments that have + * been deleted (unlinked) by the time we get to them. Rather than + * just hoping an ENOENT (or EACCES on Windows) error can be ignored, + * what we do on error is absorb pending requests and then retry. + * Since mdunlink() queues a "cancel" message before actually + * unlinking, the fsync request is guaranteed to be marked canceled + * after the absorb if it really was this case. DROP DATABASE likewise + * has to tell us to forget fsync requests before it starts deletions. + */ + for (failures = 0; !entry->canceled; failures++) + { + char path[MAXPGPATH]; + + INSTR_TIME_SET_CURRENT(sync_start); + if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag, + path) == 0) + { + /* Success; update statistics about sync timing */ + INSTR_TIME_SET_CURRENT(sync_end); + sync_diff = sync_end; + INSTR_TIME_SUBTRACT(sync_diff, sync_start); + elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); + if (elapsed > longest) + longest = elapsed; + total_elapsed += elapsed; + processed++; + + if (log_checkpoints) + elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", + processed, + path, + (double) elapsed / 1000); + + break; /* out of retry loop */ + } + + /* + * It is possible that the relation has been dropped or truncated + * since the fsync request was entered. Therefore, allow ENOENT, + * but only if we didn't fail already on this file. + */ + if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\" but retrying: %m", + path))); + + /* + * Absorb incoming requests and check to see if a cancel arrived + * for this relation fork. + */ + AbsorbSyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ + } /* end retry loop */ + + /* We are done with this entry, remove it */ + if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOps corrupted"); + } /* end loop over hashtable entries */ + + /* Return sync performance metrics for report at checkpoint end */ + CheckpointStats.ckpt_sync_rels = processed; + CheckpointStats.ckpt_longest_sync = longest; + CheckpointStats.ckpt_agg_sync_time = total_elapsed; + + /* Flag successful completion of ProcessSyncRequests */ + sync_in_progress = false; +} + +/* + * RememberSyncRequest() -- callback from checkpointer side of sync request + * + * We stuff fsync requests into the local hash table for execution + * during the checkpointer's next checkpoint. UNLINK requests go into a + * separate linked list, however, because they get processed separately. + * + * See sync.h for more information on the types of sync requests supported. + */ +void +RememberSyncRequest(const FileTag *ftag, SyncRequestType type) +{ + Assert(pendingOps); + + if (type == SYNC_FORGET_REQUEST) + { + PendingFsyncEntry *entry; + + /* Cancel previously entered request */ + entry = (PendingFsyncEntry *) hash_search(pendingOps, + (void *) ftag, + HASH_FIND, + NULL); + if (entry != NULL) + entry->canceled = true; + } + else if (type == SYNC_FILTER_REQUEST) + { + HASH_SEQ_STATUS hstat; + PendingFsyncEntry *entry; + ListCell *cell, + *prev, + *next; + + /* Cancel matching fsync requests */ + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + if (entry->tag.handler == ftag->handler && + syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag)) + entry->canceled = true; + } + + /* Remove matching unlink requests */ + prev = NULL; + for (cell = list_head(pendingUnlinks); cell; cell = next) + { + PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); + + next = lnext(cell); + if (entry->tag.handler == ftag->handler && + syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag)) + { + pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); + pfree(entry); + } + else + prev = cell; + } + } + else if (type == SYNC_UNLINK_REQUEST) + { + /* Unlink request: put it in the linked list */ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingUnlinkEntry *entry; + + entry = palloc(sizeof(PendingUnlinkEntry)); + entry->tag = *ftag; + entry->cycle_ctr = checkpoint_cycle_ctr; + + pendingUnlinks = lappend(pendingUnlinks, entry); + + MemoryContextSwitchTo(oldcxt); + } + else + { + /* Normal case: enter a request to fsync this segment */ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingFsyncEntry *entry; + bool found; + + Assert(type == SYNC_REQUEST); + + entry = (PendingFsyncEntry *) hash_search(pendingOps, + (void *) ftag, + HASH_ENTER, + &found); + /* if new entry, initialize it */ + if (!found) + { + entry->cycle_ctr = sync_cycle_ctr; + entry->canceled = false; + } + + /* + * NB: it's intentional that we don't change cycle_ctr if the entry + * already exists. The cycle_ctr must represent the oldest fsync + * request that could be in the entry. + */ + + MemoryContextSwitchTo(oldcxt); + } +} + +/* + * Register the sync request locally, or forward it to the checkpointer. + * + * If retryOnError is true, we'll keep trying if there is no space in the + * queue. Return true if we succeeded, or false if there wasn't space. + */ +bool +RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, + bool retryOnError) +{ + bool ret; + + if (pendingOps != NULL) + { + /* standalone backend or startup process: fsync state is local */ + RememberSyncRequest(ftag, type); + return true; + } + + for (;;) + { + /* + * Notify the checkpointer about it. If we fail to queue a message + * in retryOnError mode, we have to sleep and try again ... ugly, but + * hopefully won't happen often. + * + * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an + * error in the case of SYNC_UNLINK_REQUEST would leave the + * no-longer-used file still present on disk, which would be bad, so + * I'm inclined to assume that the checkpointer will always empty the + * queue soon. + */ + ret = ForwardSyncRequest(ftag, type); + + /* + * If we are successful in queueing the request, or we failed and were + * instructed not to retry on error, break. + */ + if (ret || (!ret && !retryOnError)) + break; + + pg_usleep(10000L); + } + + return ret; +} + +/* + * In archive recovery, we rely on checkpointer to do fsyncs, but we will have + * already created the pendingOps during initialization of the startup + * process. Calling this function drops the local pendingOps so that + * subsequent requests will be forwarded to checkpointer. + */ +void +EnableSyncRequestForwarding(void) +{ + /* Perform any pending fsyncs we may have queued up, then drop table */ + if (pendingOps) + { + ProcessSyncRequests(); + hash_destroy(pendingOps); + } + pendingOps = NULL; + + /* + * We should not have any pending unlink requests, since mdunlink doesn't + * queue unlink requests when isRedo. + */ + Assert(pendingUnlinks == NIL); +} diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 66b4ee864d..e9f72b5069 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -51,6 +51,7 @@ #include "storage/proc.h" #include "storage/sinvaladt.h" #include "storage/smgr.h" +#include "storage/sync.h" #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/fmgroids.h" @@ -557,6 +558,7 @@ BaseInit(void) /* Do local initialization of file, storage and buffer managers */ InitFileAccess(); + InitSync(); smgrinit(); InitBufferPoolAccess(); } diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 53b8f5fe3c..630366f49e 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -17,6 +17,8 @@ #include "storage/block.h" #include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/sync.h" /* GUC options */ @@ -31,9 +33,9 @@ extern void CheckpointerMain(void) pg_attribute_noreturn(); extern void RequestCheckpoint(int flags); extern void CheckpointWriteDelay(int flags, double progress); -extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, - BlockNumber segno); -extern void AbsorbFsyncRequests(void); +extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type); + +extern void AbsorbSyncRequests(void); extern Size CheckpointerShmemSize(void); extern void CheckpointerShmemInit(void); diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 74c34757fb..a03b4d14a2 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -54,6 +54,18 @@ extern PGDLLIMPORT bool data_sync_retry; */ extern int max_safe_fds; +/* + * On Windows, we have to interpret EACCES as possibly meaning the same as + * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, + * that's what you get. Ugh. This code is designed so that we don't + * actually believe these cases are okay without further evidence (namely, + * a pending fsync request getting canceled ... see ProcessSyncRequests). + */ +#ifndef WIN32 +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) +#else +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) +#endif /* * prototypes for functions in fd.c diff --git a/src/include/storage/md.h b/src/include/storage/md.h new file mode 100644 index 0000000000..a6758a10dc --- /dev/null +++ b/src/include/storage/md.h @@ -0,0 +1,51 @@ +/*------------------------------------------------------------------------- + * + * md.h + * magnetic disk storage manager public interface declarations. + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/md.h + * + *------------------------------------------------------------------------- + */ +#ifndef MD_H +#define MD_H + +#include "storage/block.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/sync.h" + +/* md storage manager functionality */ +extern void mdinit(void); +extern void mdclose(SMgrRelation reln, ForkNumber forknum); +extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool mdexists(SMgrRelation reln, ForkNumber forknum); +extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void mdextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void mdprefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +extern void mdwrite(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); +extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); + +extern void ForgetDatabaseSyncRequests(Oid dbid); +extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo); + +/* md sync callbacks */ +extern int mdsyncfiletag(const FileTag *ftag, char *path); +extern int mdunlinkfiletag(const FileTag *ftag, char *path); +extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate); + +#endif /* MD_H */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 8e98273878..770193e285 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,7 +18,6 @@ #include "storage/block.h" #include "storage/relfilenode.h" - /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -106,43 +105,6 @@ extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); -extern void smgrpreckpt(void); -extern void smgrsync(void); -extern void smgrpostckpt(void); extern void AtEOXact_SMgr(void); - -/* internals: move me elsewhere -- ay 7/94 */ - -/* in md.c */ -extern void mdinit(void); -extern void mdclose(SMgrRelation reln, ForkNumber forknum); -extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool mdexists(SMgrRelation reln, ForkNumber forknum); -extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void mdextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void mdprefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); -extern void mdwrite(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); -extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); -extern void mdpreckpt(void); -extern void mdsync(void); -extern void mdpostckpt(void); - -extern void SetForwardFsyncRequests(void); -extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, - BlockNumber segno); -extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); -extern void ForgetDatabaseFsyncRequests(Oid dbid); -extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo); - #endif /* SMGR_H */ diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h new file mode 100644 index 0000000000..124a49ea98 --- /dev/null +++ b/src/include/storage/sync.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * sync.h + * File synchronization management code. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sync.h + * + *------------------------------------------------------------------------- + */ +#ifndef SYNC_H +#define SYNC_H + +#include "storage/relfilenode.h" + +/* + * Type of sync request. These are used to manage the set of pending + * requests to call a sync handler's sync or unlink functions at the next + * checkpoint. + */ +typedef enum SyncRequestType +{ + SYNC_REQUEST, /* schedule a call of sync function */ + SYNC_UNLINK_REQUEST, /* schedule a call of unlink function */ + SYNC_FORGET_REQUEST, /* forget all calls for a tag */ + SYNC_FILTER_REQUEST /* forget all calls satisfying match fn */ +} SyncRequestType; + +/* + * Which set of functions to use to handle a given request. See the function + * table in sync.c. + */ +typedef enum SyncRequestHandler +{ + SYNC_HANDLER_MD = 0 /* md smgr */ +} SyncRequestHandler; + +/* + * A tag identifying a file. Currently it has the members required for md.c's + * usage, but sync.c has no knowledge of the internal structure, and it is + * liable to change as required by future handlers. + */ +typedef struct FileTag +{ + int16 handler; /* SyncRequstHandler value, saving space */ + int16 forknum; /* ForkNumber, saving space */ + RelFileNode rnode; + uint32 segno; +} FileTag; + +extern void InitSync(void); +extern void SyncPreCheckpoint(void); +extern void SyncPostCheckpoint(void); +extern void ProcessSyncRequests(void); +extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type); +extern void EnableSyncRequestForwarding(void); +extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, + bool retryOnError); + +#endif /* SYNC_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index f31929664a..e09f9353ed 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -651,6 +651,7 @@ File FileFdwExecutionState FileFdwPlanState FileNameMap +FileTag FindSplitData FixedParallelExecutorState FixedParallelState @@ -1700,7 +1701,7 @@ PathKeysComparison PathTarget Pattern_Prefix_Status Pattern_Type -PendingOperationEntry +PendingFsyncEntry PendingRelDelete PendingUnlinkEntry PendingWriteback @@ -2276,7 +2277,10 @@ Subscription SubscriptionInfo SubscriptionRelState Syn +SyncOps SyncRepConfigData +SyncRequestHandler +SyncRequestType SysScanDesc SyscacheCallbackFunction SystemRowsSamplerData