From c34bb00581487e9fac9f8524bc6986d305fcfe3a Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Fri, 29 Jul 2005 03:22:33 +0000 Subject: [PATCH] Use O_DIRECT if available when using O_SYNC for wal_sync_method. Also, write multiple WAL buffers out in one write() operation. ITAGAKI Takahiro --------------------------------------------------------------------------- > If we disable writeback-cache and use open_sync, the per-page writing > behavior in WAL module will show up as bad result. O_DIRECT is similar > to O_DSYNC (at least on linux), so that the benefit of it will disappear > behind the slow disk revolution. > > In the current source, WAL is written as: > for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); } > Is this intentional? Can we rewrite it as follows? > write(&buffers[0], N * BLCKSZ); > > In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff). > Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff). > These two patches are independent, so they can be applied either or both. > > > I tested them on my machine and the results as follows. It shows that > direct-io and gather-write is the best choice when writeback-cache is off. > Are these two patches worth trying if they are used together? > > > | writeback | fsync= | fdata | open_ | fsync_ | open_ > patch | cache | false | sync | sync | direct | direct > ------------+-----------+--------+-------+-------+--------+--------- > direct io | off | 124.2 | 105.7 | 48.3 | 48.3 | 48.2 > direct io | on | 129.1 | 112.3 | 114.1 | 142.9 | 144.5 > gather-write| off | 124.3 | 108.7 | 105.4 | (N/A) | (N/A) > both | off | 131.5 | 115.5 | 114.4 | 145.4 | 145.2 > > - 20runs * pgbench -s 100 -c 50 -t 200 > - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8) > - using 2 ATA disks: > - hda(reiserfs) includes system and wal. > - hdc(jfs) includes database files. writeback-cache is always on. > > --- > ITAGAKI Takahiro --- src/backend/access/transam/xlog.c | 197 ++++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 48 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1204ae34bf..fc9264cfcd 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -47,6 +47,20 @@ #include "utils/relcache.h" +/* + * Becauase O_DIRECT bypasses the kernel buffers, and because we never + * read those buffers except during crash recovery, it is a win to use + * it in all cases where we sync on each write(). We could allow O_DIRECT + * with fsync(), but because skipping the kernel buffer forces writes out + * quickly, it seems best just to use it for O_SYNC. It is hard to imagine + * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT. + */ +#ifdef O_DIRECT +#define PG_O_DIRECT O_DIRECT +#else +#define PG_O_DIRECT 0 +#endif + /* * This chunk of hackery attempts to determine which file sync methods * are available on the current platform, and to choose an appropriate @@ -54,24 +68,50 @@ * configure determined whether fdatasync() is. */ #if defined(O_SYNC) -#define OPEN_SYNC_FLAG O_SYNC +#define CMP_OPEN_SYNC_FLAG O_SYNC #else #if defined(O_FSYNC) -#define OPEN_SYNC_FLAG O_FSYNC +#define CMP_OPEN_SYNC_FLAG O_FSYNC #endif #endif +#define OPEN_SYNC_FLAG (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT) #if defined(O_DSYNC) #if defined(OPEN_SYNC_FLAG) -#if O_DSYNC != OPEN_SYNC_FLAG -#define OPEN_DATASYNC_FLAG O_DSYNC +#if O_DSYNC != CMP_OPEN_SYNC_FLAG +#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) #endif #else /* !defined(OPEN_SYNC_FLAG) */ /* Win32 only has O_DSYNC */ -#define OPEN_DATASYNC_FLAG O_DSYNC +#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) #endif #endif +/* + * Limitation of buffer-alignment for direct io depend on OS and filesystem, + * but BLCKSZ is assumed to be enough for it. + */ +#ifdef O_DIRECT +#define ALIGNOF_XLOG_BUFFER BLCKSZ +#else +#define ALIGNOF_XLOG_BUFFER MAXIMUM_ALIGNOF +#endif + +/* + * Switch the alignment routine because ShmemAlloc() returns a max-aligned + * buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF. + */ +#if ALIGNOF_XLOG_BUFFER <= MAXIMUM_ALIGNOF +#define XLOG_BUFFER_ALIGN(LEN) MAXALIGN((LEN)) +#else +#define XLOG_BUFFER_ALIGN(LEN) ((LEN) + (ALIGNOF_XLOG_BUFFER)) +#endif +/* assume sizeof(ptrdiff_t) == sizeof(void*) */ +#define POINTERALIGN(ALIGNVAL,PTR) \ + ((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1)))) +#define XLOG_BUFFER_POINTERALIGN(PTR) \ + POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR)) + #if defined(OPEN_DATASYNC_FLAG) #define DEFAULT_SYNC_METHOD_STR "open_datasync" #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN @@ -469,6 +509,17 @@ static void ReadControlFile(void); static char *str_time(time_t tnow); static void issue_xlog_fsync(void); +/* XLog gather-write staffs */ +typedef struct XLogPages +{ + char *head; /* Head of first page */ + int size; /* Total bytes of pages == count(pages) * BLCKSZ */ + int offset; /* Offset in xlog segment file */ +} XLogPages; +static void XLogPageReset(XLogPages *pages); +static void XLogPageWrite(XLogPages *pages, int index); +static void XLogPageFlush(XLogPages *pages, int index); + #ifdef WAL_DEBUG static void xlog_outrec(char *buf, XLogRecord *record); #endif @@ -1245,9 +1296,10 @@ static void XLogWrite(XLogwrtRqst WriteRqst) { XLogCtlWrite *Write = &XLogCtl->Write; - char *from; bool ispartialpage; bool use_existent; + int currentIndex = Write->curridx; + XLogPages pages; /* We should always be inside a critical section here */ Assert(CritSectionCount > 0); @@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst) */ LogwrtResult = Write->LogwrtResult; + XLogPageReset(&pages); + while (XLByteLT(LogwrtResult.Write, WriteRqst.Write)) { /* @@ -1266,14 +1320,14 @@ XLogWrite(XLogwrtRqst WriteRqst) * end of the last page that's been initialized by * AdvanceXLInsertBuffer. */ - if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx])) + if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[currentIndex])) elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, - XLogCtl->xlblocks[Write->curridx].xlogid, - XLogCtl->xlblocks[Write->curridx].xrecoff); + XLogCtl->xlblocks[currentIndex].xlogid, + XLogCtl->xlblocks[currentIndex].xrecoff); /* Advance LogwrtResult.Write to end of current buffer page */ - LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx]; + LogwrtResult.Write = XLogCtl->xlblocks[currentIndex]; ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write); if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg)) @@ -1281,6 +1335,7 @@ XLogWrite(XLogwrtRqst WriteRqst) /* * Switch to new logfile segment. */ + XLogPageFlush(&pages, currentIndex); if (openLogFile >= 0) { if (close(openLogFile)) @@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst) openLogOff = 0; } - /* Need to seek in the file? */ - if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize) - { - openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize; - if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not seek in log file %u, segment %u to offset %u: %m", - openLogId, openLogSeg, openLogOff))); - } - - /* OK to write the page */ - from = XLogCtl->pages + Write->curridx * BLCKSZ; - errno = 0; - if (write(openLogFile, from, BLCKSZ) != BLCKSZ) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log file %u, segment %u at offset %u: %m", - openLogId, openLogSeg, openLogOff))); - } - openLogOff += BLCKSZ; + /* Add a page to buffer */ + XLogPageWrite(&pages, currentIndex); /* * If we just wrote the whole last page of a logfile segment, @@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst) * This is also the right place to notify the Archiver that the * segment is ready to copy to archival storage. */ - if (openLogOff >= XLogSegSize && !ispartialpage) + if (openLogOff + pages.size >= XLogSegSize && !ispartialpage) { + XLogPageFlush(&pages, currentIndex); issue_xlog_fsync(); LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */ @@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst) LogwrtResult.Write = WriteRqst.Write; break; } - Write->curridx = NextBufIdx(Write->curridx); + currentIndex = NextBufIdx(currentIndex); } + XLogPageFlush(&pages, currentIndex); /* * If asked to flush, do so @@ -3584,7 +3618,7 @@ XLOGShmemSize(void) if (XLOGbuffers < MinXLOGbuffers) XLOGbuffers = MinXLOGbuffers; - return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers) + return XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers) + BLCKSZ * XLOGbuffers + MAXALIGN(sizeof(ControlFileData)); } @@ -3601,7 +3635,7 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) ShmemInitStruct("XLOG Ctl", - MAXALIGN(sizeof(XLogCtlData) + + XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers) + BLCKSZ * XLOGbuffers, &foundXLog); @@ -3630,9 +3664,9 @@ XLOGShmemInit(void) * Here, on the other hand, we must MAXALIGN to ensure the page * buffers have worst-case alignment. */ - XLogCtl->pages = - ((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) + - sizeof(XLogRecPtr) * XLOGbuffers); + XLogCtl->pages = XLOG_BUFFER_POINTERALIGN( + ((char *) XLogCtl) + + sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers); memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers); /* @@ -3690,10 +3724,9 @@ BootStrapXLOG(void) /* First timeline ID is always 1 */ ThisTimeLineID = 1; - /* Use malloc() to ensure buffer is MAXALIGNED */ - buffer = (char *) malloc(BLCKSZ); - page = (XLogPageHeader) buffer; - memset(buffer, 0, BLCKSZ); + buffer = (char *) malloc(BLCKSZ + ALIGNOF_XLOG_BUFFER); + page = (XLogPageHeader) XLOG_BUFFER_POINTERALIGN(buffer); + memset(page, 0, BLCKSZ); /* Set up information for the initial checkpoint record */ checkPoint.redo.xlogid = 0; @@ -3745,7 +3778,7 @@ BootStrapXLOG(void) /* Write the first page with the initial record */ errno = 0; - if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ) + if (write(openLogFile, page, BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) @@ -5837,3 +5870,71 @@ remove_backup_label(void) errmsg("could not remove file \"%s\": %m", BACKUP_LABEL_FILE))); } + + +/* XLog gather-write staffs */ + +static void +XLogPageReset(XLogPages *pages) +{ + memset(pages, 0, sizeof(*pages)); +} + +static void +XLogPageWrite(XLogPages *pages, int index) +{ + char *page = XLogCtl->pages + index * BLCKSZ; + int size = BLCKSZ; + int offset = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize; + + if (pages->head + pages->size == page + && pages->offset + pages->size == offset) + { /* Pages are continuous. Append new page. */ + pages->size += size; + } + else + { /* Pages are not continuous. Flush and clear. */ + XLogPageFlush(pages, PrevBufIdx(index)); + pages->head = page; + pages->size = size; + pages->offset = offset; + } +} + +static void +XLogPageFlush(XLogPages *pages, int index) +{ + if (!pages->head) + { /* No needs to write pages. */ + XLogCtl->Write.curridx = index; + return; + } + + /* Need to seek in the file? */ + if (openLogOff != pages->offset) + { + openLogOff = pages->offset; + if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not seek in log file %u, segment %u to offset %u: %m", + openLogId, openLogSeg, openLogOff))); + } + + /* OK to write the page */ + errno = 0; + if (write(openLogFile, pages->head, pages->size) != pages->size) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log file %u, segment %u at offset %u: %m", + openLogId, openLogSeg, openLogOff))); + } + + openLogOff += pages->size; + XLogCtl->Write.curridx = index; + XLogPageReset(pages); +}