Per previous discussions, get rid of use of sync(2) in favor of

explicitly fsync'ing every (non-temp) file we have written since the
last checkpoint.  In the vast majority of cases, the burden of the
fsyncs should fall on the bgwriter process not on backends.  (To this
end, we assume that an fsync issued by the bgwriter will force out
blocks written to the same file by other processes using other file
descriptors.  Anyone have a problem with that?)  This makes the world
safe for WIN32, which ain't even got sync(2), and really makes the world
safe for Unixen as well, because sync(2) never had the semantics we need:
it offers no way to wait for the requested I/O to finish.

Along the way, fix a bug I recently introduced in xlog recovery:
file truncation replay failed to clear bufmgr buffers for the dropped
blocks, which could result in 'PANIC:  heap_delete_redo: no block'
later on in xlog replay.
This commit is contained in:
Tom Lane 2004-05-31 03:48:10 +00:00
parent f024086db3
commit 9b178555fc
13 changed files with 779 additions and 250 deletions

View File

@ -13,7 +13,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -97,7 +97,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
Assert(status == TRANSACTION_STATUS_COMMITTED ||
status == TRANSACTION_STATUS_ABORTED);
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
byteptr += byteno;
@ -110,7 +110,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
/* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
/*
@ -128,14 +128,14 @@ TransactionIdGetStatus(TransactionId xid)
char *byteptr;
XidStatus status;
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
byteptr += byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
return status;
}
@ -169,16 +169,16 @@ BootStrapCLOG(void)
{
int slotno;
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
/* Create and zero the first page of the commit log */
slotno = ZeroCLOGPage(0, false);
/* Make sure it's written out */
SimpleLruWritePage(ClogCtl, slotno);
SimpleLruWritePage(ClogCtl, slotno, NULL);
/* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
/*
@ -256,12 +256,12 @@ ExtendCLOG(TransactionId newestXact)
pageno = TransactionIdToPage(newestXact);
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
/* Zero the page and make an XLOG entry about it */
ZeroCLOGPage(pageno, true);
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
@ -351,13 +351,13 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
memcpy(&pageno, XLogRecGetData(record), sizeof(int));
LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
slotno = ZeroCLOGPage(pageno, false);
SimpleLruWritePage(ClogCtl, slotno);
SimpleLruWritePage(ClogCtl, slotno, NULL);
/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
LWLockRelease(ClogCtl->locks->ControlLock);
LWLockRelease(ClogCtl->ControlLock);
}
}

View File

@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.15 2004/05/29 22:48:18 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -17,6 +17,7 @@
#include <unistd.h>
#include "access/slru.h"
#include "access/clog.h" /* only for NUM_CLOG_BUFFERS */
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/lwlock.h"
@ -100,6 +101,8 @@ typedef enum
*/
typedef struct SlruSharedData
{
LWLockId ControlLock;
/*
* Info for each buffer slot. Page number is undefined when status is
* EMPTY. lru_count is essentially the number of page switches since
@ -110,6 +113,7 @@ typedef struct SlruSharedData
SlruPageStatus page_status[NUM_CLOG_BUFFERS];
int page_number[NUM_CLOG_BUFFERS];
unsigned int page_lru_count[NUM_CLOG_BUFFERS];
LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */
/*
* latest_page_number is the page number of the current end of the
@ -118,12 +122,24 @@ typedef struct SlruSharedData
*/
int latest_page_number;
} SlruSharedData;
typedef SlruSharedData *SlruShared;
#define SlruFileName(ctl, path, seg) \
snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
/*
* During SimpleLruFlush(), we will usually not need to write/fsync more
* than one or two physical files, but we may need to write several pages
* per file. We can consolidate the I/O requests by leaving files open
* until control returns to SimpleLruFlush(). This data structure remembers
* which files are open.
*/
typedef struct SlruFlushData
{
int num_files; /* # files actually open */
int fd[NUM_CLOG_BUFFERS]; /* their FD's */
int segno[NUM_CLOG_BUFFERS]; /* their clog seg#s */
} SlruFlushData;
/*
* Macro to mark a buffer slot "most recently used".
*/
@ -145,14 +161,17 @@ typedef enum
SLRU_SEEK_FAILED,
SLRU_READ_FAILED,
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno);
static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
SlruFlush fdata);
static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
@ -165,24 +184,16 @@ static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
int
SimpleLruShmemSize(void)
{
return MAXALIGN(sizeof(SlruSharedData))
+ BLCKSZ * NUM_CLOG_BUFFERS
+ MAXALIGN(sizeof(SlruLockData))
;
return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS;
}
void
SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
{
bool found;
char *ptr;
SlruShared shared;
SlruLock locks;
bool found;
ptr = ShmemInitStruct(name, SimpleLruShmemSize(), &found);
shared = (SlruShared) ptr;
locks = (SlruLock) (ptr + MAXALIGN(sizeof(SlruSharedData)) +
BLCKSZ * NUM_CLOG_BUFFERS);
shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(), &found);
if (!IsUnderPostmaster)
{
@ -192,18 +203,18 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
Assert(!found);
locks->ControlLock = LWLockAssign();
memset(shared, 0, sizeof(SlruSharedData));
shared->ControlLock = LWLockAssign();
bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData));
for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
{
locks->BufferLocks[slotno] = LWLockAssign();
shared->page_buffer[slotno] = bufptr;
shared->page_status[slotno] = SLRU_PAGE_EMPTY;
shared->page_lru_count[slotno] = 1;
shared->BufferLocks[slotno] = LWLockAssign();
bufptr += BLCKSZ;
}
@ -213,10 +224,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
Assert(found);
/* Initialize the unshared control struct */
ctl->locks = locks;
ctl->shared = shared;
ctl->ControlLock = shared->ControlLock;
/* Init directory path */
/* Initialize unshared copy of directory path */
snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir);
}
@ -232,7 +243,7 @@ int
SimpleLruZeroPage(SlruCtl ctl, int pageno)
{
int slotno;
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
/* Find a suitable buffer slot for the page */
slotno = SlruSelectLRUPage(ctl, pageno);
@ -270,7 +281,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
char *
SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
{
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
/* Outer loop handles restart if we lose the buffer to someone else */
for (;;)
@ -313,8 +324,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
SlruRecentlyUsed(shared, slotno);
/* Release shared lock, grab per-buffer lock instead */
LWLockRelease(ctl->locks->ControlLock);
LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
LWLockRelease(shared->ControlLock);
LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
/*
* Check to see if someone else already did the read, or took the
@ -323,8 +334,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
if (shared->page_number[slotno] != pageno ||
shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
{
LWLockRelease(ctl->locks->BufferLocks[slotno]);
LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
LWLockRelease(shared->BufferLocks[slotno]);
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
continue;
}
@ -332,14 +343,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
/* Re-acquire shared control lock and update page state */
LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
Assert(shared->page_number[slotno] == pageno &&
shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS);
shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY;
LWLockRelease(ctl->locks->BufferLocks[slotno]);
LWLockRelease(shared->BufferLocks[slotno]);
/* Now it's okay to ereport if we failed */
if (!ok)
@ -364,11 +375,11 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
* Control lock must be held at entry, and will be held at exit.
*/
void
SimpleLruWritePage(SlruCtl ctl, int slotno)
SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
{
int pageno;
bool ok;
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
/* Do nothing if page does not need writing */
if (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
@ -378,8 +389,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
pageno = shared->page_number[slotno];
/* Release shared lock, grab per-buffer lock instead */
LWLockRelease(ctl->locks->ControlLock);
LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
LWLockRelease(shared->ControlLock);
LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
/*
* Check to see if someone else already did the write, or took the
@ -392,8 +403,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
(shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS))
{
LWLockRelease(ctl->locks->BufferLocks[slotno]);
LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
LWLockRelease(shared->BufferLocks[slotno]);
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
return;
}
@ -412,10 +423,19 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
/* Okay, do the write */
ok = SlruPhysicalWritePage(ctl, pageno, slotno);
ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
/* If we failed, and we're in a flush, better close the files */
if (!ok && fdata)
{
int i;
for (i = 0; i < fdata->num_files; i++)
close(fdata->fd[i]);
}
/* Re-acquire shared control lock and update page state */
LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
Assert(shared->page_number[slotno] == pageno &&
(shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS ||
@ -425,7 +445,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY;
LWLockRelease(ctl->locks->BufferLocks[slotno]);
LWLockRelease(shared->BufferLocks[slotno]);
/* Now it's okay to ereport if we failed */
if (!ok)
@ -445,7 +465,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
static bool
SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
{
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT;
int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
int offset = rpageno * BLCKSZ;
@ -482,6 +502,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
{
slru_errcause = SLRU_SEEK_FAILED;
slru_errno = errno;
close(fd);
return false;
}
@ -490,6 +511,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
{
slru_errcause = SLRU_READ_FAILED;
slru_errno = errno;
close(fd);
return false;
}
@ -511,50 +533,80 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
* info in static variables to let SlruReportIOError make the report.
*
* For now, assume it's not worth keeping a file pointer open across
* read/write operations. We could cache one virtual file pointer ...
* independent read/write operations. We do batch operations during
* SimpleLruFlush, though.
*
* fdata is NULL for a standalone write, pointer to open-file info during
* SimpleLruFlush.
*/
static bool
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
{
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT;
int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
int offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd;
SlruFileName(ctl, path, segno);
int fd = -1;
/*
* If the file doesn't already exist, we should create it. It is
* possible for this to need to happen when writing a page that's not
* first in its segment; we assume the OS can cope with that. (Note:
* it might seem that it'd be okay to create files only when
* SimpleLruZeroPage is called for the first page of a segment.
* However, if after a crash and restart the REDO logic elects to
* replay the log from a checkpoint before the latest one, then it's
* possible that we will get commands to set transaction status of
* transactions that have already been truncated from the commit log.
* Easiest way to deal with that is to accept references to
* nonexistent files here and in SlruPhysicalReadPage.)
* During a Flush, we may already have the desired file open.
*/
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fdata)
{
int i;
for (i = 0; i < fdata->num_files; i++)
{
if (fdata->segno[i] == segno)
{
fd = fdata->fd[i];
break;
}
}
}
if (fd < 0)
{
if (errno != ENOENT)
{
slru_errcause = SLRU_OPEN_FAILED;
slru_errno = errno;
return false;
}
fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
S_IRUSR | S_IWUSR);
/*
* If the file doesn't already exist, we should create it. It is
* possible for this to need to happen when writing a page that's not
* first in its segment; we assume the OS can cope with that.
* (Note: it might seem that it'd be okay to create files only when
* SimpleLruZeroPage is called for the first page of a segment.
* However, if after a crash and restart the REDO logic elects to
* replay the log from a checkpoint before the latest one, then it's
* possible that we will get commands to set transaction status of
* transactions that have already been truncated from the commit log.
* Easiest way to deal with that is to accept references to
* nonexistent files here and in SlruPhysicalReadPage.)
*/
SlruFileName(ctl, path, segno);
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
{
slru_errcause = SLRU_CREATE_FAILED;
slru_errno = errno;
return false;
if (errno != ENOENT)
{
slru_errcause = SLRU_OPEN_FAILED;
slru_errno = errno;
return false;
}
fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
S_IRUSR | S_IWUSR);
if (fd < 0)
{
slru_errcause = SLRU_CREATE_FAILED;
slru_errno = errno;
return false;
}
}
if (fdata)
{
fdata->fd[fdata->num_files] = fd;
fdata->segno[fdata->num_files] = segno;
fdata->num_files++;
}
}
@ -562,6 +614,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
{
slru_errcause = SLRU_SEEK_FAILED;
slru_errno = errno;
if (!fdata)
close(fd);
return false;
}
@ -573,14 +627,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
errno = ENOSPC;
slru_errcause = SLRU_WRITE_FAILED;
slru_errno = errno;
if (!fdata)
close(fd);
return false;
}
if (close(fd))
/*
* If not part of Flush, need to fsync now. We assume this happens
* infrequently enough that it's not a performance issue.
*/
if (!fdata)
{
slru_errcause = SLRU_CLOSE_FAILED;
slru_errno = errno;
return false;
if (pg_fsync(fd))
{
slru_errcause = SLRU_FSYNC_FAILED;
slru_errno = errno;
close(fd);
return false;
}
if (close(fd))
{
slru_errcause = SLRU_CLOSE_FAILED;
slru_errno = errno;
return false;
}
}
return true;
@ -637,6 +708,13 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
errdetail("could not write to file \"%s\" at offset %u: %m",
path, offset)));
break;
case SLRU_FSYNC_FAILED:
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not access status of transaction %u", xid),
errdetail("could not fsync file \"%s\": %m",
path)));
break;
case SLRU_CLOSE_FAILED:
ereport(ERROR,
(errcode_for_file_access(),
@ -668,7 +746,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
static int
SlruSelectLRUPage(SlruCtl ctl, int pageno)
{
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
/* Outer loop handles restart after I/O */
for (;;)
@ -717,7 +795,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
(void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
InvalidTransactionId, false);
else
SimpleLruWritePage(ctl, bestslot);
SimpleLruWritePage(ctl, bestslot, NULL);
/*
* Now loop back and try again. This is the easiest way of
@ -733,7 +811,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
void
SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
{
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
shared->latest_page_number = pageno;
}
@ -744,16 +822,20 @@ SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
void
SimpleLruFlush(SlruCtl ctl, bool checkpoint)
{
#ifdef USE_ASSERT_CHECKING /* only used in Assert() */
SlruShared shared = (SlruShared) ctl->shared;
#endif
SlruShared shared = ctl->shared;
SlruFlushData fdata;
int slotno;
int pageno = 0;
int i;
bool ok;
LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
fdata.num_files = 0;
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
{
SimpleLruWritePage(ctl, slotno);
SimpleLruWritePage(ctl, slotno, &fdata);
/*
* When called during a checkpoint, we cannot assert that the slot
@ -765,7 +847,32 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
shared->page_status[slotno] == SLRU_PAGE_CLEAN);
}
LWLockRelease(ctl->locks->ControlLock);
LWLockRelease(shared->ControlLock);
/*
* Now fsync and close any files that were open
*/
ok = true;
for (i = 0; i < fdata.num_files; i++)
{
if (pg_fsync(fdata.fd[i]))
{
slru_errcause = SLRU_FSYNC_FAILED;
slru_errno = errno;
pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
ok = false;
}
if (close(fdata.fd[i]))
{
slru_errcause = SLRU_CLOSE_FAILED;
slru_errno = errno;
pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
ok = false;
}
}
if (!ok)
SlruReportIOError(ctl, pageno, InvalidTransactionId);
}
/*
@ -786,7 +893,7 @@ void
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
{
int slotno;
SlruShared shared = (SlruShared) ctl->shared;
SlruShared shared = ctl->shared;
/*
* The cutoff point is the start of the segment containing cutoffPage.
@ -805,7 +912,7 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
* have been flushed already during the checkpoint, we're just being
* extra careful here.)
*/
LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
restart:;
@ -817,7 +924,7 @@ restart:;
*/
if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
{
LWLockRelease(ctl->locks->ControlLock);
LWLockRelease(shared->ControlLock);
ereport(LOG,
(errmsg("could not truncate directory \"%s\": apparent wraparound",
ctl->Dir)));
@ -849,11 +956,11 @@ restart:;
(void) SimpleLruReadPage(ctl, shared->page_number[slotno],
InvalidTransactionId, false);
else
SimpleLruWritePage(ctl, slotno);
SimpleLruWritePage(ctl, slotno, NULL);
goto restart;
}
LWLockRelease(ctl->locks->ControlLock);
LWLockRelease(shared->ControlLock);
/* Now we can remove the old segment(s) */
(void) SlruScanDirectory(ctl, cutoffPage, true);
@ -878,7 +985,8 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
if (cldir == NULL)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", ctl->Dir)));
errmsg("could not open directory \"%s\": %m",
ctl->Dir)));
errno = 0;
while ((clde = readdir(cldir)) != NULL)

View File

@ -27,14 +27,17 @@
*
* If the bgwriter exits unexpectedly, the postmaster treats that the same
* as a backend crash: shared memory may be corrupted, so remaining backends
* should be killed by SIGQUIT and then a recovery cycle started.
* should be killed by SIGQUIT and then a recovery cycle started. (Even if
* shared memory isn't corrupted, we have lost information about which
* files need to be fsync'd for the next checkpoint, and so a system
* restart needs to be forced.)
*
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -55,13 +58,54 @@
#include "utils/guc.h"
/*
/*----------
* Shared memory area for communication between bgwriter and backends
*
* The ckpt counters allow backends to watch for completion of a checkpoint
* request they send. Here's how it works:
* * At start of a checkpoint, bgwriter increments ckpt_started.
* * On completion of a checkpoint, bgwriter sets ckpt_done to
* equal ckpt_started.
* * On failure of a checkpoint, bgwrite first increments ckpt_failed,
* then sets ckpt_done to equal ckpt_started.
* All three fields are declared sig_atomic_t to ensure they can be read
* and written without explicit locking. The algorithm for backends is:
* 1. Record current values of ckpt_failed and ckpt_started (in that
* order!).
* 2. Send signal to request checkpoint.
* 3. Sleep until ckpt_started changes. Now you know a checkpoint has
* begun since you started this algorithm (although *not* that it was
* specifically initiated by your signal).
* 4. Record new value of ckpt_started.
* 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
* arithmetic here in case counters wrap around.) Now you know a
* checkpoint has started and completed, but not whether it was
* successful.
* 6. If ckpt_failed is different from the originally saved value,
* assume request failed; otherwise it was definitely successful.
*
* The requests array holds fsync requests sent by backends and not yet
* absorbed by the bgwriter.
*----------
*/
typedef struct
{
RelFileNode rnode;
BlockNumber segno;
/* might add a request-type field later */
} BgWriterRequest;
typedef struct
{
pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */
sig_atomic_t checkpoint_count; /* advances when checkpoint done */
sig_atomic_t ckpt_started; /* advances when checkpoint starts */
sig_atomic_t ckpt_done; /* advances when checkpoint done */
sig_atomic_t ckpt_failed; /* advances when checkpoint fails */
int num_requests; /* current # of requests */
int max_requests; /* allocated array size */
BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */
} BgWriterShmemStruct;
static BgWriterShmemStruct *BgWriterShmem;
@ -86,6 +130,10 @@ static volatile sig_atomic_t shutdown_requested = false;
/*
* Private state
*/
static bool am_bg_writer = false;
static bool ckpt_active = false;
static time_t last_checkpoint_time;
@ -106,6 +154,7 @@ BackgroundWriterMain(void)
{
Assert(BgWriterShmem != NULL);
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
/*
* Properly accept or ignore signals the postmaster might send us
@ -180,6 +229,17 @@ BackgroundWriterMain(void)
*/
InError = false;
/* Warn any waiting backends that the checkpoint failed. */
if (ckpt_active)
{
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
bgs->ckpt_failed++;
bgs->ckpt_done = bgs->ckpt_started;
ckpt_active = false;
}
/*
* Exit interrupt holdoff section we implicitly established above.
*/
@ -214,8 +274,17 @@ BackgroundWriterMain(void)
long udelay;
/*
* Process any signals received recently.
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
*/
if (!PostmasterIsAlive(true))
exit(1);
/*
* Process any requests or signals received recently.
*/
AbsorbFsyncRequests();
if (got_SIGHUP)
{
got_SIGHUP = false;
@ -265,8 +334,20 @@ BackgroundWriterMain(void)
errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
}
/*
* Indicate checkpoint start to any waiting backends.
*/
ckpt_active = true;
BgWriterShmem->ckpt_started++;
CreateCheckPoint(false, force_checkpoint);
/*
* Indicate checkpoint completion to any waiting backends.
*/
BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
ckpt_active = false;
/*
* Note we record the checkpoint start time not end time as
* last_checkpoint_time. This is so that time-driven checkpoints
@ -274,14 +355,11 @@ BackgroundWriterMain(void)
*/
last_checkpoint_time = now;
/*
* Indicate checkpoint completion to any waiting backends.
*/
BgWriterShmem->checkpoint_count++;
/*
* After any checkpoint, close all smgr files. This is so we
* won't hang onto smgr references to deleted files indefinitely.
* (It is safe to do this because this process does not have a
* relcache, and so no dangling references could remain.)
*/
smgrcloseall();
@ -301,6 +379,8 @@ BackgroundWriterMain(void)
* we respond reasonably promptly when someone signals us,
* break down the sleep into 1-second increments, and check for
* interrupts after each nap.
*
* We absorb pending requests after each short sleep.
*/
udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
while (udelay > 1000000L)
@ -308,17 +388,11 @@ BackgroundWriterMain(void)
if (got_SIGHUP || checkpoint_requested || shutdown_requested)
break;
pg_usleep(1000000L);
AbsorbFsyncRequests();
udelay -= 1000000L;
}
if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
pg_usleep(udelay);
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
*/
if (!PostmasterIsAlive(true))
exit(1);
}
}
@ -387,10 +461,11 @@ int
BgWriterShmemSize(void)
{
/*
* This is not worth measuring right now, but may become so after we
* add fsync signaling ...
* Currently, the size of the requests[] array is arbitrarily set
* equal to NBuffers. This may prove too large or small ...
*/
return MAXALIGN(sizeof(BgWriterShmemStruct));
return MAXALIGN(sizeof(BgWriterShmemStruct) +
(NBuffers - 1) * sizeof(BgWriterRequest));
}
/*
@ -404,7 +479,7 @@ BgWriterShmemInit(void)
BgWriterShmem = (BgWriterShmemStruct *)
ShmemInitStruct("Background Writer Data",
sizeof(BgWriterShmemStruct),
BgWriterShmemSize(),
&found);
if (BgWriterShmem == NULL)
ereport(FATAL,
@ -414,6 +489,7 @@ BgWriterShmemInit(void)
return; /* already initialized */
MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
BgWriterShmem->max_requests = NBuffers;
}
/*
@ -427,8 +503,10 @@ BgWriterShmemInit(void)
void
RequestCheckpoint(bool waitforit)
{
volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count;
sig_atomic_t old_count = *count_ptr;
/* use volatile pointer to prevent code rearrangement */
volatile BgWriterShmemStruct *bgs = BgWriterShmem;
sig_atomic_t old_failed = bgs->ckpt_failed;
sig_atomic_t old_started = bgs->ckpt_started;
/*
* Send signal to request checkpoint. When waitforit is false,
@ -442,15 +520,119 @@ RequestCheckpoint(bool waitforit)
"could not signal for checkpoint: %m");
/*
* If requested, wait for completion. We detect completion by
* observing a change in checkpoint_count in shared memory.
* If requested, wait for completion. We detect completion according
* to the algorithm given above.
*/
if (waitforit)
{
while (*count_ptr == old_count)
while (bgs->ckpt_started == old_started)
{
CHECK_FOR_INTERRUPTS();
pg_usleep(1000000L);
pg_usleep(100000L);
}
old_started = bgs->ckpt_started;
/*
* We are waiting for ckpt_done >= old_started, in a modulo
* sense. This is a little tricky since we don't know the
* width or signedness of sig_atomic_t. We make the lowest
* common denominator assumption that it is only as wide
* as "char". This means that this algorithm will cope
* correctly as long as we don't sleep for more than 127
* completed checkpoints. (If we do, we will get another
* chance to exit after 128 more checkpoints...)
*/
while (((signed char) (bgs->ckpt_done - old_started)) < 0)
{
CHECK_FOR_INTERRUPTS();
pg_usleep(100000L);
}
if (bgs->ckpt_failed != old_failed)
ereport(ERROR,
(errmsg("checkpoint request failed"),
errhint("Consult the postmaster log for details.")));
}
}
/*
* ForwardFsyncRequest
* Forward a file-fsync request from a backend to the bgwriter
*
* Whenever a backend is compelled to write directly to a relation
* (which should be seldom, if the bgwriter is getting its job done),
* the backend calls this routine to pass over knowledge that the relation
* is dirty and must be fsync'd before next checkpoint.
*
* If we are unable to pass over the request (at present, this can happen
* if the shared memory queue is full), we return false. That forces
* the backend to do its own fsync. We hope that will be even more seldom.
*
* Note: we presently make no attempt to eliminate duplicate requests
* in the requests[] queue. The bgwriter will have to eliminate dups
* internally anyway, so we may as well avoid holding the lock longer
* than we have to here.
*/
bool
ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
BgWriterRequest *request;
if (!IsUnderPostmaster)
return false; /* probably shouldn't even get here */
Assert(BgWriterShmem != NULL);
LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
if (BgWriterShmem->bgwriter_pid == 0 ||
BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
{
LWLockRelease(BgWriterCommLock);
return false;
}
request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
request->rnode = rnode;
request->segno = segno;
LWLockRelease(BgWriterCommLock);
return true;
}
/*
* AbsorbFsyncRequests
* Retrieve queued fsync requests and pass them to local smgr.
*
* This is exported because it must be called during CreateCheckpoint;
* we have to be sure we have accepted all pending requests *after* we
* establish the checkpoint redo pointer. Since CreateCheckpoint
* sometimes runs in non-bgwriter processes, do nothing if not bgwriter.
*/
void
AbsorbFsyncRequests(void)
{
BgWriterRequest *requests = NULL;
BgWriterRequest *request;
int n;
if (!am_bg_writer)
return;
/*
* We try to avoid holding the lock for a long time by copying the
* request array.
*/
LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
n = BgWriterShmem->num_requests;
if (n > 0)
{
requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
}
BgWriterShmem->num_requests = 0;
LWLockRelease(BgWriterCommLock);
for (request = requests; n > 0; request++, n--)
{
RememberFsyncRequest(request->rnode, request->segno);
}
if (requests)
pfree(requests);
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
* bothering to write them out first. This is NOT rollback-able,
* and so should be used only with extreme caution!
*
* There is no particularly good reason why this doesn't have a
* firstDelBlock parameter, except that current callers don't need it.
*
* We assume that the caller holds an exclusive lock on the relation,
* which should assure that no new buffers will be acquired for the rel
* meanwhile.
@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
void
DropRelationBuffers(Relation rel)
{
DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
}
/* ---------------------------------------------------------------------
* DropRelFileNodeBuffers
*
* This is the same as DropRelationBuffers, except that the target
* relation is specified by RelFileNode and temp status.
* relation is specified by RelFileNode and temp status, and one
* may specify the first block to drop.
*
* This is NOT rollback-able. One legitimate use is to clear the
* buffer cache of buffers for a relation that is being deleted
@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel)
* --------------------------------------------------------------------
*/
void
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock)
{
int i;
BufferDesc *bufHdr;
@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
for (i = 0; i < NLocBuffer; i++)
{
bufHdr = &LocalBufferDescriptors[i];
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.blockNum >= firstDelBlock)
{
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
bufHdr->cntxDirty = false;
@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
{
bufHdr = &BufferDescriptors[i - 1];
recheck:
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.blockNum >= firstDelBlock)
{
/*
* If there is I/O in progress, better wait till it's done;

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
*
* NOTES:
*
@ -484,6 +484,7 @@ Insert(File file)
DO_DB(_dump_lru());
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
@ -685,6 +686,7 @@ filepath(const char *filename)
return buf;
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount)
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
FileAccess(file);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
returnCode = read(VfdCache[file].fd, buffer, amount);
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount)
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
FileAccess(file);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
errno = 0;
returnCode = write(VfdCache[file].fd, buffer, amount);
@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount)
return returnCode;
}
int
FileSync(File file)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSync: %d (%s)",
file, VfdCache[file].fileName));
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
return pg_fsync(VfdCache[file].fd);
}
long
FileSeek(File file, long offset, int whence)
{
int returnCode;
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence)
VfdCache[file].seekPos += offset;
break;
case SEEK_END:
FileAccess(file);
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence)
if (offset < 0)
elog(ERROR, "invalid seek offset: %ld", offset);
if (VfdCache[file].seekPos != offset)
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
case SEEK_END:
VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset)
DO_DB(elog(LOG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
FileAccess(file);
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
return returnCode;
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -21,8 +21,10 @@
#include "catalog/catalog.h"
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
@ -33,37 +35,68 @@
* system's file size limit (often 2GBytes). In order to do that,
* we break relations up into chunks of < 2GBytes and store one chunk
* in each of several files that represent the relation. See the
* BLCKSZ and RELSEG_SIZE configuration constants in
* include/pg_config.h. All chunks except the last MUST have size exactly
* equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
* BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
* All chunks except the last MUST have size exactly equal to RELSEG_SIZE
* blocks --- see mdnblocks() and mdtruncate().
*
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
* cache is, therefore, just the head of a list of MdfdVec objects.
* But note the md_fd pointer can be NULL, indicating relation not open.
*
* Note that mdfd_chain == NULL does not necessarily mean the relation
* doesn't have another segment after this one; we may just not have
* opened the next segment yet. (We could not have "all segments are
* in the chain" as an invariant anyway, since another backend could
* extend the relation when we weren't looking.)
*
* All MdfdVec objects are palloc'd in the MdCxt memory context.
*/
typedef struct _MdfdVec
{
File mdfd_vfd; /* fd number in fd.c's pool */
#ifndef LET_OS_MANAGE_FILESIZE
struct _MdfdVec *mdfd_chain; /* for large relations */
BlockNumber mdfd_segno; /* segment number, from 0 */
#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
#endif
} MdfdVec;
static MemoryContext MdCxt; /* context for all md.c allocations */
/* routines declared here */
static MdfdVec *mdopen(SMgrRelation reln);
/*
* In some contexts (currently, standalone backends and the bgwriter process)
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
* table remembers the pending operations. We use a hash table not because
* we want to look up individual operations, but simply as a convenient way
* of eliminating duplicate requests.
*
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
*
* XXX for WIN32, may want to expand this to track pending deletes, too.
*/
typedef struct
{
RelFileNode rnode; /* the targeted relation */
BlockNumber segno; /* which segment */
} PendingOperationEntry;
static HTAB *pendingOpsTable = NULL;
/* local routines */
static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
int oflags);
#endif
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
bool allowNotFound);
static BlockNumber _mdnblocks(File file, Size blcksz);
@ -79,6 +112,31 @@ mdinit(void)
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
/*
* Create pending-operations hashtable if we need it. Currently,
* we need it if we are standalone (not under a postmaster) OR
* if we are a bootstrap-mode subprocess of a postmaster (that is,
* a startup or bgwriter process).
*/
if (!IsUnderPostmaster || IsBootstrapProcessingMode())
{
HASHCTL hash_ctl;
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(PendingOperationEntry);
hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hash = tag_hash;
hash_ctl.hcxt = MdCxt;
pendingOpsTable = hash_create("Pending Ops Table",
100L,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
if (pendingOpsTable == NULL)
ereport(FATAL,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
return true;
}
@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
reln->md_fd = _fdvec_alloc();
reln->md_fd->mdfd_vfd = fd;
reln->md_fd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum);
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
return false;
}
if (!register_dirty_segment(reln, v))
return false;
#ifndef LET_OS_MANAGE_FILESIZE
Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
/*
* mdopen() -- Open the specified relation. ereport's on failure.
* (Optionally, can return NULL instead of ereport for ENOENT.)
*
* Note we only open the first segment, when there are multiple segments.
*/
static MdfdVec *
mdopen(SMgrRelation reln)
mdopen(SMgrRelation reln, bool allowNotFound)
{
MdfdVec *mdfd;
char *path;
File fd;
@ -292,6 +356,8 @@ mdopen(SMgrRelation reln)
if (fd < 0)
{
pfree(path);
if (allowNotFound && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open relation %u/%u: %m",
@ -302,15 +368,16 @@ mdopen(SMgrRelation reln)
pfree(path);
reln->md_fd = _fdvec_alloc();
reln->md_fd = mdfd = _fdvec_alloc();
reln->md_fd->mdfd_vfd = fd;
mdfd->mdfd_vfd = fd;
mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
mdfd->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
return reln->md_fd;
return mdfd;
}
/*
@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
int nbytes;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum);
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
long seekpos;
MdfdVec *v;
v = _mdfd_getseg(reln, blocknum);
v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
return false;
if (!register_dirty_segment(reln, v))
return false;
return true;
}
@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
BlockNumber
mdnblocks(SMgrRelation reln)
{
MdfdVec *v = mdopen(reln);
MdfdVec *v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
if (nblocks == curnblk)
return nblocks; /* no work */
v = mdopen(reln);
v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
@ -575,61 +645,168 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
return nblocks;
}
/*
* mdcommit() -- Commit a transaction.
*/
bool
mdcommit(void)
{
/*
* We don't actually have to do anything here...
*/
return true;
}
/*
* mdabort() -- Abort a transaction.
*/
bool
mdabort(void)
{
/*
* We don't actually have to do anything here...
*/
return true;
}
/*
* mdsync() -- Sync previous writes to stable storage.
*
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOpsTable.
*/
bool
mdsync(void)
{
sync();
if (IsUnderPostmaster)
pg_usleep(2000000L);
sync();
HASH_SEQ_STATUS hstat;
PendingOperationEntry *entry;
if (!pendingOpsTable)
return false;
/*
* If we are in the bgwriter, the sync had better include all fsync
* requests that were queued by backends before the checkpoint REDO
* point was determined. We go that a little better by accepting
* all requests queued up to the point where we start fsync'ing.
*/
AbsorbFsyncRequests();
hash_seq_init(&hstat, pendingOpsTable);
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
{
/*
* If fsync is off then we don't have to bother opening the file
* at all. (We delay checking until this point so that changing
* fsync on the fly behaves sensibly.)
*/
if (enableFsync)
{
SMgrRelation reln;
MdfdVec *seg;
/*
* Find or create an smgr hash entry for this relation.
* This may seem a bit unclean -- md calling smgr? But it's
* really the best solution. It ensures that the open file
* reference isn't permanently leaked if we get an error here.
* (You may say "but an unreferenced SMgrRelation is still a
* leak!" Not really, because the only case in which a checkpoint
* is done by a process that isn't about to shut down is in the
* bgwriter, and it will periodically do smgrcloseall(). This
* fact justifies our not closing the reln in the success path
* either, which is a good thing since in non-bgwriter cases
* we couldn't safely do that.) Furthermore, in many cases
* the relation will have been dirtied through this same smgr
* relation, and so we can save a file open/close cycle.
*/
reln = smgropen(entry->rnode);
/*
* It is possible that the relation has been dropped or truncated
* since the fsync request was entered. Therefore, we have to
* allow file-not-found errors. This applies both during
* _mdfd_getseg() and during FileSync, since fd.c might have
* closed the file behind our back.
*/
seg = _mdfd_getseg(reln,
entry->segno * ((BlockNumber) RELSEG_SIZE),
true);
if (seg)
{
if (FileSync(seg->mdfd_vfd) < 0 &&
errno != ENOENT)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not fsync segment %u of relation %u/%u: %m",
entry->segno,
entry->rnode.tblNode,
entry->rnode.relNode)));
return false;
}
}
}
/* Okay, delete this entry */
if (hash_search(pendingOpsTable, entry,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
return true;
}
/*
* register_dirty_segment() -- Mark a relation segment as needing fsync
*
* If there is a local pending-ops table, just make an entry in it for
* mdsync to process later. Otherwise, try to pass off the fsync request
* to the background writer process. If that fails, just do the fsync
* locally before returning (we expect this will not happen often enough
* to be a performance problem).
*
* A false result implies I/O failure during local fsync. errno will be
* valid for error reporting.
*/
static bool
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
if (pendingOpsTable)
{
PendingOperationEntry entry;
/* ensure any pad bytes in the struct are zeroed */
MemSet(&entry, 0, sizeof(entry));
entry.rnode = reln->smgr_rnode;
entry.segno = seg->mdfd_segno;
if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
return true;
/* out of memory: fall through to do it locally */
}
else
{
if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
return true;
}
if (FileSync(seg->mdfd_vfd) < 0)
return false;
return true;
}
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
*
* We stuff the fsync request into the local hash table for execution
* during the bgwriter's next checkpoint.
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
PendingOperationEntry entry;
Assert(pendingOpsTable);
/* ensure any pad bytes in the struct are zeroed */
MemSet(&entry, 0, sizeof(entry));
entry.rnode = rnode;
entry.segno = segno;
if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
ereport(FATAL,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
/*
* _fdvec_alloc() -- Make a MdfdVec object.
*/
static MdfdVec *
_fdvec_alloc(void)
{
MdfdVec *v;
v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
v->mdfd_vfd = -1;
#ifndef LET_OS_MANAGE_FILESIZE
v->mdfd_chain = NULL;
#endif
return v;
return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
}
#ifndef LET_OS_MANAGE_FILESIZE
/*
* Open the specified segment of the relation,
* and make a MdfdVec object for it. Returns NULL on failure.
@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
char *path,
*fullpath;
/* be sure we have enough space for the '.segno', if any */
path = relpath(reln->smgr_rnode);
if (segno > 0)
{
/* be sure we have enough space for the '.segno' */
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%u", path, segno);
pfree(path);
@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
/* fill the entry */
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
v->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
}
#endif
#endif /* LET_OS_MANAGE_FILESIZE */
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
* specified block. ereport's on failure.
* specified block. ereport's on failure.
* (Optionally, can return NULL instead of ereport for ENOENT.)
*/
static MdfdVec *
_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
{
MdfdVec *v = mdopen(reln);
MdfdVec *v = mdopen(reln, allowNotFound);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber segno;
BlockNumber i;
BlockNumber segstogo;
BlockNumber nextsegno;
for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
segno > 0;
i++, segno--)
if (!v)
return NULL; /* only possible if allowNotFound */
for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
segstogo > 0;
nextsegno++, segstogo--)
{
if (v->mdfd_chain == NULL)
{
/*
@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
* one new segment per call, so this restriction seems
* reasonable.
*/
v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
v->mdfd_chain = _mdfd_openseg(reln,
nextsegno,
(segstogo == 1) ? O_CREAT : 0);
if (v->mdfd_chain == NULL)
{
if (allowNotFound && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
i,
nextsegno,
reln->smgr_rnode.tblNode,
reln->smgr_rnode.relNode,
blkno)));
}
}
v = v->mdfd_chain;
}

View File

@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -56,7 +56,7 @@ typedef struct f_smgr
static const f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
}
};
@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
* Get rid of any leftover buffers for the rel (shouldn't be any in the
* commit case, but there can be in the abort case).
*/
DropRelFileNodeBuffers(rnode, isTemp);
DropRelFileNodeBuffers(rnode, isTemp, 0);
/*
* Tell the free space map to forget this relation. It won't be accessed
@ -638,7 +638,7 @@ smgrcommit(void)
if (smgrsw[i].smgr_commit)
{
if (! (*(smgrsw[i].smgr_commit)) ())
elog(FATAL, "transaction commit failed on %s: %m",
elog(ERROR, "transaction commit failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
@ -658,7 +658,7 @@ smgrabort(void)
if (smgrsw[i].smgr_abort)
{
if (! (*(smgrsw[i].smgr_abort)) ())
elog(FATAL, "transaction abort failed on %s: %m",
elog(ERROR, "transaction abort failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
@ -678,7 +678,7 @@ smgrsync(void)
if (smgrsw[i].smgr_sync)
{
if (! (*(smgrsw[i].smgr_sync)) ())
elog(PANIC, "storage sync failed on %s: %m",
elog(ERROR, "storage sync failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
reln = smgropen(xlrec->rnode);
/*
* First, force bufmgr to drop any buffers it has for the to-be-
* truncated blocks. We must do this, else subsequent XLogReadBuffer
* operations will not re-extend the file properly.
*/
DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
/* Can't use smgrtruncate because it would try to xlog */
/*

View File

@ -6,26 +6,17 @@
* Portions Copyright (c) 2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
*/
#ifndef SLRU_H
#define SLRU_H
#include "access/xlog.h"
#include "storage/lwlock.h"
/* exported because lwlock.c needs it */
#define NUM_CLOG_BUFFERS 8
/*
* Note: the separation between SlruLockData and SlruSharedData is purely
* historical; the structs could be combined.
*/
typedef struct SlruLockData
{
LWLockId ControlLock;
LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */
} SlruLockData;
typedef SlruLockData *SlruLock;
/* Opaque structs known only in slru.c */
typedef struct SlruSharedData *SlruShared;
typedef struct SlruFlushData *SlruFlush;
/*
* SlruCtlData is an unshared structure that points to the active information
@ -33,13 +24,13 @@ typedef SlruLockData *SlruLock;
*/
typedef struct SlruCtlData
{
void *shared; /* pointer to SlruSharedData */
SlruLock locks;
SlruShared shared;
LWLockId ControlLock;
/*
* Dir is set during SimpleLruShmemInit and does not change thereafter.
* The value is automatically inherited by backends via fork, and
* doesn't need to be in shared memory.
* Dir is set during SimpleLruInit and does not change thereafter.
* Since it's always the same, it doesn't need to be in shared memory.
*/
char Dir[MAXPGPATH];
@ -51,13 +42,16 @@ typedef struct SlruCtlData
bool (*PagePrecedes) (int, int);
} SlruCtlData;
typedef SlruCtlData *SlruCtl;
extern int SimpleLruShmemSize(void);
extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
extern int SimpleLruZeroPage(SlruCtl ctl, int pageno);
extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
TransactionId xid, bool forwrite);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);

View File

@ -5,13 +5,17 @@
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $
* $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef _BGWRITER_H
#define _BGWRITER_H
#include "storage/block.h"
#include "storage/relfilenode.h"
/* GUC options */
extern int BgWriterDelay;
extern int BgWriterPercent;
@ -23,6 +27,9 @@ extern void BackgroundWriterMain(void);
extern void RequestCheckpoint(bool waitforit);
extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
extern void AbsorbFsyncRequests(void);
extern int BgWriterShmemSize(void);
extern void BgWriterShmemInit(void);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -154,7 +154,8 @@ extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
extern void RelationTruncate(Relation rel, BlockNumber nblocks);
extern int FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
extern void DropRelationBuffers(Relation rel);
extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock);
extern void DropBuffers(Oid dbid);
#ifdef NOT_USED

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $
* $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -15,7 +15,7 @@
/*
* calls:
*
* File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
* File {Close, Read, Write, Seek, Tell, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@ -66,6 +66,7 @@ extern void FileClose(File file);
extern void FileUnlink(File file);
extern int FileRead(File file, char *buffer, int amount);
extern int FileWrite(File file, char *buffer, int amount);
extern int FileSync(File file);
extern long FileSeek(File file, long offset, int whence);
extern int FileTruncate(File file, long offset);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -37,6 +37,7 @@ typedef enum LWLockId
ControlFileLock,
CheckpointLock,
RelCacheInitLock,
BgWriterCommLock,
NumFixedLWLocks, /* must be last except for
* MaxDynamicLWLock */

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $
* $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -83,10 +83,10 @@ extern bool mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer);
extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer);
extern BlockNumber mdnblocks(SMgrRelation reln);
extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks);
extern bool mdcommit(void);
extern bool mdabort(void);
extern bool mdsync(void);
extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
/* smgrtype.c */
extern Datum smgrout(PG_FUNCTION_ARGS);
extern Datum smgrin(PG_FUNCTION_ARGS);