Remove the restriction that the relmap must be 512 bytes.

Instead of relying on the ability to atomically overwrite the
entire relmap file in one shot, write a new one and durably
rename it into place. Removing the struct padding and the
calculation showing why the map is exactly 512 bytes, and change
the maximum number of entries to a nearby round number.

Patch by me, reviewed by Andres Freund and Dilip Kumar.

Discussion: http://postgr.es/m/CA+TgmoZq5%3DLWDK7kHaUbmWXxcaTuw_QwafgG9dr-BaPym_U8WQ%40mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-ttOXLX75k_WzRo9ar=VvxFhrHi+rJxns997F+yvkm==A@mail.gmail.com
This commit is contained in:
Robert Haas 2022-07-26 14:56:25 -04:00
parent e530be2c5c
commit d8cd0c6c95
4 changed files with 58 additions and 46 deletions

View File

@ -1409,8 +1409,8 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting for a read of the relation map file.</entry> <entry>Waiting for a read of the relation map file.</entry>
</row> </row>
<row> <row>
<entry><literal>RelationMapSync</literal></entry> <entry><literal>RelationMapReplace</literal></entry>
<entry>Waiting for the relation map file to reach durable storage.</entry> <entry>Waiting for durable replacement of a relation map file.</entry>
</row> </row>
<row> <row>
<entry><literal>RelationMapWrite</literal></entry> <entry><literal>RelationMapWrite</literal></entry>

View File

@ -633,8 +633,8 @@ pgstat_get_wait_io(WaitEventIO w)
case WAIT_EVENT_RELATION_MAP_READ: case WAIT_EVENT_RELATION_MAP_READ:
event_name = "RelationMapRead"; event_name = "RelationMapRead";
break; break;
case WAIT_EVENT_RELATION_MAP_SYNC: case WAIT_EVENT_RELATION_MAP_REPLACE:
event_name = "RelationMapSync"; event_name = "RelationMapReplace";
break; break;
case WAIT_EVENT_RELATION_MAP_WRITE: case WAIT_EVENT_RELATION_MAP_WRITE:
event_name = "RelationMapWrite"; event_name = "RelationMapWrite";

View File

@ -60,21 +60,26 @@
/* /*
* The map file is critical data: we have no automatic method for recovering * The map file is critical data: we have no automatic method for recovering
* from loss or corruption of it. We use a CRC so that we can detect * from loss or corruption of it. We use a CRC so that we can detect
* corruption. To minimize the risk of failed updates, the map file should * corruption. Since the file might be more than one standard-size disk
* be kept to no more than one standard-size disk sector (ie 512 bytes), * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
* and we use overwrite-in-place rather than playing renaming games. * a new file and rename it into place, atomically replacing the original file.
* The struct layout below is designed to occupy exactly 512 bytes, which
* might make filesystem updates a bit more efficient.
* *
* Entries in the mappings[] array are in no particular order. We could * Entries in the mappings[] array are in no particular order. We could
* speed searching by insisting on OID order, but it really shouldn't be * speed searching by insisting on OID order, but it really shouldn't be
* worth the trouble given the intended size of the mapping sets. * worth the trouble given the intended size of the mapping sets.
*/ */
#define RELMAPPER_FILENAME "pg_filenode.map" #define RELMAPPER_FILENAME "pg_filenode.map"
#define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */ #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */ /*
* There's no need for this constant to have any particular value, and we
* can raise it as necessary if we end up with more mapped relations. For
* now, we just pick a round number that is modestly larger than the expected
* number of mappings.
*/
#define MAX_MAPPINGS 64
typedef struct RelMapping typedef struct RelMapping
{ {
@ -88,7 +93,6 @@ typedef struct RelMapFile
int32 num_mappings; /* number of valid RelMapping entries */ int32 num_mappings; /* number of valid RelMapping entries */
RelMapping mappings[MAX_MAPPINGS]; RelMapping mappings[MAX_MAPPINGS];
pg_crc32c crc; /* CRC of all above */ pg_crc32c crc; /* CRC of all above */
int32 pad; /* to make the struct size be 512 exactly */
} RelMapFile; } RelMapFile;
/* /*
@ -877,6 +881,7 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
{ {
int fd; int fd;
char mapfilename[MAXPGPATH]; char mapfilename[MAXPGPATH];
char maptempfilename[MAXPGPATH];
/* /*
* Fill in the overhead fields and update CRC. * Fill in the overhead fields and update CRC.
@ -890,17 +895,47 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
FIN_CRC32C(newmap->crc); FIN_CRC32C(newmap->crc);
/* /*
* Open the target file. We prefer to do this before entering the * Construct filenames -- a temporary file that we'll create to write the
* critical section, so that an open() failure need not force PANIC. * data initially, and then the permanent name to which we will rename it.
*/ */
snprintf(mapfilename, sizeof(mapfilename), "%s/%s", snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
dbpath, RELMAPPER_FILENAME); dbpath, RELMAPPER_FILENAME);
fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY); snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
dbpath, RELMAPPER_TEMP_FILENAME);
/*
* Open a temporary file. If a file already exists with this name, it must
* be left over from a previous crash, so we can overwrite it. Concurrent
* calls to this function are not allowed.
*/
fd = OpenTransientFile(maptempfilename,
O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
if (fd < 0) if (fd < 0)
ereport(ERROR, ereport(ERROR,
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", errmsg("could not open file \"%s\": %m",
mapfilename))); maptempfilename)));
/* Write new data to the file. */
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
maptempfilename)));
}
pgstat_report_wait_end();
/* And close the file. */
if (CloseTransientFile(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m",
maptempfilename)));
if (write_wal) if (write_wal)
{ {
@ -924,40 +959,17 @@ write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
XLogFlush(lsn); XLogFlush(lsn);
} }
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
errno = ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
mapfilename)));
}
pgstat_report_wait_end();
/* /*
* We choose to fsync the data to disk before considering the task done. * durable_rename() does all the hard work of making sure that we rename
* It would be possible to relax this if it turns out to be a performance * the temporary file into place in a crash-safe manner.
* issue, but it would complicate checkpointing --- see notes for *
* CheckPointRelationMap. * NB: Although we instruct durable_rename() to use ERROR, we will often
* be in a critical section at this point; if so, ERROR will become PANIC.
*/ */
pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC); pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
if (pg_fsync(fd) != 0) durable_rename(maptempfilename, mapfilename, ERROR);
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m",
mapfilename)));
pgstat_report_wait_end(); pgstat_report_wait_end();
if (CloseTransientFile(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m",
mapfilename)));
/* /*
* Now that the file is safely on disk, send sinval message to let other * Now that the file is safely on disk, send sinval message to let other
* backends know to re-read it. We must do this inside the critical * backends know to re-read it. We must do this inside the critical

View File

@ -194,7 +194,7 @@ typedef enum
WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE, WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
WAIT_EVENT_LOGICAL_REWRITE_WRITE, WAIT_EVENT_LOGICAL_REWRITE_WRITE,
WAIT_EVENT_RELATION_MAP_READ, WAIT_EVENT_RELATION_MAP_READ,
WAIT_EVENT_RELATION_MAP_SYNC, WAIT_EVENT_RELATION_MAP_REPLACE,
WAIT_EVENT_RELATION_MAP_WRITE, WAIT_EVENT_RELATION_MAP_WRITE,
WAIT_EVENT_REORDER_BUFFER_READ, WAIT_EVENT_REORDER_BUFFER_READ,
WAIT_EVENT_REORDER_BUFFER_WRITE, WAIT_EVENT_REORDER_BUFFER_WRITE,