diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 2a99a286c6..5d87408ce9 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -9,10 +9,16 @@ #include "postgres_fe.h" +#include "access/visibilitymap.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/checksum_impl.h" +#include #include +#define BITS_PER_HEAPBLOCK_OLD 1 #ifndef WIN32 @@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force) #endif +/* + * rewriteVisibilityMap() + * + * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's + * visibility map included one bit per heap page; it now includes two. + * When upgrading a cluster from before that time to a current PostgreSQL + * version, we could refuse to copy visibility maps from the old cluster + * to the new cluster; the next VACUUM would recreate them, but at the + * price of scanning the entire table. So, instead, we rewrite the old + * visibility maps in the new format. That way, the all-visible bit + * remains set for the pages for which it was set previously. The + * all-frozen bit is never set by this conversion; we leave that to + * VACUUM. + */ +const char * +rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force) +{ + int src_fd = 0; + int dst_fd = 0; + char buffer[BLCKSZ]; + ssize_t bytesRead; + ssize_t src_filesize; + int rewriteVmBytesPerPage; + BlockNumber new_blkno = 0; + struct stat statbuf; + + /* Compute we need how many old page bytes to rewrite a new page */ + rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; + + if ((fromfile == NULL) || (tofile == NULL)) + return "Invalid old file or new file"; + + if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0) + return getErrorText(); + + if (fstat(src_fd, &statbuf) != 0) + { + close(src_fd); + return getErrorText(); + } + + if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0) + { + close(src_fd); + return getErrorText(); + } + + /* Save old file size */ + src_filesize = statbuf.st_size; + + /* + * Turn each visibility map page into 2 pages one by one. Each new page + * has the same page header as the old one. If the last section of last + * page is empty, we skip it, mostly to avoid turning one-page visibility + * maps for small relations into two pages needlessly. + */ + while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ) + { + char *old_cur; + char *old_break; + char *old_blkend; + PageHeaderData pageheader; + bool old_lastblk = ((BLCKSZ * (new_blkno + 1)) == src_filesize); + + /* Save the page header data */ + memcpy(&pageheader, buffer, SizeOfPageHeaderData); + + /* + * These old_* variables point to old visibility map page. old_cur + * points to current position on old page. old_blkend points to end of + * old block. old_break points to old page break position for + * rewriting a new page. After wrote a new page, old_break proceeds + * rewriteVmBytesPerPage bytes. + */ + old_cur = buffer + SizeOfPageHeaderData; + old_blkend = buffer + bytesRead; + old_break = old_cur + rewriteVmBytesPerPage; + + while (old_blkend >= old_break) + { + char new_vmbuf[BLCKSZ]; + char *new_cur = new_vmbuf; + bool empty = true; + bool old_lastpart; + + /* Copy page header in advance */ + memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData); + + /* Rewrite the last part of the old page? */ + old_lastpart = old_lastblk && (old_blkend == old_break); + + new_cur += SizeOfPageHeaderData; + + /* Process old page bytes one by one, and turn it into new page. */ + while (old_break > old_cur) + { + uint16 new_vmbits = 0; + int i; + + /* Generate new format bits while keeping old information */ + for (i = 0; i < BITS_PER_BYTE; i++) + { + uint8 byte = *(uint8 *) old_cur; + + if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i))) + { + empty = false; + new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i); + } + } + + /* Copy new visibility map bit to new format page */ + memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK); + + old_cur += BITS_PER_HEAPBLOCK_OLD; + new_cur += BITS_PER_HEAPBLOCK; + } + + /* If the last part of the old page is empty, skip to write it */ + if (old_lastpart && empty) + break; + + /* Set new checksum for a visibility map page (if enabled) */ + if (old_cluster.controldata.data_checksum_version != 0 && + new_cluster.controldata.data_checksum_version != 0) + ((PageHeader) new_vmbuf)->pd_checksum = + pg_checksum_page(new_vmbuf, new_blkno); + + if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ) + { + close(dst_fd); + close(src_fd); + return getErrorText(); + } + + old_break += rewriteVmBytesPerPage; + new_blkno++; + } + } + + /* Close files */ + close(dst_fd); + close(src_fd); + + return NULL; + +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 61228780d4..89beb73955 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -109,6 +109,10 @@ extern char *output_files[]; */ #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031 +/* + * The format of visibility map is changed with this 9.6 commit, + */ +#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011 /* * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85, * ("Improve concurrency of foreign key locking") which also updated catalog @@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir); const char *copyFile(const char *src, const char *dst, bool force); const char *linkFile(const char *src, const char *dst); +const char *rewriteVisibilityMap(const char *fromfile, const char *tofile, + bool force); void check_hard_link(void); FILE *fopen_priv(const char *path, const char *mode); diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index b20f073ef7..0c1a8220bb 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -11,12 +11,13 @@ #include "pg_upgrade.h" +#include #include "catalog/pg_class.h" #include "access/transam.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); -static void transfer_relfile(FileNameMap *map, const char *suffix); +static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit); /* @@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) { int mapnum; bool vm_crashsafe_match = true; + bool vm_must_add_frozenbit = false; /* * Do the old and new cluster disagree on the crash-safetiness of the vm @@ -141,13 +143,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) vm_crashsafe_match = false; + /* + * Do we need to rewrite visibilitymap? + */ + if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER && + new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) + vm_must_add_frozenbit = true; + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) { /* transfer primary file */ - transfer_relfile(&maps[mapnum], ""); + transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) @@ -155,9 +164,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) /* * Copy/link any fsm and vm files, if they exist */ - transfer_relfile(&maps[mapnum], "_fsm"); + transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); if (vm_crashsafe_match) - transfer_relfile(&maps[mapnum], "_vm"); + transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit); } } } @@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) /* * transfer_relfile() * - * Copy or link file from old cluster to new one. + * Copy or link file from old cluster to new one. If vm_must_add_frozenbit + * is true, visibility map forks are converted and rewritten, even in link + * mode. */ static void -transfer_relfile(FileNameMap *map, const char *type_suffix) +transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit) { const char *msg; char old_file[MAXPGPATH]; char new_file[MAXPGPATH]; - int fd; int segno; char extent_suffix[65]; + struct stat statbuf; /* * Now copy/link any related segments as well. Remember, PG breaks large @@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) if (type_suffix[0] != '\0' || segno != 0) { /* Did file open fail? */ - if ((fd = open(old_file, O_RDONLY, 0)) == -1) + if (stat(old_file, &statbuf) != 0) { /* File does not exist? That's OK, just return */ if (errno == ENOENT) @@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) map->nspname, map->relname, old_file, new_file, getErrorText()); } - close(fd); + + /* If file is empty, just return */ + if (statbuf.st_size == 0) + return; } unlink(new_file); @@ -232,7 +246,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) { pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = copyFile(old_file, new_file, true)) != NULL) + /* Rewrite visibility map if needed */ + if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0)) + msg = rewriteVisibilityMap(old_file, new_file, true); + else + msg = copyFile(old_file, new_file, true); + + if (msg) pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); } @@ -240,7 +260,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) { pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = linkFile(old_file, new_file)) != NULL) + /* Rewrite visibility map if needed */ + if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0)) + msg = rewriteVisibilityMap(old_file, new_file, true); + else + msg = linkFile(old_file, new_file); + + if (msg) pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); }