diff --git a/src/bin/pg_rewind/copy_fetch.c b/src/bin/pg_rewind/copy_fetch.c index e4b8ce6aaf..1cd4449314 100644 --- a/src/bin/pg_rewind/copy_fetch.c +++ b/src/bin/pg_rewind/copy_fetch.c @@ -207,9 +207,9 @@ copy_executeFileMap(filemap_t *map) file_entry_t *entry; int i; - for (i = 0; i < map->narray; i++) + for (i = 0; i < map->nentries; i++) { - entry = map->array[i]; + entry = map->entries[i]; execute_pagemap(&entry->target_pages_to_overwrite, entry->path); switch (entry->action) diff --git a/src/bin/pg_rewind/fetch.c b/src/bin/pg_rewind/fetch.c index f18fe5386e..f41d0f295e 100644 --- a/src/bin/pg_rewind/fetch.c +++ b/src/bin/pg_rewind/fetch.c @@ -37,7 +37,7 @@ fetchSourceFileList(void) * Fetch all relation data files that are marked in the given data page map. */ void -executeFileMap(void) +execute_file_actions(filemap_t *filemap) { if (datadir_source) copy_executeFileMap(filemap); diff --git a/src/bin/pg_rewind/fetch.h b/src/bin/pg_rewind/fetch.h index 7cf8b6ea09..b20df8b153 100644 --- a/src/bin/pg_rewind/fetch.h +++ b/src/bin/pg_rewind/fetch.h @@ -25,7 +25,7 @@ */ extern void fetchSourceFileList(void); extern char *fetchFile(const char *filename, size_t *filesize); -extern void executeFileMap(void); +extern void execute_file_actions(filemap_t *filemap); /* in libpq_fetch.c */ extern void libpqProcessFileList(void); diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index d756c28ca8..314b064b22 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -3,6 +3,19 @@ * filemap.c * A data structure for keeping track of files that have changed. * + * This source file contains the logic to decide what to do with different + * kinds of files, and the data structure to support it. Before modifying + * anything, pg_rewind collects information about all the files and their + * attributes in the target and source data directories. It also scans the + * WAL log in the target, and collects information about data blocks that + * were changed. All this information is stored in a hash table, using the + * file path relative to the root of the data directory as the key. + * + * After collecting all the information required, the decide_file_actions() + * function scans the hash table and decides what action needs to be taken + * for each file. Finally, it sorts the array to the final order that the + * actions should be executed in. + * * Copyright (c) 2013-2020, PostgreSQL Global Development Group * *------------------------------------------------------------------------- @@ -14,22 +27,41 @@ #include #include "catalog/pg_tablespace_d.h" +#include "common/hashfn.h" #include "common/string.h" #include "datapagemap.h" #include "filemap.h" #include "pg_rewind.h" #include "storage/fd.h" -filemap_t *filemap = NULL; +/* + * Define a hash table which we can use to store information about the files + * appearing in source and target systems. + */ +static uint32 hash_string_pointer(const char *s); +#define SH_PREFIX filehash +#define SH_ELEMENT_TYPE file_entry_t +#define SH_KEY_TYPE const char * +#define SH_KEY path +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +#define FILEHASH_INITIAL_SIZE 1000 + +static filehash_hash *filehash; static bool isRelDataFile(const char *path); static char *datasegpath(RelFileNode rnode, ForkNumber forknum, BlockNumber segno); -static int path_cmp(const void *a, const void *b); -static file_entry_t *get_filemap_entry(const char *path, bool create); +static file_entry_t *insert_filehash_entry(const char *path); +static file_entry_t *lookup_filehash_entry(const char *path); static int final_filemap_cmp(const void *a, const void *b); -static void filemap_list_to_array(filemap_t *map); static bool check_file_excluded(const char *path, bool is_source); /* @@ -131,54 +163,26 @@ static const struct exclude_list_item excludeFiles[] = }; /* - * Create a new file map (stored in the global pointer "filemap"). + * Initialize the hash table for the file map. */ void -filemap_create(void) +filehash_init(void) { - filemap_t *map; - - map = pg_malloc(sizeof(filemap_t)); - map->first = map->last = NULL; - map->nlist = 0; - map->array = NULL; - map->narray = 0; - - Assert(filemap == NULL); - filemap = map; + filehash = filehash_create(FILEHASH_INITIAL_SIZE, NULL); } -/* Look up or create entry for 'path' */ +/* Look up entry for 'path', creating a new one if it doesn't exist */ static file_entry_t * -get_filemap_entry(const char *path, bool create) +insert_filehash_entry(const char *path) { - filemap_t *map = filemap; file_entry_t *entry; - file_entry_t **e; - file_entry_t key; - file_entry_t *key_ptr; + bool found; - if (map->array) + entry = filehash_insert(filehash, path, &found); + if (!found) { - key.path = (char *) path; - key_ptr = &key; - e = bsearch(&key_ptr, map->array, map->narray, sizeof(file_entry_t *), - path_cmp); - } - else - e = NULL; - - if (e) - entry = *e; - else if (!create) - entry = NULL; - else - { - /* Create a new entry for this file */ - entry = pg_malloc(sizeof(file_entry_t)); entry->path = pg_strdup(path); entry->isrelfile = isRelDataFile(path); - entry->action = FILE_ACTION_UNDECIDED; entry->target_exists = false; entry->target_type = FILE_TYPE_UNDEFINED; @@ -192,21 +196,18 @@ get_filemap_entry(const char *path, bool create) entry->source_size = 0; entry->source_link_target = NULL; - entry->next = NULL; - - if (map->last) - { - map->last->next = entry; - map->last = entry; - } - else - map->first = map->last = entry; - map->nlist++; + entry->action = FILE_ACTION_UNDECIDED; } return entry; } +static file_entry_t * +lookup_filehash_entry(const char *path) +{ + return filehash_lookup(filehash, path); +} + /* * Callback for processing source file list. * @@ -220,8 +221,6 @@ process_source_file(const char *path, file_type_t type, size_t size, { file_entry_t *entry; - Assert(filemap->array == NULL); - /* * Pretend that pg_wal is a directory, even if it's really a symlink. We * don't want to mess with the symlink itself, nor complain if it's a @@ -238,7 +237,9 @@ process_source_file(const char *path, file_type_t type, size_t size, pg_fatal("data file \"%s\" in source is not a regular file", path); /* Remember this source file */ - entry = get_filemap_entry(path, true); + entry = insert_filehash_entry(path); + if (entry->source_exists) + pg_fatal("duplicate source file \"%s\"", path); entry->source_exists = true; entry->source_type = type; entry->source_size = size; @@ -248,15 +249,12 @@ process_source_file(const char *path, file_type_t type, size_t size, /* * Callback for processing target file list. * - * All source files must be already processed before calling this. We record - * the type and size of file, so that decide_file_action() can later decide - * what to do with it. + * Record the type and size of the file, like process_source_file() does. */ void process_target_file(const char *path, file_type_t type, size_t size, const char *link_target) { - filemap_t *map = filemap; file_entry_t *entry; /* @@ -264,21 +262,6 @@ process_target_file(const char *path, file_type_t type, size_t size, * from the target data folder all paths which have been filtered out from * the source data folder when processing the source files. */ - if (map->array == NULL) - { - /* on first call, initialize lookup array */ - if (map->nlist == 0) - { - /* should not happen */ - pg_fatal("source file list is empty"); - } - - filemap_list_to_array(map); - - Assert(map->array != NULL); - - qsort(map->array, map->narray, sizeof(file_entry_t *), path_cmp); - } /* * Like in process_source_file, pretend that pg_wal is always a directory. @@ -287,7 +270,9 @@ process_target_file(const char *path, file_type_t type, size_t size, type = FILE_TYPE_DIRECTORY; /* Remember this target file */ - entry = get_filemap_entry(path, true); + entry = insert_filehash_entry(path); + if (entry->target_exists) + pg_fatal("duplicate source file \"%s\"", path); entry->target_exists = true; entry->target_type = type; entry->target_size = size; @@ -301,7 +286,7 @@ process_target_file(const char *path, file_type_t type, size_t size, * if so, records it in 'target_pages_to_overwrite' bitmap. * * NOTE: All the files on both systems must have already been added to the - * file map! + * hash table! */ void process_target_wal_block_change(ForkNumber forknum, RelFileNode rnode, @@ -312,47 +297,45 @@ process_target_wal_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno_inseg; int segno; - Assert(filemap->array); - segno = blkno / RELSEG_SIZE; blkno_inseg = blkno % RELSEG_SIZE; path = datasegpath(rnode, forknum, segno); - entry = get_filemap_entry(path, false); + entry = lookup_filehash_entry(path); pfree(path); + /* + * If the block still exists in both systems, remember it. Otherwise we + * can safely ignore it. + * + * If the block is beyond the EOF in the source system, or the file + * doesn't exist in the source at all, we're going to truncate/remove it + * away from the target anyway. Likewise, if it doesn't exist in the + * target anymore, we will copy it over with the "tail" from the source + * system, anyway. + * + * It is possible to find WAL for a file that doesn't exist on either + * system anymore. It means that the relation was dropped later in the + * target system, and independently on the source system too, or that it + * was created and dropped in the target system and it never existed in + * the source. Either way, we can safely ignore it. + */ if (entry) { - int64 end_offset; - Assert(entry->isrelfile); if (entry->target_type != FILE_TYPE_REGULAR) pg_fatal("unexpected page modification for non-regular file \"%s\"", entry->path); - /* - * If the block beyond the EOF in the source system, no need to - * remember it now, because we're going to truncate it away from the - * target anyway. Also no need to remember the block if it's beyond - * the current EOF in the target system; we will copy it over with the - * "tail" from the source system, anyway. - */ - end_offset = (blkno_inseg + 1) * BLCKSZ; - if (end_offset <= entry->source_size && - end_offset <= entry->target_size) - datapagemap_add(&entry->target_pages_to_overwrite, blkno_inseg); - } - else - { - /* - * If we don't have any record of this file in the file map, it means - * that it's a relation that doesn't exist in the source system. It - * could exist in the target system; we haven't moved the target-only - * entries from the linked list to the array yet! But in any case, if - * it doesn't exist in the source it will be removed from the target - * too, and we can safely ignore it. - */ + if (entry->target_exists && entry->source_exists) + { + off_t end_offset; + + end_offset = (blkno_inseg + 1) * BLCKSZ; + if (end_offset <= entry->source_size && end_offset <= entry->target_size) + datapagemap_add(&entry->target_pages_to_overwrite, blkno_inseg); + } } } @@ -423,34 +406,6 @@ check_file_excluded(const char *path, bool is_source) return false; } -/* - * Convert the linked list of entries in map->first/last to the array, - * map->array. - */ -static void -filemap_list_to_array(filemap_t *map) -{ - int narray; - file_entry_t *entry, - *next; - - map->array = (file_entry_t **) - pg_realloc(map->array, - (map->nlist + map->narray) * sizeof(file_entry_t *)); - - narray = map->narray; - for (entry = map->first; entry != NULL; entry = next) - { - map->array[narray++] = entry; - next = entry->next; - entry->next = NULL; - } - Assert(narray == map->nlist + map->narray); - map->narray = narray; - map->nlist = 0; - map->first = map->last = NULL; -} - static const char * action_to_str(file_action_t action) { @@ -478,32 +433,31 @@ action_to_str(file_action_t action) * Calculate the totals needed for progress reports. */ void -calculate_totals(void) +calculate_totals(filemap_t *filemap) { file_entry_t *entry; int i; - filemap_t *map = filemap; - map->total_size = 0; - map->fetch_size = 0; + filemap->total_size = 0; + filemap->fetch_size = 0; - for (i = 0; i < map->narray; i++) + for (i = 0; i < filemap->nentries; i++) { - entry = map->array[i]; + entry = filemap->entries[i]; if (entry->source_type != FILE_TYPE_REGULAR) continue; - map->total_size += entry->source_size; + filemap->total_size += entry->source_size; if (entry->action == FILE_ACTION_COPY) { - map->fetch_size += entry->source_size; + filemap->fetch_size += entry->source_size; continue; } if (entry->action == FILE_ACTION_COPY_TAIL) - map->fetch_size += (entry->source_size - entry->target_size); + filemap->fetch_size += (entry->source_size - entry->target_size); if (entry->target_pages_to_overwrite.bitmapsize > 0) { @@ -512,7 +466,7 @@ calculate_totals(void) iter = datapagemap_iterate(&entry->target_pages_to_overwrite); while (datapagemap_next(iter, &blk)) - map->fetch_size += BLCKSZ; + filemap->fetch_size += BLCKSZ; pg_free(iter); } @@ -520,15 +474,14 @@ calculate_totals(void) } void -print_filemap(void) +print_filemap(filemap_t *filemap) { - filemap_t *map = filemap; file_entry_t *entry; int i; - for (i = 0; i < map->narray; i++) + for (i = 0; i < filemap->nentries; i++) { - entry = map->array[i]; + entry = filemap->entries[i]; if (entry->action != FILE_ACTION_NONE || entry->target_pages_to_overwrite.bitmapsize > 0) { @@ -650,15 +603,6 @@ datasegpath(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) return path; } -static int -path_cmp(const void *a, const void *b) -{ - file_entry_t *fa = *((file_entry_t **) a); - file_entry_t *fb = *((file_entry_t **) b); - - return strcmp(fa->path, fb->path); -} - /* * In the final stage, the filemap is sorted so that removals come last. * From disk space usage point of view, it would be better to do removals @@ -834,22 +778,52 @@ decide_file_action(file_entry_t *entry) /* * Decide what to do with each file. + * + * Returns a 'filemap' with the entries in the order that their actions + * should be executed. */ -void +filemap_t * decide_file_actions(void) { int i; + filehash_iterator it; + file_entry_t *entry; + filemap_t *filemap; - filemap_list_to_array(filemap); - - for (i = 0; i < filemap->narray; i++) + filehash_start_iterate(filehash, &it); + while ((entry = filehash_iterate(filehash, &it)) != NULL) { - file_entry_t *entry = filemap->array[i]; - entry->action = decide_file_action(entry); } - /* Sort the actions to the order that they should be performed */ - qsort(filemap->array, filemap->narray, sizeof(file_entry_t *), + /* + * Turn the hash table into an array, and sort in the order that the + * actions should be performed. + */ + filemap = pg_malloc(offsetof(filemap_t, entries) + + filehash->members * sizeof(file_entry_t *)); + filemap->nentries = filehash->members; + filehash_start_iterate(filehash, &it); + i = 0; + while ((entry = filehash_iterate(filehash, &it)) != NULL) + { + filemap->entries[i++] = entry; + } + + qsort(&filemap->entries, filemap->nentries, sizeof(file_entry_t *), final_filemap_cmp); + + return filemap; +} + + +/* + * Helper function for filemap hash table. + */ +static uint32 +hash_string_pointer(const char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); } diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h index 3d42355873..6f03447d7e 100644 --- a/src/bin/pg_rewind/filemap.h +++ b/src/bin/pg_rewind/filemap.h @@ -12,15 +12,6 @@ #include "storage/block.h" #include "storage/relfilenode.h" -/* - * For every file found in the local or remote system, we have a file entry - * that contains information about the file on both systems. For relation - * files, there is also a page map that marks pages in the file that were - * changed in the target after the last common checkpoint. Each entry also - * contains an 'action' field, which says what we are going to do with the - * file. - */ - /* these enum values are sorted in the order we want actions to be processed */ typedef enum { @@ -45,9 +36,21 @@ typedef enum FILE_TYPE_SYMLINK } file_type_t; +/* + * For every file found in the local or remote system, we have a file entry + * that contains information about the file on both systems. For relation + * files, there is also a page map that marks pages in the file that were + * changed in the target after the last common checkpoint. + * + * When gathering information, these are kept in a hash table, private to + * filemap.c. decide_file_actions() fills in the 'action' field, sorts all + * the entries, and returns them in an array, ready for executing the actions. + */ typedef struct file_entry_t { - char *path; + uint32 status; /* hash status */ + + const char *path; bool isrelfile; /* is it a relation data file? */ /* @@ -76,44 +79,25 @@ typedef struct file_entry_t * What will we do to the file? */ file_action_t action; - - struct file_entry_t *next; } file_entry_t; +/* + * This contains the final decisions on what to do with each file. + * 'entries' array contains an entry for each file, sorted in the order + * that their actions should executed. + */ typedef struct filemap_t { - /* - * New entries are accumulated to a linked list, in process_source_file - * and process_target_file. - */ - file_entry_t *first; - file_entry_t *last; - int nlist; /* number of entries currently in list */ - - /* - * After processing all the remote files, the entries in the linked list - * are moved to this array. After processing local files, too, all the - * local entries are added to the array by decide_file_actions(), and - * sorted in the final order. After decide_file_actions(), all the entries - * are in the array, and the linked list is empty. - */ - file_entry_t **array; - int narray; /* current length of array */ - - /* - * Summary information. - */ + /* Summary information, filled by calculate_totals() */ uint64 total_size; /* total size of the source cluster */ uint64 fetch_size; /* number of bytes that needs to be copied */ + + int nentries; /* size of 'entries' array */ + file_entry_t *entries[FLEXIBLE_ARRAY_MEMBER]; } filemap_t; -extern filemap_t *filemap; - -extern void filemap_create(void); -extern void calculate_totals(void); -extern void print_filemap(void); - /* Functions for populating the filemap */ +extern void filehash_init(void); extern void process_source_file(const char *path, file_type_t type, size_t size, const char *link_target); extern void process_target_file(const char *path, file_type_t type, @@ -121,6 +105,9 @@ extern void process_target_file(const char *path, file_type_t type, extern void process_target_wal_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno); -extern void decide_file_actions(void); + +extern filemap_t *decide_file_actions(void); +extern void calculate_totals(filemap_t *filemap); +extern void print_filemap(filemap_t *filemap); #endif /* FILEMAP_H */ diff --git a/src/bin/pg_rewind/libpq_fetch.c b/src/bin/pg_rewind/libpq_fetch.c index 2fc4a784bd..16d451ae16 100644 --- a/src/bin/pg_rewind/libpq_fetch.c +++ b/src/bin/pg_rewind/libpq_fetch.c @@ -460,9 +460,9 @@ libpq_executeFileMap(filemap_t *map) PQresultErrorMessage(res)); PQclear(res); - for (i = 0; i < map->narray; i++) + for (i = 0; i < map->nentries; i++) { - entry = map->array[i]; + entry = map->entries[i]; /* If this is a relation file, copy the modified blocks */ execute_pagemap(&entry->target_pages_to_overwrite, entry->path); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 4760090d06..574d7f7163 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -129,6 +129,7 @@ main(int argc, char **argv) TimeLineID endtli; ControlFileData ControlFile_new; bool writerecoveryconf = false; + filemap_t *filemap; pg_logging_init(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind")); @@ -368,13 +369,16 @@ main(int argc, char **argv) (uint32) (chkptrec >> 32), (uint32) chkptrec, chkpttli); + /* Initialize the hash table to track the status of each file */ + filehash_init(); + /* * Collect information about all files in the target and source systems. */ - filemap_create(); if (showprogress) pg_log_info("reading source file list"); fetchSourceFileList(); + if (showprogress) pg_log_info("reading target file list"); traverse_datadir(datadir_target, &process_target_file); @@ -395,13 +399,13 @@ main(int argc, char **argv) * We have collected all information we need from both systems. Decide * what to do with each file. */ - decide_file_actions(); + filemap = decide_file_actions(); if (showprogress) - calculate_totals(); + calculate_totals(filemap); /* this is too verbose even for verbose mode */ if (debug) - print_filemap(); + print_filemap(filemap); /* * Ok, we're ready to start copying things over. @@ -421,7 +425,7 @@ main(int argc, char **argv) * modified the target directory and there is no turning back! */ - executeFileMap(); + execute_file_actions(filemap); progress_report(true);