postgresql/src/bin/pg_verifybackup/parse_manifest.c

/*-------------------------------------------------------------------------
 *
 * parse_manifest.c
 *	  Parse a backup manifest in JSON format.
 *
 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/bin/pg_verifybackup/parse_manifest.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres_fe.h"

#include "parse_manifest.h"
#include "common/jsonapi.h"

/*
 * Semantic states for JSON manifest parsing.
 */
typedef enum
{
	JM_EXPECT_TOPLEVEL_START,
	JM_EXPECT_TOPLEVEL_END,
	JM_EXPECT_TOPLEVEL_FIELD,
	JM_EXPECT_VERSION_VALUE,
	JM_EXPECT_FILES_START,
	JM_EXPECT_FILES_NEXT,
	JM_EXPECT_THIS_FILE_FIELD,
	JM_EXPECT_THIS_FILE_VALUE,
	JM_EXPECT_WAL_RANGES_START,
	JM_EXPECT_WAL_RANGES_NEXT,
	JM_EXPECT_THIS_WAL_RANGE_FIELD,
	JM_EXPECT_THIS_WAL_RANGE_VALUE,
	JM_EXPECT_MANIFEST_CHECKSUM_VALUE,
	JM_EXPECT_EOF
} JsonManifestSemanticState;

/*
 * Possible fields for one file as described by the manifest.
 */
typedef enum
{
	JMFF_PATH,
	JMFF_ENCODED_PATH,
	JMFF_SIZE,
	JMFF_LAST_MODIFIED,
	JMFF_CHECKSUM_ALGORITHM,
	JMFF_CHECKSUM
} JsonManifestFileField;

/*
 * Possible fields for one file as described by the manifest.
 */
typedef enum
{
	JMWRF_TIMELINE,
	JMWRF_START_LSN,
	JMWRF_END_LSN
} JsonManifestWALRangeField;

/*
 * Internal state used while decoding the JSON-format backup manifest.
 */
typedef struct
{
	JsonManifestParseContext *context;
	JsonManifestSemanticState state;

	/* These fields are used for parsing objects in the list of files. */
	JsonManifestFileField file_field;
	char	   *pathname;
	char	   *encoded_pathname;
	char	   *size;
	char	   *algorithm;
	pg_checksum_type checksum_algorithm;
	char	   *checksum;

	/* These fields are used for parsing objects in the list of WAL ranges. */
	JsonManifestWALRangeField wal_range_field;
	char	   *timeline;
	char	   *start_lsn;
	char	   *end_lsn;

	/* Miscellaneous other stuff. */
	bool		saw_version_field;
	char	   *manifest_checksum;
} JsonManifestParseState;

static JsonParseErrorType json_manifest_object_start(void *state);
static JsonParseErrorType json_manifest_object_end(void *state);
static JsonParseErrorType json_manifest_array_start(void *state);
static JsonParseErrorType json_manifest_array_end(void *state);
static JsonParseErrorType json_manifest_object_field_start(void *state, char *fname,
														   bool isnull);
static JsonParseErrorType json_manifest_scalar(void *state, char *token,
											   JsonTokenType tokentype);
static void json_manifest_finalize_file(JsonManifestParseState *parse);
static void json_manifest_finalize_wal_range(JsonManifestParseState *parse);
static void verify_manifest_checksum(JsonManifestParseState *parse,
									 char *buffer, size_t size);
static void json_manifest_parse_failure(JsonManifestParseContext *context,
										char *msg);

static int	hexdecode_char(char c);
static bool hexdecode_string(uint8 *result, char *input, int nbytes);
static bool parse_xlogrecptr(XLogRecPtr *result, char *input);

/*
 * Main entrypoint to parse a JSON-format backup manifest.
 *
 * Caller should set up the parsing context and then invoke this function.
 * For each file whose information is extracted from the manifest,
 * context->perfile_cb is invoked.  In case of trouble, context->error_cb is
 * invoked and is expected not to return.
 */
void
json_parse_manifest(JsonManifestParseContext *context, char *buffer,
					size_t size)
{
	JsonLexContext *lex;
	JsonParseErrorType json_error;
	JsonSemAction sem;
	JsonManifestParseState parse;

	/* Set up our private parsing context. */
	parse.context = context;
	parse.state = JM_EXPECT_TOPLEVEL_START;
	parse.saw_version_field = false;

	/* Create a JSON lexing context. */
	lex = makeJsonLexContextCstringLen(buffer, size, PG_UTF8, true);

	/* Set up semantic actions. */
	sem.semstate = &parse;
	sem.object_start = json_manifest_object_start;
	sem.object_end = json_manifest_object_end;
	sem.array_start = json_manifest_array_start;
	sem.array_end = json_manifest_array_end;
	sem.object_field_start = json_manifest_object_field_start;
	sem.object_field_end = NULL;
	sem.array_element_start = NULL;
	sem.array_element_end = NULL;
	sem.scalar = json_manifest_scalar;

	/* Run the actual JSON parser. */
	json_error = pg_parse_json(lex, &sem);
	if (json_error != JSON_SUCCESS)
		json_manifest_parse_failure(context, "parsing failed");
	if (parse.state != JM_EXPECT_EOF)
		json_manifest_parse_failure(context, "manifest ended unexpectedly");

	/* Verify the manifest checksum. */
	verify_manifest_checksum(&parse, buffer, size);
}

/*
 * Invoked at the start of each object in the JSON document.
 *
 * The document as a whole is expected to be an object; each file and each
 * WAL range is also expected to be an object. If we're anywhere else in the
 * document, it's an error.
 */
static JsonParseErrorType
json_manifest_object_start(void *state)
{
	JsonManifestParseState *parse = state;

	switch (parse->state)
	{
		case JM_EXPECT_TOPLEVEL_START:
			parse->state = JM_EXPECT_TOPLEVEL_FIELD;
			break;
		case JM_EXPECT_FILES_NEXT:
			parse->state = JM_EXPECT_THIS_FILE_FIELD;
			parse->pathname = NULL;
			parse->encoded_pathname = NULL;
			parse->size = NULL;
			parse->algorithm = NULL;
			parse->checksum = NULL;
			break;
		case JM_EXPECT_WAL_RANGES_NEXT:
			parse->state = JM_EXPECT_THIS_WAL_RANGE_FIELD;
			parse->timeline = NULL;
			parse->start_lsn = NULL;
			parse->end_lsn = NULL;
			break;
		default:
			json_manifest_parse_failure(parse->context,
										"unexpected object start");
			break;
	}

	return JSON_SUCCESS;
}

/*
 * Invoked at the end of each object in the JSON document.
 *
 * The possible cases here are the same as for json_manifest_object_start.
 * There's nothing special to do at the end of the document, but when we
 * reach the end of an object representing a particular file or WAL range,
 * we must call json_manifest_finalize_file() to save the associated details.
 */
static JsonParseErrorType
json_manifest_object_end(void *state)
{
	JsonManifestParseState *parse = state;

	switch (parse->state)
	{
		case JM_EXPECT_TOPLEVEL_END:
			parse->state = JM_EXPECT_EOF;
			break;
		case JM_EXPECT_THIS_FILE_FIELD:
			json_manifest_finalize_file(parse);
			parse->state = JM_EXPECT_FILES_NEXT;
			break;
		case JM_EXPECT_THIS_WAL_RANGE_FIELD:
			json_manifest_finalize_wal_range(parse);
			parse->state = JM_EXPECT_WAL_RANGES_NEXT;
			break;
		default:
			json_manifest_parse_failure(parse->context,
										"unexpected object end");
			break;
	}

	return JSON_SUCCESS;
}

/*
 * Invoked at the start of each array in the JSON document.
 *
 * Within the toplevel object, the value associated with the "Files" key
 * should be an array. Similarly for the "WAL-Ranges" key. No other arrays
 * are expected.
 */
static JsonParseErrorType
json_manifest_array_start(void *state)
{
	JsonManifestParseState *parse = state;

	switch (parse->state)
	{
		case JM_EXPECT_FILES_START:
			parse->state = JM_EXPECT_FILES_NEXT;
			break;
		case JM_EXPECT_WAL_RANGES_START:
			parse->state = JM_EXPECT_WAL_RANGES_NEXT;
			break;
		default:
			json_manifest_parse_failure(parse->context,
										"unexpected array start");
			break;
	}

	return JSON_SUCCESS;
}

/*
 * Invoked at the end of each array in the JSON document.
 *
 * The cases here are analogous to those in json_manifest_array_start.
 */
static JsonParseErrorType
json_manifest_array_end(void *state)
{
	JsonManifestParseState *parse = state;

	switch (parse->state)
	{
		case JM_EXPECT_FILES_NEXT:
		case JM_EXPECT_WAL_RANGES_NEXT:
			parse->state = JM_EXPECT_TOPLEVEL_FIELD;
			break;
		default:
			json_manifest_parse_failure(parse->context,
										"unexpected array end");
			break;
	}

	return JSON_SUCCESS;
}

/*
 * Invoked at the start of each object field in the JSON document.
 */
static JsonParseErrorType
json_manifest_object_field_start(void *state, char *fname, bool isnull)
{
	JsonManifestParseState *parse = state;

	switch (parse->state)
	{
		case JM_EXPECT_TOPLEVEL_FIELD:

			/*
			 * Inside toplevel object. The version indicator should always be
			 * the first field.
			 */
			if (!parse->saw_version_field)
			{
				if (strcmp(fname, "PostgreSQL-Backup-Manifest-Version") != 0)
					json_manifest_parse_failure(parse->context,
												"expected version indicator");
				parse->state = JM_EXPECT_VERSION_VALUE;
				parse->saw_version_field = true;
				break;
			}

			/* Is this the list of files? */
			if (strcmp(fname, "Files") == 0)
			{
				parse->state = JM_EXPECT_FILES_START;
				break;
			}

			/* Is this the list of WAL ranges? */
			if (strcmp(fname, "WAL-Ranges") == 0)
			{
				parse->state = JM_EXPECT_WAL_RANGES_START;
				break;
			}

			/* Is this the manifest checksum? */
			if (strcmp(fname, "Manifest-Checksum") == 0)
			{
				parse->state = JM_EXPECT_MANIFEST_CHECKSUM_VALUE;
				break;
			}

			/* It's not a field we recognize. */
			json_manifest_parse_failure(parse->context,
										"unrecognized top-level field");
			break;

		case JM_EXPECT_THIS_FILE_FIELD:
			/* Inside object for one file; which key have we got? */
			if (strcmp(fname, "Path") == 0)
				parse->file_field = JMFF_PATH;
			else if (strcmp(fname, "Encoded-Path") == 0)
				parse->file_field = JMFF_ENCODED_PATH;
			else if (strcmp(fname, "Size") == 0)
				parse->file_field = JMFF_SIZE;
			else if (strcmp(fname, "Last-Modified") == 0)
				parse->file_field = JMFF_LAST_MODIFIED;
			else if (strcmp(fname, "Checksum-Algorithm") == 0)
				parse->file_field = JMFF_CHECKSUM_ALGORITHM;
			else if (strcmp(fname, "Checksum") == 0)
				parse->file_field = JMFF_CHECKSUM;
			else
				json_manifest_parse_failure(parse->context,
											"unexpected file field");
			parse->state = JM_EXPECT_THIS_FILE_VALUE;
			break;

		case JM_EXPECT_THIS_WAL_RANGE_FIELD:
			/* Inside object for one file; which key have we got? */
			if (strcmp(fname, "Timeline") == 0)
				parse->wal_range_field = JMWRF_TIMELINE;
			else if (strcmp(fname, "Start-LSN") == 0)
				parse->wal_range_field = JMWRF_START_LSN;
			else if (strcmp(fname, "End-LSN") == 0)
				parse->wal_range_field = JMWRF_END_LSN;
			else
				json_manifest_parse_failure(parse->context,
											"unexpected WAL range field");
			parse->state = JM_EXPECT_THIS_WAL_RANGE_VALUE;
			break;

		default:
			json_manifest_parse_failure(parse->context,
										"unexpected object field");
			break;
	}

	return JSON_SUCCESS;
}

/*
 * Invoked at the start of each scalar in the JSON document.
 *
 * Object field names don't reach this code; those are handled by
 * json_manifest_object_field_start. When we're inside of the object for
 * a particular file or WAL range, that function will have noticed the name
 * of the field, and we'll get the corresponding value here. When we're in
 * the toplevel object, the parse state itself tells us which field this is.
 *
 * In all cases except for PostgreSQL-Backup-Manifest-Version, which we
 * can just check on the spot, the goal here is just to save the value in
 * the parse state for later use. We don't actually do anything until we
 * reach either the end of the object representing this file, or the end
 * of the manifest, as the case may be.
 */
static JsonParseErrorType
json_manifest_scalar(void *state, char *token, JsonTokenType tokentype)
{
	JsonManifestParseState *parse = state;

	switch (parse->state)
	{
		case JM_EXPECT_VERSION_VALUE:
			if (strcmp(token, "1") != 0)
				json_manifest_parse_failure(parse->context,
											"unexpected manifest version");
			parse->state = JM_EXPECT_TOPLEVEL_FIELD;
			break;

		case JM_EXPECT_THIS_FILE_VALUE:
			switch (parse->file_field)
			{
				case JMFF_PATH:
					parse->pathname = token;
					break;
				case JMFF_ENCODED_PATH:
					parse->encoded_pathname = token;
					break;
				case JMFF_SIZE:
					parse->size = token;
					break;
				case JMFF_LAST_MODIFIED:
					pfree(token);	/* unused */
					break;
				case JMFF_CHECKSUM_ALGORITHM:
					parse->algorithm = token;
					break;
				case JMFF_CHECKSUM:
					parse->checksum = token;
					break;
			}
			parse->state = JM_EXPECT_THIS_FILE_FIELD;
			break;

		case JM_EXPECT_THIS_WAL_RANGE_VALUE:
			switch (parse->wal_range_field)
			{
				case JMWRF_TIMELINE:
					parse->timeline = token;
					break;
				case JMWRF_START_LSN:
					parse->start_lsn = token;
					break;
				case JMWRF_END_LSN:
					parse->end_lsn = token;
					break;
			}
			parse->state = JM_EXPECT_THIS_WAL_RANGE_FIELD;
			break;

		case JM_EXPECT_MANIFEST_CHECKSUM_VALUE:
			parse->state = JM_EXPECT_TOPLEVEL_END;
			parse->manifest_checksum = token;
			break;

		default:
			json_manifest_parse_failure(parse->context, "unexpected scalar");
			break;
	}

	return JSON_SUCCESS;
}

/*
 * Do additional parsing and sanity-checking of the details gathered for one
 * file, and invoke the per-file callback so that the caller gets those
 * details. This happens for each file when the corresponding JSON object is
 * completely parsed.
 */
static void
json_manifest_finalize_file(JsonManifestParseState *parse)
{
	JsonManifestParseContext *context = parse->context;
	size_t		size;
	char	   *ep;
	int			checksum_string_length;
	pg_checksum_type checksum_type;
	int			checksum_length;
	uint8	   *checksum_payload;

	/* Pathname and size are required. */
	if (parse->pathname == NULL && parse->encoded_pathname == NULL)
		json_manifest_parse_failure(parse->context, "missing path name");
	if (parse->pathname != NULL && parse->encoded_pathname != NULL)
		json_manifest_parse_failure(parse->context,
									"both path name and encoded path name");
	if (parse->size == NULL)
		json_manifest_parse_failure(parse->context, "missing size");
	if (parse->algorithm == NULL && parse->checksum != NULL)
		json_manifest_parse_failure(parse->context,
									"checksum without algorithm");

	/* Decode encoded pathname, if that's what we have. */
	if (parse->encoded_pathname != NULL)
	{
		int			encoded_length = strlen(parse->encoded_pathname);
		int			raw_length = encoded_length / 2;

		parse->pathname = palloc(raw_length + 1);
		if (encoded_length % 2 != 0 ||
			!hexdecode_string((uint8 *) parse->pathname,
							  parse->encoded_pathname,
							  raw_length))
			json_manifest_parse_failure(parse->context,
										"could not decode file name");
		parse->pathname[raw_length] = '\0';
		pfree(parse->encoded_pathname);
		parse->encoded_pathname = NULL;
	}

	/* Parse size. */
	size = strtoul(parse->size, &ep, 10);
	if (*ep)
		json_manifest_parse_failure(parse->context,
									"file size is not an integer");

	/* Parse the checksum algorithm, if it's present. */
	if (parse->algorithm == NULL)
		checksum_type = CHECKSUM_TYPE_NONE;
	else if (!pg_checksum_parse_type(parse->algorithm, &checksum_type))
		context->error_cb(context, "unrecognized checksum algorithm: \"%s\"",
						  parse->algorithm);

	/* Parse the checksum payload, if it's present. */
	checksum_string_length = parse->checksum == NULL ? 0
		: strlen(parse->checksum);
	if (checksum_string_length == 0)
	{
		checksum_length = 0;
		checksum_payload = NULL;
	}
	else
	{
		checksum_length = checksum_string_length / 2;
		checksum_payload = palloc(checksum_length);
		if (checksum_string_length % 2 != 0 ||
			!hexdecode_string(checksum_payload, parse->checksum,
							  checksum_length))
			context->error_cb(context,
							  "invalid checksum for file \"%s\": \"%s\"",
							  parse->pathname, parse->checksum);
	}

	/* Invoke the callback with the details we've gathered. */
	context->perfile_cb(context, parse->pathname, size,
						checksum_type, checksum_length, checksum_payload);

	/* Free memory we no longer need. */
	if (parse->size != NULL)
	{
		pfree(parse->size);
		parse->size = NULL;
	}
	if (parse->algorithm != NULL)
	{
		pfree(parse->algorithm);
		parse->algorithm = NULL;
	}
	if (parse->checksum != NULL)
	{
		pfree(parse->checksum);
		parse->checksum = NULL;
	}
}

/*
 * Do additional parsing and sanity-checking of the details gathered for one
 * WAL range, and invoke the per-WAL-range callback so that the caller gets
 * those details. This happens for each WAL range when the corresponding JSON
 * object is completely parsed.
 */
static void
json_manifest_finalize_wal_range(JsonManifestParseState *parse)
{
	JsonManifestParseContext *context = parse->context;
	TimeLineID	tli;
	XLogRecPtr	start_lsn,
				end_lsn;
	char	   *ep;

	/* Make sure all fields are present. */
	if (parse->timeline == NULL)
		json_manifest_parse_failure(parse->context, "missing timeline");
	if (parse->start_lsn == NULL)
		json_manifest_parse_failure(parse->context, "missing start LSN");
	if (parse->end_lsn == NULL)
		json_manifest_parse_failure(parse->context, "missing end LSN");

	/* Parse timeline. */
	tli = strtoul(parse->timeline, &ep, 10);
	if (*ep)
		json_manifest_parse_failure(parse->context,
									"timeline is not an integer");
	if (!parse_xlogrecptr(&start_lsn, parse->start_lsn))
		json_manifest_parse_failure(parse->context,
									"could not parse start LSN");
	if (!parse_xlogrecptr(&end_lsn, parse->end_lsn))
		json_manifest_parse_failure(parse->context,
									"could not parse end LSN");

	/* Invoke the callback with the details we've gathered. */
	context->perwalrange_cb(context, tli, start_lsn, end_lsn);

	/* Free memory we no longer need. */
	if (parse->timeline != NULL)
	{
		pfree(parse->timeline);
		parse->timeline = NULL;
	}
	if (parse->start_lsn != NULL)
	{
		pfree(parse->start_lsn);
		parse->start_lsn = NULL;
	}
	if (parse->end_lsn != NULL)
	{
		pfree(parse->end_lsn);
		parse->end_lsn = NULL;
	}
}

/*
 * Verify that the manifest checksum is correct.
 *
 * The last line of the manifest file is excluded from the manifest checksum,
 * because the last line is expected to contain the checksum that covers
 * the rest of the file.
 */
static void
verify_manifest_checksum(JsonManifestParseState *parse, char *buffer,
						 size_t size)
{
	JsonManifestParseContext *context = parse->context;
	size_t		i;
	size_t		number_of_newlines = 0;
	size_t		ultimate_newline = 0;
	size_t		penultimate_newline = 0;
	pg_cryptohash_ctx *manifest_ctx;
	uint8		manifest_checksum_actual[PG_SHA256_DIGEST_LENGTH];
	uint8		manifest_checksum_expected[PG_SHA256_DIGEST_LENGTH];

	/* Find the last two newlines in the file. */
	for (i = 0; i < size; ++i)
	{
		if (buffer[i] == '\n')
		{
			++number_of_newlines;
			penultimate_newline = ultimate_newline;
			ultimate_newline = i;
		}
	}

	/*
	 * Make sure that the last newline is right at the end, and that there are
	 * at least two lines total. We need this to be true in order for the
	 * following code, which computes the manifest checksum, to work properly.
	 */
	if (number_of_newlines < 2)
		json_manifest_parse_failure(parse->context,
									"expected at least 2 lines");
	if (ultimate_newline != size - 1)
		json_manifest_parse_failure(parse->context,
									"last line not newline-terminated");

	/* Checksum the rest. */
	manifest_ctx = pg_cryptohash_create(PG_SHA256);
	if (manifest_ctx == NULL)
		context->error_cb(context, "out of memory");
	if (pg_cryptohash_init(manifest_ctx) < 0)
		context->error_cb(context, "could not initialize checksum of manifest");
	if (pg_cryptohash_update(manifest_ctx, (uint8 *) buffer, penultimate_newline + 1) < 0)
		context->error_cb(context, "could not update checksum of manifest");
	if (pg_cryptohash_final(manifest_ctx, manifest_checksum_actual,
							sizeof(manifest_checksum_actual)) < 0)
		context->error_cb(context, "could not finalize checksum of manifest");

	/* Now verify it. */
	if (parse->manifest_checksum == NULL)
		context->error_cb(parse->context, "manifest has no checksum");
	if (strlen(parse->manifest_checksum) != PG_SHA256_DIGEST_LENGTH * 2 ||
		!hexdecode_string(manifest_checksum_expected, parse->manifest_checksum,
						  PG_SHA256_DIGEST_LENGTH))
		context->error_cb(context, "invalid manifest checksum: \"%s\"",
						  parse->manifest_checksum);
	if (memcmp(manifest_checksum_actual, manifest_checksum_expected,
			   PG_SHA256_DIGEST_LENGTH) != 0)
		context->error_cb(context, "manifest checksum mismatch");
	pg_cryptohash_free(manifest_ctx);
}

/*
 * Report a parse error.
 *
 * This is intended to be used for fairly low-level failures that probably
 * shouldn't occur unless somebody has deliberately constructed a bad manifest,
 * or unless the server is generating bad manifests due to some bug. msg should
 * be a short string giving some hint as to what the problem is.
 */
static void
json_manifest_parse_failure(JsonManifestParseContext *context, char *msg)
{
	context->error_cb(context, "could not parse backup manifest: %s", msg);
}

/*
 * Convert a character which represents a hexadecimal digit to an integer.
 *
 * Returns -1 if the character is not a hexadecimal digit.
 */
static int
hexdecode_char(char c)
{
	if (c >= '0' && c <= '9')
		return c - '0';
	if (c >= 'a' && c <= 'f')
		return c - 'a' + 10;
	if (c >= 'A' && c <= 'F')
		return c - 'A' + 10;

	return -1;
}

/*
 * Decode a hex string into a byte string, 2 hex chars per byte.
 *
 * Returns false if invalid characters are encountered; otherwise true.
 */
static bool
hexdecode_string(uint8 *result, char *input, int nbytes)
{
	int			i;

	for (i = 0; i < nbytes; ++i)
	{
		int			n1 = hexdecode_char(input[i * 2]);
		int			n2 = hexdecode_char(input[i * 2 + 1]);

		if (n1 < 0 || n2 < 0)
			return false;
		result[i] = n1 * 16 + n2;
	}

	return true;
}

/*
 * Parse an XLogRecPtr expressed using the usual string format.
 */
static bool
parse_xlogrecptr(XLogRecPtr *result, char *input)
{
	uint32		hi;
	uint32		lo;

	if (sscanf(input, "%X/%X", &hi, &lo) != 2)
		return false;
	*result = ((uint64) hi) << 32 | lo;
	return true;
}