From 0ee9d685dd80910a269eb44036dc59df511c6d88 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 13 Mar 2023 15:19:00 -0400 Subject: [PATCH] Fix JSON error reporting for many cases of erroneous string values. The majority of error exit cases in json_lex_string() failed to set lex->token_terminator, causing problems for the error context reporting code: it would see token_terminator less than token_start and do something more or less nuts. In v14 and up the end result could be as bad as a crash in report_json_context(). Older versions accidentally avoided that fate; but all versions produce error context lines that are far less useful than intended, because they'd stop at the end of the prior token instead of continuing to where the actually-bad input is. To fix, invent some macros that make it less notationally painful to do the right thing. Also add documentation about what the function is actually required to do; and in >= v14, add an assertion in report_json_context about token_terminator being sufficiently far advanced. Per report from Nikolay Shaplov. Back-patch to all supported versions. Discussion: https://postgr.es/m/7332649.x5DLKWyVIX@thinkpad-pgpro --- src/backend/utils/adt/jsonfuncs.c | 1 + src/common/jsonapi.c | 77 +++++++++++-------- src/test/regress/expected/json_encoding.out | 24 +++--- src/test/regress/expected/json_encoding_1.out | 24 +++--- 4 files changed, 72 insertions(+), 54 deletions(-) diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index b342c81f27..f6a074aa7d 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -656,6 +656,7 @@ report_json_context(JsonLexContext *lex) line_start = lex->line_start; context_start = line_start; context_end = lex->token_terminator; + Assert(context_end >= context_start); /* Advance until we are close enough to context_end */ while (context_end - context_start >= 50) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index ade13aed3a..3d0fbfa7be 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -675,6 +675,14 @@ json_lex(JsonLexContext *lex) /* * The next token in the input stream is known to be a string; lex it. + * + * If lex->strval isn't NULL, fill it with the decoded string. + * Set lex->token_terminator to the end of the decoded input, and in + * success cases, transfer its previous value to lex->prev_token_terminator. + * Return JSON_SUCCESS or an error code. + * + * Note: be careful that all error exits advance lex->token_terminator + * to the point after the character we detected the error on. */ static inline JsonParseErrorType json_lex_string(JsonLexContext *lex) @@ -683,6 +691,19 @@ json_lex_string(JsonLexContext *lex) int len; int hi_surrogate = -1; + /* Convenience macros for error exits */ +#define FAIL_AT_CHAR_START(code) \ + do { \ + lex->token_terminator = s; \ + return code; \ + } while (0) +#define FAIL_AT_CHAR_END(code) \ + do { \ + lex->token_terminator = \ + s + pg_encoding_mblen_bounded(lex->input_encoding, s); \ + return code; \ + } while (0) + if (lex->strval != NULL) resetStringInfo(lex->strval); @@ -695,18 +716,14 @@ json_lex_string(JsonLexContext *lex) len++; /* Premature end of the string. */ if (len >= lex->input_length) - { - lex->token_terminator = s; - return JSON_INVALID_TOKEN; - } + FAIL_AT_CHAR_START(JSON_INVALID_TOKEN); else if (*s == '"') break; else if ((unsigned char) *s < 32) { /* Per RFC4627, these characters MUST be escaped. */ /* Since *s isn't printable, exclude it from the context string */ - lex->token_terminator = s; - return JSON_ESCAPING_REQUIRED; + FAIL_AT_CHAR_START(JSON_ESCAPING_REQUIRED); } else if (*s == '\\') { @@ -714,10 +731,7 @@ json_lex_string(JsonLexContext *lex) s++; len++; if (len >= lex->input_length) - { - lex->token_terminator = s; - return JSON_INVALID_TOKEN; - } + FAIL_AT_CHAR_START(JSON_INVALID_TOKEN); else if (*s == 'u') { int i; @@ -728,10 +742,7 @@ json_lex_string(JsonLexContext *lex) s++; len++; if (len >= lex->input_length) - { - lex->token_terminator = s; - return JSON_INVALID_TOKEN; - } + FAIL_AT_CHAR_START(JSON_INVALID_TOKEN); else if (*s >= '0' && *s <= '9') ch = (ch * 16) + (*s - '0'); else if (*s >= 'a' && *s <= 'f') @@ -739,10 +750,7 @@ json_lex_string(JsonLexContext *lex) else if (*s >= 'A' && *s <= 'F') ch = (ch * 16) + (*s - 'A') + 10; else - { - lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s); - return JSON_UNICODE_ESCAPE_FORMAT; - } + FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT); } if (lex->strval != NULL) { @@ -752,20 +760,20 @@ json_lex_string(JsonLexContext *lex) if (is_utf16_surrogate_first(ch)) { if (hi_surrogate != -1) - return JSON_UNICODE_HIGH_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE); hi_surrogate = ch; continue; } else if (is_utf16_surrogate_second(ch)) { if (hi_surrogate == -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); ch = surrogate_pair_to_codepoint(hi_surrogate, ch); hi_surrogate = -1; } if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); /* * Reject invalid cases. We can't have a value above @@ -775,7 +783,7 @@ json_lex_string(JsonLexContext *lex) if (ch == 0) { /* We can't allow this, since our TEXT type doesn't */ - return JSON_UNICODE_CODE_POINT_ZERO; + FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO); } /* @@ -812,14 +820,14 @@ json_lex_string(JsonLexContext *lex) appendStringInfoChar(lex->strval, (char) ch); } else - return JSON_UNICODE_HIGH_ESCAPE; + FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE); #endif /* FRONTEND */ } } else if (lex->strval != NULL) { if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); switch (*s) { @@ -844,10 +852,14 @@ json_lex_string(JsonLexContext *lex) appendStringInfoChar(lex->strval, '\t'); break; default: - /* Not a valid string escape, so signal error. */ + + /* + * Not a valid string escape, so signal error. We + * adjust token_start so that just the escape sequence + * is reported, not the whole string. + */ lex->token_start = s; - lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s); - return JSON_ESCAPING_INVALID; + FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID); } } else if (strchr("\"\\/bfnrt", *s) == NULL) @@ -860,15 +872,14 @@ json_lex_string(JsonLexContext *lex) * shown it's not a performance win. */ lex->token_start = s; - lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s); - return JSON_ESCAPING_INVALID; + FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID); } } else if (lex->strval != NULL) { if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); appendStringInfoChar(lex->strval, *s); } @@ -876,12 +887,18 @@ json_lex_string(JsonLexContext *lex) } if (hi_surrogate != -1) + { + lex->token_terminator = s + 1; return JSON_UNICODE_LOW_SURROGATE; + } /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; return JSON_SUCCESS; + +#undef FAIL_AT_CHAR_START +#undef FAIL_AT_CHAR_END } /* diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out index f343f74fe1..fa41b40103 100644 --- a/src/test/regress/expected/json_encoding.out +++ b/src/test/regress/expected/json_encoding.out @@ -56,19 +56,19 @@ select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8; select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... --handling of simple unicode escapes select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -121,7 +121,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape; select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails; ERROR: unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- @@ -159,7 +159,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT '"\u0000"'::jsonb; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\u0000... -- use octet_length here so we don't get an odd unicode char in the -- output SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK @@ -180,25 +180,25 @@ ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; ^ DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... -- handling of simple unicode escapes SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -223,7 +223,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape; not_an_escape ------------------------------ @@ -253,7 +253,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai... ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out index e2fc131b0f..938f8e24aa 100644 --- a/src/test/regress/expected/json_encoding_1.out +++ b/src/test/regress/expected/json_encoding_1.out @@ -52,19 +52,19 @@ ERROR: conversion between UTF8 and SQL_ASCII is not supported select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... --handling of simple unicode escapes select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -113,7 +113,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape; select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails; ERROR: unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- @@ -151,7 +151,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT '"\u0000"'::jsonb; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\u0000... -- use octet_length here so we don't get an odd unicode char in the -- output SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK @@ -168,25 +168,25 @@ ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; ^ DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... -- handling of simple unicode escapes SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; ERROR: conversion between UTF8 and SQL_ASCII is not supported @@ -209,7 +209,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape; not_an_escape ------------------------------ @@ -237,7 +237,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai... ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape --------------------