diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index bdfc48cdf5..7a36f74dad 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -675,6 +675,7 @@ report_json_context(JsonLexContext *lex) line_start = lex->line_start; context_start = line_start; context_end = lex->token_terminator; + Assert(context_end >= context_start); /* Advance until we are close enough to context_end */ while (context_end - context_start >= 50) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index e4ff3f3602..2e86589cfd 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -697,6 +697,14 @@ json_lex(JsonLexContext *lex) /* * The next token in the input stream is known to be a string; lex it. + * + * If lex->strval isn't NULL, fill it with the decoded string. + * Set lex->token_terminator to the end of the decoded input, and in + * success cases, transfer its previous value to lex->prev_token_terminator. + * Return JSON_SUCCESS or an error code. + * + * Note: be careful that all error exits advance lex->token_terminator + * to the point after the character we detected the error on. */ static inline JsonParseErrorType json_lex_string(JsonLexContext *lex) @@ -705,6 +713,19 @@ json_lex_string(JsonLexContext *lex) char *const end = lex->input + lex->input_length; int hi_surrogate = -1; + /* Convenience macros for error exits */ +#define FAIL_AT_CHAR_START(code) \ + do { \ + lex->token_terminator = s; \ + return code; \ + } while (0) +#define FAIL_AT_CHAR_END(code) \ + do { \ + lex->token_terminator = \ + s + pg_encoding_mblen_bounded(lex->input_encoding, s); \ + return code; \ + } while (0) + if (lex->strval != NULL) resetStringInfo(lex->strval); @@ -715,10 +736,7 @@ json_lex_string(JsonLexContext *lex) s++; /* Premature end of the string. */ if (s >= end) - { - lex->token_terminator = s; - return JSON_INVALID_TOKEN; - } + FAIL_AT_CHAR_START(JSON_INVALID_TOKEN); else if (*s == '"') break; else if (*s == '\\') @@ -726,10 +744,7 @@ json_lex_string(JsonLexContext *lex) /* OK, we have an escape character. */ s++; if (s >= end) - { - lex->token_terminator = s; - return JSON_INVALID_TOKEN; - } + FAIL_AT_CHAR_START(JSON_INVALID_TOKEN); else if (*s == 'u') { int i; @@ -739,10 +754,7 @@ json_lex_string(JsonLexContext *lex) { s++; if (s >= end) - { - lex->token_terminator = s; - return JSON_INVALID_TOKEN; - } + FAIL_AT_CHAR_START(JSON_INVALID_TOKEN); else if (*s >= '0' && *s <= '9') ch = (ch * 16) + (*s - '0'); else if (*s >= 'a' && *s <= 'f') @@ -750,10 +762,7 @@ json_lex_string(JsonLexContext *lex) else if (*s >= 'A' && *s <= 'F') ch = (ch * 16) + (*s - 'A') + 10; else - { - lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s); - return JSON_UNICODE_ESCAPE_FORMAT; - } + FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT); } if (lex->strval != NULL) { @@ -763,20 +772,20 @@ json_lex_string(JsonLexContext *lex) if (is_utf16_surrogate_first(ch)) { if (hi_surrogate != -1) - return JSON_UNICODE_HIGH_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE); hi_surrogate = ch; continue; } else if (is_utf16_surrogate_second(ch)) { if (hi_surrogate == -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); ch = surrogate_pair_to_codepoint(hi_surrogate, ch); hi_surrogate = -1; } if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); /* * Reject invalid cases. We can't have a value above @@ -786,7 +795,7 @@ json_lex_string(JsonLexContext *lex) if (ch == 0) { /* We can't allow this, since our TEXT type doesn't */ - return JSON_UNICODE_CODE_POINT_ZERO; + FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO); } /* @@ -800,7 +809,7 @@ json_lex_string(JsonLexContext *lex) char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf)) - return JSON_UNICODE_UNTRANSLATABLE; + FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE); appendStringInfoString(lex->strval, cbuf); } #else @@ -820,14 +829,14 @@ json_lex_string(JsonLexContext *lex) appendStringInfoChar(lex->strval, (char) ch); } else - return JSON_UNICODE_HIGH_ESCAPE; + FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE); #endif /* FRONTEND */ } } else if (lex->strval != NULL) { if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); switch (*s) { @@ -852,10 +861,14 @@ json_lex_string(JsonLexContext *lex) appendStringInfoChar(lex->strval, '\t'); break; default: - /* Not a valid string escape, so signal error. */ + + /* + * Not a valid string escape, so signal error. We + * adjust token_start so that just the escape sequence + * is reported, not the whole string. + */ lex->token_start = s; - lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s); - return JSON_ESCAPING_INVALID; + FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID); } } else if (strchr("\"\\/bfnrt", *s) == NULL) @@ -868,8 +881,7 @@ json_lex_string(JsonLexContext *lex) * shown it's not a performance win. */ lex->token_start = s; - lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s); - return JSON_ESCAPING_INVALID; + FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID); } } else @@ -877,7 +889,7 @@ json_lex_string(JsonLexContext *lex) char *p = s; if (hi_surrogate != -1) - return JSON_UNICODE_LOW_SURROGATE; + FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE); /* * Skip to the first byte that requires special handling, so we @@ -917,12 +929,18 @@ json_lex_string(JsonLexContext *lex) } if (hi_surrogate != -1) + { + lex->token_terminator = s + 1; return JSON_UNICODE_LOW_SURROGATE; + } /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; return JSON_SUCCESS; + +#undef FAIL_AT_CHAR_START +#undef FAIL_AT_CHAR_END } /* diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out index f18ba9ebb2..fe729db8c9 100644 --- a/src/test/regress/expected/json_encoding.out +++ b/src/test/regress/expected/json_encoding.out @@ -56,19 +56,19 @@ select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8; select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... --handling of simple unicode escapes select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -121,7 +121,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape; select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails; ERROR: unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- @@ -159,7 +159,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT '"\u0000"'::jsonb; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\u0000... -- use octet_length here so we don't get an odd unicode char in the -- output SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK @@ -180,25 +180,25 @@ ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; ^ DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... -- handling of simple unicode escapes SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -223,7 +223,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape; not_an_escape ------------------------------ @@ -253,7 +253,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai... ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out index 77bdaf63a1..5c8d91ad0b 100644 --- a/src/test/regress/expected/json_encoding_1.out +++ b/src/test/regress/expected/json_encoding_1.out @@ -50,23 +50,23 @@ SELECT '"\uaBcD"'::json; -- OK, uppercase and lower case both OK select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8; ERROR: unsupported Unicode escape sequence DETAIL: Unicode escape value could not be translated to the server's encoding SQL_ASCII. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ude04... select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... --handling of simple unicode escapes select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -101,7 +101,7 @@ select json '{ "a": "null \\u0000 escape" }' as not_an_escape; select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8; ERROR: unsupported Unicode escape sequence DETAIL: Unicode escape value could not be translated to the server's encoding SQL_ASCII. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "the Copyright \u00a9... select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere; correct_everywhere -------------------- @@ -117,7 +117,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape; select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails; ERROR: unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- @@ -155,7 +155,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT '"\u0000"'::jsonb; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\u0000... -- use octet_length here so we don't get an odd unicode char in the -- output SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK @@ -163,45 +163,45 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text); ^ DETAIL: Unicode escape value could not be translated to the server's encoding SQL_ASCII. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\uaBcD... -- handling of unicode surrogate pairs SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8; ERROR: unsupported Unicode escape sequence LINE 1: SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc3... ^ DETAIL: Unicode escape value could not be translated to the server's encoding SQL_ASCII. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ude04... SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; ^ DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... -- handling of simple unicode escapes SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as corr... ^ DETAIL: Unicode escape value could not be translated to the server's encoding SQL_ASCII. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "the Copyright \u00a9... SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere; correct_everywhere ----------------------------- @@ -219,7 +219,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape; not_an_escape ------------------------------ @@ -231,7 +231,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a'... ^ DETAIL: Unicode escape value could not be translated to the server's encoding SQL_ASCII. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "the Copyright \u00a9... SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere; correct_everywhere -------------------- @@ -249,7 +249,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai... ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape --------------------