diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index 6f0fe94d63..c020210e20 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -750,6 +750,13 @@ json_lex(JsonLexContext *lex) /* * The next token in the input stream is known to be a string; lex it. + * + * If lex->strval isn't NULL, fill it with the decoded string. + * Set lex->token_terminator to the end of the decoded input, and in + * success cases, transfer its previous value to lex->prev_token_terminator. + * + * Note: be careful that all error cases advance lex->token_terminator + * to the point after the character we detected the error on. */ static inline void json_lex_string(JsonLexContext *lex) @@ -837,33 +844,42 @@ json_lex_string(JsonLexContext *lex) if (ch >= 0xd800 && ch <= 0xdbff) { if (hi_surrogate != -1) + { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "json"), errdetail("Unicode high surrogate must not follow a high surrogate."), report_json_context(lex))); + } hi_surrogate = (ch & 0x3ff) << 10; continue; } else if (ch >= 0xdc00 && ch <= 0xdfff) { if (hi_surrogate == -1) + { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); + } ch = 0x10000 + hi_surrogate + (ch & 0x3ff); hi_surrogate = -1; } if (hi_surrogate != -1) + { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); + } /* * For UTF8, replace the escape sequence by the actual @@ -875,6 +891,7 @@ json_lex_string(JsonLexContext *lex) if (ch == 0) { /* We can't allow this, since our TEXT type doesn't */ + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), errmsg("unsupported Unicode escape sequence"), @@ -898,24 +915,27 @@ json_lex_string(JsonLexContext *lex) } else { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), errmsg("unsupported Unicode escape sequence"), errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), report_json_context(lex))); } - } } else if (lex->strval != NULL) { if (hi_surrogate != -1) + { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); + } switch (*s) { @@ -968,16 +988,18 @@ json_lex_string(JsonLexContext *lex) extract_mb_char(s)), report_json_context(lex))); } - } else if (lex->strval != NULL) { if (hi_surrogate != -1) + { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); + } appendStringInfoChar(lex->strval, *s); } @@ -985,11 +1007,14 @@ json_lex_string(JsonLexContext *lex) } if (hi_surrogate != -1) + { + lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); + } /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out index d8d34f4ff6..3156c63c6f 100644 --- a/src/test/regress/expected/json_encoding.out +++ b/src/test/regress/expected/json_encoding.out @@ -41,19 +41,19 @@ select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8; select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... --handling of simple unicode escapes select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -106,7 +106,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape; select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails; ERROR: unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- @@ -144,7 +144,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT '"\u0000"'::jsonb; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\u0000... -- use octet_length here so we don't get an odd unicode char in the -- output SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK @@ -165,25 +165,25 @@ ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; ^ DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... -- handling of simple unicode escapes SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -208,7 +208,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape; not_an_escape ------------------------------ @@ -238,7 +238,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai... ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out index 79ed78e1c5..d320bfa811 100644 --- a/src/test/regress/expected/json_encoding_1.out +++ b/src/test/regress/expected/json_encoding_1.out @@ -35,23 +35,23 @@ SELECT '"\uaBcD"'::json; -- OK, uppercase and lower case both OK select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8; ERROR: unsupported Unicode escape sequence DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ude04... select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... --handling of simple unicode escapes select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; correct_in_utf8 @@ -86,7 +86,7 @@ select json '{ "a": "null \\u0000 escape" }' as not_an_escape; select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8; ERROR: unsupported Unicode escape sequence DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "the Copyright \u00a9... select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere; correct_everywhere -------------------- @@ -102,7 +102,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape; select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails; ERROR: unsupported Unicode escape sequence DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape -------------------- @@ -140,7 +140,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT '"\u0000"'::jsonb; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\u0000... -- use octet_length here so we don't get an odd unicode char in the -- output SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK @@ -148,45 +148,45 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text); ^ DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8. -CONTEXT: JSON data, line 1: ... +CONTEXT: JSON data, line 1: "\uaBcD... -- handling of unicode surrogate pairs SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8; ERROR: unsupported Unicode escape sequence LINE 1: SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc3... ^ DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ude04... SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; ^ DETAIL: Unicode high surrogate must not follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d... SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ud83dX... SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate ERROR: invalid input syntax for type json LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; ^ DETAIL: Unicode low surrogate must follow a high surrogate. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "\ude04... -- handling of simple unicode escapes SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8; ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as corr... ^ DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "the Copyright \u00a9... SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere; correct_everywhere ----------------------------- @@ -204,7 +204,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails; ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape; not_an_escape ------------------------------ @@ -216,7 +216,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a'... ^ DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "the Copyright \u00a9... SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere; correct_everywhere -------------------- @@ -234,7 +234,7 @@ ERROR: unsupported Unicode escape sequence LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai... ^ DETAIL: \u0000 cannot be converted to text. -CONTEXT: JSON data, line 1: { "a":... +CONTEXT: JSON data, line 1: { "a": "null \u0000... SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape; not_an_escape --------------------