diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 8e30b82273..4cc9e59270 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -1338,9 +1338,10 @@ SELECT b, char_length(b) FROM test2; There are two other fixed-length character types in PostgreSQL, shown in . The name - type exists only for the storage of identifiers - in the internal system catalogs and is not intended for use by the general user. Its + linkend="datatype-character-special-table"/>. + These are not intended for general-purpose use, only for use + in the internal system catalogs. + The name type is used to store identifiers. Its length is currently defined as 64 bytes (63 usable characters plus terminator) but should be referenced using the constant NAMEDATALEN in C source code. @@ -1348,7 +1349,8 @@ SELECT b, char_length(b) FROM test2; is therefore adjustable for special uses); the default maximum length might change in a future release. The type "char" (note the quotes) is different from char(1) in that it - only uses one byte of storage. It is internally used in the system + only uses one byte of storage, and therefore can store only a single + ASCII character. It is used in the system catalogs as a simplistic enumeration type. diff --git a/src/backend/utils/adt/char.c b/src/backend/utils/adt/char.c index 0df41c2253..e50293bf14 100644 --- a/src/backend/utils/adt/char.c +++ b/src/backend/utils/adt/char.c @@ -20,6 +20,11 @@ #include "libpq/pqformat.h" #include "utils/builtins.h" +#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) +#define TOOCTAL(c) ((c) + '0') +#define FROMOCTAL(c) ((unsigned char) (c) - '0') + + /***************************************************************************** * USER I/O ROUTINES * *****************************************************************************/ @@ -27,31 +32,53 @@ /* * charin - converts "x" to 'x' * - * Note that an empty input string will implicitly be converted to \0. + * This accepts the formats charout produces. If we have multibyte input + * that is not in the form '\ooo', then we take its first byte as the value + * and silently discard the rest; this is a backwards-compatibility provision. */ Datum charin(PG_FUNCTION_ARGS) { char *ch = PG_GETARG_CSTRING(0); + if (strlen(ch) == 4 && ch[0] == '\\' && + ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3])) + PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) + + (FROMOCTAL(ch[2]) << 3) + + FROMOCTAL(ch[3])); + /* This will do the right thing for a zero-length input string */ PG_RETURN_CHAR(ch[0]); } /* * charout - converts 'x' to "x" * - * Note that if the char value is \0, the resulting string will appear - * to be empty (null-terminated after zero characters). So this is the - * inverse of the charin() function for such data. + * The possible output formats are: + * 1. 0x00 is represented as an empty string. + * 2. 0x01..0x7F are represented as a single ASCII byte. + * 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits). + * Case 3 is meant to match the traditional "escape" format of bytea. */ Datum charout(PG_FUNCTION_ARGS) { char ch = PG_GETARG_CHAR(0); - char *result = (char *) palloc(2); + char *result = (char *) palloc(5); - result[0] = ch; - result[1] = '\0'; + if (IS_HIGHBIT_SET(ch)) + { + result[0] = '\\'; + result[1] = TOOCTAL(((unsigned char) ch) >> 6); + result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07); + result[3] = TOOCTAL(((unsigned char) ch) & 07); + result[4] = '\0'; + } + else + { + /* This produces acceptable results for 0x00 as well */ + result[0] = ch; + result[1] = '\0'; + } PG_RETURN_CSTRING(result); } @@ -176,15 +203,20 @@ Datum text_char(PG_FUNCTION_ARGS) { text *arg1 = PG_GETARG_TEXT_PP(0); + char *ch = VARDATA_ANY(arg1); char result; /* - * An empty input string is converted to \0 (for consistency with charin). - * If the input is longer than one character, the excess data is silently - * discarded. + * Conversion rules are the same as in charin(), but here we need to + * handle the empty-string case honestly. */ - if (VARSIZE_ANY_EXHDR(arg1) > 0) - result = *(VARDATA_ANY(arg1)); + if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' && + ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3])) + result = (FROMOCTAL(ch[1]) << 6) + + (FROMOCTAL(ch[2]) << 3) + + FROMOCTAL(ch[3]); + else if (VARSIZE_ANY_EXHDR(arg1) > 0) + result = ch[0]; else result = '\0'; @@ -195,13 +227,21 @@ Datum char_text(PG_FUNCTION_ARGS) { char arg1 = PG_GETARG_CHAR(0); - text *result = palloc(VARHDRSZ + 1); + text *result = palloc(VARHDRSZ + 4); /* - * Convert \0 to an empty string, for consistency with charout (and - * because the text stuff doesn't like embedded nulls all that well). + * Conversion rules are the same as in charout(), but here we need to be + * honest about converting 0x00 to an empty string. */ - if (arg1 != '\0') + if (IS_HIGHBIT_SET(arg1)) + { + SET_VARSIZE(result, VARHDRSZ + 4); + (VARDATA(result))[0] = '\\'; + (VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6); + (VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07); + (VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07); + } + else if (arg1 != '\0') { SET_VARSIZE(result, VARHDRSZ + 1); *(VARDATA(result)) = arg1; diff --git a/src/test/regress/expected/char.out b/src/test/regress/expected/char.out index 2d78f90f3b..ea9b0b8eeb 100644 --- a/src/test/regress/expected/char.out +++ b/src/test/regress/expected/char.out @@ -1,8 +1,8 @@ -- -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; true ------ @@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL; abcd (4 rows) +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- +SELECT 'a'::"char"; + char +------ + a +(1 row) + +SELECT '\101'::"char"; + char +------ + A +(1 row) + +SELECT '\377'::"char"; + char +------ + \377 +(1 row) + +SELECT 'a'::"char"::text; + text +------ + a +(1 row) + +SELECT '\377'::"char"::text; + text +------ + \377 +(1 row) + +SELECT '\000'::"char"::text; + text +------ + +(1 row) + +SELECT 'a'::text::"char"; + char +------ + a +(1 row) + +SELECT '\377'::text::"char"; + char +------ + \377 +(1 row) + +SELECT ''::text::"char"; + char +------ + +(1 row) + diff --git a/src/test/regress/expected/char_1.out b/src/test/regress/expected/char_1.out index fa6644d692..ffd31551de 100644 --- a/src/test/regress/expected/char_1.out +++ b/src/test/regress/expected/char_1.out @@ -1,8 +1,8 @@ -- -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; true ------ @@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL; abcd (4 rows) +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- +SELECT 'a'::"char"; + char +------ + a +(1 row) + +SELECT '\101'::"char"; + char +------ + A +(1 row) + +SELECT '\377'::"char"; + char +------ + \377 +(1 row) + +SELECT 'a'::"char"::text; + text +------ + a +(1 row) + +SELECT '\377'::"char"::text; + text +------ + \377 +(1 row) + +SELECT '\000'::"char"::text; + text +------ + +(1 row) + +SELECT 'a'::text::"char"; + char +------ + a +(1 row) + +SELECT '\377'::text::"char"; + char +------ + \377 +(1 row) + +SELECT ''::text::"char"; + char +------ + +(1 row) + diff --git a/src/test/regress/expected/char_2.out b/src/test/regress/expected/char_2.out index 09434a44cd..56818f824b 100644 --- a/src/test/regress/expected/char_2.out +++ b/src/test/regress/expected/char_2.out @@ -1,8 +1,8 @@ -- -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; true ------ @@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL; abcd (4 rows) +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- +SELECT 'a'::"char"; + char +------ + a +(1 row) + +SELECT '\101'::"char"; + char +------ + A +(1 row) + +SELECT '\377'::"char"; + char +------ + \377 +(1 row) + +SELECT 'a'::"char"::text; + text +------ + a +(1 row) + +SELECT '\377'::"char"::text; + text +------ + \377 +(1 row) + +SELECT '\000'::"char"::text; + text +------ + +(1 row) + +SELECT 'a'::text::"char"; + char +------ + a +(1 row) + +SELECT '\377'::text::"char"; + char +------ + \377 +(1 row) + +SELECT ''::text::"char"; + char +------ + +(1 row) + diff --git a/src/test/regress/sql/char.sql b/src/test/regress/sql/char.sql index 9c83c45e34..120fed53e5 100644 --- a/src/test/regress/sql/char.sql +++ b/src/test/regress/sql/char.sql @@ -2,8 +2,8 @@ -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; @@ -71,3 +71,19 @@ DROP TABLE CHAR_TBL; INSERT INTO CHAR_TBL (f1) VALUES ('abcde'); SELECT * FROM CHAR_TBL; + +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- + +SELECT 'a'::"char"; +SELECT '\101'::"char"; +SELECT '\377'::"char"; +SELECT 'a'::"char"::text; +SELECT '\377'::"char"::text; +SELECT '\000'::"char"::text; +SELECT 'a'::text::"char"; +SELECT '\377'::text::"char"; +SELECT ''::text::"char";