diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 7114eb7b52..55bbb20dac 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -377,13 +377,21 @@ initdb --locale-provider=icu --icu-locale=en The builtin provider uses built-in operations. Only - the C locale is supported for this provider. + the C and C.UTF-8 locales are + supported for this provider. The C locale behavior is identical to the C locale in the libc provider. When using this locale, the behavior may depend on the database encoding. + + The C.UTF-8 locale is available only for when the + database encoding is UTF-8, and the behavior is + based on Unicode. The collation uses the code point values only. The + regular expression character classes are based on the "POSIX + Compatible" semantics, and the case mapping is the "simple" variant. + @@ -878,6 +886,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR"; + + pg_c_utf8 + + + This collation sorts by Unicode code point values rather than natural + language order. For the functions lower, + initcap, and upper, it uses + Unicode simple case mapping. For pattern matching (including regular + expressions), it uses the POSIX Compatible variant of Unicode Compatibility + Properties. Behavior is efficient and stable within a + Postgres major version. This collation is + only available for encoding UTF8. + + + + C (equivalent to POSIX) diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 98cd7d56be..85f18cbbe5 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -99,7 +99,7 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM If provider is builtin, then locale must be specified and set to - C. + either C or C.UTF-8. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 233ff1755d..7653cb902e 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -166,8 +166,9 @@ CREATE DATABASE name If is - builtin, then locale - must be specified and set to C. + builtin, then locale or + builtin_locale must be specified and set to + either C or C.UTF-8. @@ -228,9 +229,11 @@ CREATE DATABASE name linkend="create-database-locale-provider">locale provider must be builtin. The default is the setting of if specified; otherwise the same - setting as the template database. Currently, the only available - locale for the builtin provider is - C. + setting as the template database. + + + The locales available for the builtin provider are + C and C.UTF-8. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 4760570f6a..377c3cb20a 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -288,8 +288,9 @@ PostgreSQL documentation If is builtin, - must be specified and set to - C. + or must be + specified and set to C or + C.UTF-8. diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 6a26388bfa..85f3238eb0 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use functions */ PG_REGEX_LOCALE_1BYTE, /* Use functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t functions */ @@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + pg_regex_strategy = PG_REGEX_BUILTIN; + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5f483b8dbc..8160d78ec6 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1679,6 +1681,34 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = unicode_strlower(dst, dstsize, src, srclen); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strlower(dst, dstsize, src, srclen); + Assert(needed + 1 == dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); @@ -1799,6 +1829,34 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = unicode_strupper(dst, dstsize, src, srclen); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strupper(dst, dstsize, src, srclen); + Assert(needed + 1 == dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); @@ -1920,6 +1978,60 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + size_t srclen = nbytes; + unsigned char *dst; + size_t dstsize; + int srcoff = 0; + int dstoff = 0; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* overflow paranoia */ + if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* result is at most srclen codepoints plus terminating NUL */ + dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1; + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2; + int u1len = unicode_utf8len(u1); + int u2len; + + if (wasalnum) + u2 = unicode_lowercase_simple(u1); + else + u2 = unicode_uppercase_simple(u1); + + u2len = unicode_utf8len(u2); + + Assert(dstoff + u2len + 1 <= dstsize); + + wasalnum = pg_u_isalnum(u2, true); + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + Assert(dstoff + 1 <= dstsize); + *(dst + dstoff) = '\0'; + dstoff++; + + /* allocate result buffer of the right size and free workspace */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 3f311e9907..e10d328fc3 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1270,8 +1270,14 @@ lookup_collation_cache(Oid collation, bool set_flags) if (collform->collprovider == COLLPROVIDER_BUILTIN) { + Datum datum; + const char *colllocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + colllocale = TextDatumGetCString(datum); + cache_entry->collate_is_c = true; - cache_entry->ctype_is_c = true; + cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0); } else if (collform->collprovider == COLLPROVIDER_LIBC) { @@ -1670,7 +1676,6 @@ pg_newlocale_from_collation(Oid collid) collversionstr = TextDatumGetCString(datum); - Assert(collform->collprovider != COLLPROVIDER_BUILTIN); if (collform->collprovider == COLLPROVIDER_LIBC) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); else @@ -1725,7 +1730,13 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; - /* the builtin collation provider is not versioned */ + /* + * The only two supported locales (C and C.UTF-8) are both based on memcmp + * and are not expected to change. + * + * Note that the character semantics may change for some locales, but the + * collation version only tracks changes to sort order. + */ if (collprovider == COLLPROVIDER_BUILTIN) return NULL; @@ -2505,13 +2516,17 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, int builtin_locale_encoding(const char *locale) { - if (strcmp(locale, "C") != 0) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid locale name \"%s\" for builtin provider", - locale))); + if (strcmp(locale, "C") == 0) + return -1; + if (strcmp(locale, "C.UTF-8") == 0) + return PG_UTF8; - return -1; + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid locale name \"%s\" for builtin provider", + locale))); + + return 0; /* keep compiler quiet */ } @@ -2525,13 +2540,28 @@ builtin_locale_encoding(const char *locale) const char * builtin_validate_locale(int encoding, const char *locale) { - if (strcmp(locale, "C") != 0) + const char *canonical_name = NULL; + int required_encoding; + + if (strcmp(locale, "C") == 0) + canonical_name = "C"; + else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) + canonical_name = "C.UTF-8"; + + if (!canonical_name) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("invalid locale name \"%s\" for builtin provider", locale))); - return "C"; + required_encoding = builtin_locale_encoding(canonical_name); + if (required_encoding >= 0 && encoding != required_encoding) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("encoding \"%s\" does not match locale \"%s\"", + pg_encoding_to_char(encoding), locale))); + + return canonical_name; } diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index c2daff1717..30e17bd1d1 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2403,9 +2403,16 @@ setlocales(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C") != 0) + if (strcmp(datlocale, "C") == 0) + canonname = "C"; + else if (strcmp(datlocale, "C.UTF-8") == 0 || + strcmp(datlocale, "C.UTF8") == 0) + canonname = "C.UTF-8"; + else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); + + datlocale = canonname; } else if (locale_provider == COLLPROVIDER_ICU) { @@ -2695,6 +2702,13 @@ setup_locale_encoding(void) !check_locale_encoding(lc_collate, encodingid)) exit(1); /* check_locale_encoding printed the error */ + if (locale_provider == COLLPROVIDER_BUILTIN) + { + if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8) + pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", + datlocale, "UTF-8"); + } + if (locale_provider == COLLPROVIDER_ICU && !check_icu_locale_encoding(encodingid)) exit(1); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 3478f58b02..c63d3206d9 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -196,6 +196,23 @@ command_ok( ], 'locale provider builtin with --locale'); +command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E UTF-8', + '--builtin-locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.UTF-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII' +); + command_ok( [ 'initdb', '--no-sync', diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index ed79c0930b..3e67121a8d 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -140,7 +140,7 @@ if ($oldnode->pg_version >= '17devel') { $original_enc_name = "UTF-8"; $original_provider = "b"; - $original_datlocale = "C"; + $original_datlocale = "C.UTF-8"; } elsif ($oldnode->pg_version >= 15 && $ENV{with_icu} eq 'yes') { diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index dfd635bfab..0b371ea4df 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -139,6 +139,24 @@ $node->command_ok( ], 'create database with provider "builtin" and LC_CTYPE=C'); +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E UTF-8', '--builtin-locale=C.UTF8', + 'tbuiltin5' + ], + 'create database with --builtin-locale C.UTF-8 and -E UTF-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E LATIN1', '--builtin-locale=C.UTF-8', + 'tbuiltin6' + ], + 'create database with --builtin-locale C.UTF-8 and -E LATIN1'); + $node->command_fails( [ 'createdb', '-T', diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 295560a7ff..be18328ea5 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202403191 +#define CATALOG_VERSION_NO 202403192 #endif diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 938432e8a4..083b0cdcca 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -30,5 +30,8 @@ descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', colllocale => 'und' }, +{ oid => '811', descr => 'sorts by Unicode code point; Unicode and POSIX character semantics', + collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', + colllocale => 'C.UTF-8' }, ] diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 0000000000..eff0ef21ac --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,136 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test PG_C_UTF8 +-- +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +ERROR: invalid locale name "C_UTF8" for builtin provider +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF-8'); +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +-----------------+-----------------+-----------------+-----------------+---------+---------------+-----------------+--------------- + abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 + ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE test_pg_c_utf8; +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); + lower +------- + ασ +(1 row) + +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); + lower +------- + αͺσͺ +(1 row) + +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + lower +------- + α΄σ΄ +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 0000000000..e73fdf50c3 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 1d8a414eea..e48cb4b7a3 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 0000000000..1f5f9ef491 --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,67 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test PG_C_UTF8 +-- + +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF-8'); + +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + +DROP TABLE test_pg_c_utf8; + +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed