Add standard collation UNICODE

This adds a new predefined collation named UNICODE, which sorts by the
default Unicode collation algorithm specifications, per SQL standard.

This only works if ICU support is built.

Reviewed-by: Jeff Davis <pgsql@j-davis.com>
Discussion: https://www.postgresql.org/message-id/flat/1293e382-2093-a2bf-a397-c04e8f83d3c2@enterprisedb.com
This commit is contained in:
Peter Eisentraut 2023-03-10 13:35:00 +01:00
parent 6ad5793a49
commit 0d21d4b9bc
5 changed files with 46 additions and 7 deletions

View File

@ -659,9 +659,34 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
</para>
<para>
Additionally, the SQL standard collation name <literal>ucs_basic</literal>
is available for encoding <literal>UTF8</literal>. It is equivalent
to <literal>C</literal> and sorts by Unicode code point.
Additionally, two SQL standard collation names are available:
<variablelist>
<varlistentry>
<term><literal>unicode</literal></term>
<listitem>
<para>
This collation sorts using the Unicode Collation Algorithm with the
Default Unicode Collation Element Table. It is available in all
encodings. ICU support is required to use this collation. (This
collation has the same behavior as the ICU root locale; see <xref
linkend="collation-managing-predefined-icu-und-x-icu"/>.)
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>ucs_basic</literal></term>
<listitem>
<para>
This collation sorts by Unicode code point. It is only available for
encoding <literal>UTF8</literal>. (This collation has the same
behavior as the libc locale specification <literal>C</literal> in
<literal>UTF8</literal> encoding.)
</para>
</listitem>
</varlistentry>
</variablelist>
</para>
</sect3>

View File

@ -1493,10 +1493,14 @@ static void
setup_collation(FILE *cmdfd)
{
/*
* Add an SQL-standard name. We don't want to pin this, so it doesn't go
* in pg_collation.h. But add it before reading system collations, so
* that it wins if libc defines a locale named ucs_basic.
* Add SQL-standard names. We don't want to pin these, so they don't go
* in pg_collation.dat. But add them before reading system collations, so
* that they win if libc defines a locale with the same name.
*/
PG_CMD_PRINTF("INSERT INTO pg_collation (oid, collname, collnamespace, collowner, collprovider, collisdeterministic, collencoding, colliculocale)"
"VALUES (pg_nextoid('pg_catalog.pg_collation', 'oid', 'pg_catalog.pg_collation_oid_index'), 'unicode', 'pg_catalog'::regnamespace, %u, '%c', true, -1, 'und');\n\n",
BOOTSTRAP_SUPERUSERID, COLLPROVIDER_ICU);
PG_CMD_PRINTF("INSERT INTO pg_collation (oid, collname, collnamespace, collowner, collprovider, collisdeterministic, collencoding, collcollate, collctype)"
"VALUES (pg_nextoid('pg_catalog.pg_collation', 'oid', 'pg_catalog.pg_collation_oid_index'), 'ucs_basic', 'pg_catalog'::regnamespace, %u, '%c', true, %d, 'C', 'C');\n\n",
BOOTSTRAP_SUPERUSERID, COLLPROVIDER_LIBC, PG_UTF8);

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202303081
#define CATALOG_VERSION_NO 202303101
#endif

View File

@ -1151,6 +1151,15 @@ SELECT * FROM collate_test2 ORDER BY b COLLATE UCS_BASIC;
2 | äbc
(4 rows)
SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
a | b
---+-----
1 | abc
4 | ABC
2 | äbc
3 | bbc
(4 rows)
-- test ICU collation customization
-- test the attributes handled by icu_set_collation_attributes()
CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');

View File

@ -447,6 +447,7 @@ drop type textrange_en_us;
-- standard collations
SELECT * FROM collate_test2 ORDER BY b COLLATE UCS_BASIC;
SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
-- test ICU collation customization