Add standard collation UNICODE

This adds a new predefined collation named UNICODE, which sorts by the default Unicode collation algorithm specifications, per SQL standard. This only works if ICU support is built. Reviewed-by: Jeff Davis <pgsql@j-davis.com> Discussion: https://www.postgresql.org/message-id/flat/1293e382-2093-a2bf-a397-c04e8f83d3c2@enterprisedb.com
2023-03-10 13:35:00 +01:00 · 2023-03-10 13:35:00 +01:00 · 0d21d4b9bc
parent 6ad5793a49
commit 0d21d4b9bc
5 changed files with 46 additions and 7 deletions
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@ -659,9 +659,34 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
   </para>

   <para>
-    Additionally, the SQL standard collation name <literal>ucs_basic</literal>
-    is available for encoding <literal>UTF8</literal>.  It is equivalent
-    to <literal>C</literal> and sorts by Unicode code point.
+    Additionally, two SQL standard collation names are available:
+
+    <variablelist>
+     <varlistentry>
+      <term><literal>unicode</literal></term>
+      <listitem>
+       <para>
+        This collation sorts using the Unicode Collation Algorithm with the
+        Default Unicode Collation Element Table.  It is available in all
+        encodings.  ICU support is required to use this collation.  (This
+        collation has the same behavior as the ICU root locale; see <xref
+        linkend="collation-managing-predefined-icu-und-x-icu"/>.)
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><literal>ucs_basic</literal></term>
+      <listitem>
+       <para>
+        This collation sorts by Unicode code point.  It is only available for
+        encoding <literal>UTF8</literal>.  (This collation has the same
+        behavior as the libc locale specification <literal>C</literal> in
+        <literal>UTF8</literal> encoding.)
+       </para>
+      </listitem>
+     </varlistentry>
+    </variablelist>
   </para>
  </sect3>

--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@ -1493,10 +1493,14 @@ static void
 setup_collation(FILE *cmdfd)
 {
 	/*
-	 * Add an SQL-standard name.  We don't want to pin this, so it doesn't go
-	 * in pg_collation.h.  But add it before reading system collations, so
-	 * that it wins if libc defines a locale named ucs_basic.
+	 * Add SQL-standard names.  We don't want to pin these, so they don't go
+	 * in pg_collation.dat.  But add them before reading system collations, so
+	 * that they win if libc defines a locale with the same name.
 	 */
+	PG_CMD_PRINTF("INSERT INTO pg_collation (oid, collname, collnamespace, collowner, collprovider, collisdeterministic, collencoding, colliculocale)"
+				  "VALUES (pg_nextoid('pg_catalog.pg_collation', 'oid', 'pg_catalog.pg_collation_oid_index'), 'unicode', 'pg_catalog'::regnamespace, %u, '%c', true, -1, 'und');\n\n",
+				  BOOTSTRAP_SUPERUSERID, COLLPROVIDER_ICU);
+
 	PG_CMD_PRINTF("INSERT INTO pg_collation (oid, collname, collnamespace, collowner, collprovider, collisdeterministic, collencoding, collcollate, collctype)"
 				  "VALUES (pg_nextoid('pg_catalog.pg_collation', 'oid', 'pg_catalog.pg_collation_oid_index'), 'ucs_basic', 'pg_catalog'::regnamespace, %u, '%c', true, %d, 'C', 'C');\n\n",
 				  BOOTSTRAP_SUPERUSERID, COLLPROVIDER_LIBC, PG_UTF8);
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -57,6 +57,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202303081
+#define CATALOG_VERSION_NO	202303101

 #endif
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@ -1151,6 +1151,15 @@ SELECT * FROM collate_test2 ORDER BY b COLLATE UCS_BASIC;
 2 | äbc
 (4 rows)

+SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+ 2 | äbc
+ 3 | bbc
+(4 rows)
+
 -- test ICU collation customization
 -- test the attributes handled by icu_set_collation_attributes()
 CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@ -447,6 +447,7 @@ drop type textrange_en_us;
 -- standard collations

 SELECT * FROM collate_test2 ORDER BY b COLLATE UCS_BASIC;
+SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;


 -- test ICU collation customization