postgresql/src/test/regress/expected/collate.icu.utf8.out

/*
 * This test is for ICU collations.
 */
/* skip test if not UTF8 server encoding or no ICU collations installed */
SELECT getdatabaseencoding() <> 'UTF8' OR
       (SELECT count(*) FROM pg_collation WHERE collprovider = 'i' AND collname <> 'unicode') = 0
       AS skip_test \gset
\if :skip_test
\quit
\endif
SET client_encoding TO UTF8;
CREATE SCHEMA collate_tests;
SET search_path = collate_tests;
CREATE TABLE collate_test1 (
    a int,
    b text COLLATE "en-x-icu" NOT NULL
);
\d collate_test1
        Table "collate_tests.collate_test1"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 a      | integer |           |          | 
 b      | text    | en-x-icu  | not null | 

CREATE TABLE collate_test_fail (
    a int,
    b text COLLATE "ja_JP.eucjp-x-icu"
);
ERROR:  collation "ja_JP.eucjp-x-icu" for encoding "UTF8" does not exist
LINE 3:     b text COLLATE "ja_JP.eucjp-x-icu"
                   ^
CREATE TABLE collate_test_fail (
    a int,
    b text COLLATE "foo-x-icu"
);
ERROR:  collation "foo-x-icu" for encoding "UTF8" does not exist
LINE 3:     b text COLLATE "foo-x-icu"
                   ^
CREATE TABLE collate_test_fail (
    a int COLLATE "en-x-icu",
    b text
);
ERROR:  collations are not supported by type integer
LINE 2:     a int COLLATE "en-x-icu",
                  ^
CREATE TABLE collate_test_like (
    LIKE collate_test1
);
\d collate_test_like
      Table "collate_tests.collate_test_like"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 a      | integer |           |          | 
 b      | text    | en-x-icu  | not null | 

CREATE TABLE collate_test2 (
    a int,
    b text COLLATE "sv-x-icu"
);
CREATE TABLE collate_test3 (
    a int,
    b text COLLATE "C"
);
INSERT INTO collate_test1 VALUES (1, 'abc'), (2, 'äbc'), (3, 'bbc'), (4, 'ABC');
INSERT INTO collate_test2 SELECT * FROM collate_test1;
INSERT INTO collate_test3 SELECT * FROM collate_test1;
SELECT * FROM collate_test1 WHERE b >= 'bbc';
 a |  b  
---+-----
 3 | bbc
(1 row)

SELECT * FROM collate_test2 WHERE b >= 'bbc';
 a |  b  
---+-----
 2 | äbc
 3 | bbc
(2 rows)

SELECT * FROM collate_test3 WHERE b >= 'bbc';
 a |  b  
---+-----
 2 | äbc
 3 | bbc
(2 rows)

SELECT * FROM collate_test3 WHERE b >= 'BBC';
 a |  b  
---+-----
 1 | abc
 2 | äbc
 3 | bbc
(3 rows)

SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
 a |  b  
---+-----
 2 | äbc
 3 | bbc
(2 rows)

SELECT * FROM collate_test1 WHERE b >= 'bbc' COLLATE "C";
 a |  b  
---+-----
 2 | äbc
 3 | bbc
(2 rows)

SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc' COLLATE "C";
 a |  b  
---+-----
 2 | äbc
 3 | bbc
(2 rows)

SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc' COLLATE "en-x-icu";
ERROR:  collation mismatch between explicit collations "C" and "en-x-icu"
LINE 1: ...* FROM collate_test1 WHERE b COLLATE "C" >= 'bbc' COLLATE "e...
                                                             ^
CREATE DOMAIN testdomain_sv AS text COLLATE "sv-x-icu";
CREATE DOMAIN testdomain_i AS int COLLATE "sv-x-icu"; -- fails
ERROR:  collations are not supported by type integer
CREATE TABLE collate_test4 (
    a int,
    b testdomain_sv
);
INSERT INTO collate_test4 SELECT * FROM collate_test1;
SELECT a, b FROM collate_test4 ORDER BY b;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

CREATE TABLE collate_test5 (
    a int,
    b testdomain_sv COLLATE "en-x-icu"
);
INSERT INTO collate_test5 SELECT * FROM collate_test1;
SELECT a, b FROM collate_test5 ORDER BY b;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT a, b FROM collate_test1 ORDER BY b;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT a, b FROM collate_test2 ORDER BY b;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, b FROM collate_test3 ORDER BY b;
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

-- star expansion
SELECT * FROM collate_test1 ORDER BY b;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT * FROM collate_test2 ORDER BY b;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT * FROM collate_test3 ORDER BY b;
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

-- constant expression folding
SELECT 'bbc' COLLATE "en-x-icu" > 'äbc' COLLATE "en-x-icu" AS "true";
 true 
------
 t
(1 row)

SELECT 'bbc' COLLATE "sv-x-icu" > 'äbc' COLLATE "sv-x-icu" AS "false";
 false 
-------
 f
(1 row)

-- upper/lower
CREATE TABLE collate_test10 (
    a int,
    x text COLLATE "en-x-icu",
    y text COLLATE "tr-x-icu"
);
INSERT INTO collate_test10 VALUES (1, 'hij', 'hij'), (2, 'HIJ', 'HIJ');
SELECT a, lower(x), lower(y), upper(x), upper(y), initcap(x), initcap(y) FROM collate_test10;
 a | lower | lower | upper | upper | initcap | initcap 
---+-------+-------+-------+-------+---------+---------
 1 | hij   | hij   | HIJ   | HİJ   | Hij     | Hij
 2 | hij   | hıj   | HIJ   | HIJ   | Hij     | Hıj
(2 rows)

SELECT a, lower(x COLLATE "C"), lower(y COLLATE "C") FROM collate_test10;
 a | lower | lower 
---+-------+-------
 1 | hij   | hij
 2 | hij   | hij
(2 rows)

SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
 a |  x  |  y  
---+-----+-----
 2 | HIJ | HIJ
 1 | hij | hij
(2 rows)

-- LIKE/ILIKE
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
 a |  b  
---+-----
 1 | abc
(1 row)

SELECT * FROM collate_test1 WHERE b LIKE 'abc%';
 a |  b  
---+-----
 1 | abc
(1 row)

SELECT * FROM collate_test1 WHERE b LIKE '%bc%';
 a |  b  
---+-----
 1 | abc
 2 | äbc
 3 | bbc
(3 rows)

SELECT * FROM collate_test1 WHERE b ILIKE 'abc';
 a |  b  
---+-----
 1 | abc
 4 | ABC
(2 rows)

SELECT * FROM collate_test1 WHERE b ILIKE 'abc%';
 a |  b  
---+-----
 1 | abc
 4 | ABC
(2 rows)

SELECT * FROM collate_test1 WHERE b ILIKE '%bc%';
 a |  b  
---+-----
 1 | abc
 2 | äbc
 3 | bbc
 4 | ABC
(4 rows)

SELECT 'Türkiye' COLLATE "en-x-icu" ILIKE '%KI%' AS "true";
 true 
------
 t
(1 row)

SELECT 'Türkiye' COLLATE "tr-x-icu" ILIKE '%KI%' AS "false";
 false 
-------
 f
(1 row)

SELECT 'bıt' ILIKE 'BIT' COLLATE "en-x-icu" AS "false";
 false 
-------
 f
(1 row)

SELECT 'bıt' ILIKE 'BIT' COLLATE "tr-x-icu" AS "true";
 true 
------
 t
(1 row)

-- The following actually exercises the selectivity estimation for ILIKE.
SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
 relname 
---------
(0 rows)

-- regular expressions
SELECT * FROM collate_test1 WHERE b ~ '^abc$';
 a |  b  
---+-----
 1 | abc
(1 row)

SELECT * FROM collate_test1 WHERE b ~ '^abc';
 a |  b  
---+-----
 1 | abc
(1 row)

SELECT * FROM collate_test1 WHERE b ~ 'bc';
 a |  b  
---+-----
 1 | abc
 2 | äbc
 3 | bbc
(3 rows)

SELECT * FROM collate_test1 WHERE b ~* '^abc$';
 a |  b  
---+-----
 1 | abc
 4 | ABC
(2 rows)

SELECT * FROM collate_test1 WHERE b ~* '^abc';
 a |  b  
---+-----
 1 | abc
 4 | ABC
(2 rows)

SELECT * FROM collate_test1 WHERE b ~* 'bc';
 a |  b  
---+-----
 1 | abc
 2 | äbc
 3 | bbc
 4 | ABC
(4 rows)

CREATE TABLE collate_test6 (
    a int,
    b text COLLATE "en-x-icu"
);
INSERT INTO collate_test6 VALUES (1, 'abc'), (2, 'ABC'), (3, '123'), (4, 'ab1'),
                                 (5, 'a1!'), (6, 'a c'), (7, '!.;'), (8, '   '),
                                 (9, 'äbç'), (10, 'ÄBÇ');
SELECT b,
       b ~ '^[[:alpha:]]+$' AS is_alpha,
       b ~ '^[[:upper:]]+$' AS is_upper,
       b ~ '^[[:lower:]]+$' AS is_lower,
       b ~ '^[[:digit:]]+$' AS is_digit,
       b ~ '^[[:alnum:]]+$' AS is_alnum,
       b ~ '^[[:graph:]]+$' AS is_graph,
       b ~ '^[[:print:]]+$' AS is_print,
       b ~ '^[[:punct:]]+$' AS is_punct,
       b ~ '^[[:space:]]+$' AS is_space
FROM collate_test6;
  b  | is_alpha | is_upper | is_lower | is_digit | is_alnum | is_graph | is_print | is_punct | is_space 
-----+----------+----------+----------+----------+----------+----------+----------+----------+----------
 abc | t        | f        | t        | f        | t        | t        | t        | f        | f
 ABC | t        | t        | f        | f        | t        | t        | t        | f        | f
 123 | f        | f        | f        | t        | t        | t        | t        | f        | f
 ab1 | f        | f        | f        | f        | t        | t        | t        | f        | f
 a1! | f        | f        | f        | f        | f        | t        | t        | f        | f
 a c | f        | f        | f        | f        | f        | f        | t        | f        | f
 !.; | f        | f        | f        | f        | f        | t        | t        | t        | f
     | f        | f        | f        | f        | f        | f        | t        | f        | t
 äbç | t        | f        | t        | f        | t        | t        | t        | f        | f
 ÄBÇ | t        | t        | f        | f        | t        | t        | t        | f        | f
(10 rows)

SELECT 'Türkiye' COLLATE "en-x-icu" ~* 'KI' AS "true";
 true 
------
 t
(1 row)

SELECT 'Türkiye' COLLATE "tr-x-icu" ~* 'KI' AS "true";  -- true with ICU
 true 
------
 t
(1 row)

SELECT 'bıt' ~* 'BIT' COLLATE "en-x-icu" AS "false";
 false 
-------
 f
(1 row)

SELECT 'bıt' ~* 'BIT' COLLATE "tr-x-icu" AS "false";  -- false with ICU
 false 
-------
 f
(1 row)

-- The following actually exercises the selectivity estimation for ~*.
SELECT relname FROM pg_class WHERE relname ~* '^abc';
 relname 
---------
(0 rows)

/* not run by default because it requires tr_TR system locale
-- to_char

SET lc_time TO 'tr_TR';
SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr-x-icu");
*/
-- backwards parsing
CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
CREATE VIEW collview3 AS SELECT a, lower((x || x) COLLATE "C") FROM collate_test10;
SELECT table_name, view_definition FROM information_schema.views
  WHERE table_name LIKE 'collview%' ORDER BY 1;
 table_name |              view_definition               
------------+--------------------------------------------
 collview1  |  SELECT a,                                +
            |     b                                     +
            |    FROM collate_test1                     +
            |   WHERE ((b COLLATE "C") >= 'bbc'::text);
 collview2  |  SELECT a,                                +
            |     b                                     +
            |    FROM collate_test1                     +
            |   ORDER BY (b COLLATE "C");
 collview3  |  SELECT a,                                +
            |     lower(((x || x) COLLATE "C")) AS lower+
            |    FROM collate_test10;
(3 rows)

-- collation propagation in various expression types
SELECT a, coalesce(b, 'foo') FROM collate_test1 ORDER BY 2;
 a | coalesce 
---+----------
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT a, coalesce(b, 'foo') FROM collate_test2 ORDER BY 2;
 a | coalesce 
---+----------
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, coalesce(b, 'foo') FROM collate_test3 ORDER BY 2;
 a | coalesce 
---+----------
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, lower(coalesce(x, 'foo')), lower(coalesce(y, 'foo')) FROM collate_test10;
 a | lower | lower 
---+-------+-------
 1 | hij   | hij
 2 | hij   | hıj
(2 rows)

SELECT a, b, greatest(b, 'CCC') FROM collate_test1 ORDER BY 3;
 a |  b  | greatest 
---+-----+----------
 1 | abc | CCC
 2 | äbc | CCC
 3 | bbc | CCC
 4 | ABC | CCC
(4 rows)

SELECT a, b, greatest(b, 'CCC') FROM collate_test2 ORDER BY 3;
 a |  b  | greatest 
---+-----+----------
 1 | abc | CCC
 3 | bbc | CCC
 4 | ABC | CCC
 2 | äbc | äbc
(4 rows)

SELECT a, b, greatest(b, 'CCC') FROM collate_test3 ORDER BY 3;
 a |  b  | greatest 
---+-----+----------
 4 | ABC | CCC
 1 | abc | abc
 3 | bbc | bbc
 2 | äbc | äbc
(4 rows)

SELECT a, x, y, lower(greatest(x, 'foo')), lower(greatest(y, 'foo')) FROM collate_test10;
 a |  x  |  y  | lower | lower 
---+-----+-----+-------+-------
 1 | hij | hij | hij   | hij
 2 | HIJ | HIJ | hij   | hıj
(2 rows)

SELECT a, nullif(b, 'abc') FROM collate_test1 ORDER BY 2;
 a | nullif 
---+--------
 4 | ABC
 2 | äbc
 3 | bbc
 1 | 
(4 rows)

SELECT a, nullif(b, 'abc') FROM collate_test2 ORDER BY 2;
 a | nullif 
---+--------
 4 | ABC
 3 | bbc
 2 | äbc
 1 | 
(4 rows)

SELECT a, nullif(b, 'abc') FROM collate_test3 ORDER BY 2;
 a | nullif 
---+--------
 4 | ABC
 3 | bbc
 2 | äbc
 1 | 
(4 rows)

SELECT a, lower(nullif(x, 'foo')), lower(nullif(y, 'foo')) FROM collate_test10;
 a | lower | lower 
---+-------+-------
 1 | hij   | hij
 2 | hij   | hıj
(2 rows)

SELECT a, CASE b WHEN 'abc' THEN 'abcd' ELSE b END FROM collate_test1 ORDER BY 2;
 a |  b   
---+------
 4 | ABC
 2 | äbc
 1 | abcd
 3 | bbc
(4 rows)

SELECT a, CASE b WHEN 'abc' THEN 'abcd' ELSE b END FROM collate_test2 ORDER BY 2;
 a |  b   
---+------
 4 | ABC
 1 | abcd
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, CASE b WHEN 'abc' THEN 'abcd' ELSE b END FROM collate_test3 ORDER BY 2;
 a |  b   
---+------
 4 | ABC
 1 | abcd
 3 | bbc
 2 | äbc
(4 rows)

CREATE DOMAIN testdomain AS text;
SELECT a, b::testdomain FROM collate_test1 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT a, b::testdomain FROM collate_test2 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, b::testdomain FROM collate_test3 ORDER BY 2;
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, b::testdomain_sv FROM collate_test3 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, lower(x::testdomain), lower(y::testdomain) FROM collate_test10;
 a | lower | lower 
---+-------+-------
 1 | hij   | hij
 2 | hij   | hıj
(2 rows)

SELECT min(b), max(b) FROM collate_test1;
 min | max 
-----+-----
 abc | bbc
(1 row)

SELECT min(b), max(b) FROM collate_test2;
 min | max 
-----+-----
 abc | äbc
(1 row)

SELECT min(b), max(b) FROM collate_test3;
 min | max 
-----+-----
 ABC | äbc
(1 row)

SELECT array_agg(b ORDER BY b) FROM collate_test1;
     array_agg     
-------------------
 {abc,ABC,äbc,bbc}
(1 row)

SELECT array_agg(b ORDER BY b) FROM collate_test2;
     array_agg     
-------------------
 {abc,ABC,bbc,äbc}
(1 row)

SELECT array_agg(b ORDER BY b) FROM collate_test3;
     array_agg     
-------------------
 {ABC,abc,bbc,äbc}
(1 row)

SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test1 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 1 | abc
 4 | ABC
 4 | ABC
 2 | äbc
 2 | äbc
 3 | bbc
 3 | bbc
(8 rows)

SELECT a, b FROM collate_test2 UNION SELECT a, b FROM collate_test2 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, b FROM collate_test3 WHERE a < 4 INTERSECT SELECT a, b FROM collate_test3 WHERE a > 1 ORDER BY 2;
 a |  b  
---+-----
 3 | bbc
 2 | äbc
(2 rows)

SELECT a, b FROM collate_test3 EXCEPT SELECT a, b FROM collate_test3 WHERE a < 2 ORDER BY 2;
 a |  b  
---+-----
 4 | ABC
 3 | bbc
 2 | äbc
(3 rows)

SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
ERROR:  could not determine which collation to use for string comparison
HINT:  Use the COLLATE clause to set the collation explicitly.
SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test3; -- ok
 a |  b  
---+-----
 1 | abc
 2 | äbc
 3 | bbc
 4 | ABC
 1 | abc
 2 | äbc
 3 | bbc
 4 | ABC
(8 rows)

SELECT a, b FROM collate_test1 UNION SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
ERROR:  collation mismatch between implicit collations "en-x-icu" and "C"
LINE 1: SELECT a, b FROM collate_test1 UNION SELECT a, b FROM collat...
                                                       ^
HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
SELECT a, b COLLATE "C" FROM collate_test1 UNION SELECT a, b FROM collate_test3 ORDER BY 2; -- ok
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, b FROM collate_test1 INTERSECT SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
ERROR:  collation mismatch between implicit collations "en-x-icu" and "C"
LINE 1: ...ELECT a, b FROM collate_test1 INTERSECT SELECT a, b FROM col...
                                                             ^
HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
SELECT a, b FROM collate_test1 EXCEPT SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
ERROR:  collation mismatch between implicit collations "en-x-icu" and "C"
LINE 1: SELECT a, b FROM collate_test1 EXCEPT SELECT a, b FROM colla...
                                                        ^
HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
CREATE TABLE test_u AS SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test3; -- fail
ERROR:  no collation was derived for column "b" with collatable type text
HINT:  Use the COLLATE clause to set the collation explicitly.
-- ideally this would be a parse-time error, but for now it must be run-time:
select x < y from collate_test10; -- fail
ERROR:  could not determine which collation to use for string comparison
HINT:  Use the COLLATE clause to set the collation explicitly.
select x || y from collate_test10; -- ok, because || is not collation aware
 ?column? 
----------
 hijhij
 HIJHIJ
(2 rows)

select x, y from collate_test10 order by x || y; -- not so ok
ERROR:  collation mismatch between implicit collations "en-x-icu" and "tr-x-icu"
LINE 1: select x, y from collate_test10 order by x || y;
                                                      ^
HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
-- collation mismatch between recursive and non-recursive term
WITH RECURSIVE foo(x) AS
   (SELECT x FROM (VALUES('a' COLLATE "en-x-icu"),('b')) t(x)
   UNION ALL
   SELECT (x || 'c') COLLATE "de-x-icu" FROM foo WHERE length(x) < 10)
SELECT * FROM foo;
ERROR:  recursive query "foo" column 1 has collation "en-x-icu" in non-recursive term but collation "de-x-icu" overall
LINE 2:    (SELECT x FROM (VALUES('a' COLLATE "en-x-icu"),('b')) t(x...
                   ^
HINT:  Use the COLLATE clause to set the collation of the non-recursive term.
-- casting
SELECT CAST('42' AS text COLLATE "C");
ERROR:  syntax error at or near "COLLATE"
LINE 1: SELECT CAST('42' AS text COLLATE "C");
                                 ^
SELECT a, CAST(b AS varchar) FROM collate_test1 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT a, CAST(b AS varchar) FROM collate_test2 ORDER BY 2;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, CAST(b AS varchar) FROM collate_test3 ORDER BY 2;
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

-- propagation of collation in SQL functions (inlined and non-inlined cases)
-- and plpgsql functions too
CREATE FUNCTION mylt (text, text) RETURNS boolean LANGUAGE sql
    AS $$ select $1 < $2 $$;
CREATE FUNCTION mylt_noninline (text, text) RETURNS boolean LANGUAGE sql
    AS $$ select $1 < $2 limit 1 $$;
CREATE FUNCTION mylt_plpgsql (text, text) RETURNS boolean LANGUAGE plpgsql
    AS $$ begin return $1 < $2; end $$;
SELECT a.b AS a, b.b AS b, a.b < b.b AS lt,
       mylt(a.b, b.b), mylt_noninline(a.b, b.b), mylt_plpgsql(a.b, b.b)
FROM collate_test1 a, collate_test1 b
ORDER BY a.b, b.b;
  a  |  b  | lt | mylt | mylt_noninline | mylt_plpgsql 
-----+-----+----+------+----------------+--------------
 abc | abc | f  | f    | f              | f
 abc | ABC | t  | t    | t              | t
 abc | äbc | t  | t    | t              | t
 abc | bbc | t  | t    | t              | t
 ABC | abc | f  | f    | f              | f
 ABC | ABC | f  | f    | f              | f
 ABC | äbc | t  | t    | t              | t
 ABC | bbc | t  | t    | t              | t
 äbc | abc | f  | f    | f              | f
 äbc | ABC | f  | f    | f              | f
 äbc | äbc | f  | f    | f              | f
 äbc | bbc | t  | t    | t              | t
 bbc | abc | f  | f    | f              | f
 bbc | ABC | f  | f    | f              | f
 bbc | äbc | f  | f    | f              | f
 bbc | bbc | f  | f    | f              | f
(16 rows)

SELECT a.b AS a, b.b AS b, a.b < b.b COLLATE "C" AS lt,
       mylt(a.b, b.b COLLATE "C"), mylt_noninline(a.b, b.b COLLATE "C"),
       mylt_plpgsql(a.b, b.b COLLATE "C")
FROM collate_test1 a, collate_test1 b
ORDER BY a.b, b.b;
  a  |  b  | lt | mylt | mylt_noninline | mylt_plpgsql 
-----+-----+----+------+----------------+--------------
 abc | abc | f  | f    | f              | f
 abc | ABC | f  | f    | f              | f
 abc | äbc | t  | t    | t              | t
 abc | bbc | t  | t    | t              | t
 ABC | abc | t  | t    | t              | t
 ABC | ABC | f  | f    | f              | f
 ABC | äbc | t  | t    | t              | t
 ABC | bbc | t  | t    | t              | t
 äbc | abc | f  | f    | f              | f
 äbc | ABC | f  | f    | f              | f
 äbc | äbc | f  | f    | f              | f
 äbc | bbc | f  | f    | f              | f
 bbc | abc | f  | f    | f              | f
 bbc | ABC | f  | f    | f              | f
 bbc | äbc | t  | t    | t              | t
 bbc | bbc | f  | f    | f              | f
(16 rows)

-- collation override in plpgsql
CREATE FUNCTION mylt2 (x text, y text) RETURNS boolean LANGUAGE plpgsql AS $$
declare
  xx text := x;
  yy text := y;
begin
  return xx < yy;
end
$$;
SELECT mylt2('a', 'B' collate "en-x-icu") as t, mylt2('a', 'B' collate "C") as f;
 t | f 
---+---
 t | f
(1 row)

CREATE OR REPLACE FUNCTION
  mylt2 (x text, y text) RETURNS boolean LANGUAGE plpgsql AS $$
declare
  xx text COLLATE "POSIX" := x;
  yy text := y;
begin
  return xx < yy;
end
$$;
SELECT mylt2('a', 'B') as f;
 f 
---
 f
(1 row)

SELECT mylt2('a', 'B' collate "C") as fail; -- conflicting collations
ERROR:  could not determine which collation to use for string comparison
HINT:  Use the COLLATE clause to set the collation explicitly.
CONTEXT:  PL/pgSQL function mylt2(text,text) line 6 at RETURN
SELECT mylt2('a', 'B' collate "POSIX") as f;
 f 
---
 f
(1 row)

-- polymorphism
SELECT * FROM unnest((SELECT array_agg(b ORDER BY b) FROM collate_test1)) ORDER BY 1;
 unnest 
--------
 abc
 ABC
 äbc
 bbc
(4 rows)

SELECT * FROM unnest((SELECT array_agg(b ORDER BY b) FROM collate_test2)) ORDER BY 1;
 unnest 
--------
 abc
 ABC
 bbc
 äbc
(4 rows)

SELECT * FROM unnest((SELECT array_agg(b ORDER BY b) FROM collate_test3)) ORDER BY 1;
 unnest 
--------
 ABC
 abc
 bbc
 äbc
(4 rows)

CREATE FUNCTION dup (anyelement) RETURNS anyelement
    AS 'select $1' LANGUAGE sql;
SELECT a, dup(b) FROM collate_test1 ORDER BY 2;
 a | dup 
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

SELECT a, dup(b) FROM collate_test2 ORDER BY 2;
 a | dup 
---+-----
 1 | abc
 4 | ABC
 3 | bbc
 2 | äbc
(4 rows)

SELECT a, dup(b) FROM collate_test3 ORDER BY 2;
 a | dup 
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

-- indexes
CREATE INDEX collate_test1_idx1 ON collate_test1 (b);
CREATE INDEX collate_test1_idx2 ON collate_test1 (b COLLATE "C");
CREATE INDEX collate_test1_idx3 ON collate_test1 ((b COLLATE "C")); -- this is different grammatically
CREATE INDEX collate_test1_idx4 ON collate_test1 (((b||'foo') COLLATE "POSIX"));
CREATE INDEX collate_test1_idx5 ON collate_test1 (a COLLATE "C"); -- fail
ERROR:  collations are not supported by type integer
CREATE INDEX collate_test1_idx6 ON collate_test1 ((a COLLATE "C")); -- fail
ERROR:  collations are not supported by type integer
LINE 1: ...ATE INDEX collate_test1_idx6 ON collate_test1 ((a COLLATE "C...
                                                             ^
SELECT relname, pg_get_indexdef(oid) FROM pg_class WHERE relname LIKE 'collate_test%_idx%' ORDER BY 1;
      relname       |                                                  pg_get_indexdef                                                  
--------------------+-------------------------------------------------------------------------------------------------------------------
 collate_test1_idx1 | CREATE INDEX collate_test1_idx1 ON collate_tests.collate_test1 USING btree (b)
 collate_test1_idx2 | CREATE INDEX collate_test1_idx2 ON collate_tests.collate_test1 USING btree (b COLLATE "C")
 collate_test1_idx3 | CREATE INDEX collate_test1_idx3 ON collate_tests.collate_test1 USING btree (b COLLATE "C")
 collate_test1_idx4 | CREATE INDEX collate_test1_idx4 ON collate_tests.collate_test1 USING btree (((b || 'foo'::text)) COLLATE "POSIX")
(4 rows)

set enable_seqscan = off;
explain (costs off)
select * from collate_test1 where b ilike 'abc';
          QUERY PLAN           
-------------------------------
 Seq Scan on collate_test1
   Filter: (b ~~* 'abc'::text)
(2 rows)

select * from collate_test1 where b ilike 'abc';
 a |  b  
---+-----
 1 | abc
 4 | ABC
(2 rows)

explain (costs off)
select * from collate_test1 where b ilike 'ABC';
          QUERY PLAN           
-------------------------------
 Seq Scan on collate_test1
   Filter: (b ~~* 'ABC'::text)
(2 rows)

select * from collate_test1 where b ilike 'ABC';
 a |  b  
---+-----
 1 | abc
 4 | ABC
(2 rows)

reset enable_seqscan;
-- schema manipulation commands
CREATE ROLE regress_test_role;
CREATE SCHEMA test_schema;
-- We need to do this this way to cope with varying names for encodings:
SET client_min_messages TO WARNING;
SET icu_validation_level = disabled;
do $$
BEGIN
  EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN datlocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
END
$$;
CREATE COLLATION test0 FROM "C"; -- fail, duplicate name
ERROR:  collation "test0" already exists
do $$
BEGIN
  EXECUTE 'CREATE COLLATION test1 (provider = icu, locale = ' ||
          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN datlocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
END
$$;
RESET icu_validation_level;
RESET client_min_messages;
CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
ERROR:  parameter "locale" must be specified
SET icu_validation_level = ERROR;
CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails
ERROR:  ICU locale "nonsense-nowhere" has unknown language "nonsense"
HINT:  To disable ICU locale validation, set the parameter icu_validation_level to "disabled".
CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails
ERROR:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
RESET icu_validation_level;
CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx;
WARNING:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx;
WARNING:  ICU locale "nonsense-nowhere" has unknown language "nonsense"
HINT:  To disable ICU locale validation, set the parameter icu_validation_level to "disabled".
CREATE COLLATION test4 FROM nonsense;
ERROR:  collation "nonsense" for encoding "UTF8" does not exist
CREATE COLLATION test5 FROM test0;
SELECT collname FROM pg_collation WHERE collname LIKE 'test%' ORDER BY 1;
 collname 
----------
 test0
 test1
 test5
(3 rows)

ALTER COLLATION test1 RENAME TO test11;
ALTER COLLATION test0 RENAME TO test11; -- fail
ERROR:  collation "test11" already exists in schema "collate_tests"
ALTER COLLATION test1 RENAME TO test22; -- fail
ERROR:  collation "test1" for encoding "UTF8" does not exist
ALTER COLLATION test11 OWNER TO regress_test_role;
ALTER COLLATION test11 OWNER TO nonsense;
ERROR:  role "nonsense" does not exist
ALTER COLLATION test11 SET SCHEMA test_schema;
COMMENT ON COLLATION test0 IS 'US English';
SELECT collname, nspname, obj_description(pg_collation.oid, 'pg_collation')
    FROM pg_collation JOIN pg_namespace ON (collnamespace = pg_namespace.oid)
    WHERE collname LIKE 'test%'
    ORDER BY 1;
 collname |    nspname    | obj_description 
----------+---------------+-----------------
 test0    | collate_tests | US English
 test11   | test_schema   | 
 test5    | collate_tests | 
(3 rows)

DROP COLLATION test0, test_schema.test11, test5;
DROP COLLATION test0; -- fail
ERROR:  collation "test0" for encoding "UTF8" does not exist
DROP COLLATION IF EXISTS test0;
NOTICE:  collation "test0" does not exist, skipping
SELECT collname FROM pg_collation WHERE collname LIKE 'test%';
 collname 
----------
(0 rows)

DROP SCHEMA test_schema;
DROP ROLE regress_test_role;
-- ALTER
ALTER COLLATION "en-x-icu" REFRESH VERSION;
NOTICE:  version has not changed
-- also test for database while we are here
SELECT current_database() AS datname \gset
ALTER DATABASE :"datname" REFRESH COLLATION VERSION;
NOTICE:  version has not changed
-- dependencies
CREATE COLLATION test0 FROM "C";
CREATE TABLE collate_dep_test1 (a int, b text COLLATE test0);
CREATE DOMAIN collate_dep_dom1 AS text COLLATE test0;
CREATE TYPE collate_dep_test2 AS (x int, y text COLLATE test0);
CREATE VIEW collate_dep_test3 AS SELECT text 'foo' COLLATE test0 AS foo;
CREATE TABLE collate_dep_test4t (a int, b text);
CREATE INDEX collate_dep_test4i ON collate_dep_test4t (b COLLATE test0);
DROP COLLATION test0 RESTRICT; -- fail
ERROR:  cannot drop collation test0 because other objects depend on it
DETAIL:  column b of table collate_dep_test1 depends on collation test0
type collate_dep_dom1 depends on collation test0
column y of composite type collate_dep_test2 depends on collation test0
view collate_dep_test3 depends on collation test0
index collate_dep_test4i depends on collation test0
HINT:  Use DROP ... CASCADE to drop the dependent objects too.
DROP COLLATION test0 CASCADE;
NOTICE:  drop cascades to 5 other objects
DETAIL:  drop cascades to column b of table collate_dep_test1
drop cascades to type collate_dep_dom1
drop cascades to column y of composite type collate_dep_test2
drop cascades to view collate_dep_test3
drop cascades to index collate_dep_test4i
\d collate_dep_test1
      Table "collate_tests.collate_dep_test1"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 a      | integer |           |          | 

\d collate_dep_test2
 Composite type "collate_tests.collate_dep_test2"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 x      | integer |           |          | 

DROP TABLE collate_dep_test1, collate_dep_test4t;
DROP TYPE collate_dep_test2;
-- test range types and collations
create type textrange_c as range(subtype=text, collation="C");
create type textrange_en_us as range(subtype=text, collation="en-x-icu");
select textrange_c('A','Z') @> 'b'::text;
 ?column? 
----------
 f
(1 row)

select textrange_en_us('A','Z') @> 'b'::text;
 ?column? 
----------
 t
(1 row)

drop type textrange_c;
drop type textrange_en_us;
-- standard collations
SELECT * FROM collate_test2 ORDER BY b COLLATE UCS_BASIC;
 a |  b  
---+-----
 4 | ABC
 1 | abc
 3 | bbc
 2 | äbc
(4 rows)

SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
 a |  b  
---+-----
 1 | abc
 4 | ABC
 2 | äbc
 3 | bbc
(4 rows)

-- test ICU collation customization
-- test the attributes handled by icu_set_collation_attributes()
SET client_min_messages=WARNING;
CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
RESET client_min_messages;
SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

SET client_min_messages=WARNING;
CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
RESET client_min_messages;
SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
NOTICE:  using standard form "und-u-kf-lower" for ICU locale "@colCaseFirst=lower"
CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
NOTICE:  using standard form "und-u-kf-upper" for ICU locale "@colCaseFirst=upper"
SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
NOTICE:  using standard form "und-u-ka-shifted" for ICU locale "@colAlternate=shifted"
SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

SET client_min_messages=WARNING;
CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
RESET client_min_messages;
SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
NOTICE:  using standard form "und-u-kn-lower" for ICU locale "@colNumeric=lower"
ERROR:  could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR
-- test that attributes not handled by icu_set_collation_attributes()
-- (handled by ucol_open() directly) also work
CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=phonebook');
NOTICE:  using standard form "de-u-co-phonebk" for ICU locale "de@collation=phonebook"
SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

-- rules
CREATE COLLATION testcoll_rules1 (provider = icu, locale = '', rules = '&a < g');
NOTICE:  using standard form "und" for ICU locale ""
CREATE TABLE test7 (a text);
-- example from https://unicode-org.github.io/icu/userguide/collation/customization/#syntax
INSERT INTO test7 VALUES ('Abernathy'), ('apple'), ('bird'), ('Boston'), ('Graham'), ('green');
SELECT * FROM test7 ORDER BY a COLLATE "en-x-icu";
     a     
-----------
 Abernathy
 apple
 bird
 Boston
 Graham
 green
(6 rows)

SELECT * FROM test7 ORDER BY a COLLATE testcoll_rules1;
     a     
-----------
 Abernathy
 apple
 green
 bird
 Boston
 Graham
(6 rows)

DROP TABLE test7;
CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!!');
NOTICE:  using standard form "und" for ICU locale ""
ERROR:  could not open collator for locale "und" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR
-- nondeterministic collations
CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true);
NOTICE:  using standard form "und" for ICU locale ""
CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false);
NOTICE:  using standard form "und" for ICU locale ""
CREATE TABLE test6 (a int, b text);
-- same string in different normal forms
INSERT INTO test6 VALUES (1, U&'\00E4bc');
INSERT INTO test6 VALUES (2, U&'\0061\0308bc');
SELECT * FROM test6;
 a |  b  
---+-----
 1 | äbc
 2 | äbc
(2 rows)

SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det;
 a |  b  
---+-----
 1 | äbc
(1 row)

SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet;
 a |  b  
---+-----
 1 | äbc
 2 | äbc
(2 rows)

-- same with arrays
CREATE TABLE test6a (a int, b text[]);
INSERT INTO test6a VALUES (1, ARRAY[U&'\00E4bc']);
INSERT INTO test6a VALUES (2, ARRAY[U&'\0061\0308bc']);
SELECT * FROM test6a;
 a |   b   
---+-------
 1 | {äbc}
 2 | {äbc}
(2 rows)

SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_det;
 a |   b   
---+-------
 1 | {äbc}
(1 row)

SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet;
 a |   b   
---+-------
 1 | {äbc}
 2 | {äbc}
(2 rows)

CREATE COLLATION case_sensitive (provider = icu, locale = '');
NOTICE:  using standard form "und" for ICU locale ""
CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false);
NOTICE:  using standard form "und-u-ks-level2" for ICU locale "@colStrength=secondary"
SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
 ?column? | ?column? 
----------+----------
 t        | f
(1 row)

SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_insensitive;
 ?column? | ?column? 
----------+----------
 t        | t
(1 row)

-- test language tags
CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
 ?column? 
----------
 t
(1 row)

CREATE COLLATION lt_upperfirst (provider = icu, locale = 'und-u-kf-upper');
SELECT 'Z' COLLATE lt_upperfirst < 'z' COLLATE lt_upperfirst;
 ?column? 
----------
 t
(1 row)

CREATE TABLE test1cs (x text COLLATE case_sensitive);
CREATE TABLE test2cs (x text COLLATE case_sensitive);
CREATE TABLE test3cs (x text COLLATE case_sensitive);
INSERT INTO test1cs VALUES ('abc'), ('def'), ('ghi');
INSERT INTO test2cs VALUES ('ABC'), ('ghi');
INSERT INTO test3cs VALUES ('abc'), ('ABC'), ('def'), ('ghi');
SELECT x FROM test3cs WHERE x = 'abc';
  x  
-----
 abc
(1 row)

SELECT x FROM test3cs WHERE x <> 'abc';
  x  
-----
 ABC
 def
 ghi
(3 rows)

SELECT x FROM test3cs WHERE x LIKE 'a%';
  x  
-----
 abc
(1 row)

SELECT x FROM test3cs WHERE x ILIKE 'a%';
  x  
-----
 abc
 ABC
(2 rows)

SELECT x FROM test3cs WHERE x SIMILAR TO 'a%';
  x  
-----
 abc
(1 row)

SELECT x FROM test3cs WHERE x ~ 'a';
  x  
-----
 abc
(1 row)

SET enable_hashagg TO off;
SELECT x FROM test1cs UNION SELECT x FROM test2cs ORDER BY x;
  x  
-----
 abc
 ABC
 def
 ghi
(4 rows)

SELECT x FROM test2cs UNION SELECT x FROM test1cs ORDER BY x;
  x  
-----
 abc
 ABC
 def
 ghi
(4 rows)

SELECT x FROM test1cs INTERSECT SELECT x FROM test2cs;
  x  
-----
 ghi
(1 row)

SELECT x FROM test2cs INTERSECT SELECT x FROM test1cs;
  x  
-----
 ghi
(1 row)

SELECT x FROM test1cs EXCEPT SELECT x FROM test2cs;
  x  
-----
 abc
 def
(2 rows)

SELECT x FROM test2cs EXCEPT SELECT x FROM test1cs;
  x  
-----
 ABC
(1 row)

SELECT DISTINCT x FROM test3cs ORDER BY x;
  x  
-----
 abc
 ABC
 def
 ghi
(4 rows)

RESET enable_hashagg;
SELECT count(DISTINCT x) FROM test3cs;
 count 
-------
     4
(1 row)

SELECT x, count(*) FROM test3cs GROUP BY x ORDER BY x;
  x  | count 
-----+-------
 abc |     1
 ABC |     1
 def |     1
 ghi |     1
(4 rows)

SELECT x, row_number() OVER (ORDER BY x), rank() OVER (ORDER BY x) FROM test3cs ORDER BY x;
  x  | row_number | rank 
-----+------------+------
 abc |          1 |    1
 ABC |          2 |    2
 def |          3 |    3
 ghi |          4 |    4
(4 rows)

CREATE UNIQUE INDEX ON test1cs (x);  -- ok
INSERT INTO test1cs VALUES ('ABC');  -- ok
CREATE UNIQUE INDEX ON test3cs (x);  -- ok
SELECT string_to_array('ABC,DEF,GHI' COLLATE case_sensitive, ',', 'abc');
 string_to_array 
-----------------
 {ABC,DEF,GHI}
(1 row)

SELECT string_to_array('ABCDEFGHI' COLLATE case_sensitive, NULL, 'b');
   string_to_array   
---------------------
 {A,B,C,D,E,F,G,H,I}
(1 row)

CREATE TABLE test1ci (x text COLLATE case_insensitive);
CREATE TABLE test2ci (x text COLLATE case_insensitive);
CREATE TABLE test3ci (x text COLLATE case_insensitive);
CREATE INDEX ON test3ci (x text_pattern_ops);  -- error
ERROR:  nondeterministic collations are not supported for operator class "text_pattern_ops"
INSERT INTO test1ci VALUES ('abc'), ('def'), ('ghi');
INSERT INTO test2ci VALUES ('ABC'), ('ghi');
INSERT INTO test3ci VALUES ('abc'), ('ABC'), ('def'), ('ghi');
SELECT x FROM test3ci WHERE x = 'abc';
  x  
-----
 abc
 ABC
(2 rows)

SELECT x FROM test3ci WHERE x <> 'abc';
  x  
-----
 def
 ghi
(2 rows)

SELECT x FROM test3ci WHERE x LIKE 'a%';
ERROR:  nondeterministic collations are not supported for LIKE
SELECT x FROM test3ci WHERE x ILIKE 'a%';
ERROR:  nondeterministic collations are not supported for ILIKE
SELECT x FROM test3ci WHERE x SIMILAR TO 'a%';
ERROR:  nondeterministic collations are not supported for regular expressions
SELECT x FROM test3ci WHERE x ~ 'a';
ERROR:  nondeterministic collations are not supported for regular expressions
SELECT x FROM test1ci UNION SELECT x FROM test2ci ORDER BY x;
  x  
-----
 abc
 def
 ghi
(3 rows)

SELECT x FROM test2ci UNION SELECT x FROM test1ci ORDER BY x;
  x  
-----
 ABC
 def
 ghi
(3 rows)

SELECT x FROM test1ci INTERSECT SELECT x FROM test2ci ORDER BY x;
  x  
-----
 abc
 ghi
(2 rows)

SELECT x FROM test2ci INTERSECT SELECT x FROM test1ci ORDER BY x;
  x  
-----
 ABC
 ghi
(2 rows)

SELECT x FROM test1ci EXCEPT SELECT x FROM test2ci;
  x  
-----
 def
(1 row)

SELECT x FROM test2ci EXCEPT SELECT x FROM test1ci;
 x 
---
(0 rows)

SELECT DISTINCT x FROM test3ci ORDER BY x;
  x  
-----
 abc
 def
 ghi
(3 rows)

SELECT count(DISTINCT x) FROM test3ci;
 count 
-------
     3
(1 row)

SELECT x, count(*) FROM test3ci GROUP BY x ORDER BY x;
  x  | count 
-----+-------
 abc |     2
 def |     1
 ghi |     1
(3 rows)

SELECT x, row_number() OVER (ORDER BY x), rank() OVER (ORDER BY x) FROM test3ci ORDER BY x;
  x  | row_number | rank 
-----+------------+------
 abc |          1 |    1
 ABC |          2 |    1
 def |          3 |    3
 ghi |          4 |    4
(4 rows)

CREATE UNIQUE INDEX ON test1ci (x);  -- ok
INSERT INTO test1ci VALUES ('ABC');  -- error
ERROR:  duplicate key value violates unique constraint "test1ci_x_idx"
DETAIL:  Key (x)=(ABC) already exists.
CREATE UNIQUE INDEX ON test3ci (x);  -- error
ERROR:  could not create unique index "test3ci_x_idx"
DETAIL:  Key (x)=(abc) is duplicated.
SELECT string_to_array('ABC,DEF,GHI' COLLATE case_insensitive, ',', 'abc');
ERROR:  nondeterministic collations are not supported for substring searches
SELECT string_to_array('ABCDEFGHI' COLLATE case_insensitive, NULL, 'b');
    string_to_array     
------------------------
 {A,NULL,C,D,E,F,G,H,I}
(1 row)

-- bpchar
CREATE TABLE test1bpci (x char(3) COLLATE case_insensitive);
CREATE TABLE test2bpci (x char(3) COLLATE case_insensitive);
CREATE TABLE test3bpci (x char(3) COLLATE case_insensitive);
CREATE INDEX ON test3bpci (x bpchar_pattern_ops);  -- error
ERROR:  nondeterministic collations are not supported for operator class "bpchar_pattern_ops"
INSERT INTO test1bpci VALUES ('abc'), ('def'), ('ghi');
INSERT INTO test2bpci VALUES ('ABC'), ('ghi');
INSERT INTO test3bpci VALUES ('abc'), ('ABC'), ('def'), ('ghi');
SELECT x FROM test3bpci WHERE x = 'abc';
  x  
-----
 abc
 ABC
(2 rows)

SELECT x FROM test3bpci WHERE x <> 'abc';
  x  
-----
 def
 ghi
(2 rows)

SELECT x FROM test3bpci WHERE x LIKE 'a%';
ERROR:  nondeterministic collations are not supported for LIKE
SELECT x FROM test3bpci WHERE x ILIKE 'a%';
ERROR:  nondeterministic collations are not supported for ILIKE
SELECT x FROM test3bpci WHERE x SIMILAR TO 'a%';
ERROR:  nondeterministic collations are not supported for regular expressions
SELECT x FROM test3bpci WHERE x ~ 'a';
ERROR:  nondeterministic collations are not supported for regular expressions
SELECT x FROM test1bpci UNION SELECT x FROM test2bpci ORDER BY x;
  x  
-----
 abc
 def
 ghi
(3 rows)

SELECT x FROM test2bpci UNION SELECT x FROM test1bpci ORDER BY x;
  x  
-----
 ABC
 def
 ghi
(3 rows)

SELECT x FROM test1bpci INTERSECT SELECT x FROM test2bpci ORDER BY x;
  x  
-----
 abc
 ghi
(2 rows)

SELECT x FROM test2bpci INTERSECT SELECT x FROM test1bpci ORDER BY x;
  x  
-----
 ABC
 ghi
(2 rows)

SELECT x FROM test1bpci EXCEPT SELECT x FROM test2bpci;
  x  
-----
 def
(1 row)

SELECT x FROM test2bpci EXCEPT SELECT x FROM test1bpci;
 x 
---
(0 rows)

SELECT DISTINCT x FROM test3bpci ORDER BY x;
  x  
-----
 abc
 def
 ghi
(3 rows)

SELECT count(DISTINCT x) FROM test3bpci;
 count 
-------
     3
(1 row)

SELECT x, count(*) FROM test3bpci GROUP BY x ORDER BY x;
  x  | count 
-----+-------
 abc |     2
 def |     1
 ghi |     1
(3 rows)

SELECT x, row_number() OVER (ORDER BY x), rank() OVER (ORDER BY x) FROM test3bpci ORDER BY x;
  x  | row_number | rank 
-----+------------+------
 abc |          1 |    1
 ABC |          2 |    1
 def |          3 |    3
 ghi |          4 |    4
(4 rows)

CREATE UNIQUE INDEX ON test1bpci (x);  -- ok
INSERT INTO test1bpci VALUES ('ABC');  -- error
ERROR:  duplicate key value violates unique constraint "test1bpci_x_idx"
DETAIL:  Key (x)=(ABC) already exists.
CREATE UNIQUE INDEX ON test3bpci (x);  -- error
ERROR:  could not create unique index "test3bpci_x_idx"
DETAIL:  Key (x)=(abc) is duplicated.
SELECT string_to_array('ABC,DEF,GHI'::char(11) COLLATE case_insensitive, ',', 'abc');
ERROR:  nondeterministic collations are not supported for substring searches
SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b');
    string_to_array     
------------------------
 {A,NULL,C,D,E,F,G,H,I}
(1 row)

-- This tests the issue described in match_pattern_prefix().  In the
-- absence of that check, the case_insensitive tests below would
-- return no rows where they should logically return one.
CREATE TABLE test4c (x text COLLATE "C");
INSERT INTO test4c VALUES ('abc');
CREATE INDEX ON test4c (x);
SET enable_seqscan = off;
SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_sensitive;  -- ok, no rows
 x 
---
(0 rows)

SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_sensitive;  -- ok, no rows
 x 
---
(0 rows)

SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive;  -- error
ERROR:  nondeterministic collations are not supported for LIKE
SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive;  -- error
ERROR:  nondeterministic collations are not supported for LIKE
RESET enable_seqscan;
-- Unicode special case: different variants of Greek lower case sigma.
-- A naive implementation like citext that just does lower(x) =
-- lower(y) will do the wrong thing here, because lower('Σ') is 'σ'
-- but upper('ς') is 'Σ'.
SELECT 'ὀδυσσεύς' = 'ὈΔΥΣΣΕΎΣ' COLLATE case_sensitive;
 ?column? 
----------
 f
(1 row)

SELECT 'ὀδυσσεύς' = 'ὈΔΥΣΣΕΎΣ' COLLATE case_insensitive;
 ?column? 
----------
 t
(1 row)

-- name vs. text comparison operators
SELECT relname FROM pg_class WHERE relname = 'PG_CLASS'::text COLLATE case_insensitive;
 relname  
----------
 pg_class
(1 row)

SELECT relname FROM pg_class WHERE 'PG_CLASS'::text = relname COLLATE case_insensitive;
 relname  
----------
 pg_class
(1 row)

SELECT typname FROM pg_type WHERE typname LIKE 'int_' AND typname <> 'INT2'::text
  COLLATE case_insensitive ORDER BY typname;
 typname 
---------
 int4
 int8
(2 rows)

SELECT typname FROM pg_type WHERE typname LIKE 'int_' AND 'INT2'::text <> typname
  COLLATE case_insensitive ORDER BY typname;
 typname 
---------
 int4
 int8
(2 rows)

-- test case adapted from subselect.sql
CREATE TEMP TABLE outer_text (f1 text COLLATE case_insensitive, f2 text);
INSERT INTO outer_text VALUES ('a', 'a');
INSERT INTO outer_text VALUES ('b', 'a');
INSERT INTO outer_text VALUES ('A', NULL);
INSERT INTO outer_text VALUES ('B', NULL);
CREATE TEMP TABLE inner_text (c1 text COLLATE case_insensitive, c2 text);
INSERT INTO inner_text VALUES ('a', NULL);
SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text);
 f1 | f2 
----+----
 b  | a
 B  | 
(2 rows)

-- accents
SET client_min_messages=WARNING;
CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false);
RESET client_min_messages;
CREATE TABLE test4 (a int, b text);
INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
SELECT * FROM test4 WHERE b = 'cote';
 a |  b   
---+------
 1 | cote
(1 row)

SELECT * FROM test4 WHERE b = 'cote' COLLATE ignore_accents;
 a |  b   
---+------
 1 | cote
 2 | côte
 3 | coté
 4 | côté
(4 rows)

SELECT * FROM test4 WHERE b = 'Cote' COLLATE ignore_accents;  -- still case-sensitive
 a | b 
---+---
(0 rows)

SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive;
 a |  b   
---+------
 1 | cote
(1 row)

-- foreign keys (should use collation of primary key)
-- PK is case-sensitive, FK is case-insensitive
CREATE TABLE test10pk (x text COLLATE case_sensitive PRIMARY KEY);
INSERT INTO test10pk VALUES ('abc'), ('def'), ('ghi');
CREATE TABLE test10fk (x text COLLATE case_insensitive REFERENCES test10pk (x) ON UPDATE CASCADE ON DELETE CASCADE);
INSERT INTO test10fk VALUES ('abc');  -- ok
INSERT INTO test10fk VALUES ('ABC');  -- error
ERROR:  insert or update on table "test10fk" violates foreign key constraint "test10fk_x_fkey"
DETAIL:  Key (x)=(ABC) is not present in table "test10pk".
INSERT INTO test10fk VALUES ('xyz');  -- error
ERROR:  insert or update on table "test10fk" violates foreign key constraint "test10fk_x_fkey"
DETAIL:  Key (x)=(xyz) is not present in table "test10pk".
SELECT * FROM test10pk;
  x  
-----
 abc
 def
 ghi
(3 rows)

SELECT * FROM test10fk;
  x  
-----
 abc
(1 row)

-- restrict update even though the values are "equal" in the FK table
UPDATE test10fk SET x = 'ABC' WHERE x = 'abc';  -- error
ERROR:  insert or update on table "test10fk" violates foreign key constraint "test10fk_x_fkey"
DETAIL:  Key (x)=(ABC) is not present in table "test10pk".
SELECT * FROM test10fk;
  x  
-----
 abc
(1 row)

DELETE FROM test10pk WHERE x = 'abc';
SELECT * FROM test10pk;
  x  
-----
 def
 ghi
(2 rows)

SELECT * FROM test10fk;
 x 
---
(0 rows)

-- PK is case-insensitive, FK is case-sensitive
CREATE TABLE test11pk (x text COLLATE case_insensitive PRIMARY KEY);
INSERT INTO test11pk VALUES ('abc'), ('def'), ('ghi');
CREATE TABLE test11fk (x text COLLATE case_sensitive REFERENCES test11pk (x) ON UPDATE CASCADE ON DELETE CASCADE);
INSERT INTO test11fk VALUES ('abc');  -- ok
INSERT INTO test11fk VALUES ('ABC');  -- ok
INSERT INTO test11fk VALUES ('xyz');  -- error
ERROR:  insert or update on table "test11fk" violates foreign key constraint "test11fk_x_fkey"
DETAIL:  Key (x)=(xyz) is not present in table "test11pk".
SELECT * FROM test11pk;
  x  
-----
 abc
 def
 ghi
(3 rows)

SELECT * FROM test11fk;
  x  
-----
 abc
 ABC
(2 rows)

-- cascade update even though the values are "equal" in the PK table
UPDATE test11pk SET x = 'ABC' WHERE x = 'abc';
SELECT * FROM test11fk;
  x  
-----
 ABC
 ABC
(2 rows)

DELETE FROM test11pk WHERE x = 'abc';
SELECT * FROM test11pk;
  x  
-----
 def
 ghi
(2 rows)

SELECT * FROM test11fk;
 x 
---
(0 rows)

-- partitioning
CREATE TABLE test20 (a int, b text COLLATE case_insensitive) PARTITION BY LIST (b);
CREATE TABLE test20_1 PARTITION OF test20 FOR VALUES IN ('abc');
INSERT INTO test20 VALUES (1, 'abc');
INSERT INTO test20 VALUES (2, 'ABC');
SELECT * FROM test20_1;
 a |  b  
---+-----
 1 | abc
 2 | ABC
(2 rows)

CREATE TABLE test21 (a int, b text COLLATE case_insensitive) PARTITION BY RANGE (b);
CREATE TABLE test21_1 PARTITION OF test21 FOR VALUES FROM ('ABC') TO ('DEF');
INSERT INTO test21 VALUES (1, 'abc');
INSERT INTO test21 VALUES (2, 'ABC');
SELECT * FROM test21_1;
 a |  b  
---+-----
 1 | abc
 2 | ABC
(2 rows)

CREATE TABLE test22 (a int, b text COLLATE case_sensitive) PARTITION BY HASH (b);
CREATE TABLE test22_0 PARTITION OF test22 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE test22_1 PARTITION OF test22 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
INSERT INTO test22 VALUES (1, 'def');
INSERT INTO test22 VALUES (2, 'DEF');
-- they end up in different partitions
SELECT (SELECT count(*) FROM test22_0) = (SELECT count(*) FROM test22_1);
 ?column? 
----------
 t
(1 row)

-- same with arrays
CREATE TABLE test22a (a int, b text[] COLLATE case_sensitive) PARTITION BY HASH (b);
CREATE TABLE test22a_0 PARTITION OF test22a FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE test22a_1 PARTITION OF test22a FOR VALUES WITH (MODULUS 2, REMAINDER 1);
INSERT INTO test22a VALUES (1, ARRAY['def']);
INSERT INTO test22a VALUES (2, ARRAY['DEF']);
-- they end up in different partitions
SELECT (SELECT count(*) FROM test22a_0) = (SELECT count(*) FROM test22a_1);
 ?column? 
----------
 t
(1 row)

CREATE TABLE test23 (a int, b text COLLATE case_insensitive) PARTITION BY HASH (b);
CREATE TABLE test23_0 PARTITION OF test23 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE test23_1 PARTITION OF test23 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
INSERT INTO test23 VALUES (1, 'def');
INSERT INTO test23 VALUES (2, 'DEF');
-- they end up in the same partition (but it's platform-dependent which one)
SELECT (SELECT count(*) FROM test23_0) <> (SELECT count(*) FROM test23_1);
 ?column? 
----------
 t
(1 row)

-- same with arrays
CREATE TABLE test23a (a int, b text[] COLLATE case_insensitive) PARTITION BY HASH (b);
CREATE TABLE test23a_0 PARTITION OF test23a FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE test23a_1 PARTITION OF test23a FOR VALUES WITH (MODULUS 2, REMAINDER 1);
INSERT INTO test23a VALUES (1, ARRAY['def']);
INSERT INTO test23a VALUES (2, ARRAY['DEF']);
-- they end up in the same partition (but it's platform-dependent which one)
SELECT (SELECT count(*) FROM test23a_0) <> (SELECT count(*) FROM test23a_1);
 ?column? 
----------
 t
(1 row)

CREATE TABLE test30 (a int, b char(3) COLLATE case_insensitive) PARTITION BY LIST (b);
CREATE TABLE test30_1 PARTITION OF test30 FOR VALUES IN ('abc');
INSERT INTO test30 VALUES (1, 'abc');
INSERT INTO test30 VALUES (2, 'ABC');
SELECT * FROM test30_1;
 a |  b  
---+-----
 1 | abc
 2 | ABC
(2 rows)

CREATE TABLE test31 (a int, b char(3) COLLATE case_insensitive) PARTITION BY RANGE (b);
CREATE TABLE test31_1 PARTITION OF test31 FOR VALUES FROM ('ABC') TO ('DEF');
INSERT INTO test31 VALUES (1, 'abc');
INSERT INTO test31 VALUES (2, 'ABC');
SELECT * FROM test31_1;
 a |  b  
---+-----
 1 | abc
 2 | ABC
(2 rows)

CREATE TABLE test32 (a int, b char(3) COLLATE case_sensitive) PARTITION BY HASH (b);
CREATE TABLE test32_0 PARTITION OF test32 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE test32_1 PARTITION OF test32 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
INSERT INTO test32 VALUES (1, 'def');
INSERT INTO test32 VALUES (2, 'DEF');
-- they end up in different partitions
SELECT (SELECT count(*) FROM test32_0) = (SELECT count(*) FROM test32_1);
 ?column? 
----------
 t
(1 row)

CREATE TABLE test33 (a int, b char(3) COLLATE case_insensitive) PARTITION BY HASH (b);
CREATE TABLE test33_0 PARTITION OF test33 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
CREATE TABLE test33_1 PARTITION OF test33 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
INSERT INTO test33 VALUES (1, 'def');
INSERT INTO test33 VALUES (2, 'DEF');
-- they end up in the same partition (but it's platform-dependent which one)
SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1);
 ?column? 
----------
 t
(1 row)

-- cleanup
RESET search_path;
SET client_min_messages TO warning;
DROP SCHEMA collate_tests CASCADE;
RESET client_min_messages;
-- leave a collation for pg_upgrade test
CREATE COLLATION coll_icu_upgrade FROM "und-x-icu";
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								/*
 								 * This test is for ICU collations.
 								 */
-												Run UTF8-requiring collation tests by default

The tests collate.icu.utf8 and collate.linux.utf8 were previously only
run when explicitly selected via EXTRA_TESTS.  They require a UTF8
database, because the error messages in the expected files refer to
that, and they use some non-ASCII characters in the tests.  Since
users can select any locale and encoding for the regression test run,
it was not possible to include these tests automatically.

To fix, use psql's \if facility to check various prerequisites such as
platform and the server encoding and quit the tests at the very
beginning if the configuration is not adequate.  We then need to
maintain alternative expected files for these tests, but they are very
tiny and never need to change after this.

These two tests are now run automatically as part of the regression
tests.

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://www.postgresql.org/message-id/flat/052295c2-a2e1-9a21-bd36-8fbff8686cf3%402ndquadrant.com

											
										
										
											2019-07-31 09:42:15 +02:00
+								/* skip test if not UTF8 server encoding or no ICU collations installed */
 								SELECT getdatabaseencoding() <> 'UTF8' OR
-												Fix tests for non-ICU build

missed in 0d21d4b9bc

											
										
										
											2023-03-10 14:27:55 +01:00
+								       (SELECT count(*) FROM pg_collation WHERE collprovider = 'i' AND collname <> 'unicode') = 0
-												Run UTF8-requiring collation tests by default

The tests collate.icu.utf8 and collate.linux.utf8 were previously only
run when explicitly selected via EXTRA_TESTS.  They require a UTF8
database, because the error messages in the expected files refer to
that, and they use some non-ASCII characters in the tests.  Since
users can select any locale and encoding for the regression test run,
it was not possible to include these tests automatically.

To fix, use psql's \if facility to check various prerequisites such as
platform and the server encoding and quit the tests at the very
beginning if the configuration is not adequate.  We then need to
maintain alternative expected files for these tests, but they are very
tiny and never need to change after this.

These two tests are now run automatically as part of the regression
tests.

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://www.postgresql.org/message-id/flat/052295c2-a2e1-9a21-bd36-8fbff8686cf3%402ndquadrant.com

											
										
										
											2019-07-31 09:42:15 +02:00
+								       AS skip_test \gset
 								\if :skip_test
 								\quit
 								\endif
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								SET client_encoding TO UTF8;
 								CREATE SCHEMA collate_tests;
 								SET search_path = collate_tests;
 								CREATE TABLE collate_test1 (
 								    a int,
 								    b text COLLATE "en-x-icu" NOT NULL
 								);
 								\d collate_test1
 								        Table "collate_tests.collate_test1"
 								 Column |  Type   | Collation | Nullable | Default
 								--------+---------+-----------+----------+---------
 								 a      | integer |           |          |
 								 b      | text    | en-x-icu  | not null |
 								CREATE TABLE collate_test_fail (
 								    a int,
 								    b text COLLATE "ja_JP.eucjp-x-icu"
 								);
 								ERROR:  collation "ja_JP.eucjp-x-icu" for encoding "UTF8" does not exist
 								LINE 3:     b text COLLATE "ja_JP.eucjp-x-icu"
 								                   ^
 								CREATE TABLE collate_test_fail (
 								    a int,
 								    b text COLLATE "foo-x-icu"
 								);
 								ERROR:  collation "foo-x-icu" for encoding "UTF8" does not exist
 								LINE 3:     b text COLLATE "foo-x-icu"
 								                   ^
 								CREATE TABLE collate_test_fail (
 								    a int COLLATE "en-x-icu",
 								    b text
 								);
 								ERROR:  collations are not supported by type integer
 								LINE 2:     a int COLLATE "en-x-icu",
 								                  ^
 								CREATE TABLE collate_test_like (
 								    LIKE collate_test1
 								);
 								\d collate_test_like
 								      Table "collate_tests.collate_test_like"
 								 Column |  Type   | Collation | Nullable | Default
 								--------+---------+-----------+----------+---------
 								 a      | integer |           |          |
 								 b      | text    | en-x-icu  | not null |
 								CREATE TABLE collate_test2 (
 								    a int,
 								    b text COLLATE "sv-x-icu"
 								);
 								CREATE TABLE collate_test3 (
 								    a int,
 								    b text COLLATE "C"
 								);
 								INSERT INTO collate_test1 VALUES (1, 'abc'), (2, 'äbc'), (3, 'bbc'), (4, 'ABC');
 								INSERT INTO collate_test2 SELECT * FROM collate_test1;
 								INSERT INTO collate_test3 SELECT * FROM collate_test1;
 								SELECT * FROM collate_test1 WHERE b >= 'bbc';
 								 a |  b
 								---+-----
 | bbc
 								(1 row)
 								SELECT * FROM collate_test2 WHERE b >= 'bbc';
 								 a |  b
 								---+-----
 | äbc
 | bbc
 								(2 rows)
 								SELECT * FROM collate_test3 WHERE b >= 'bbc';
 								 a |  b
 								---+-----
 | äbc
 | bbc
 								(2 rows)
 								SELECT * FROM collate_test3 WHERE b >= 'BBC';
 								 a |  b
 								---+-----
 | abc
 | äbc
 | bbc
 								(3 rows)
 								SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
 								 a |  b
 								---+-----
 | äbc
 | bbc
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b >= 'bbc' COLLATE "C";
 								 a |  b
 								---+-----
 | äbc
 | bbc
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc' COLLATE "C";
 								 a |  b
 								---+-----
 | äbc
 | bbc
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc' COLLATE "en-x-icu";
 								ERROR:  collation mismatch between explicit collations "C" and "en-x-icu"
 								LINE 1: ...* FROM collate_test1 WHERE b COLLATE "C" >= 'bbc' COLLATE "e...
 								                                                             ^
 								CREATE DOMAIN testdomain_sv AS text COLLATE "sv-x-icu";
 								CREATE DOMAIN testdomain_i AS int COLLATE "sv-x-icu"; -- fails
 								ERROR:  collations are not supported by type integer
 								CREATE TABLE collate_test4 (
 								    a int,
 								    b testdomain_sv
 								);
 								INSERT INTO collate_test4 SELECT * FROM collate_test1;
 								SELECT a, b FROM collate_test4 ORDER BY b;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								CREATE TABLE collate_test5 (
 								    a int,
 								    b testdomain_sv COLLATE "en-x-icu"
 								);
 								INSERT INTO collate_test5 SELECT * FROM collate_test1;
 								SELECT a, b FROM collate_test5 ORDER BY b;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT a, b FROM collate_test1 ORDER BY b;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT a, b FROM collate_test2 ORDER BY b;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, b FROM collate_test3 ORDER BY b;
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								-- star expansion
 								SELECT * FROM collate_test1 ORDER BY b;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT * FROM collate_test2 ORDER BY b;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT * FROM collate_test3 ORDER BY b;
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								-- constant expression folding
 								SELECT 'bbc' COLLATE "en-x-icu" > 'äbc' COLLATE "en-x-icu" AS "true";
 								 true
 								------
 								 t
 								(1 row)
 								SELECT 'bbc' COLLATE "sv-x-icu" > 'äbc' COLLATE "sv-x-icu" AS "false";
 								 false
 								-------
 								 f
 								(1 row)
 								-- upper/lower
 								CREATE TABLE collate_test10 (
 								    a int,
 								    x text COLLATE "en-x-icu",
 								    y text COLLATE "tr-x-icu"
 								);
 								INSERT INTO collate_test10 VALUES (1, 'hij', 'hij'), (2, 'HIJ', 'HIJ');
 								SELECT a, lower(x), lower(y), upper(x), upper(y), initcap(x), initcap(y) FROM collate_test10;
 								 a | lower | lower | upper | upper | initcap | initcap
 								---+-------+-------+-------+-------+---------+---------
 | hij   | hij   | HIJ   | HİJ   | Hij     | Hij
 | hij   | hıj   | HIJ   | HIJ   | Hij     | Hıj
 								(2 rows)
 								SELECT a, lower(x COLLATE "C"), lower(y COLLATE "C") FROM collate_test10;
 								 a | lower | lower
 								---+-------+-------
 | hij   | hij
 | hij   | hij
 								(2 rows)
 								SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
 								 a |  x  |  y
 								---+-----+-----
 | HIJ | HIJ
 | hij | hij
 								(2 rows)
 								-- LIKE/ILIKE
 								SELECT * FROM collate_test1 WHERE b LIKE 'abc';
 								 a |  b
 								---+-----
 | abc
 								(1 row)
 								SELECT * FROM collate_test1 WHERE b LIKE 'abc%';
 								 a |  b
 								---+-----
 | abc
 								(1 row)
 								SELECT * FROM collate_test1 WHERE b LIKE '%bc%';
 								 a |  b
 								---+-----
 | abc
 | äbc
 | bbc
 								(3 rows)
 								SELECT * FROM collate_test1 WHERE b ILIKE 'abc';
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b ILIKE 'abc%';
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b ILIKE '%bc%';
 								 a |  b
 								---+-----
 | abc
 | äbc
 | bbc
 | ABC
 								(4 rows)
 								SELECT 'Türkiye' COLLATE "en-x-icu" ILIKE '%KI%' AS "true";
 								 true
 								------
 								 t
 								(1 row)
 								SELECT 'Türkiye' COLLATE "tr-x-icu" ILIKE '%KI%' AS "false";
 								 false
 								-------
 								 f
 								(1 row)
 								SELECT 'bıt' ILIKE 'BIT' COLLATE "en-x-icu" AS "false";
 								 false
 								-------
 								 f
 								(1 row)
 								SELECT 'bıt' ILIKE 'BIT' COLLATE "tr-x-icu" AS "true";
 								 true
 								------
 								 t
 								(1 row)
 								-- The following actually exercises the selectivity estimation for ILIKE.
 								SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
 								 relname
 								---------
 								(0 rows)
 								-- regular expressions
 								SELECT * FROM collate_test1 WHERE b ~ '^abc$';
 								 a |  b
 								---+-----
 | abc
 								(1 row)
 								SELECT * FROM collate_test1 WHERE b ~ '^abc';
 								 a |  b
 								---+-----
 | abc
 								(1 row)
 								SELECT * FROM collate_test1 WHERE b ~ 'bc';
 								 a |  b
 								---+-----
 | abc
 | äbc
 | bbc
 								(3 rows)
 								SELECT * FROM collate_test1 WHERE b ~* '^abc$';
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b ~* '^abc';
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								SELECT * FROM collate_test1 WHERE b ~* 'bc';
 								 a |  b
 								---+-----
 | abc
 | äbc
 | bbc
 | ABC
 								(4 rows)
 								CREATE TABLE collate_test6 (
 								    a int,
 								    b text COLLATE "en-x-icu"
 								);
 								INSERT INTO collate_test6 VALUES (1, 'abc'), (2, 'ABC'), (3, '123'), (4, 'ab1'),
 								                                 (5, 'a1!'), (6, 'a c'), (7, '!.;'), (8, '   '),
 								                                 (9, 'äbç'), (10, 'ÄBÇ');
 								SELECT b,
 								       b ~ '^[[:alpha:]]+$' AS is_alpha,
 								       b ~ '^[[:upper:]]+$' AS is_upper,
 								       b ~ '^[[:lower:]]+$' AS is_lower,
 								       b ~ '^[[:digit:]]+$' AS is_digit,
 								       b ~ '^[[:alnum:]]+$' AS is_alnum,
 								       b ~ '^[[:graph:]]+$' AS is_graph,
 								       b ~ '^[[:print:]]+$' AS is_print,
 								       b ~ '^[[:punct:]]+$' AS is_punct,
 								       b ~ '^[[:space:]]+$' AS is_space
 								FROM collate_test6;
 								  b  | is_alpha | is_upper | is_lower | is_digit | is_alnum | is_graph | is_print | is_punct | is_space
 								-----+----------+----------+----------+----------+----------+----------+----------+----------+----------
 								 abc | t        | f        | t        | f        | t        | t        | t        | f        | f
 								 ABC | t        | t        | f        | f        | t        | t        | t        | f        | f
 | f        | f        | f        | t        | t        | t        | t        | f        | f
 								 ab1 | f        | f        | f        | f        | t        | t        | t        | f        | f
 								 a1! | f        | f        | f        | f        | f        | t        | t        | f        | f
 								 a c | f        | f        | f        | f        | f        | f        | t        | f        | f
 								 !.; | f        | f        | f        | f        | f        | t        | t        | t        | f
 								     | f        | f        | f        | f        | f        | f        | t        | f        | t
 								 äbç | t        | f        | t        | f        | t        | t        | t        | f        | f
 								 ÄBÇ | t        | t        | f        | f        | t        | t        | t        | f        | f
 								(10 rows)
 								SELECT 'Türkiye' COLLATE "en-x-icu" ~* 'KI' AS "true";
 								 true
 								------
 								 t
 								(1 row)
 								SELECT 'Türkiye' COLLATE "tr-x-icu" ~* 'KI' AS "true";  -- true with ICU
 								 true
 								------
 								 t
 								(1 row)
 								SELECT 'bıt' ~* 'BIT' COLLATE "en-x-icu" AS "false";
 								 false
 								-------
 								 f
 								(1 row)
 								SELECT 'bıt' ~* 'BIT' COLLATE "tr-x-icu" AS "false";  -- false with ICU
 								 false
 								-------
 								 f
 								(1 row)
 								-- The following actually exercises the selectivity estimation for ~*.
 								SELECT relname FROM pg_class WHERE relname ~* '^abc';
 								 relname
 								---------
 								(0 rows)
 								/* not run by default because it requires tr_TR system locale
 								-- to_char
 								SET lc_time TO 'tr_TR';
 								SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
 								SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr-x-icu");
 								*/
 								-- backwards parsing
 								CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
 								CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
 								CREATE VIEW collview3 AS SELECT a, lower((x || x) COLLATE "C") FROM collate_test10;
 								SELECT table_name, view_definition FROM information_schema.views
 								  WHERE table_name LIKE 'collview%' ORDER BY 1;
-												Get rid of the "new" and "old" entries in a view's rangetable.

The rule system needs "old" and/or "new" pseudo-RTEs in rule actions
that are ON INSERT/UPDATE/DELETE.  Historically it's put such entries
into the ON SELECT rules of views as well, but those are really quite
vestigial.  The only thing we've used them for is to carry the
view's relid forward to AcquireExecutorLocks (so that we can
re-lock the view to verify it hasn't changed before re-using a plan)
and to carry its relid and permissions data forward to execution-time
permissions checks.  What we can do instead of that is to retain
these fields of the RTE_RELATION RTE for the view even after we
convert it to an RTE_SUBQUERY RTE.  This requires a tiny amount of
extra complication in the planner and AcquireExecutorLocks, but on
the other hand we can get rid of the logic that moves that data from
one place to another.

The principal immediate benefit of doing this, aside from a small
saving in the pg_rewrite data for views, is that these pseudo-RTEs
no longer trigger ruleutils.c's heuristic about qualifying variable
names when the rangetable's length is more than 1.  That results
in quite a number of small simplifications in regression test outputs,
which are all to the good IMO.

Bump catversion because we need to dump a few more fields of
RTE_SUBQUERY RTEs.  While those will always be zeroes anyway in
stored rules (because we'd never populate them until query rewrite)
they are useful for debugging, and it seems like we'd better make
sure to transmit such RTEs accurately in plans sent to parallel
workers.  I don't think the executor actually examines these fields
after startup, but someday it might.

This is a second attempt at committing 1b4d280ea.  The difference
from the first time is that now we can add some filtering rules to
AdjustUpgrade.pm to allow cross-version upgrade testing to pass
despite all the cosmetic changes in CREATE VIEW outputs.

Amit Langote (filtering rules by me)

Discussion: https://postgr.es/m/CA+HiwqEf7gPN4Hn+LoZ4tP2q_Qt7n3vw7-6fJKOf92tSEnX6Gg@mail.gmail.com
Discussion: https://postgr.es/m/891521.1673657296@sss.pgh.pa.us

											
										
										
											2023-01-18 19:23:57 +01:00
+								 table_name |              view_definition
 								------------+--------------------------------------------
 								 collview1  |  SELECT a,                                +
 								            |     b                                     +
 								            |    FROM collate_test1                     +
 								            |   WHERE ((b COLLATE "C") >= 'bbc'::text);
 								 collview2  |  SELECT a,                                +
 								            |     b                                     +
 								            |    FROM collate_test1                     +
 								            |   ORDER BY (b COLLATE "C");
 								 collview3  |  SELECT a,                                +
 								            |     lower(((x || x) COLLATE "C")) AS lower+
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								            |    FROM collate_test10;
 								(3 rows)
 								-- collation propagation in various expression types
 								SELECT a, coalesce(b, 'foo') FROM collate_test1 ORDER BY 2;
 								 a | coalesce
 								---+----------
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT a, coalesce(b, 'foo') FROM collate_test2 ORDER BY 2;
 								 a | coalesce
 								---+----------
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, coalesce(b, 'foo') FROM collate_test3 ORDER BY 2;
 								 a | coalesce
 								---+----------
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, lower(coalesce(x, 'foo')), lower(coalesce(y, 'foo')) FROM collate_test10;
 								 a | lower | lower
 								---+-------+-------
 | hij   | hij
 | hij   | hıj
 								(2 rows)
 								SELECT a, b, greatest(b, 'CCC') FROM collate_test1 ORDER BY 3;
 								 a |  b  | greatest
 								---+-----+----------
 | abc | CCC
 | äbc | CCC
 | bbc | CCC
 | ABC | CCC
 								(4 rows)
 								SELECT a, b, greatest(b, 'CCC') FROM collate_test2 ORDER BY 3;
 								 a |  b  | greatest
 								---+-----+----------
 | abc | CCC
 | bbc | CCC
 | ABC | CCC
 | äbc | äbc
 								(4 rows)
 								SELECT a, b, greatest(b, 'CCC') FROM collate_test3 ORDER BY 3;
 								 a |  b  | greatest
 								---+-----+----------
 | ABC | CCC
 | abc | abc
 | bbc | bbc
 | äbc | äbc
 								(4 rows)
 								SELECT a, x, y, lower(greatest(x, 'foo')), lower(greatest(y, 'foo')) FROM collate_test10;
 								 a |  x  |  y  | lower | lower
 								---+-----+-----+-------+-------
 | hij | hij | hij   | hij
 | HIJ | HIJ | hij   | hıj
 								(2 rows)
 								SELECT a, nullif(b, 'abc') FROM collate_test1 ORDER BY 2;
 								 a | nullif
 								---+--------
 | ABC
 | äbc
 | bbc
 |
 								(4 rows)
 								SELECT a, nullif(b, 'abc') FROM collate_test2 ORDER BY 2;
 								 a | nullif
 								---+--------
 | ABC
 | bbc
 | äbc
 |
 								(4 rows)
 								SELECT a, nullif(b, 'abc') FROM collate_test3 ORDER BY 2;
 								 a | nullif
 								---+--------
 | ABC
 | bbc
 | äbc
 |
 								(4 rows)
 								SELECT a, lower(nullif(x, 'foo')), lower(nullif(y, 'foo')) FROM collate_test10;
 								 a | lower | lower
 								---+-------+-------
 | hij   | hij
 | hij   | hıj
 								(2 rows)
 								SELECT a, CASE b WHEN 'abc' THEN 'abcd' ELSE b END FROM collate_test1 ORDER BY 2;
 								 a |  b
 								---+------
 | ABC
 | äbc
 | abcd
 | bbc
 								(4 rows)
 								SELECT a, CASE b WHEN 'abc' THEN 'abcd' ELSE b END FROM collate_test2 ORDER BY 2;
 								 a |  b
 								---+------
 | ABC
 | abcd
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, CASE b WHEN 'abc' THEN 'abcd' ELSE b END FROM collate_test3 ORDER BY 2;
 								 a |  b
 								---+------
 | ABC
 | abcd
 | bbc
 | äbc
 								(4 rows)
 								CREATE DOMAIN testdomain AS text;
 								SELECT a, b::testdomain FROM collate_test1 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT a, b::testdomain FROM collate_test2 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, b::testdomain FROM collate_test3 ORDER BY 2;
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, b::testdomain_sv FROM collate_test3 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, lower(x::testdomain), lower(y::testdomain) FROM collate_test10;
 								 a | lower | lower
 								---+-------+-------
 | hij   | hij
 | hij   | hıj
 								(2 rows)
 								SELECT min(b), max(b) FROM collate_test1;
 								 min | max
 								-----+-----
 								 abc | bbc
 								(1 row)
 								SELECT min(b), max(b) FROM collate_test2;
 								 min | max
 								-----+-----
 								 abc | äbc
 								(1 row)
 								SELECT min(b), max(b) FROM collate_test3;
 								 min | max
 								-----+-----
 								 ABC | äbc
 								(1 row)
 								SELECT array_agg(b ORDER BY b) FROM collate_test1;
 								     array_agg
 								-------------------
 								 {abc,ABC,äbc,bbc}
 								(1 row)
 								SELECT array_agg(b ORDER BY b) FROM collate_test2;
 								     array_agg
 								-------------------
 								 {abc,ABC,bbc,äbc}
 								(1 row)
 								SELECT array_agg(b ORDER BY b) FROM collate_test3;
 								     array_agg
 								-------------------
 								 {ABC,abc,bbc,äbc}
 								(1 row)
 								SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test1 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | abc
 | ABC
 | ABC
 | äbc
 | äbc
 | bbc
 | bbc
 								(8 rows)
 								SELECT a, b FROM collate_test2 UNION SELECT a, b FROM collate_test2 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, b FROM collate_test3 WHERE a < 4 INTERSECT SELECT a, b FROM collate_test3 WHERE a > 1 ORDER BY 2;
 								 a |  b
 								---+-----
 | bbc
 | äbc
 								(2 rows)
 								SELECT a, b FROM collate_test3 EXCEPT SELECT a, b FROM collate_test3 WHERE a < 2 ORDER BY 2;
 								 a |  b
 								---+-----
 | ABC
 | bbc
 | äbc
 								(3 rows)
 								SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
 								ERROR:  could not determine which collation to use for string comparison
 								HINT:  Use the COLLATE clause to set the collation explicitly.
 								SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test3; -- ok
 								 a |  b
 								---+-----
 | abc
 | äbc
 | bbc
 | ABC
 | abc
 | äbc
 | bbc
 | ABC
 								(8 rows)
 								SELECT a, b FROM collate_test1 UNION SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
 								ERROR:  collation mismatch between implicit collations "en-x-icu" and "C"
 								LINE 1: SELECT a, b FROM collate_test1 UNION SELECT a, b FROM collat...
 								                                                       ^
 								HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
 								SELECT a, b COLLATE "C" FROM collate_test1 UNION SELECT a, b FROM collate_test3 ORDER BY 2; -- ok
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, b FROM collate_test1 INTERSECT SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
 								ERROR:  collation mismatch between implicit collations "en-x-icu" and "C"
 								LINE 1: ...ELECT a, b FROM collate_test1 INTERSECT SELECT a, b FROM col...
 								                                                             ^
 								HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
 								SELECT a, b FROM collate_test1 EXCEPT SELECT a, b FROM collate_test3 ORDER BY 2; -- fail
 								ERROR:  collation mismatch between implicit collations "en-x-icu" and "C"
 								LINE 1: SELECT a, b FROM collate_test1 EXCEPT SELECT a, b FROM colla...
 								                                                        ^
 								HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
 								CREATE TABLE test_u AS SELECT a, b FROM collate_test1 UNION ALL SELECT a, b FROM collate_test3; -- fail
 								ERROR:  no collation was derived for column "b" with collatable type text
 								HINT:  Use the COLLATE clause to set the collation explicitly.
 								-- ideally this would be a parse-time error, but for now it must be run-time:
 								select x < y from collate_test10; -- fail
 								ERROR:  could not determine which collation to use for string comparison
 								HINT:  Use the COLLATE clause to set the collation explicitly.
 								select x || y from collate_test10; -- ok, because || is not collation aware
 								 ?column?
 								----------
 								 hijhij
 								 HIJHIJ
 								(2 rows)
 								select x, y from collate_test10 order by x || y; -- not so ok
 								ERROR:  collation mismatch between implicit collations "en-x-icu" and "tr-x-icu"
 								LINE 1: select x, y from collate_test10 order by x || y;
 								                                                      ^
 								HINT:  You can choose the collation by applying the COLLATE clause to one or both expressions.
 								-- collation mismatch between recursive and non-recursive term
 								WITH RECURSIVE foo(x) AS
 								   (SELECT x FROM (VALUES('a' COLLATE "en-x-icu"),('b')) t(x)
 								   UNION ALL
 								   SELECT (x || 'c') COLLATE "de-x-icu" FROM foo WHERE length(x) < 10)
 								SELECT * FROM foo;
 								ERROR:  recursive query "foo" column 1 has collation "en-x-icu" in non-recursive term but collation "de-x-icu" overall
 								LINE 2:    (SELECT x FROM (VALUES('a' COLLATE "en-x-icu"),('b')) t(x...
 								                   ^
 								HINT:  Use the COLLATE clause to set the collation of the non-recursive term.
 								-- casting
 								SELECT CAST('42' AS text COLLATE "C");
 								ERROR:  syntax error at or near "COLLATE"
 								LINE 1: SELECT CAST('42' AS text COLLATE "C");
 								                                 ^
 								SELECT a, CAST(b AS varchar) FROM collate_test1 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT a, CAST(b AS varchar) FROM collate_test2 ORDER BY 2;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, CAST(b AS varchar) FROM collate_test3 ORDER BY 2;
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								-- propagation of collation in SQL functions (inlined and non-inlined cases)
 								-- and plpgsql functions too
 								CREATE FUNCTION mylt (text, text) RETURNS boolean LANGUAGE sql
 								    AS $$ select $1 < $2 $$;
 								CREATE FUNCTION mylt_noninline (text, text) RETURNS boolean LANGUAGE sql
 								    AS $$ select $1 < $2 limit 1 $$;
 								CREATE FUNCTION mylt_plpgsql (text, text) RETURNS boolean LANGUAGE plpgsql
 								    AS $$ begin return $1 < $2; end $$;
 								SELECT a.b AS a, b.b AS b, a.b < b.b AS lt,
 								       mylt(a.b, b.b), mylt_noninline(a.b, b.b), mylt_plpgsql(a.b, b.b)
 								FROM collate_test1 a, collate_test1 b
 								ORDER BY a.b, b.b;
 								  a  |  b  | lt | mylt | mylt_noninline | mylt_plpgsql
 								-----+-----+----+------+----------------+--------------
 								 abc | abc | f  | f    | f              | f
 								 abc | ABC | t  | t    | t              | t
 								 abc | äbc | t  | t    | t              | t
 								 abc | bbc | t  | t    | t              | t
 								 ABC | abc | f  | f    | f              | f
 								 ABC | ABC | f  | f    | f              | f
 								 ABC | äbc | t  | t    | t              | t
 								 ABC | bbc | t  | t    | t              | t
 								 äbc | abc | f  | f    | f              | f
 								 äbc | ABC | f  | f    | f              | f
 								 äbc | äbc | f  | f    | f              | f
 								 äbc | bbc | t  | t    | t              | t
 								 bbc | abc | f  | f    | f              | f
 								 bbc | ABC | f  | f    | f              | f
 								 bbc | äbc | f  | f    | f              | f
 								 bbc | bbc | f  | f    | f              | f
 								(16 rows)
 								SELECT a.b AS a, b.b AS b, a.b < b.b COLLATE "C" AS lt,
 								       mylt(a.b, b.b COLLATE "C"), mylt_noninline(a.b, b.b COLLATE "C"),
 								       mylt_plpgsql(a.b, b.b COLLATE "C")
 								FROM collate_test1 a, collate_test1 b
 								ORDER BY a.b, b.b;
 								  a  |  b  | lt | mylt | mylt_noninline | mylt_plpgsql
 								-----+-----+----+------+----------------+--------------
 								 abc | abc | f  | f    | f              | f
 								 abc | ABC | f  | f    | f              | f
 								 abc | äbc | t  | t    | t              | t
 								 abc | bbc | t  | t    | t              | t
 								 ABC | abc | t  | t    | t              | t
 								 ABC | ABC | f  | f    | f              | f
 								 ABC | äbc | t  | t    | t              | t
 								 ABC | bbc | t  | t    | t              | t
 								 äbc | abc | f  | f    | f              | f
 								 äbc | ABC | f  | f    | f              | f
 								 äbc | äbc | f  | f    | f              | f
 								 äbc | bbc | f  | f    | f              | f
 								 bbc | abc | f  | f    | f              | f
 								 bbc | ABC | f  | f    | f              | f
 								 bbc | äbc | t  | t    | t              | t
 								 bbc | bbc | f  | f    | f              | f
 								(16 rows)
 								-- collation override in plpgsql
 								CREATE FUNCTION mylt2 (x text, y text) RETURNS boolean LANGUAGE plpgsql AS $$
 								declare
 								  xx text := x;
 								  yy text := y;
 								begin
 								  return xx < yy;
 								end
 								$$;
 								SELECT mylt2('a', 'B' collate "en-x-icu") as t, mylt2('a', 'B' collate "C") as f;
 								 t | f
 								---+---
 								 t | f
 								(1 row)
 								CREATE OR REPLACE FUNCTION
 								  mylt2 (x text, y text) RETURNS boolean LANGUAGE plpgsql AS $$
 								declare
 								  xx text COLLATE "POSIX" := x;
 								  yy text := y;
 								begin
 								  return xx < yy;
 								end
 								$$;
 								SELECT mylt2('a', 'B') as f;
 								 f
 								---
 								 f
 								(1 row)
 								SELECT mylt2('a', 'B' collate "C") as fail; -- conflicting collations
 								ERROR:  could not determine which collation to use for string comparison
 								HINT:  Use the COLLATE clause to set the collation explicitly.
 								CONTEXT:  PL/pgSQL function mylt2(text,text) line 6 at RETURN
 								SELECT mylt2('a', 'B' collate "POSIX") as f;
 								 f
 								---
 								 f
 								(1 row)
 								-- polymorphism
 								SELECT * FROM unnest((SELECT array_agg(b ORDER BY b) FROM collate_test1)) ORDER BY 1;
 								 unnest
 								--------
 								 abc
 								 ABC
 								 äbc
 								 bbc
 								(4 rows)
 								SELECT * FROM unnest((SELECT array_agg(b ORDER BY b) FROM collate_test2)) ORDER BY 1;
 								 unnest
 								--------
 								 abc
 								 ABC
 								 bbc
 								 äbc
 								(4 rows)
 								SELECT * FROM unnest((SELECT array_agg(b ORDER BY b) FROM collate_test3)) ORDER BY 1;
 								 unnest
 								--------
 								 ABC
 								 abc
 								 bbc
 								 äbc
 								(4 rows)
 								CREATE FUNCTION dup (anyelement) RETURNS anyelement
 								    AS 'select $1' LANGUAGE sql;
 								SELECT a, dup(b) FROM collate_test1 ORDER BY 2;
 								 a | dup
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
 								SELECT a, dup(b) FROM collate_test2 ORDER BY 2;
 								 a | dup
 								---+-----
 | abc
 | ABC
 | bbc
 | äbc
 								(4 rows)
 								SELECT a, dup(b) FROM collate_test3 ORDER BY 2;
 								 a | dup
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
 								-- indexes
 								CREATE INDEX collate_test1_idx1 ON collate_test1 (b);
 								CREATE INDEX collate_test1_idx2 ON collate_test1 (b COLLATE "C");
 								CREATE INDEX collate_test1_idx3 ON collate_test1 ((b COLLATE "C")); -- this is different grammatically
 								CREATE INDEX collate_test1_idx4 ON collate_test1 (((b||'foo') COLLATE "POSIX"));
 								CREATE INDEX collate_test1_idx5 ON collate_test1 (a COLLATE "C"); -- fail
 								ERROR:  collations are not supported by type integer
 								CREATE INDEX collate_test1_idx6 ON collate_test1 ((a COLLATE "C")); -- fail
 								ERROR:  collations are not supported by type integer
 								LINE 1: ...ATE INDEX collate_test1_idx6 ON collate_test1 ((a COLLATE "C...
 								                                                             ^
 								SELECT relname, pg_get_indexdef(oid) FROM pg_class WHERE relname LIKE 'collate_test%_idx%' ORDER BY 1;
-												Avoid using unsafe search_path settings during dump and restore.

Historically, pg_dump has "set search_path = foo, pg_catalog" when
dumping an object in schema "foo", and has also caused that setting
to be used while restoring the object.  This is problematic because
functions and operators in schema "foo" could capture references meant
to refer to pg_catalog entries, both in the queries issued by pg_dump
and those issued during the subsequent restore run.  That could
result in dump/restore misbehavior, or in privilege escalation if a
nefarious user installs trojan-horse functions or operators.

This patch changes pg_dump so that it does not change the search_path
dynamically.  The emitted restore script sets the search_path to what
was used at dump time, and then leaves it alone thereafter.  Created
objects are placed in the correct schema, regardless of the active
search_path, by dint of schema-qualifying their names in the CREATE
commands, as well as in subsequent ALTER and ALTER-like commands.

Since this change requires a change in the behavior of pg_restore
when processing an archive file made according to this new convention,
bump the archive file version number; old versions of pg_restore will
therefore refuse to process files made with new versions of pg_dump.

Security: CVE-2018-1058

											
										
										
											2018-02-26 16:18:21 +01:00
+								      relname       |                                                  pg_get_indexdef
 								--------------------+-------------------------------------------------------------------------------------------------------------------
 								 collate_test1_idx1 | CREATE INDEX collate_test1_idx1 ON collate_tests.collate_test1 USING btree (b)
 								 collate_test1_idx2 | CREATE INDEX collate_test1_idx2 ON collate_tests.collate_test1 USING btree (b COLLATE "C")
 								 collate_test1_idx3 | CREATE INDEX collate_test1_idx3 ON collate_tests.collate_test1 USING btree (b COLLATE "C")
 								 collate_test1_idx4 | CREATE INDEX collate_test1_idx4 ON collate_tests.collate_test1 USING btree (((b || 'foo'::text)) COLLATE "POSIX")
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								(4 rows)
-												Fix planner's test for case-foldable characters in ILIKE with ICU.

As coded, the ICU-collation path in pattern_char_isalpha() failed
to consider regular ASCII letters to be case-varying.  This led to
like_fixed_prefix treating too much of an ILIKE pattern as being a
fixed prefix, so that indexscans derived from an ILIKE clause might
miss entries that they should find.

Per bug #15892 from James Inform.  This is an oversight in the original
ICU patch (commit eccfef81e), so back-patch to v10 where that came in.

Discussion: https://postgr.es/m/15892-e5d2bea3e8a04a1b@postgresql.org

											
										
										
											2019-08-12 19:15:47 +02:00
+								set enable_seqscan = off;
 								explain (costs off)
 								select * from collate_test1 where b ilike 'abc';
 								          QUERY PLAN
 								-------------------------------
 								 Seq Scan on collate_test1
 								   Filter: (b ~~* 'abc'::text)
 								(2 rows)
 								select * from collate_test1 where b ilike 'abc';
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								explain (costs off)
 								select * from collate_test1 where b ilike 'ABC';
 								          QUERY PLAN
 								-------------------------------
 								 Seq Scan on collate_test1
 								   Filter: (b ~~* 'ABC'::text)
 								(2 rows)
 								select * from collate_test1 where b ilike 'ABC';
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								reset enable_seqscan;
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								-- schema manipulation commands
 								CREATE ROLE regress_test_role;
 								CREATE SCHEMA test_schema;
 								-- We need to do this this way to cope with varying names for encodings:
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								SET client_min_messages TO WARNING;
-												ICU: do not convert locale 'C' to 'en-US-u-va-posix'.

Older versions of ICU canonicalize "C" to "en-US-u-va-posix"; but
starting in ICU version 64, the "C" locale is considered
obsolete. Postgres commit ea1db8ae70 introduced code to always
canonicalize "C" to "en-US-u-va-posix" for consistency and
convenience, but it was deemed too confusing.

This commit removes that code, so that "C" is treated like other ICU
locale names: canonicalization is attempted, and if it fails, the
behavior is controlled by icu_validation_level.

A similar change was previously committed as f7faa9976c, then reverted
due to an ICU-version-dependent test failure. This commit un-reverts
it, omitting the test because we now expect the behavior to depend on
the version of ICU being used.

Discussion: https://postgr.es/m/3a200aca-4672-4b37-fc91-5d198a323503%40eisentraut.org
Discussion: https://postgr.es/m/f83f089ee1e9acd5dbbbf3353294d24e1f196e95.camel@j-davis.com
Discussion: https://postgr.es/m/37520ec1ae9591f83132f82dbd625f3fc2d69c16.camel@j-davis.com

											
										
										
											2023-06-21 22:18:25 +02:00
+								SET icu_validation_level = disabled;
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								do $$
 								BEGIN
 								  EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
-												Catalog changes preparing for builtin collation provider.

Rename pg_collation.colliculocale to colllocale, and
pg_database.daticulocale to datlocale. These names reflects that the
fields will be useful for the upcoming builtin provider as well, not
just for ICU.

This is purely a rename; no changes to the meaning of the fields.

Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2024-03-09 23:48:18 +01:00
+								          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN datlocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								END
 								$$;
 								CREATE COLLATION test0 FROM "C"; -- fail, duplicate name
 								ERROR:  collation "test0" already exists
 								do $$
 								BEGIN
-												Add option to use ICU as global locale provider

This adds the option to use ICU as the default locale provider for
either the whole cluster or a database.  New options for initdb,
createdb, and CREATE DATABASE are used to select this.

Since some (legacy) code still uses the libc locale facilities
directly, we still need to set the libc global locale settings even if
ICU is otherwise selected.  So pg_database now has three
locale-related fields: the existing datcollate and datctype, which are
always set, and a new daticulocale, which is only set if ICU is
selected.  A similar change is made in pg_collation for consistency,
but in that case, only the libc-related fields or the ICU-related
field is set, never both.

Reviewed-by: Julien Rouhaud <rjuju123@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com

											
										
										
											2022-03-17 11:11:21 +01:00
+								  EXECUTE 'CREATE COLLATION test1 (provider = icu, locale = ' ||
-												Catalog changes preparing for builtin collation provider.

Rename pg_collation.colliculocale to colllocale, and
pg_database.daticulocale to datlocale. These names reflects that the
fields will be useful for the upcoming builtin provider as well, not
just for ICU.

This is purely a rename; no changes to the meaning of the fields.

Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2024-03-09 23:48:18 +01:00
+								          quote_literal((SELECT CASE WHEN datlocprovider='i' THEN datlocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								END
 								$$;
-												ICU: do not convert locale 'C' to 'en-US-u-va-posix'.

Older versions of ICU canonicalize "C" to "en-US-u-va-posix"; but
starting in ICU version 64, the "C" locale is considered
obsolete. Postgres commit ea1db8ae70 introduced code to always
canonicalize "C" to "en-US-u-va-posix" for consistency and
convenience, but it was deemed too confusing.

This commit removes that code, so that "C" is treated like other ICU
locale names: canonicalization is attempted, and if it fails, the
behavior is controlled by icu_validation_level.

A similar change was previously committed as f7faa9976c, then reverted
due to an ICU-version-dependent test failure. This commit un-reverts
it, omitting the test because we now expect the behavior to depend on
the version of ICU being used.

Discussion: https://postgr.es/m/3a200aca-4672-4b37-fc91-5d198a323503%40eisentraut.org
Discussion: https://postgr.es/m/f83f089ee1e9acd5dbbbf3353294d24e1f196e95.camel@j-davis.com
Discussion: https://postgr.es/m/37520ec1ae9591f83132f82dbd625f3fc2d69c16.camel@j-davis.com

											
										
										
											2023-06-21 22:18:25 +02:00
+								RESET icu_validation_level;
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								RESET client_min_messages;
-												Add option to use ICU as global locale provider

This adds the option to use ICU as the default locale provider for
either the whole cluster or a database.  New options for initdb,
createdb, and CREATE DATABASE are used to select this.

Since some (legacy) code still uses the libc locale facilities
directly, we still need to set the libc global locale settings even if
ICU is otherwise selected.  So pg_database now has three
locale-related fields: the existing datcollate and datctype, which are
always set, and a new daticulocale, which is only set if ICU is
selected.  A similar change is made in pg_collation for consistency,
but in that case, only the libc-related fields or the ICU-related
field is set, never both.

Reviewed-by: Julien Rouhaud <rjuju123@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5e756dd6-0e91-d778-96fd-b1bcb06c161a%402ndquadrant.com

											
										
										
											2022-03-17 11:11:21 +01:00
+								CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 								ERROR:  parameter "locale" must be specified
-												Reduce icu_validation_level default to WARNING.

Discussion: https://postgr.es/m/daa9f060aa2349ebc84444515efece49e7b32c5d.camel@j-davis.com

											
										
										
											2023-05-17 22:18:40 +02:00
+								SET icu_validation_level = ERROR;
-												Validate ICU locales.

For ICU collations, ensure that the locale's language exists in ICU,
and that the locale can be opened.

Basic validation helps avoid minor mistakes and misspellings, which
often fall back to the root locale instead of the intended
locale. It's even more important to avoid such mistakes in ICU
versions 54 and earlier, where the same (misspelled) locale string
could fall back to different locales depending on the environment.

Discussion: https://postgr.es/m/11b1eeb7e7667fdd4178497aeb796c48d26e69b9.camel@j-davis.com
Discussion: https://postgr.es/m/df2efad0cae7c65180df8e5ebb709e5eb4f2a82b.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-03-29 01:15:59 +02:00
+								CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails
 								ERROR:  ICU locale "nonsense-nowhere" has unknown language "nonsense"
-												Apply quotes more consistently to GUC names in logs

Quotes are applied to GUCs in a very inconsistent way across the code
base, with a mix of double quotes or no quotes used.  This commit
removes double quotes around all the GUC names that are obviously
referred to as parameters with non-English words (use of underscore,
mixed case, etc).

This is the result of a discussion with Álvaro Herrera, Nathan Bossart,
Laurenz Albe, Peter Eisentraut, Tom Lane and Daniel Gustafsson.

Author: Peter Smith
Discussion: https://postgr.es/m/CAHut+Pv-kSN8SkxSdoHano_wPubqcg5789ejhCDZAcLFceBR-w@mail.gmail.com

											
										
										
											2023-11-30 06:11:45 +01:00
+								HINT:  To disable ICU locale validation, set the parameter icu_validation_level to "disabled".
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails
 								ERROR:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
-												Reduce icu_validation_level default to WARNING.

Discussion: https://postgr.es/m/daa9f060aa2349ebc84444515efece49e7b32c5d.camel@j-davis.com

											
										
										
											2023-05-17 22:18:40 +02:00
+								RESET icu_validation_level;
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx;
 								WARNING:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
-												Validate ICU locales.

For ICU collations, ensure that the locale's language exists in ICU,
and that the locale can be opened.

Basic validation helps avoid minor mistakes and misspellings, which
often fall back to the root locale instead of the intended
locale. It's even more important to avoid such mistakes in ICU
versions 54 and earlier, where the same (misspelled) locale string
could fall back to different locales depending on the environment.

Discussion: https://postgr.es/m/11b1eeb7e7667fdd4178497aeb796c48d26e69b9.camel@j-davis.com
Discussion: https://postgr.es/m/df2efad0cae7c65180df8e5ebb709e5eb4f2a82b.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-03-29 01:15:59 +02:00
+								CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx;
 								WARNING:  ICU locale "nonsense-nowhere" has unknown language "nonsense"
-												Apply quotes more consistently to GUC names in logs

Quotes are applied to GUCs in a very inconsistent way across the code
base, with a mix of double quotes or no quotes used.  This commit
removes double quotes around all the GUC names that are obviously
referred to as parameters with non-English words (use of underscore,
mixed case, etc).

This is the result of a discussion with Álvaro Herrera, Nathan Bossart,
Laurenz Albe, Peter Eisentraut, Tom Lane and Daniel Gustafsson.

Author: Peter Smith
Discussion: https://postgr.es/m/CAHut+Pv-kSN8SkxSdoHano_wPubqcg5789ejhCDZAcLFceBR-w@mail.gmail.com

											
										
										
											2023-11-30 06:11:45 +01:00
+								HINT:  To disable ICU locale validation, set the parameter icu_validation_level to "disabled".
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								CREATE COLLATION test4 FROM nonsense;
 								ERROR:  collation "nonsense" for encoding "UTF8" does not exist
 								CREATE COLLATION test5 FROM test0;
 								SELECT collname FROM pg_collation WHERE collname LIKE 'test%' ORDER BY 1;
 								 collname
 								----------
 								 test0
 								 test1
 								 test5
 								(3 rows)
 								ALTER COLLATION test1 RENAME TO test11;
 								ALTER COLLATION test0 RENAME TO test11; -- fail
 								ERROR:  collation "test11" already exists in schema "collate_tests"
 								ALTER COLLATION test1 RENAME TO test22; -- fail
 								ERROR:  collation "test1" for encoding "UTF8" does not exist
 								ALTER COLLATION test11 OWNER TO regress_test_role;
 								ALTER COLLATION test11 OWNER TO nonsense;
 								ERROR:  role "nonsense" does not exist
 								ALTER COLLATION test11 SET SCHEMA test_schema;
 								COMMENT ON COLLATION test0 IS 'US English';
 								SELECT collname, nspname, obj_description(pg_collation.oid, 'pg_collation')
 								    FROM pg_collation JOIN pg_namespace ON (collnamespace = pg_namespace.oid)
 								    WHERE collname LIKE 'test%'
 								    ORDER BY 1;
 								 collname |    nspname    | obj_description
 								----------+---------------+-----------------
 								 test0    | collate_tests | US English
 								 test11   | test_schema   |
 								 test5    | collate_tests |
 								(3 rows)
 								DROP COLLATION test0, test_schema.test11, test5;
 								DROP COLLATION test0; -- fail
 								ERROR:  collation "test0" for encoding "UTF8" does not exist
 								DROP COLLATION IF EXISTS test0;
 								NOTICE:  collation "test0" does not exist, skipping
 								SELECT collname FROM pg_collation WHERE collname LIKE 'test%';
 								 collname
 								----------
 								(0 rows)
 								DROP SCHEMA test_schema;
 								DROP ROLE regress_test_role;
-												Revert per-index collation version tracking feature.

Design problems were discovered in the handling of composite types and
record types that would cause some relevant versions not to be recorded.
Misgivings were also expressed about the use of the pg_depend catalog
for this purpose.  We're out of time for this release so we'll revert
and try again.

Commits reverted:

1bf946bd: Doc: Document known problem with Windows collation versions.
cf002008: Remove no-longer-relevant test case.
ef387bed: Fix bogus collation-version-recording logic.
0fb0a050: Hide internal error for pg_collation_actual_version(<bad OID>).
ff942057: Suppress "warning: variable 'collcollate' set but not used".
d50e3b1f: Fix assertion in collation version lookup.
f24b1569: Rethink extraction of collation dependencies.
257836a7: Track collation versions for indexes.
cd6f479e: Add pg_depend.refobjversion.
7d1297df: Remove pg_collation.collversion.

Discussion: https://postgr.es/m/CA%2BhUKGLhj5t1fcjqAu8iD9B3ixJtsTNqyCCD4V0aTO9kAKAjjA%40mail.gmail.com

											
										
										
											2021-05-07 10:17:42 +02:00
+								-- ALTER
 								ALTER COLLATION "en-x-icu" REFRESH VERSION;
 								NOTICE:  version has not changed
-												Database-level collation version tracking

This adds to database objects the same version tracking that collation
objects have.  There is a new pg_database column datcollversion that
stores the version, a new function
pg_database_collation_actual_version() to get the version from the
operating system, and a new subcommand ALTER DATABASE ... REFRESH
COLLATION VERSION.

This was not originally added together with pg_collation.collversion,
since originally version tracking was only supported for ICU, and ICU
on a database-level is not currently supported.  But we now have
version tracking for glibc (since PG13), FreeBSD (since PG14), and
Windows (since PG13), so this is useful to have now.

Reviewed-by: Julien Rouhaud <rjuju123@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/f0ff3190-29a3-5b39-a179-fa32eee57db6%40enterprisedb.com

											
										
										
											2022-02-14 08:09:04 +01:00
+								-- also test for database while we are here
 								SELECT current_database() AS datname \gset
 								ALTER DATABASE :"datname" REFRESH COLLATION VERSION;
 								NOTICE:  version has not changed
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								-- dependencies
 								CREATE COLLATION test0 FROM "C";
 								CREATE TABLE collate_dep_test1 (a int, b text COLLATE test0);
 								CREATE DOMAIN collate_dep_dom1 AS text COLLATE test0;
 								CREATE TYPE collate_dep_test2 AS (x int, y text COLLATE test0);
 								CREATE VIEW collate_dep_test3 AS SELECT text 'foo' COLLATE test0 AS foo;
 								CREATE TABLE collate_dep_test4t (a int, b text);
 								CREATE INDEX collate_dep_test4i ON collate_dep_test4t (b COLLATE test0);
 								DROP COLLATION test0 RESTRICT; -- fail
 								ERROR:  cannot drop collation test0 because other objects depend on it
-												Update non-default collation tests for getObjectDescription() changes.

Sigh, also missed in commit b86b7bfa3.  Per buildfarm.

											
										
										
											2018-05-24 23:41:52 +02:00
+								DETAIL:  column b of table collate_dep_test1 depends on collation test0
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								type collate_dep_dom1 depends on collation test0
-												Update non-default collation tests for getObjectDescription() changes.

Sigh, also missed in commit b86b7bfa3.  Per buildfarm.

											
										
										
											2018-05-24 23:41:52 +02:00
+								column y of composite type collate_dep_test2 depends on collation test0
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								view collate_dep_test3 depends on collation test0
 								index collate_dep_test4i depends on collation test0
 								HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 								DROP COLLATION test0 CASCADE;
 								NOTICE:  drop cascades to 5 other objects
-												Update non-default collation tests for getObjectDescription() changes.

Sigh, also missed in commit b86b7bfa3.  Per buildfarm.

											
										
										
											2018-05-24 23:41:52 +02:00
+								DETAIL:  drop cascades to column b of table collate_dep_test1
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								drop cascades to type collate_dep_dom1
-												Update non-default collation tests for getObjectDescription() changes.

Sigh, also missed in commit b86b7bfa3.  Per buildfarm.

											
										
										
											2018-05-24 23:41:52 +02:00
+								drop cascades to column y of composite type collate_dep_test2
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								drop cascades to view collate_dep_test3
 								drop cascades to index collate_dep_test4i
 								\d collate_dep_test1
 								      Table "collate_tests.collate_dep_test1"
 								 Column |  Type   | Collation | Nullable | Default
 								--------+---------+-----------+----------+---------
 								 a      | integer |           |          |
 								\d collate_dep_test2
 								 Composite type "collate_tests.collate_dep_test2"
 								 Column |  Type   | Collation | Nullable | Default
 								--------+---------+-----------+----------+---------
 								 x      | integer |           |          |
 								DROP TABLE collate_dep_test1, collate_dep_test4t;
 								DROP TYPE collate_dep_test2;
 								-- test range types and collations
 								create type textrange_c as range(subtype=text, collation="C");
 								create type textrange_en_us as range(subtype=text, collation="en-x-icu");
 								select textrange_c('A','Z') @> 'b'::text;
 								 ?column?
 								----------
 								 f
 								(1 row)
 								select textrange_en_us('A','Z') @> 'b'::text;
 								 ?column?
 								----------
 								 t
 								(1 row)
 								drop type textrange_c;
 								drop type textrange_en_us;
-												Add a test for UCS_BASIC collation

											
										
										
											2023-03-10 11:00:51 +01:00
+								-- standard collations
 								SELECT * FROM collate_test2 ORDER BY b COLLATE UCS_BASIC;
 								 a |  b
 								---+-----
 | ABC
 | abc
 | bbc
 | äbc
 								(4 rows)
-												Add standard collation UNICODE

This adds a new predefined collation named UNICODE, which sorts by the
default Unicode collation algorithm specifications, per SQL standard.

This only works if ICU support is built.

Reviewed-by: Jeff Davis <pgsql@j-davis.com>
Discussion: https://www.postgresql.org/message-id/flat/1293e382-2093-a2bf-a397-c04e8f83d3c2@enterprisedb.com

											
										
										
											2023-03-10 13:35:00 +01:00
+								SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
 								 a |  b
 								---+-----
 | abc
 | ABC
 | äbc
 | bbc
 								(4 rows)
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								-- test ICU collation customization
-												Fix bug in support for collation attributes on older ICU versions

Unrecognized attribute names are supposed to be ignored.  But the code
would error out on an unrecognized attribute value even if it did not
recognize the attribute name.  So unrecognized attributes wouldn't
really be ignored unless the value happened to be one that matched a
recognized value.  This would break some important cases where the
attribute would be processed by ucol_open() directly.  Fix that and
add a test case.

The restructured code should also avoid compiler warnings about
initializing a UColAttribute value to -1, because the type might be an
unsigned enum.  (reported by Andres Freund)

											
										
										
											2019-03-19 09:37:46 +01:00
+								-- test the attributes handled by icu_set_collation_attributes()
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								SET client_min_messages=WARNING;
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								RESET client_min_messages;
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								SET client_min_messages=WARNING;
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								RESET client_min_messages;
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
 								CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und-u-kf-lower" for ICU locale "@colCaseFirst=lower"
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und-u-kf-upper" for ICU locale "@colCaseFirst=upper"
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
 								CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und-u-ka-shifted" for ICU locale "@colAlternate=shifted"
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								SET client_min_messages=WARNING;
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								RESET client_min_messages;
-												Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com

											
										
										
											2019-03-17 08:16:33 +01:00
+								SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
 								CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und-u-kn-lower" for ICU locale "@colNumeric=lower"
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								ERROR:  could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR
-												Fix bug in support for collation attributes on older ICU versions

Unrecognized attribute names are supposed to be ignored.  But the code
would error out on an unrecognized attribute value even if it did not
recognize the attribute name.  So unrecognized attributes wouldn't
really be ignored unless the value happened to be one that matched a
recognized value.  This would break some important cases where the
attribute would be processed by ucol_open() directly.  Fix that and
add a test case.

The restructured code should also avoid compiler warnings about
initializing a UColAttribute value to -1, because the type might be an
unsigned enum.  (reported by Andres Freund)

											
										
										
											2019-03-19 09:37:46 +01:00
+								-- test that attributes not handled by icu_set_collation_attributes()
 								-- (handled by ucol_open() directly) also work
 								CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=phonebook');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "de-u-co-phonebk" for ICU locale "de@collation=phonebook"
-												Fix bug in support for collation attributes on older ICU versions

Unrecognized attribute names are supposed to be ignored.  But the code
would error out on an unrecognized attribute value even if it did not
recognize the attribute name.  So unrecognized attributes wouldn't
really be ignored unless the value happened to be one that matched a
recognized value.  This would break some important cases where the
attribute would be processed by ucol_open() directly.  Fix that and
add a test case.

The restructured code should also avoid compiler warnings about
initializing a UColAttribute value to -1, because the type might be an
unsigned enum.  (reported by Andres Freund)

											
										
										
											2019-03-19 09:37:46 +01:00
+								SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
-												Allow tailoring of ICU locales with custom rules

This exposes the ICU facility to add custom collation rules to a
standard collation.

New options are added to CREATE COLLATION, CREATE DATABASE, createdb,
and initdb to set the rules.

Reviewed-by: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/flat/821c71a4-6ef0-d366-9acf-bb8e367f739f@enterprisedb.com

											
										
										
											2023-03-08 16:35:42 +01:00
+								-- rules
 								CREATE COLLATION testcoll_rules1 (provider = icu, locale = '', rules = '&a < g');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und" for ICU locale ""
-												Allow tailoring of ICU locales with custom rules

This exposes the ICU facility to add custom collation rules to a
standard collation.

New options are added to CREATE COLLATION, CREATE DATABASE, createdb,
and initdb to set the rules.

Reviewed-by: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/flat/821c71a4-6ef0-d366-9acf-bb8e367f739f@enterprisedb.com

											
										
										
											2023-03-08 16:35:42 +01:00
+								CREATE TABLE test7 (a text);
 								-- example from https://unicode-org.github.io/icu/userguide/collation/customization/#syntax
 								INSERT INTO test7 VALUES ('Abernathy'), ('apple'), ('bird'), ('Boston'), ('Graham'), ('green');
 								SELECT * FROM test7 ORDER BY a COLLATE "en-x-icu";
 								     a
 								-----------
 								 Abernathy
 								 apple
 								 bird
 								 Boston
 								 Graham
 								 green
 								(6 rows)
 								SELECT * FROM test7 ORDER BY a COLLATE testcoll_rules1;
 								     a
 								-----------
 								 Abernathy
 								 apple
 								 green
 								 bird
 								 Boston
 								 Graham
 								(6 rows)
 								DROP TABLE test7;
 								CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!!');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und" for ICU locale ""
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								ERROR:  could not open collator for locale "und" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								-- nondeterministic collations
-												Fix ICU tests for older ICU versions

Change the tests to use old-style ICU locale specifications so that
they can run on older ICU versions.

											
										
										
											2019-03-22 14:40:56 +01:00
+								CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true);
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und" for ICU locale ""
-												Fix ICU tests for older ICU versions

Change the tests to use old-style ICU locale specifications so that
they can run on older ICU versions.

											
										
										
											2019-03-22 14:40:56 +01:00
+								CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false);
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und" for ICU locale ""
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								CREATE TABLE test6 (a int, b text);
 								-- same string in different normal forms
 								INSERT INTO test6 VALUES (1, U&'\00E4bc');
 								INSERT INTO test6 VALUES (2, U&'\0061\0308bc');
 								SELECT * FROM test6;
 								 a |  b
 								---+-----
 | äbc
 | äbc
 								(2 rows)
 								SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det;
 								 a |  b
 								---+-----
 | äbc
 								(1 row)
 								SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet;
 								 a |  b
 								---+-----
 | äbc
 | äbc
 								(2 rows)
-												Enable hash partitioning of text arrays

hash_array_extended() needs to pass PG_GET_COLLATION() to the hash
function of the element type.  Otherwise, the hash function of a
collation-aware data type such as text will error out, since the
introduction of nondeterministic collation made hash functions require
a collation, too.

The consequence of this is that before this change, hash partitioning
using an array over text in the partition key would not work.

Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://www.postgresql.org/message-id/flat/32c1fdae-95c6-5dc6-058a-a90330a3b621%40enterprisedb.com

											
										
										
											2020-11-04 07:47:06 +01:00
+								-- same with arrays
 								CREATE TABLE test6a (a int, b text[]);
 								INSERT INTO test6a VALUES (1, ARRAY[U&'\00E4bc']);
 								INSERT INTO test6a VALUES (2, ARRAY[U&'\0061\0308bc']);
 								SELECT * FROM test6a;
 								 a |   b
 								---+-------
 | {äbc}
 | {äbc}
 								(2 rows)
 								SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_det;
 								 a |   b
 								---+-------
 | {äbc}
 								(1 row)
 								SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet;
 								 a |   b
 								---+-------
 | {äbc}
 | {äbc}
 								(2 rows)
-												Fix ICU tests for older ICU versions

Change the tests to use old-style ICU locale specifications so that
they can run on older ICU versions.

											
										
										
											2019-03-22 14:40:56 +01:00
+								CREATE COLLATION case_sensitive (provider = icu, locale = '');
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und" for ICU locale ""
-												Fix ICU tests for older ICU versions

Change the tests to use old-style ICU locale specifications so that
they can run on older ICU versions.

											
										
										
											2019-03-22 14:40:56 +01:00
+								CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false);
-												CREATE DATABASE: make LOCALE apply to all collation providers.

For CREATE DATABASE, make LOCALE parameter apply regardless of the
provider used. Also affects initdb and createdb --locale arguments.

Previously, LOCALE (and --locale) only affected the database default
collation when using the libc provider.

Discussion: https://postgr.es/m/1a63084d-221e-4075-619e-6b3e590f673e@enterprisedb.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-06-16 19:27:32 +02:00
+								NOTICE:  using standard form "und-u-ks-level2" for ICU locale "@colStrength=secondary"
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
 								 ?column? | ?column?
 								----------+----------
 								 t        | f
 								(1 row)
 								SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_insensitive;
 								 ?column? | ?column?
 								----------+----------
 								 t        | t
 								(1 row)
-												Support language tags in older ICU versions (53 and earlier).

By calling uloc_canonicalize() before parsing the attributes, the
existing locale attribute parsing logic works on language tags as
well.

Fix a small memory leak, too.

Discussion: http://postgr.es/m/60da0cecfb512a78b8666b31631a636215d8ce73.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-03-21 23:49:18 +01:00
+								-- test language tags
 								CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
 								SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
 								 ?column?
 								----------
 								 t
 								(1 row)
-												Handle the "und" locale in ICU versions 54 and older.

The "und" locale is an alternative spelling of the root locale, but it
was not recognized until ICU 55. To maintain common behavior across
all supported ICU versions, check for "und" and replace with "root"
before opening.

Previously, the lack of support for "und" was dangerous, because
versions 54 and older fall back to the environment when a locale is
not found. If the user specified "und" for the language (which is
expected and documented), it could not only resolve to the wrong
collator, but it could unexpectedly change (which could lead to
corrupt indexes).

This effectively reverts commit d72900bded, which worked around the
problem for the built-in "unicode" collation, and is no longer
necessary.

Discussion: https://postgr.es/m/60da0cecfb512a78b8666b31631a636215d8ce73.camel@j-davis.com
Discussion: https://postgr.es/m/0c6fa66f2753217d2a40480a96bd2ccf023536a1.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-03-23 17:50:06 +01:00
+								CREATE COLLATION lt_upperfirst (provider = icu, locale = 'und-u-kf-upper');
 								SELECT 'Z' COLLATE lt_upperfirst < 'z' COLLATE lt_upperfirst;
 								 ?column?
 								----------
 								 t
 								(1 row)
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								CREATE TABLE test1cs (x text COLLATE case_sensitive);
 								CREATE TABLE test2cs (x text COLLATE case_sensitive);
 								CREATE TABLE test3cs (x text COLLATE case_sensitive);
 								INSERT INTO test1cs VALUES ('abc'), ('def'), ('ghi');
 								INSERT INTO test2cs VALUES ('ABC'), ('ghi');
 								INSERT INTO test3cs VALUES ('abc'), ('ABC'), ('def'), ('ghi');
 								SELECT x FROM test3cs WHERE x = 'abc';
 								  x
 								-----
 								 abc
 								(1 row)
 								SELECT x FROM test3cs WHERE x <> 'abc';
 								  x
 								-----
 								 ABC
 								 def
 								 ghi
 								(3 rows)
 								SELECT x FROM test3cs WHERE x LIKE 'a%';
 								  x
 								-----
 								 abc
 								(1 row)
 								SELECT x FROM test3cs WHERE x ILIKE 'a%';
 								  x
 								-----
 								 abc
 								 ABC
 								(2 rows)
 								SELECT x FROM test3cs WHERE x SIMILAR TO 'a%';
 								  x
 								-----
 								 abc
 								(1 row)
 								SELECT x FROM test3cs WHERE x ~ 'a';
 								  x
 								-----
 								 abc
 								(1 row)
-												Allow planner to use Merge Append to efficiently implement UNION

Until now, UNION queries have often been suboptimal as the planner has
only ever considered using an Append node and making the results unique
by either using a Hash Aggregate, or by Sorting the entire Append result
and running it through the Unique operator.  Both of these methods
always require reading all rows from the union subqueries.

Here we adjust the union planner so that it can request that each subquery
produce results in target list order so that these can be Merge Appended
together and made unique with a Unique node.  This can improve performance
significantly as the union child can make use of the likes of btree
indexes and/or Merge Joins to provide the top-level UNION with presorted
input.  This is especially good if the top-level UNION contains a LIMIT
node that limits the output rows to a small subset of the unioned rows as
cheap startup plans can be used.

Author: David Rowley
Reviewed-by: Richard Guo, Andy Fan
Discussion: https://postgr.es/m/CAApHDvpb_63XQodmxKUF8vb9M7CxyUyT4sWvEgqeQU-GB7QFoQ@mail.gmail.com

											
										
										
											2024-03-25 02:31:14 +01:00
+								SET enable_hashagg TO off;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								SELECT x FROM test1cs UNION SELECT x FROM test2cs ORDER BY x;
 								  x
 								-----
 								 abc
 								 ABC
 								 def
 								 ghi
 								(4 rows)
 								SELECT x FROM test2cs UNION SELECT x FROM test1cs ORDER BY x;
 								  x
 								-----
 								 abc
 								 ABC
 								 def
 								 ghi
 								(4 rows)
 								SELECT x FROM test1cs INTERSECT SELECT x FROM test2cs;
 								  x
 								-----
 								 ghi
 								(1 row)
 								SELECT x FROM test2cs INTERSECT SELECT x FROM test1cs;
 								  x
 								-----
 								 ghi
 								(1 row)
 								SELECT x FROM test1cs EXCEPT SELECT x FROM test2cs;
 								  x
 								-----
 								 abc
 								 def
 								(2 rows)
 								SELECT x FROM test2cs EXCEPT SELECT x FROM test1cs;
 								  x
 								-----
 								 ABC
 								(1 row)
 								SELECT DISTINCT x FROM test3cs ORDER BY x;
 								  x
 								-----
 								 abc
 								 ABC
 								 def
 								 ghi
 								(4 rows)
-												Allow planner to use Merge Append to efficiently implement UNION

Until now, UNION queries have often been suboptimal as the planner has
only ever considered using an Append node and making the results unique
by either using a Hash Aggregate, or by Sorting the entire Append result
and running it through the Unique operator.  Both of these methods
always require reading all rows from the union subqueries.

Here we adjust the union planner so that it can request that each subquery
produce results in target list order so that these can be Merge Appended
together and made unique with a Unique node.  This can improve performance
significantly as the union child can make use of the likes of btree
indexes and/or Merge Joins to provide the top-level UNION with presorted
input.  This is especially good if the top-level UNION contains a LIMIT
node that limits the output rows to a small subset of the unioned rows as
cheap startup plans can be used.

Author: David Rowley
Reviewed-by: Richard Guo, Andy Fan
Discussion: https://postgr.es/m/CAApHDvpb_63XQodmxKUF8vb9M7CxyUyT4sWvEgqeQU-GB7QFoQ@mail.gmail.com

											
										
										
											2024-03-25 02:31:14 +01:00
+								RESET enable_hashagg;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								SELECT count(DISTINCT x) FROM test3cs;
 								 count
 								-------
 
 								(1 row)
 								SELECT x, count(*) FROM test3cs GROUP BY x ORDER BY x;
 								  x  | count
 								-----+-------
 								 abc |     1
 								 ABC |     1
 								 def |     1
 								 ghi |     1
 								(4 rows)
 								SELECT x, row_number() OVER (ORDER BY x), rank() OVER (ORDER BY x) FROM test3cs ORDER BY x;
 								  x  | row_number | rank
 								-----+------------+------
 								 abc |          1 |    1
 								 ABC |          2 |    2
 								 def |          3 |    3
 								 ghi |          4 |    4
 								(4 rows)
 								CREATE UNIQUE INDEX ON test1cs (x);  -- ok
 								INSERT INTO test1cs VALUES ('ABC');  -- ok
 								CREATE UNIQUE INDEX ON test3cs (x);  -- ok
 								SELECT string_to_array('ABC,DEF,GHI' COLLATE case_sensitive, ',', 'abc');
 								 string_to_array
 								-----------------
 								 {ABC,DEF,GHI}
 								(1 row)
 								SELECT string_to_array('ABCDEFGHI' COLLATE case_sensitive, NULL, 'b');
 								   string_to_array
 								---------------------
 								 {A,B,C,D,E,F,G,H,I}
 								(1 row)
 								CREATE TABLE test1ci (x text COLLATE case_insensitive);
 								CREATE TABLE test2ci (x text COLLATE case_insensitive);
 								CREATE TABLE test3ci (x text COLLATE case_insensitive);
 								CREATE INDEX ON test3ci (x text_pattern_ops);  -- error
 								ERROR:  nondeterministic collations are not supported for operator class "text_pattern_ops"
 								INSERT INTO test1ci VALUES ('abc'), ('def'), ('ghi');
 								INSERT INTO test2ci VALUES ('ABC'), ('ghi');
 								INSERT INTO test3ci VALUES ('abc'), ('ABC'), ('def'), ('ghi');
 								SELECT x FROM test3ci WHERE x = 'abc';
 								  x
 								-----
 								 abc
 								 ABC
 								(2 rows)
 								SELECT x FROM test3ci WHERE x <> 'abc';
 								  x
 								-----
 								 def
 								 ghi
 								(2 rows)
 								SELECT x FROM test3ci WHERE x LIKE 'a%';
 								ERROR:  nondeterministic collations are not supported for LIKE
 								SELECT x FROM test3ci WHERE x ILIKE 'a%';
 								ERROR:  nondeterministic collations are not supported for ILIKE
 								SELECT x FROM test3ci WHERE x SIMILAR TO 'a%';
 								ERROR:  nondeterministic collations are not supported for regular expressions
 								SELECT x FROM test3ci WHERE x ~ 'a';
 								ERROR:  nondeterministic collations are not supported for regular expressions
 								SELECT x FROM test1ci UNION SELECT x FROM test2ci ORDER BY x;
 								  x
 								-----
 								 abc
 								 def
 								 ghi
 								(3 rows)
 								SELECT x FROM test2ci UNION SELECT x FROM test1ci ORDER BY x;
 								  x
 								-----
 								 ABC
 								 def
 								 ghi
 								(3 rows)
-												Add ORDER BY to regression test case

Apparently, the output order is different on different endianness, per
build farm member snapper.

											
										
										
											2019-03-25 08:15:38 +01:00
+								SELECT x FROM test1ci INTERSECT SELECT x FROM test2ci ORDER BY x;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								  x
 								-----
 								 abc
-												Add ORDER BY to regression test case

Apparently, the output order is different on different endianness, per
build farm member snapper.

											
										
										
											2019-03-25 08:15:38 +01:00
+								 ghi
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								(2 rows)
-												Add ORDER BY to regression test case

Apparently, the output order is different on different endianness, per
build farm member snapper.

											
										
										
											2019-03-25 08:15:38 +01:00
+								SELECT x FROM test2ci INTERSECT SELECT x FROM test1ci ORDER BY x;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								  x
 								-----
 								 ABC
-												Add ORDER BY to regression test case

Apparently, the output order is different on different endianness, per
build farm member snapper.

											
										
										
											2019-03-25 08:15:38 +01:00
+								 ghi
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								(2 rows)
 								SELECT x FROM test1ci EXCEPT SELECT x FROM test2ci;
 								  x
 								-----
 								 def
 								(1 row)
 								SELECT x FROM test2ci EXCEPT SELECT x FROM test1ci;
 								 x
 								---
 								(0 rows)
 								SELECT DISTINCT x FROM test3ci ORDER BY x;
 								  x
 								-----
 								 abc
 								 def
 								 ghi
 								(3 rows)
 								SELECT count(DISTINCT x) FROM test3ci;
 								 count
 								-------
 
 								(1 row)
 								SELECT x, count(*) FROM test3ci GROUP BY x ORDER BY x;
 								  x  | count
 								-----+-------
 								 abc |     2
 								 def |     1
 								 ghi |     1
 								(3 rows)
 								SELECT x, row_number() OVER (ORDER BY x), rank() OVER (ORDER BY x) FROM test3ci ORDER BY x;
 								  x  | row_number | rank
 								-----+------------+------
 								 abc |          1 |    1
 								 ABC |          2 |    1
 								 def |          3 |    3
 								 ghi |          4 |    4
 								(4 rows)
 								CREATE UNIQUE INDEX ON test1ci (x);  -- ok
 								INSERT INTO test1ci VALUES ('ABC');  -- error
 								ERROR:  duplicate key value violates unique constraint "test1ci_x_idx"
 								DETAIL:  Key (x)=(ABC) already exists.
 								CREATE UNIQUE INDEX ON test3ci (x);  -- error
 								ERROR:  could not create unique index "test3ci_x_idx"
 								DETAIL:  Key (x)=(abc) is duplicated.
 								SELECT string_to_array('ABC,DEF,GHI' COLLATE case_insensitive, ',', 'abc');
 								ERROR:  nondeterministic collations are not supported for substring searches
 								SELECT string_to_array('ABCDEFGHI' COLLATE case_insensitive, NULL, 'b');
 								    string_to_array
 								------------------------
 								 {A,NULL,C,D,E,F,G,H,I}
 								(1 row)
 								-- bpchar
 								CREATE TABLE test1bpci (x char(3) COLLATE case_insensitive);
 								CREATE TABLE test2bpci (x char(3) COLLATE case_insensitive);
 								CREATE TABLE test3bpci (x char(3) COLLATE case_insensitive);
 								CREATE INDEX ON test3bpci (x bpchar_pattern_ops);  -- error
 								ERROR:  nondeterministic collations are not supported for operator class "bpchar_pattern_ops"
 								INSERT INTO test1bpci VALUES ('abc'), ('def'), ('ghi');
 								INSERT INTO test2bpci VALUES ('ABC'), ('ghi');
 								INSERT INTO test3bpci VALUES ('abc'), ('ABC'), ('def'), ('ghi');
 								SELECT x FROM test3bpci WHERE x = 'abc';
 								  x
 								-----
 								 abc
 								 ABC
 								(2 rows)
 								SELECT x FROM test3bpci WHERE x <> 'abc';
 								  x
 								-----
 								 def
 								 ghi
 								(2 rows)
 								SELECT x FROM test3bpci WHERE x LIKE 'a%';
 								ERROR:  nondeterministic collations are not supported for LIKE
 								SELECT x FROM test3bpci WHERE x ILIKE 'a%';
 								ERROR:  nondeterministic collations are not supported for ILIKE
 								SELECT x FROM test3bpci WHERE x SIMILAR TO 'a%';
 								ERROR:  nondeterministic collations are not supported for regular expressions
 								SELECT x FROM test3bpci WHERE x ~ 'a';
 								ERROR:  nondeterministic collations are not supported for regular expressions
 								SELECT x FROM test1bpci UNION SELECT x FROM test2bpci ORDER BY x;
 								  x
 								-----
 								 abc
 								 def
 								 ghi
 								(3 rows)
 								SELECT x FROM test2bpci UNION SELECT x FROM test1bpci ORDER BY x;
 								  x
 								-----
 								 ABC
 								 def
 								 ghi
 								(3 rows)
-												Add ORDER BY to more ICU regression test cases.

Commit c77e12208 didn't fully fix the problem.  Per buildfarm
and local testing.

											
										
										
											2019-03-26 22:46:04 +01:00
+								SELECT x FROM test1bpci INTERSECT SELECT x FROM test2bpci ORDER BY x;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								  x
 								-----
 								 abc
-												Add ORDER BY to more ICU regression test cases.

Commit c77e12208 didn't fully fix the problem.  Per buildfarm
and local testing.

											
										
										
											2019-03-26 22:46:04 +01:00
+								 ghi
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								(2 rows)
-												Add ORDER BY to more ICU regression test cases.

Commit c77e12208 didn't fully fix the problem.  Per buildfarm
and local testing.

											
										
										
											2019-03-26 22:46:04 +01:00
+								SELECT x FROM test2bpci INTERSECT SELECT x FROM test1bpci ORDER BY x;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								  x
 								-----
 								 ABC
-												Add ORDER BY to more ICU regression test cases.

Commit c77e12208 didn't fully fix the problem.  Per buildfarm
and local testing.

											
										
										
											2019-03-26 22:46:04 +01:00
+								 ghi
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								(2 rows)
 								SELECT x FROM test1bpci EXCEPT SELECT x FROM test2bpci;
 								  x
 								-----
 								 def
 								(1 row)
 								SELECT x FROM test2bpci EXCEPT SELECT x FROM test1bpci;
 								 x
 								---
 								(0 rows)
 								SELECT DISTINCT x FROM test3bpci ORDER BY x;
 								  x
 								-----
 								 abc
 								 def
 								 ghi
 								(3 rows)
 								SELECT count(DISTINCT x) FROM test3bpci;
 								 count
 								-------
 
 								(1 row)
 								SELECT x, count(*) FROM test3bpci GROUP BY x ORDER BY x;
 								  x  | count
 								-----+-------
 								 abc |     2
 								 def |     1
 								 ghi |     1
 								(3 rows)
 								SELECT x, row_number() OVER (ORDER BY x), rank() OVER (ORDER BY x) FROM test3bpci ORDER BY x;
 								  x  | row_number | rank
 								-----+------------+------
 								 abc |          1 |    1
 								 ABC |          2 |    1
 								 def |          3 |    3
 								 ghi |          4 |    4
 								(4 rows)
 								CREATE UNIQUE INDEX ON test1bpci (x);  -- ok
 								INSERT INTO test1bpci VALUES ('ABC');  -- error
 								ERROR:  duplicate key value violates unique constraint "test1bpci_x_idx"
 								DETAIL:  Key (x)=(ABC) already exists.
 								CREATE UNIQUE INDEX ON test3bpci (x);  -- error
 								ERROR:  could not create unique index "test3bpci_x_idx"
 								DETAIL:  Key (x)=(abc) is duplicated.
 								SELECT string_to_array('ABC,DEF,GHI'::char(11) COLLATE case_insensitive, ',', 'abc');
 								ERROR:  nondeterministic collations are not supported for substring searches
 								SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b');
 								    string_to_array
 								------------------------
 								 {A,NULL,C,D,E,F,G,H,I}
 								(1 row)
 								-- This tests the issue described in match_pattern_prefix().  In the
 								-- absence of that check, the case_insensitive tests below would
 								-- return no rows where they should logically return one.
 								CREATE TABLE test4c (x text COLLATE "C");
 								INSERT INTO test4c VALUES ('abc');
 								CREATE INDEX ON test4c (x);
 								SET enable_seqscan = off;
 								SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_sensitive;  -- ok, no rows
 								 x
 								---
 								(0 rows)
 								SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_sensitive;  -- ok, no rows
 								 x
 								---
 								(0 rows)
 								SELECT x FROM test4c WHERE x LIKE 'ABC' COLLATE case_insensitive;  -- error
 								ERROR:  nondeterministic collations are not supported for LIKE
 								SELECT x FROM test4c WHERE x LIKE 'ABC%' COLLATE case_insensitive;  -- error
 								ERROR:  nondeterministic collations are not supported for LIKE
 								RESET enable_seqscan;
 								-- Unicode special case: different variants of Greek lower case sigma.
 								-- A naive implementation like citext that just does lower(x) =
 								-- lower(y) will do the wrong thing here, because lower('Σ') is 'σ'
 								-- but upper('ς') is 'Σ'.
 								SELECT 'ὀδυσσεύς' = 'ὈΔΥΣΣΕΎΣ' COLLATE case_sensitive;
 								 ?column?
 								----------
 								 f
 								(1 row)
 								SELECT 'ὀδυσσεύς' = 'ὈΔΥΣΣΕΎΣ' COLLATE case_insensitive;
 								 ?column?
 								----------
 								 t
 								(1 row)
 								-- name vs. text comparison operators
 								SELECT relname FROM pg_class WHERE relname = 'PG_CLASS'::text COLLATE case_insensitive;
 								 relname
 								----------
 								 pg_class
 								(1 row)
 								SELECT relname FROM pg_class WHERE 'PG_CLASS'::text = relname COLLATE case_insensitive;
 								 relname
 								----------
 								 pg_class
 								(1 row)
-												Fix random regression failure in test case "collate.icu.utf8"

This is a fix similar to 2d7d67cc, where slight plan alteration can
cause a random failure of this regression test because of an incorect
tuple ordering, except that this one involves lookups of pg_type.
Similarly to the other case, add ORDER BY clauses to ensure the output
order.

The failure has been seen at least once on buildfarm member skink.

Reported-by: Thomas Munro
Discussion: https://postgr.es/m/CA+hUKGLjR9ZBvhXcr9b-NSBHPw9aRgbjyzGE+kqLsT4vwX+nkQ@mail.gmail.com
Backpatch-through: 12

											
										
										
											2019-08-14 06:37:48 +02:00
+								SELECT typname FROM pg_type WHERE typname LIKE 'int_' AND typname <> 'INT2'::text
 								  COLLATE case_insensitive ORDER BY typname;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								 typname
 								---------
 								 int4
 								 int8
 								(2 rows)
-												Fix random regression failure in test case "collate.icu.utf8"

This is a fix similar to 2d7d67cc, where slight plan alteration can
cause a random failure of this regression test because of an incorect
tuple ordering, except that this one involves lookups of pg_type.
Similarly to the other case, add ORDER BY clauses to ensure the output
order.

The failure has been seen at least once on buildfarm member skink.

Reported-by: Thomas Munro
Discussion: https://postgr.es/m/CA+hUKGLjR9ZBvhXcr9b-NSBHPw9aRgbjyzGE+kqLsT4vwX+nkQ@mail.gmail.com
Backpatch-through: 12

											
										
										
											2019-08-14 06:37:48 +02:00
+								SELECT typname FROM pg_type WHERE typname LIKE 'int_' AND 'INT2'::text <> typname
 								  COLLATE case_insensitive ORDER BY typname;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								 typname
 								---------
 								 int4
 								 int8
 								(2 rows)
 								-- test case adapted from subselect.sql
 								CREATE TEMP TABLE outer_text (f1 text COLLATE case_insensitive, f2 text);
 								INSERT INTO outer_text VALUES ('a', 'a');
 								INSERT INTO outer_text VALUES ('b', 'a');
 								INSERT INTO outer_text VALUES ('A', NULL);
 								INSERT INTO outer_text VALUES ('B', NULL);
 								CREATE TEMP TABLE inner_text (c1 text COLLATE case_insensitive, c2 text);
 								INSERT INTO inner_text VALUES ('a', NULL);
 								SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text);
 								 f1 | f2
 								----+----
 								 b  | a
 								 B  |
 								(2 rows)
 								-- accents
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								SET client_min_messages=WARNING;
-												Fix ICU tests for older ICU versions

Change the tests to use old-style ICU locale specifications so that
they can run on older ICU versions.

											
										
										
											2019-03-22 14:40:56 +01:00
+								CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false);
-												Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut

											
										
										
											2023-04-04 19:28:08 +02:00
+								RESET client_min_messages;
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								CREATE TABLE test4 (a int, b text);
 								INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
 								SELECT * FROM test4 WHERE b = 'cote';
 								 a |  b
 								---+------
 | cote
 								(1 row)
 								SELECT * FROM test4 WHERE b = 'cote' COLLATE ignore_accents;
 								 a |  b
 								---+------
 | cote
 | côte
 | coté
 | côté
 								(4 rows)
 								SELECT * FROM test4 WHERE b = 'Cote' COLLATE ignore_accents;  -- still case-sensitive
 								 a | b
 								---+---
 								(0 rows)
 								SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive;
 								 a |  b
 								---+------
 | cote
 								(1 row)
 								-- foreign keys (should use collation of primary key)
 								-- PK is case-sensitive, FK is case-insensitive
 								CREATE TABLE test10pk (x text COLLATE case_sensitive PRIMARY KEY);
 								INSERT INTO test10pk VALUES ('abc'), ('def'), ('ghi');
 								CREATE TABLE test10fk (x text COLLATE case_insensitive REFERENCES test10pk (x) ON UPDATE CASCADE ON DELETE CASCADE);
 								INSERT INTO test10fk VALUES ('abc');  -- ok
 								INSERT INTO test10fk VALUES ('ABC');  -- error
 								ERROR:  insert or update on table "test10fk" violates foreign key constraint "test10fk_x_fkey"
 								DETAIL:  Key (x)=(ABC) is not present in table "test10pk".
 								INSERT INTO test10fk VALUES ('xyz');  -- error
 								ERROR:  insert or update on table "test10fk" violates foreign key constraint "test10fk_x_fkey"
 								DETAIL:  Key (x)=(xyz) is not present in table "test10pk".
 								SELECT * FROM test10pk;
 								  x
 								-----
 								 abc
 								 def
 								 ghi
 								(3 rows)
 								SELECT * FROM test10fk;
 								  x
 								-----
 								 abc
 								(1 row)
 								-- restrict update even though the values are "equal" in the FK table
 								UPDATE test10fk SET x = 'ABC' WHERE x = 'abc';  -- error
 								ERROR:  insert or update on table "test10fk" violates foreign key constraint "test10fk_x_fkey"
 								DETAIL:  Key (x)=(ABC) is not present in table "test10pk".
 								SELECT * FROM test10fk;
 								  x
 								-----
 								 abc
 								(1 row)
 								DELETE FROM test10pk WHERE x = 'abc';
 								SELECT * FROM test10pk;
 								  x
 								-----
 								 def
 								 ghi
 								(2 rows)
 								SELECT * FROM test10fk;
 								 x
 								---
 								(0 rows)
 								-- PK is case-insensitive, FK is case-sensitive
 								CREATE TABLE test11pk (x text COLLATE case_insensitive PRIMARY KEY);
 								INSERT INTO test11pk VALUES ('abc'), ('def'), ('ghi');
 								CREATE TABLE test11fk (x text COLLATE case_sensitive REFERENCES test11pk (x) ON UPDATE CASCADE ON DELETE CASCADE);
 								INSERT INTO test11fk VALUES ('abc');  -- ok
 								INSERT INTO test11fk VALUES ('ABC');  -- ok
 								INSERT INTO test11fk VALUES ('xyz');  -- error
 								ERROR:  insert or update on table "test11fk" violates foreign key constraint "test11fk_x_fkey"
 								DETAIL:  Key (x)=(xyz) is not present in table "test11pk".
 								SELECT * FROM test11pk;
 								  x
 								-----
 								 abc
 								 def
 								 ghi
 								(3 rows)
 								SELECT * FROM test11fk;
 								  x
 								-----
 								 abc
 								 ABC
 								(2 rows)
 								-- cascade update even though the values are "equal" in the PK table
 								UPDATE test11pk SET x = 'ABC' WHERE x = 'abc';
 								SELECT * FROM test11fk;
 								  x
 								-----
 								 ABC
 								 ABC
 								(2 rows)
 								DELETE FROM test11pk WHERE x = 'abc';
 								SELECT * FROM test11pk;
 								  x
 								-----
 								 def
 								 ghi
 								(2 rows)
 								SELECT * FROM test11fk;
 								 x
 								---
 								(0 rows)
 								-- partitioning
 								CREATE TABLE test20 (a int, b text COLLATE case_insensitive) PARTITION BY LIST (b);
 								CREATE TABLE test20_1 PARTITION OF test20 FOR VALUES IN ('abc');
 								INSERT INTO test20 VALUES (1, 'abc');
 								INSERT INTO test20 VALUES (2, 'ABC');
 								SELECT * FROM test20_1;
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								CREATE TABLE test21 (a int, b text COLLATE case_insensitive) PARTITION BY RANGE (b);
 								CREATE TABLE test21_1 PARTITION OF test21 FOR VALUES FROM ('ABC') TO ('DEF');
 								INSERT INTO test21 VALUES (1, 'abc');
 								INSERT INTO test21 VALUES (2, 'ABC');
 								SELECT * FROM test21_1;
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								CREATE TABLE test22 (a int, b text COLLATE case_sensitive) PARTITION BY HASH (b);
 								CREATE TABLE test22_0 PARTITION OF test22 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
 								CREATE TABLE test22_1 PARTITION OF test22 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 								INSERT INTO test22 VALUES (1, 'def');
 								INSERT INTO test22 VALUES (2, 'DEF');
 								-- they end up in different partitions
 								SELECT (SELECT count(*) FROM test22_0) = (SELECT count(*) FROM test22_1);
 								 ?column?
 								----------
 								 t
 								(1 row)
-												Enable hash partitioning of text arrays

hash_array_extended() needs to pass PG_GET_COLLATION() to the hash
function of the element type.  Otherwise, the hash function of a
collation-aware data type such as text will error out, since the
introduction of nondeterministic collation made hash functions require
a collation, too.

The consequence of this is that before this change, hash partitioning
using an array over text in the partition key would not work.

Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://www.postgresql.org/message-id/flat/32c1fdae-95c6-5dc6-058a-a90330a3b621%40enterprisedb.com

											
										
										
											2020-11-04 07:47:06 +01:00
+								-- same with arrays
 								CREATE TABLE test22a (a int, b text[] COLLATE case_sensitive) PARTITION BY HASH (b);
 								CREATE TABLE test22a_0 PARTITION OF test22a FOR VALUES WITH (MODULUS 2, REMAINDER 0);
 								CREATE TABLE test22a_1 PARTITION OF test22a FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 								INSERT INTO test22a VALUES (1, ARRAY['def']);
 								INSERT INTO test22a VALUES (2, ARRAY['DEF']);
 								-- they end up in different partitions
 								SELECT (SELECT count(*) FROM test22a_0) = (SELECT count(*) FROM test22a_1);
 								 ?column?
 								----------
 								 t
 								(1 row)
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								CREATE TABLE test23 (a int, b text COLLATE case_insensitive) PARTITION BY HASH (b);
 								CREATE TABLE test23_0 PARTITION OF test23 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
 								CREATE TABLE test23_1 PARTITION OF test23 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 								INSERT INTO test23 VALUES (1, 'def');
 								INSERT INTO test23 VALUES (2, 'DEF');
 								-- they end up in the same partition (but it's platform-dependent which one)
 								SELECT (SELECT count(*) FROM test23_0) <> (SELECT count(*) FROM test23_1);
 								 ?column?
 								----------
 								 t
 								(1 row)
-												Enable hash partitioning of text arrays

hash_array_extended() needs to pass PG_GET_COLLATION() to the hash
function of the element type.  Otherwise, the hash function of a
collation-aware data type such as text will error out, since the
introduction of nondeterministic collation made hash functions require
a collation, too.

The consequence of this is that before this change, hash partitioning
using an array over text in the partition key would not work.

Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://www.postgresql.org/message-id/flat/32c1fdae-95c6-5dc6-058a-a90330a3b621%40enterprisedb.com

											
										
										
											2020-11-04 07:47:06 +01:00
+								-- same with arrays
 								CREATE TABLE test23a (a int, b text[] COLLATE case_insensitive) PARTITION BY HASH (b);
 								CREATE TABLE test23a_0 PARTITION OF test23a FOR VALUES WITH (MODULUS 2, REMAINDER 0);
 								CREATE TABLE test23a_1 PARTITION OF test23a FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 								INSERT INTO test23a VALUES (1, ARRAY['def']);
 								INSERT INTO test23a VALUES (2, ARRAY['DEF']);
 								-- they end up in the same partition (but it's platform-dependent which one)
 								SELECT (SELECT count(*) FROM test23a_0) <> (SELECT count(*) FROM test23a_1);
 								 ?column?
 								----------
 								 t
 								(1 row)
-												Collations with nondeterministic comparison

This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com

											
										
										
											2019-03-22 12:09:32 +01:00
+								CREATE TABLE test30 (a int, b char(3) COLLATE case_insensitive) PARTITION BY LIST (b);
 								CREATE TABLE test30_1 PARTITION OF test30 FOR VALUES IN ('abc');
 								INSERT INTO test30 VALUES (1, 'abc');
 								INSERT INTO test30 VALUES (2, 'ABC');
 								SELECT * FROM test30_1;
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								CREATE TABLE test31 (a int, b char(3) COLLATE case_insensitive) PARTITION BY RANGE (b);
 								CREATE TABLE test31_1 PARTITION OF test31 FOR VALUES FROM ('ABC') TO ('DEF');
 								INSERT INTO test31 VALUES (1, 'abc');
 								INSERT INTO test31 VALUES (2, 'ABC');
 								SELECT * FROM test31_1;
 								 a |  b
 								---+-----
 | abc
 | ABC
 								(2 rows)
 								CREATE TABLE test32 (a int, b char(3) COLLATE case_sensitive) PARTITION BY HASH (b);
 								CREATE TABLE test32_0 PARTITION OF test32 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
 								CREATE TABLE test32_1 PARTITION OF test32 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 								INSERT INTO test32 VALUES (1, 'def');
 								INSERT INTO test32 VALUES (2, 'DEF');
 								-- they end up in different partitions
 								SELECT (SELECT count(*) FROM test32_0) = (SELECT count(*) FROM test32_1);
 								 ?column?
 								----------
 								 t
 								(1 row)
 								CREATE TABLE test33 (a int, b char(3) COLLATE case_insensitive) PARTITION BY HASH (b);
 								CREATE TABLE test33_0 PARTITION OF test33 FOR VALUES WITH (MODULUS 2, REMAINDER 0);
 								CREATE TABLE test33_1 PARTITION OF test33 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 								INSERT INTO test33 VALUES (1, 'def');
 								INSERT INTO test33 VALUES (2, 'DEF');
 								-- they end up in the same partition (but it's platform-dependent which one)
 								SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1);
 								 ?column?
 								----------
 								 t
 								(1 row)
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								-- cleanup
-												Fix planner's test for case-foldable characters in ILIKE with ICU.

As coded, the ICU-collation path in pattern_char_isalpha() failed
to consider regular ASCII letters to be case-varying.  This led to
like_fixed_prefix treating too much of an ILIKE pattern as being a
fixed prefix, so that indexscans derived from an ILIKE clause might
miss entries that they should find.

Per bug #15892 from James Inform.  This is an oversight in the original
ICU patch (commit eccfef81e), so back-patch to v10 where that came in.

Discussion: https://postgr.es/m/15892-e5d2bea3e8a04a1b@postgresql.org

											
										
										
											2019-08-12 19:15:47 +02:00
+								RESET search_path;
-												Hide cascade messages in collate tests

These are not relevant to the tests and would just uselessly bloat
patches.

											
										
										
											2019-02-06 22:17:57 +01:00
+								SET client_min_messages TO warning;
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								DROP SCHEMA collate_tests CASCADE;
-												Fix planner's test for case-foldable characters in ILIKE with ICU.

As coded, the ICU-collation path in pattern_char_isalpha() failed
to consider regular ASCII letters to be case-varying.  This led to
like_fixed_prefix treating too much of an ILIKE pattern as being a
fixed prefix, so that indexscans derived from an ILIKE clause might
miss entries that they should find.

Per bug #15892 from James Inform.  This is an oversight in the original
ICU patch (commit eccfef81e), so back-patch to v10 where that came in.

Discussion: https://postgr.es/m/15892-e5d2bea3e8a04a1b@postgresql.org

											
										
										
											2019-08-12 19:15:47 +02:00
+								RESET client_min_messages;
-												ICU support

Add a column collprovider to pg_collation that determines which library
provides the collation data.  The existing choices are default and libc,
and this adds an icu choice, which uses the ICU4C library.

The pg_locale_t type is changed to a union that contains the
provider-specific locale handles.  Users of locale information are
changed to look into that struct for the appropriate handle to use.

Also add a collversion column that records the version of the collation
when it is created, and check at run time whether it is still the same.
This detects potentially incompatible library upgrades that can corrupt
indexes and other structures.  This is currently only supported by
ICU-provided collations.

initdb initializes the default collation set as before from the `locale
-a` output but also adds all available ICU locales with a "-x-icu"
appended.

Currently, ICU-provided collations can only be explicitly named
collations.  The global database locales are still always libc-provided.

ICU support is enabled by configure --with-icu.

Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com>
Reviewed-by: Andreas Karlsson <andreas@proxel.se>

											
										
										
											2017-03-23 20:25:34 +01:00
+								-- leave a collation for pg_upgrade test
 								CREATE COLLATION coll_icu_upgrade FROM "und-x-icu";