From 0b13b2a7712b6f91590b7589a314240a14520c2f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 23 Jun 2017 14:19:48 -0400 Subject: [PATCH] Rethink behavior of pg_import_system_collations(). Marco Atzeri reported that initdb would fail if "locale -a" reported the same locale name more than once. All previous versions of Postgres implicitly de-duplicated the results of "locale -a", but the rewrite to move the collation import logic into C had lost that property. It had also lost the property that locale names matching built-in collation names were silently ignored. The simplest way to fix this is to make initdb run the function in if-not-exists mode, which means that there's no real use-case for non if-not-exists mode; we might as well just drop the boolean argument and simplify the function's definition to be "add any collations not already known". This change also gets rid of some odd corner cases caused by the fact that aliases were added in if-not-exists mode even if the function argument said otherwise. While at it, adjust the behavior so that pg_import_system_collations() doesn't spew "collation foo already exists, skipping" messages during a re-run; that's completely unhelpful, especially since there are often hundreds of them. And make it return a count of the number of collations it did add, which seems like it might be helpful. Also, re-integrate the previous coding's property that it would make a deterministic selection of which alias to use if there were conflicting possibilities. This would only come into play if "locale -a" reports multiple equivalent locale names, say "de_DE.utf8" and "de_DE.UTF-8", but that hardly seems out of the question. In passing, fix incorrect behavior in pg_import_system_collations()'s ICU code path: it neglected CommandCounterIncrement, which would result in failures if ICU returns duplicate names, and it would try to create comments even if a new collation hadn't been created. Also, reorder operations in initdb so that the 'ucs_basic' collation is created before calling pg_import_system_collations() not after. This prevents a failure if "locale -a" were to report a locale named that. There's no reason to think that that ever happens in the wild, but the old coding would have survived it, so let's be equally robust. Discussion: https://postgr.es/m/20c74bc3-d6ca-243d-1bbc-12f17fa4fe9a@gmail.com --- doc/src/sgml/func.sgml | 24 +- src/backend/catalog/pg_collation.c | 19 +- src/backend/commands/collationcmds.c | 333 ++++++++++++++++---------- src/bin/initdb/initdb.c | 12 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_collation_fn.h | 3 +- src/include/catalog/pg_proc.h | 5 +- 7 files changed, 257 insertions(+), 141 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index e073f7b57c..36319222e6 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -19711,9 +19711,9 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); pg_import_system_collations - pg_import_system_collations(if_not_exists boolean, schema regnamespace) + pg_import_system_collations(schema regnamespace) - void + integer Import operating system collations @@ -19730,18 +19730,20 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); - pg_import_system_collations populates the system - catalog pg_collation with collations based on all the - locales it finds on the operating system. This is + pg_import_system_collations adds collations to the system + catalog pg_collation based on all the + locales it finds in the operating system. This is what initdb uses; see for more details. If additional locales are installed into the operating system later on, this function - can be run again to add collations for the new locales. In that case, the - parameter if_not_exists should be set to true to - skip over existing collations. The schema - parameter would typically be pg_catalog, but that is - not a requirement. (Collation objects based on locales that are no longer - present on the operating system are never removed by this function.) + can be run again to add collations for the new locales. Locales that + match existing entries in pg_collation will be skipped. + (But collation objects based on locales that are no longer + present in the operating system are not removed by this function.) + The schema parameter would typically + be pg_catalog, but that is not a requirement; + the collations could be installed into some other schema as well. + The function returns the number of new collation objects it created. diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index 6b1b72f245..ca62896ecb 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -37,6 +37,11 @@ * CollationCreate * * Add a new tuple to pg_collation. + * + * if_not_exists: if true, don't fail on duplicate name, just print a notice + * and return InvalidOid. + * quiet: if true, don't fail on duplicate name, just silently return + * InvalidOid (overrides if_not_exists). */ Oid CollationCreate(const char *collname, Oid collnamespace, @@ -45,7 +50,8 @@ CollationCreate(const char *collname, Oid collnamespace, int32 collencoding, const char *collcollate, const char *collctype, const char *collversion, - bool if_not_exists) + bool if_not_exists, + bool quiet) { Relation rel; TupleDesc tupDesc; @@ -77,7 +83,9 @@ CollationCreate(const char *collname, Oid collnamespace, Int32GetDatum(collencoding), ObjectIdGetDatum(collnamespace))) { - if (if_not_exists) + if (quiet) + return InvalidOid; + else if (if_not_exists) { ereport(NOTICE, (errcode(ERRCODE_DUPLICATE_OBJECT), @@ -119,7 +127,12 @@ CollationCreate(const char *collname, Oid collnamespace, Int32GetDatum(-1), ObjectIdGetDatum(collnamespace)))) { - if (if_not_exists) + if (quiet) + { + heap_close(rel, NoLock); + return InvalidOid; + } + else if (if_not_exists) { heap_close(rel, NoLock); ereport(NOTICE, diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 1a62b3e30a..a0b3b23816 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -37,6 +37,14 @@ #include "utils/syscache.h" +typedef struct +{ + char *localename; /* name of locale, as per "locale -a" */ + char *alias; /* shortened alias for same */ + int enc; /* encoding */ +} CollAliasData; + + /* * CREATE COLLATION */ @@ -196,7 +204,8 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e collcollate, collctype, collversion, - if_not_exists); + if_not_exists, + false); /* not quiet */ if (!OidIsValid(newoid)) return InvalidObjectAddress; @@ -344,13 +353,18 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) } +/* will we use "locale -a" in pg_import_system_collations? */ +#if defined(HAVE_LOCALE_T) && !defined(WIN32) +#define READ_LOCALE_A_OUTPUT +#endif + +#ifdef READ_LOCALE_A_OUTPUT /* * "Normalize" a libc locale name, stripping off encoding tags such as * ".utf8" (e.g., "en_US.utf8" -> "en_US", but "br_FR.iso885915@euro" * -> "br_FR@euro"). Return true if a new, different name was * generated. */ -pg_attribute_unused() static bool normalize_libc_locale_name(char *new, const char *old) { @@ -379,6 +393,20 @@ normalize_libc_locale_name(char *new, const char *old) return changed; } +/* + * qsort comparator for CollAliasData items + */ +static int +cmpaliases(const void *a, const void *b) +{ + const CollAliasData *ca = (const CollAliasData *) a; + const CollAliasData *cb = (const CollAliasData *) b; + + /* comparing localename is enough because other fields are derived */ + return strcmp(ca->localename, cb->localename); +} +#endif /* READ_LOCALE_A_OUTPUT */ + #ifdef USE_ICU /* @@ -429,140 +457,190 @@ get_icu_locale_comment(const char *localename) #endif /* USE_ICU */ +/* + * pg_import_system_collations: add known system collations to pg_collation + */ Datum pg_import_system_collations(PG_FUNCTION_ARGS) { - bool if_not_exists = PG_GETARG_BOOL(0); - Oid nspid = PG_GETARG_OID(1); + Oid nspid = PG_GETARG_OID(0); + int ncreated = 0; -#if defined(HAVE_LOCALE_T) && !defined(WIN32) - FILE *locale_a_handle; - char localebuf[NAMEDATALEN]; /* we assume ASCII so this is fine */ - int count = 0; - List *aliaslist = NIL; - List *localelist = NIL; - List *enclist = NIL; - ListCell *lca, - *lcl, - *lce; -#endif + /* silence compiler warning if we have no locale implementation at all */ + (void) nspid; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to import system collations")))); -#if !(defined(HAVE_LOCALE_T) && !defined(WIN32)) && !defined(USE_ICU) - /* silence compiler warnings */ - (void) if_not_exists; - (void) nspid; -#endif - -#if defined(HAVE_LOCALE_T) && !defined(WIN32) - locale_a_handle = OpenPipeStream("locale -a", "r"); - if (locale_a_handle == NULL) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not execute command \"%s\": %m", - "locale -a"))); - - while (fgets(localebuf, sizeof(localebuf), locale_a_handle)) + /* Load collations known to libc, using "locale -a" to enumerate them */ +#ifdef READ_LOCALE_A_OUTPUT { - int i; - size_t len; - int enc; - bool skip; - char alias[NAMEDATALEN]; + FILE *locale_a_handle; + char localebuf[NAMEDATALEN]; /* we assume ASCII so this is fine */ + int nvalid = 0; + Oid collid; + CollAliasData *aliases; + int naliases, + maxaliases, + i; - len = strlen(localebuf); + /* expansible array of aliases */ + maxaliases = 100; + aliases = (CollAliasData *) palloc(maxaliases * sizeof(CollAliasData)); + naliases = 0; - if (len == 0 || localebuf[len - 1] != '\n') + locale_a_handle = OpenPipeStream("locale -a", "r"); + if (locale_a_handle == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not execute command \"%s\": %m", + "locale -a"))); + + while (fgets(localebuf, sizeof(localebuf), locale_a_handle)) { - elog(DEBUG1, "locale name too long, skipped: \"%s\"", localebuf); - continue; - } - localebuf[len - 1] = '\0'; + size_t len; + int enc; + bool skip; + char alias[NAMEDATALEN]; - /* - * Some systems have locale names that don't consist entirely of ASCII - * letters (such as "bokmål" or "français"). This is - * pretty silly, since we need the locale itself to interpret the - * non-ASCII characters. We can't do much with those, so we filter - * them out. - */ - skip = false; - for (i = 0; i < len; i++) - { - if (IS_HIGHBIT_SET(localebuf[i])) + len = strlen(localebuf); + + if (len == 0 || localebuf[len - 1] != '\n') { - skip = true; - break; + elog(DEBUG1, "locale name too long, skipped: \"%s\"", localebuf); + continue; + } + localebuf[len - 1] = '\0'; + + /* + * Some systems have locale names that don't consist entirely of + * ASCII letters (such as "bokmål" or "français"). + * This is pretty silly, since we need the locale itself to + * interpret the non-ASCII characters. We can't do much with + * those, so we filter them out. + */ + skip = false; + for (i = 0; i < len; i++) + { + if (IS_HIGHBIT_SET(localebuf[i])) + { + skip = true; + break; + } + } + if (skip) + { + elog(DEBUG1, "locale name has non-ASCII characters, skipped: \"%s\"", localebuf); + continue; + } + + enc = pg_get_encoding_from_locale(localebuf, false); + if (enc < 0) + { + /* error message printed by pg_get_encoding_from_locale() */ + continue; + } + if (!PG_VALID_BE_ENCODING(enc)) + continue; /* ignore locales for client-only encodings */ + if (enc == PG_SQL_ASCII) + continue; /* C/POSIX are already in the catalog */ + + /* count valid locales found in operating system */ + nvalid++; + + /* + * Create a collation named the same as the locale, but quietly + * doing nothing if it already exists. This is the behavior we + * need even at initdb time, because some versions of "locale -a" + * can report the same locale name more than once. And it's + * convenient for later import runs, too, since you just about + * always want to add on new locales without a lot of chatter + * about existing ones. + */ + collid = CollationCreate(localebuf, nspid, GetUserId(), + COLLPROVIDER_LIBC, enc, + localebuf, localebuf, + get_collation_actual_version(COLLPROVIDER_LIBC, localebuf), + true, true); + if (OidIsValid(collid)) + { + ncreated++; + + /* Must do CCI between inserts to handle duplicates correctly */ + CommandCounterIncrement(); + } + + /* + * Generate aliases such as "en_US" in addition to "en_US.utf8" + * for ease of use. Note that collation names are unique per + * encoding only, so this doesn't clash with "en_US" for LATIN1, + * say. + * + * However, it might conflict with a name we'll see later in the + * "locale -a" output. So save up the aliases and try to add them + * after we've read all the output. + */ + if (normalize_libc_locale_name(alias, localebuf)) + { + if (naliases >= maxaliases) + { + maxaliases *= 2; + aliases = (CollAliasData *) + repalloc(aliases, maxaliases * sizeof(CollAliasData)); + } + aliases[naliases].localename = pstrdup(localebuf); + aliases[naliases].alias = pstrdup(alias); + aliases[naliases].enc = enc; + naliases++; } } - if (skip) - { - elog(DEBUG1, "locale name has non-ASCII characters, skipped: \"%s\"", localebuf); - continue; - } - enc = pg_get_encoding_from_locale(localebuf, false); - if (enc < 0) - { - /* error message printed by pg_get_encoding_from_locale() */ - continue; - } - if (!PG_VALID_BE_ENCODING(enc)) - continue; /* ignore locales for client-only encodings */ - if (enc == PG_SQL_ASCII) - continue; /* C/POSIX are already in the catalog */ - - count++; - - CollationCreate(localebuf, nspid, GetUserId(), COLLPROVIDER_LIBC, enc, - localebuf, localebuf, - get_collation_actual_version(COLLPROVIDER_LIBC, localebuf), - if_not_exists); - - CommandCounterIncrement(); + ClosePipeStream(locale_a_handle); /* - * Generate aliases such as "en_US" in addition to "en_US.utf8" for - * ease of use. Note that collation names are unique per encoding - * only, so this doesn't clash with "en_US" for LATIN1, say. - * - * However, it might conflict with a name we'll see later in the - * "locale -a" output. So save up the aliases and try to add them - * after we've read all the output. + * Before processing the aliases, sort them by locale name. The point + * here is that if "locale -a" gives us multiple locale names with the + * same encoding and base name, say "en_US.utf8" and "en_US.utf-8", we + * want to pick a deterministic one of them. First in ASCII sort + * order is a good enough rule. (Before PG 10, the code corresponding + * to this logic in initdb.c had an additional ordering rule, to + * prefer the locale name exactly matching the alias, if any. We + * don't need to consider that here, because we would have already + * created such a pg_collation entry above, and that one will win.) */ - if (normalize_libc_locale_name(alias, localebuf)) + if (naliases > 1) + qsort((void *) aliases, naliases, sizeof(CollAliasData), cmpaliases); + + /* Now add aliases, ignoring any that match pre-existing entries */ + for (i = 0; i < naliases; i++) { - aliaslist = lappend(aliaslist, pstrdup(alias)); - localelist = lappend(localelist, pstrdup(localebuf)); - enclist = lappend_int(enclist, enc); + char *locale = aliases[i].localename; + char *alias = aliases[i].alias; + int enc = aliases[i].enc; + + collid = CollationCreate(alias, nspid, GetUserId(), + COLLPROVIDER_LIBC, enc, + locale, locale, + get_collation_actual_version(COLLPROVIDER_LIBC, locale), + true, true); + if (OidIsValid(collid)) + { + ncreated++; + + CommandCounterIncrement(); + } } + + /* Give a warning if "locale -a" seems to be malfunctioning */ + if (nvalid == 0) + ereport(WARNING, + (errmsg("no usable system locales were found"))); } +#endif /* READ_LOCALE_A_OUTPUT */ - ClosePipeStream(locale_a_handle); - - /* Now try to add any aliases we created */ - forthree(lca, aliaslist, lcl, localelist, lce, enclist) - { - char *alias = (char *) lfirst(lca); - char *locale = (char *) lfirst(lcl); - int enc = lfirst_int(lce); - - CollationCreate(alias, nspid, GetUserId(), COLLPROVIDER_LIBC, enc, - locale, locale, - get_collation_actual_version(COLLPROVIDER_LIBC, locale), - true); - CommandCounterIncrement(); - } - - if (count == 0) - ereport(WARNING, - (errmsg("no usable system locales were found"))); -#endif /* not HAVE_LOCALE_T && not WIN32 */ - + /* Load collations known to ICU */ #ifdef USE_ICU if (!is_encoding_supported_by_icu(GetDatabaseEncoding())) { @@ -597,13 +675,20 @@ pg_import_system_collations(PG_FUNCTION_ARGS) langtag = get_icu_language_tag(name); collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name; collid = CollationCreate(psprintf("%s-x-icu", langtag), - nspid, GetUserId(), COLLPROVIDER_ICU, -1, + nspid, GetUserId(), + COLLPROVIDER_ICU, -1, collcollate, collcollate, get_collation_actual_version(COLLPROVIDER_ICU, collcollate), - if_not_exists); + true, true); + if (OidIsValid(collid)) + { + ncreated++; - CreateComments(collid, CollationRelationId, 0, - get_icu_locale_comment(name)); + CommandCounterIncrement(); + + CreateComments(collid, CollationRelationId, 0, + get_icu_locale_comment(name)); + } /* * Add keyword variants @@ -624,12 +709,20 @@ pg_import_system_collations(PG_FUNCTION_ARGS) langtag = get_icu_language_tag(localeid); collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid; collid = CollationCreate(psprintf("%s-x-icu", langtag), - nspid, GetUserId(), COLLPROVIDER_ICU, -1, + nspid, GetUserId(), + COLLPROVIDER_ICU, -1, collcollate, collcollate, get_collation_actual_version(COLLPROVIDER_ICU, collcollate), - if_not_exists); - CreateComments(collid, CollationRelationId, 0, - get_icu_locale_comment(localeid)); + true, true); + if (OidIsValid(collid)) + { + ncreated++; + + CommandCounterIncrement(); + + CreateComments(collid, CollationRelationId, 0, + get_icu_locale_comment(localeid)); + } } if (U_FAILURE(status)) ereport(ERROR, @@ -638,7 +731,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) uenum_close(en); } } -#endif +#endif /* USE_ICU */ - PG_RETURN_VOID(); + PG_RETURN_INT32(ncreated); } diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index b76eb1eae4..7303bbe892 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -1636,10 +1636,16 @@ setup_description(FILE *cmdfd) static void setup_collation(FILE *cmdfd) { - PG_CMD_PUTS("SELECT pg_import_system_collations(if_not_exists => false, schema => 'pg_catalog');\n\n"); + /* + * Add an SQL-standard name. We don't want to pin this, so it doesn't go + * in pg_collation.h. But add it before reading system collations, so + * that it wins if libc defines a locale named ucs_basic. + */ + PG_CMD_PRINTF3("INSERT INTO pg_collation (collname, collnamespace, collowner, collprovider, collencoding, collcollate, collctype) VALUES ('ucs_basic', 'pg_catalog'::regnamespace, %u, '%c', %d, 'C', 'C');\n\n", + BOOTSTRAP_SUPERUSERID, COLLPROVIDER_LIBC, PG_UTF8); - /* Add an SQL-standard name */ - PG_CMD_PRINTF3("INSERT INTO pg_collation (collname, collnamespace, collowner, collprovider, collencoding, collcollate, collctype) VALUES ('ucs_basic', 'pg_catalog'::regnamespace, %u, '%c', %d, 'C', 'C');\n\n", BOOTSTRAP_SUPERUSERID, COLLPROVIDER_LIBC, PG_UTF8); + /* Now import all collations we can find in the operating system */ + PG_CMD_PUTS("SELECT pg_import_system_collations('pg_catalog');\n\n"); } /* diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index f4d2770e7e..14f916cde1 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201706202 +#define CATALOG_VERSION_NO 201706231 #endif diff --git a/src/include/catalog/pg_collation_fn.h b/src/include/catalog/pg_collation_fn.h index 26b8f0d3d1..0ef31389d5 100644 --- a/src/include/catalog/pg_collation_fn.h +++ b/src/include/catalog/pg_collation_fn.h @@ -20,7 +20,8 @@ extern Oid CollationCreate(const char *collname, Oid collnamespace, int32 collencoding, const char *collcollate, const char *collctype, const char *collversion, - bool if_not_exists); + bool if_not_exists, + bool quiet); extern void RemoveCollationById(Oid collationOid); #endif /* PG_COLLATION_FN_H */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 1191b4ab1b..8b33b4e0ea 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5462,11 +5462,12 @@ DESCR("pg_controldata recovery state information as a function"); DATA(insert OID = 3444 ( pg_control_init PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{23,23,23,23,23,23,23,23,23,16,16,23}" "{o,o,o,o,o,o,o,o,o,o,o,o}" "{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float4_pass_by_value,float8_pass_by_value,data_page_checksum_version}" _null_ _null_ pg_control_init _null_ _null_ _null_ )); DESCR("pg_controldata init state information as a function"); -DATA(insert OID = 3445 ( pg_import_system_collations PGNSP PGUID 12 100 0 0 0 f f f f t f v r 2 0 2278 "16 4089" _null_ _null_ "{if_not_exists,schema}" _null_ _null_ pg_import_system_collations _null_ _null_ _null_ )); +/* collation management functions */ +DATA(insert OID = 3445 ( pg_import_system_collations PGNSP PGUID 12 100 0 0 0 f f f f t f v r 1 0 23 "4089" _null_ _null_ _null_ _null_ _null_ pg_import_system_collations _null_ _null_ _null_ )); DESCR("import collations from operating system"); DATA(insert OID = 3448 ( pg_collation_actual_version PGNSP PGUID 12 100 0 0 0 f f f f t f v s 1 0 25 "26" _null_ _null_ _null_ _null_ _null_ pg_collation_actual_version _null_ _null_ _null_ )); -DESCR("import collations from operating system"); +DESCR("get actual version of collation from operating system"); /* system management/monitoring related functions */ DATA(insert OID = 3353 ( pg_ls_logdir PGNSP PGUID 12 10 20 0 0 f f f f t t v s 0 0 2249 "" "{25,20,1184}" "{o,o,o}" "{name,size,modification}" _null_ _null_ pg_ls_logdir _null_ _null_ _null_ ));