Simplify a bit the special rules generating unaccent.rules

As noted by Thomas Munro, CLDR 36 has added SOUND RECORDING COPYRIGHT
(U+2117), and we use CLDR 41, so this can be removed from the set of
special cases.

The set of regression tests is expanded for degree signs, which are two
of the special cases, and a fancy case with U+210C in Latin-ASCII.xml
that we have discovered about when diving into what could be done for
Cyrillic characters (this last part is material for a future patch, not
tackled yet).

While on it, some of the assertions of generate_unaccent_rules.py are
expanded to report the codepoint on which a failure is found, something
useful for debugging.

Extracted from a larger patch by the same author.

Author: Przemysław Sztoch
Discussion: https://postgr.es/m/8478da0d-3b61-d24f-80b4-ce2f5e971c60@sztoch.pl
This commit is contained in:
Michael Paquier 2022-07-05 16:17:51 +09:00
parent 84ad713cf8
commit e3dd7c06e6
3 changed files with 56 additions and 3 deletions

View File

@ -37,6 +37,18 @@ SELECT unaccent('À'); -- Remove combining diacritical 0x0300
A
(1 row)
SELECT unaccent('℃℉'); -- degree signs
unaccent
----------
°C°F
(1 row)
SELECT unaccent('℗'); -- sound recording copyright
unaccent
----------
(P)
(1 row)
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
@ -67,6 +79,18 @@ SELECT unaccent('unaccent', 'À');
A
(1 row)
SELECT unaccent('unaccent', '℃℉');
unaccent
----------
°C°F
(1 row)
SELECT unaccent('unaccent', '℗');
unaccent
----------
(P)
(1 row)
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
@ -97,3 +121,23 @@ SELECT ts_lexize('unaccent', 'À');
{A}
(1 row)
SELECT ts_lexize('unaccent', '℃℉');
ts_lexize
-----------
{°C°F}
(1 row)
SELECT ts_lexize('unaccent', '℗');
ts_lexize
-----------
{(P)}
(1 row)
-- Controversial case. Black-Letter Capital H (U+210C) is translated by
-- Latin-ASCII.xml as 'x', but it should be 'H'.
SELECT unaccent('');
unaccent
----------
x
(1 row)

View File

@ -134,12 +134,12 @@ def get_plain_letter(codepoint, table):
return table[codepoint.combining_ids[0]]
# Should not come here
assert(False)
assert False, 'Codepoint U+%0.2X' % codepoint.id
elif is_plain_letter(codepoint):
return codepoint
# Should not come here
assert(False)
assert False, 'Codepoint U+%0.2X' % codepoint.id
def is_ligature(codepoint, table):
@ -212,7 +212,6 @@ def special_cases():
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS
charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT
charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
return charactersSet

View File

@ -10,15 +10,25 @@ SELECT unaccent('ёлка');
SELECT unaccent('ЁЖИК');
SELECT unaccent('˃˖˗˜');
SELECT unaccent(''); -- Remove combining diacritical 0x0300
SELECT unaccent('℃℉'); -- degree signs
SELECT unaccent(''); -- sound recording copyright
SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'ёлка');
SELECT unaccent('unaccent', 'ЁЖИК');
SELECT unaccent('unaccent', '˃˖˗˜');
SELECT unaccent('unaccent', '');
SELECT unaccent('unaccent', '℃℉');
SELECT unaccent('unaccent', '');
SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'ёлка');
SELECT ts_lexize('unaccent', 'ЁЖИК');
SELECT ts_lexize('unaccent', '˃˖˗˜');
SELECT ts_lexize('unaccent', '');
SELECT ts_lexize('unaccent', '℃℉');
SELECT ts_lexize('unaccent', '');
-- Controversial case. Black-Letter Capital H (U+210C) is translated by
-- Latin-ASCII.xml as 'x', but it should be 'H'.
SELECT unaccent('');