Extend the default rules file for contrib/unaccent with Vietnamese letters.

Improve generate_unaccent_rules.py to handle composed characters whose base
is another composed character rather than a plain letter.  The net effect
of this is to add a bunch of multi-accented Vietnamese characters to
unaccent.rules.

Original complaint from Kha Nguyen, diagnosis of the script's shortcoming
by Thomas Munro.

Dang Minh Huong and Michael Paquier

Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
This commit is contained in:
Tom Lane 2017-08-16 16:51:56 -04:00
parent 2b74303637
commit ec0a69e49b
2 changed files with 145 additions and 8 deletions

View File

@ -48,24 +48,47 @@ def is_mark(codepoint):
return codepoint.general_category in ("Mn", "Me", "Mc") return codepoint.general_category in ("Mn", "Me", "Mc")
def is_letter_with_marks(codepoint, table): def is_letter_with_marks(codepoint, table):
"""Returns true for plain letters combined with one or more marks.""" """Returns true for letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
return len(codepoint.combining_ids) > 1 and \
is_plain_letter(table[codepoint.combining_ids[0]]) and \ # Letter may have no combining characters, in which case it has
all(is_mark(table[i]) for i in codepoint.combining_ids[1:]) # no marks.
if len(codepoint.combining_ids) == 1:
return False
# A letter without diacritical marks has none of them.
if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
return False
# Check if the base letter of this letter has marks.
codepoint_base = codepoint.combining_ids[0]
if (is_plain_letter(table[codepoint_base]) is False and \
is_letter_with_marks(table[codepoint_base], table) is False):
return False
return True
def is_letter(codepoint, table): def is_letter(codepoint, table):
"""Return true for letter with or without diacritical marks.""" """Return true for letter with or without diacritical marks."""
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
def get_plain_letter(codepoint, table): def get_plain_letter(codepoint, table):
"""Return the base codepoint without marks.""" """Return the base codepoint without marks. If this codepoint has more
than one combining character, do a recursive lookup on the table to
find out its plain base letter."""
if is_letter_with_marks(codepoint, table): if is_letter_with_marks(codepoint, table):
return table[codepoint.combining_ids[0]] if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
return get_plain_letter(table[codepoint.combining_ids[0]], table)
elif is_plain_letter(table[codepoint.combining_ids[0]]):
return table[codepoint.combining_ids[0]]
# Should not come here
assert(False)
elif is_plain_letter(codepoint): elif is_plain_letter(codepoint):
return codepoint return codepoint
else:
raise "mu" # Should not come here
assert(False)
def is_ligature(codepoint, table): def is_ligature(codepoint, table):
"""Return true for letters combined with letters.""" """Return true for letters combined with letters."""

View File

@ -254,6 +254,18 @@
ǒ o ǒ o
Ǔ U Ǔ U
ǔ u ǔ u
Ǖ U
ǖ u
Ǘ U
ǘ u
Ǚ U
ǚ u
Ǜ U
ǜ u
Ǟ A
ǟ a
Ǡ A
ǡ a
Ǥ G Ǥ G
ǥ g ǥ g
Ǧ G Ǧ G
@ -262,6 +274,8 @@
ǩ k ǩ k
Ǫ O Ǫ O
ǫ o ǫ o
Ǭ O
ǭ o
ǰ j ǰ j
DZ DZ DZ DZ
Dz Dz Dz Dz
@ -270,6 +284,8 @@
ǵ g ǵ g
Ǹ N Ǹ N
ǹ n ǹ n
Ǻ A
ǻ a
Ȁ A Ȁ A
ȁ a ȁ a
Ȃ A Ȃ A
@ -307,8 +323,14 @@
ȧ a ȧ a
Ȩ E Ȩ E
ȩ e ȩ e
Ȫ O
ȫ o
Ȭ O
ȭ o
Ȯ O Ȯ O
ȯ o ȯ o
Ȱ O
ȱ o
Ȳ Y Ȳ Y
ȳ y ȳ y
ȴ l ȴ l
@ -441,6 +463,8 @@
ḅ b ḅ b
Ḇ B Ḇ B
ḇ b ḇ b
Ḉ C
ḉ c
Ḋ D Ḋ D
ḋ d ḋ d
Ḍ D Ḍ D
@ -451,10 +475,16 @@
ḑ d ḑ d
Ḓ D Ḓ D
ḓ d ḓ d
Ḕ E
ḕ e
Ḗ E
ḗ e
Ḙ E Ḙ E
ḙ e ḙ e
Ḛ E Ḛ E
ḛ e ḛ e
Ḝ E
ḝ e
Ḟ F Ḟ F
ḟ f ḟ f
Ḡ G Ḡ G
@ -471,6 +501,8 @@
ḫ h ḫ h
Ḭ I Ḭ I
ḭ i ḭ i
Ḯ I
ḯ i
Ḱ K Ḱ K
ḱ k ḱ k
Ḳ K Ḳ K
@ -479,6 +511,8 @@
ḵ k ḵ k
Ḷ L Ḷ L
ḷ l ḷ l
Ḹ L
ḹ l
Ḻ L Ḻ L
ḻ l ḻ l
Ḽ L Ḽ L
@ -497,6 +531,14 @@
ṉ n ṉ n
Ṋ N Ṋ N
ṋ n ṋ n
Ṍ O
ṍ o
Ṏ O
ṏ o
Ṑ O
ṑ o
Ṓ O
ṓ o
Ṕ P Ṕ P
ṕ p ṕ p
Ṗ P Ṗ P
@ -505,12 +547,20 @@
ṙ r ṙ r
Ṛ R Ṛ R
ṛ r ṛ r
Ṝ R
ṝ r
Ṟ R Ṟ R
ṟ r ṟ r
Ṡ S Ṡ S
ṡ s ṡ s
Ṣ S Ṣ S
ṣ s ṣ s
Ṥ S
ṥ s
Ṧ S
ṧ s
Ṩ S
ṩ s
Ṫ T Ṫ T
ṫ t ṫ t
Ṭ T Ṭ T
@ -525,6 +575,10 @@
ṵ u ṵ u
Ṷ U Ṷ U
ṷ u ṷ u
Ṹ U
ṹ u
Ṻ U
ṻ u
Ṽ V Ṽ V
ṽ v ṽ v
Ṿ V Ṿ V
@ -563,12 +617,42 @@
ạ a ạ a
Ả A Ả A
ả a ả a
Ấ A
ấ a
Ầ A
ầ a
Ẩ A
ẩ a
Ẫ A
ẫ a
Ậ A
ậ a
Ắ A
ắ a
Ằ A
ằ a
Ẳ A
ẳ a
Ẵ A
ẵ a
Ặ A
ặ a
Ẹ E Ẹ E
ẹ e ẹ e
Ẻ E Ẻ E
ẻ e ẻ e
Ẽ E Ẽ E
ẽ e ẽ e
Ế E
ế e
Ề E
ề e
Ể E
ể e
Ễ E
ễ e
Ệ E
ệ e
Ỉ I Ỉ I
ỉ i ỉ i
Ị I Ị I
@ -577,10 +661,40 @@
ọ o ọ o
Ỏ O Ỏ O
ỏ o ỏ o
Ố O
ố o
Ồ O
ồ o
Ổ O
ổ o
Ỗ O
ỗ o
Ộ O
ộ o
Ớ O
ớ o
Ờ O
ờ o
Ở O
ở o
Ỡ O
ỡ o
Ợ O
ợ o
Ụ U Ụ U
ụ u ụ u
Ủ U Ủ U
ủ u ủ u
Ứ U
ứ u
Ừ U
ừ u
Ử U
ử u
Ữ U
ữ u
Ự U
ự u
Ỳ Y Ỳ Y
ỳ y ỳ y
Ỵ Y Ỵ Y