Extend the default rules file for contrib/unaccent with Vietnamese letters.

Improve generate_unaccent_rules.py to handle composed characters whose base
is another composed character rather than a plain letter.  The net effect
of this is to add a bunch of multi-accented Vietnamese characters to
unaccent.rules.

Original complaint from Kha Nguyen, diagnosis of the script's shortcoming
by Thomas Munro.

Dang Minh Huong and Michael Paquier

Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
This commit is contained in:
Tom Lane 2017-08-16 16:51:56 -04:00
parent 2b74303637
commit ec0a69e49b
2 changed files with 145 additions and 8 deletions

View File

@ -48,24 +48,47 @@ def is_mark(codepoint):
return codepoint.general_category in ("Mn", "Me", "Mc")
def is_letter_with_marks(codepoint, table):
"""Returns true for plain letters combined with one or more marks."""
"""Returns true for letters combined with one or more marks."""
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
return len(codepoint.combining_ids) > 1 and \
is_plain_letter(table[codepoint.combining_ids[0]]) and \
all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
# Letter may have no combining characters, in which case it has
# no marks.
if len(codepoint.combining_ids) == 1:
return False
# A letter without diacritical marks has none of them.
if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
return False
# Check if the base letter of this letter has marks.
codepoint_base = codepoint.combining_ids[0]
if (is_plain_letter(table[codepoint_base]) is False and \
is_letter_with_marks(table[codepoint_base], table) is False):
return False
return True
def is_letter(codepoint, table):
"""Return true for letter with or without diacritical marks."""
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
def get_plain_letter(codepoint, table):
"""Return the base codepoint without marks."""
"""Return the base codepoint without marks. If this codepoint has more
than one combining character, do a recursive lookup on the table to
find out its plain base letter."""
if is_letter_with_marks(codepoint, table):
return table[codepoint.combining_ids[0]]
if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
return get_plain_letter(table[codepoint.combining_ids[0]], table)
elif is_plain_letter(table[codepoint.combining_ids[0]]):
return table[codepoint.combining_ids[0]]
# Should not come here
assert(False)
elif is_plain_letter(codepoint):
return codepoint
else:
raise "mu"
# Should not come here
assert(False)
def is_ligature(codepoint, table):
"""Return true for letters combined with letters."""

View File

@ -254,6 +254,18 @@
ǒ o
Ǔ U
ǔ u
Ǖ U
ǖ u
Ǘ U
ǘ u
Ǚ U
ǚ u
Ǜ U
ǜ u
Ǟ A
ǟ a
Ǡ A
ǡ a
Ǥ G
ǥ g
Ǧ G
@ -262,6 +274,8 @@
ǩ k
Ǫ O
ǫ o
Ǭ O
ǭ o
ǰ j
DZ DZ
Dz Dz
@ -270,6 +284,8 @@
ǵ g
Ǹ N
ǹ n
Ǻ A
ǻ a
Ȁ A
ȁ a
Ȃ A
@ -307,8 +323,14 @@
ȧ a
Ȩ E
ȩ e
Ȫ O
ȫ o
Ȭ O
ȭ o
Ȯ O
ȯ o
Ȱ O
ȱ o
Ȳ Y
ȳ y
ȴ l
@ -441,6 +463,8 @@
ḅ b
Ḇ B
ḇ b
Ḉ C
ḉ c
Ḋ D
ḋ d
Ḍ D
@ -451,10 +475,16 @@
ḑ d
Ḓ D
ḓ d
Ḕ E
ḕ e
Ḗ E
ḗ e
Ḙ E
ḙ e
Ḛ E
ḛ e
Ḝ E
ḝ e
Ḟ F
ḟ f
Ḡ G
@ -471,6 +501,8 @@
ḫ h
Ḭ I
ḭ i
Ḯ I
ḯ i
Ḱ K
ḱ k
Ḳ K
@ -479,6 +511,8 @@
ḵ k
Ḷ L
ḷ l
Ḹ L
ḹ l
Ḻ L
ḻ l
Ḽ L
@ -497,6 +531,14 @@
ṉ n
Ṋ N
ṋ n
Ṍ O
ṍ o
Ṏ O
ṏ o
Ṑ O
ṑ o
Ṓ O
ṓ o
Ṕ P
ṕ p
Ṗ P
@ -505,12 +547,20 @@
ṙ r
Ṛ R
ṛ r
Ṝ R
ṝ r
Ṟ R
ṟ r
Ṡ S
ṡ s
Ṣ S
ṣ s
Ṥ S
ṥ s
Ṧ S
ṧ s
Ṩ S
ṩ s
Ṫ T
ṫ t
Ṭ T
@ -525,6 +575,10 @@
ṵ u
Ṷ U
ṷ u
Ṹ U
ṹ u
Ṻ U
ṻ u
Ṽ V
ṽ v
Ṿ V
@ -563,12 +617,42 @@
ạ a
Ả A
ả a
Ấ A
ấ a
Ầ A
ầ a
Ẩ A
ẩ a
Ẫ A
ẫ a
Ậ A
ậ a
Ắ A
ắ a
Ằ A
ằ a
Ẳ A
ẳ a
Ẵ A
ẵ a
Ặ A
ặ a
Ẹ E
ẹ e
Ẻ E
ẻ e
Ẽ E
ẽ e
Ế E
ế e
Ề E
ề e
Ể E
ể e
Ễ E
ễ e
Ệ E
ệ e
Ỉ I
ỉ i
Ị I
@ -577,10 +661,40 @@
ọ o
Ỏ O
ỏ o
Ố O
ố o
Ồ O
ồ o
Ổ O
ổ o
Ỗ O
ỗ o
Ộ O
ộ o
Ớ O
ớ o
Ờ O
ờ o
Ở O
ở o
Ỡ O
ỡ o
Ợ O
ợ o
Ụ U
ụ u
Ủ U
ủ u
Ứ U
ứ u
Ừ U
ừ u
Ử U
ử u
Ữ U
ữ u
Ự U
ự u
Ỳ Y
ỳ y
Ỵ Y