Improve script generating unaccent rules

Script now use the standard Unicode transliterator Latin-ASCII. Author: Leonard Benedetti
2016-03-16 16:47:03 +03:00 · 2016-03-16 16:47:03 +03:00 · 9a206d063c
parent 3aff33aa68
commit 9a206d063c
2 changed files with 761 additions and 55 deletions
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@ -1,20 +1,33 @@
-#!/usr/bin/python
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
 #
 # This script builds unaccent.rules on standard output when given the
-# contents of UnicodeData.txt[1] on standard input.  Optionally includes
-# ligature expansion, if --expand-ligatures is given on the command line.
+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
+# arguments. Optionally includes ligature expansion and Unicode CLDR
+# Latin-ASCII transliterator, enabled by default, this can be disabled
+# with "--no-ligatures-expansion" command line option.
 #
 # The approach is to use the Unicode decomposition data to identify
 # precomposed codepoints that are equivalent to a ligature of several
 # letters, or a base letter with any number of diacritical marks.
-# There is also a small set of special cases for codepoints that we
-# traditionally support even though Unicode doesn't consider them to
-# be ligatures or letters with marks.
 #
-# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+# This approach handles most letters with diacritical marks and some
+# ligatures.  However, several characters (notably a majority of
+# ligatures) don't have decomposition. To handle all these cases, one can
+# use a standard Unicode transliterator available in Common Locale Data
+# Repository (CLDR): Latin-ASCII.  This transliterator associates Unicode
+# characters to ASCII-range equivalent.  Unless "--no-ligatures-expansion"
+# option is enabled, the XML file of this transliterator [2] -- given as a
+# command line argument -- will be parsed and used.
+#
+# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
+# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+

 import re
+import argparse
 import sys
+import xml.etree.ElementTree as ET

 def print_record(codepoint, letter):
    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
    assert(is_ligature(codepoint, table))
    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]

-def main(expand_ligatures):
+def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
+    """Parse the XML file and return a set of tuples (src, trg), where "src"
+    is the original character and "trg" the substitute."""
+    charactersSet = set()
+
+    # RegEx to parse rules
+    rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+
+    # construct tree from XML
+    transliterationTree = ET.parse(latinAsciiFilePath)
+    transliterationTreeRoot = transliterationTree.getroot()
+
+    for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
+        matches = rulePattern.search(rule.text)
+
+        # The regular expression capture four groups corresponding
+        # to the characters.
+        #
+        # Group 1: plain "src" char. Empty if group 2 is not.
+        # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
+        #
+        # Group 3: plain "trg" char. Empty if group 4 is not.
+        # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
+        if matches is not None:
+            src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
+            trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
+
+            # "'" and """ are escaped
+            trg = trg.replace("\\'", "'").replace('\\"', '"')
+
+            # the parser of unaccent only accepts non-whitespace characters
+            # for "src" and "trg" (see unaccent.c)
+            if not src.isspace() and not trg.isspace():
+                charactersSet.add((ord(src), trg))
+
+    return charactersSet
+
+def special_cases():
+    """Returns the special cases which are not handled by other methods"""
+    charactersSet = set()
+
+    # Cyrillic
+    charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
+    charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
+
+    # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
+    charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
+    charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
+    charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
+
+    return charactersSet
+
+def main(args):
    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
    decomposition_type_pattern = re.compile(" *<[^>]*> *")

    table = {}
    all = []

+    # unordered set for ensure uniqueness
+    charactersSet = set()
+
+    # read file UnicodeData.txt
+    unicodeDataFile = open(args.unicodeDataFilePath, 'r')
+
    # read everything we need into memory
-    for line in sys.stdin.readlines():
+    for line in unicodeDataFile:
        fields = line.split(";")
        if len(fields) > 5:
            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@ -89,35 +160,34 @@ def main(expand_ligatures):
        if codepoint.general_category.startswith('L') and \
           len(codepoint.combining_ids) > 1:
            if is_letter_with_marks(codepoint, table):
-                print_record(codepoint.id,
-                             chr(get_plain_letter(codepoint, table).id))
-            elif expand_ligatures and is_ligature(codepoint, table):
-                print_record(codepoint.id,
+                charactersSet.add((codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id)))
+            elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
+                charactersSet.add((codepoint.id,
                             "".join(unichr(combining_codepoint.id)
                                     for combining_codepoint \
-                                     in get_plain_letters(codepoint, table)))
+                                     in get_plain_letters(codepoint, table))))

-    # some special cases
-    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
-    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
-    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
-    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
-    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
-    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
-    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
-    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
-    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
-    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
-    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
-    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
-    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
-    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
-    if expand_ligatures:
-        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
-        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
-        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
-        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
-        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+    # add CLDR Latin-ASCII characters
+    if not args.noLigaturesExpansion:
+        charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
+        charactersSet |= special_cases()
+
+    # sort for more convenient display
+    charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
+
+    for characterPair in charactersList:
+        print_record(characterPair[0], characterPair[1])

 if __name__ == "__main__":
-    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
+    parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
+    parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
+    parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
+    parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
+    args = parser.parse_args()
+
+    if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
+        sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
+        sys.exit(1)
+
+    main(args)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@ -1,9 +1,18 @@
+©	(C)
+«	<<
+	-
+®	(R)
+»	>>
+¼	 1/4
+½	 1/2
+¾	 3/4
 À	A
 Á	A
 Â	A
 Ã	A
 Ä	A
 Å	A
+Æ	AE
 Ç	C
 È	E
 É	E
@ -13,23 +22,29 @@
 Í	I
 Î	I
 Ï	I
+Ð	D
 Ñ	N
 Ò	O
 Ó	O
 Ô	O
 Õ	O
 Ö	O
+×	*
+Ø	O
 Ù	U
 Ú	U
 Û	U
 Ü	U
 Ý	Y
+Þ	TH
+ß	ss
 à	a
 á	a
 â	a
 ã	a
 ä	a
 å	a
+æ	ae
 ç	c
 è	e
 é	e
@ -39,17 +54,21 @@
 í	i
 î	i
 ï	i
+ð	d
 ñ	n
 ò	o
 ó	o
 ô	o
 õ	o
 ö	o
+÷	/
+ø	o
 ù	u
 ú	u
 û	u
 ü	u
 ý	y
+þ	th
 ÿ	y
 Ā	A
 ā	a
@ -67,6 +86,8 @@
 č	c
 Ď	D
 ď	d
+Đ	D
+đ	d
 Ē	E
 ē	e
 Ĕ	E
@ -87,6 +108,8 @@
 ģ	g
 Ĥ	H
 ĥ	h
+Ħ	H
+ħ	h
 Ĩ	I
 ĩ	i
 Ī	I
@ -96,30 +119,41 @@
 Į	I
 į	i
 İ	I
+ı	i
 Ĳ	IJ
 ĳ	ij
 Ĵ	J
 ĵ	j
 Ķ	K
 ķ	k
+ĸ	q
 Ĺ	L
 ĺ	l
 Ļ	L
 ļ	l
 Ľ	L
 ľ	l
+Ŀ	L
+ŀ	l
+Ł	L
+ł	l
 Ń	N
 ń	n
 Ņ	N
 ņ	n
 Ň	N
 ň	n
+ŉ	'n
+Ŋ	N
+ŋ	n
 Ō	O
 ō	o
 Ŏ	O
 ŏ	o
 Ő	O
 ő	o
+Œ	OE
+œ	oe
 Ŕ	R
 ŕ	r
 Ŗ	R
@ -138,6 +172,8 @@
 ţ	t
 Ť	T
 ť	t
+Ŧ	T
+ŧ	t
 Ũ	U
 ũ	u
 Ū	U
@ -161,10 +197,46 @@
 ż	z
 Ž	Z
 ž	z
+ſ	s
+ƀ	b
+Ɓ	B
+Ƃ	B
+ƃ	b
+Ƈ	C
+ƈ	c
+Ɖ	D
+Ɗ	D
+Ƌ	D
+ƌ	d
+Ɛ	E
+Ƒ	F
+ƒ	f
+Ɠ	G
+ƕ	hv
+Ɩ	I
+Ɨ	I
+Ƙ	K
+ƙ	k
+ƚ	l
+Ɲ	N
+ƞ	n
 Ơ	O
 ơ	o
+Ƣ	OI
+ƣ	oi
+Ƥ	P
+ƥ	p
+ƫ	t
+Ƭ	T
+ƭ	t
+Ʈ	T
 Ư	U
 ư	u
+Ʋ	V
+Ƴ	Y
+ƴ	y
+Ƶ	Z
+ƶ	z
 Ǆ	DZ
 ǅ	Dz
 ǆ	dz
@ -182,6 +254,8 @@
 ǒ	o
 Ǔ	U
 ǔ	u
+Ǥ	G
+ǥ	g
 Ǧ	G
 ǧ	g
 Ǩ	K
@ -226,6 +300,9 @@
 ț	t
 Ȟ	H
 ȟ	h
+ȡ	d
+Ȥ	Z
+ȥ	z
 Ȧ	A
 ȧ	a
 Ȩ	E
@ -234,6 +311,128 @@
 ȯ	o
 Ȳ	Y
 ȳ	y
+ȴ	l
+ȵ	n
+ȶ	t
+ȷ	j
+ȸ	db
+ȹ	qp
+Ⱥ	A
+Ȼ	C
+ȼ	c
+Ƚ	L
+Ⱦ	T
+ȿ	s
+ɀ	z
+Ƀ	B
+Ʉ	U
+Ɇ	E
+ɇ	e
+Ɉ	J
+ɉ	j
+Ɍ	R
+ɍ	r
+Ɏ	Y
+ɏ	y
+ɓ	b
+ɕ	c
+ɖ	d
+ɗ	d
+ɛ	e
+ɟ	j
+ɠ	g
+ɡ	g
+ɢ	G
+ɦ	h
+ɧ	h
+ɨ	i
+ɪ	I
+ɫ	l
+ɬ	l
+ɭ	l
+ɱ	m
+ɲ	n
+ɳ	n
+ɴ	N
+ɶ	OE
+ɼ	r
+ɽ	r
+ɾ	r
+ʀ	R
+ʂ	s
+ʈ	t
+ʉ	u
+ʋ	v
+ʏ	Y
+ʐ	z
+ʑ	z
+ʙ	B
+ʛ	G
+ʜ	H
+ʝ	j
+ʟ	L
+ʠ	q
+ʣ	dz
+ʥ	dz
+ʦ	ts
+ʪ	ls
+ʫ	lz
+Ё	Е
+ё	е
+ᴀ	A
+ᴁ	AE
+ᴃ	B
+ᴄ	C
+ᴅ	D
+ᴆ	D
+ᴇ	E
+ᴊ	J
+ᴋ	K
+ᴌ	L
+ᴍ	M
+ᴏ	O
+ᴘ	P
+ᴛ	T
+ᴜ	U
+ᴠ	V
+ᴡ	W
+ᴢ	Z
+ᵫ	ue
+ᵬ	b
+ᵭ	d
+ᵮ	f
+ᵯ	m
+ᵰ	n
+ᵱ	p
+ᵲ	r
+ᵳ	r
+ᵴ	s
+ᵵ	t
+ᵶ	z
+ᵺ	th
+ᵻ	I
+ᵽ	p
+ᵾ	U
+ᶀ	b
+ᶁ	d
+ᶂ	f
+ᶃ	g
+ᶄ	k
+ᶅ	l
+ᶆ	m
+ᶇ	n
+ᶈ	p
+ᶉ	r
+ᶊ	s
+ᶌ	v
+ᶍ	x
+ᶎ	z
+ᶏ	a
+ᶑ	d
+ᶒ	e
+ᶓ	e
+ᶖ	i
+ᶙ	u
 Ḁ	A
 ḁ	a
 Ḃ	B
@ -356,6 +555,10 @@
 ẗ	t
 ẘ	w
 ẙ	y
+ẚ	a
+ẜ	s
+ẝ	s
+ẞ	SS
 Ạ	A
 ạ	a
 Ả	A
@ -386,28 +589,461 @@
 ỷ	y
 Ỹ	Y
 ỹ	y
+Ỻ	LL
+ỻ	ll
+Ỽ	V
+ỽ	v
+Ỿ	Y
+ỿ	y
+‐	-
+‑	-
+‒	-
+–	-
+—	-
+―	-
+‖	||
+‘	'
+’	'
+‚	,
+‛	'
+“	"
+”	"
+„	,,
+‟	"
+․	.
+‥	..
+…	...
+′	'
+″	"
+‹	<
+›	>
+‼	!!
+⁄	/
+⁅	[
+⁆	]
+⁇	??
+⁈	?!
+⁉	!?
+⁎	*
+₠	CE
+₢	Cr
+₣	Fr.
+₤	L.
+₧	Pts
+₹	Rs
+₺	TL
+℀	a/c
+℁	a/s
+ℂ	C
+℃	°C
+℅	c/o
+℆	c/u
+℉	°F
+ℊ	g
+ℋ	H
+ℌ	x
+ℍ	H
+ℎ	h
+ℐ	I
+ℑ	I
+ℒ	L
+ℓ	l
+ℕ	N
+№	No
+℗	(P)
+ℙ	P
+ℚ	Q
+ℛ	R
+ℜ	R
+ℝ	R
+℞	Rx
+℡	TEL
+ℤ	Z
+ℨ	Z
+ℬ	B
+ℭ	C
+ℯ	e
+ℰ	E
+ℱ	F
+ℳ	M
+ℴ	o
+ℹ	i
+℻	FAX
+ⅅ	D
+ⅆ	d
+ⅇ	e
+ⅈ	i
+ⅉ	j
+⅓	 1/3
+⅔	 2/3
+⅕	 1/5
+⅖	 2/5
+⅗	 3/5
+⅘	 4/5
+⅙	 1/6
+⅚	 5/6
+⅛	 1/8
+⅜	 3/8
+⅝	 5/8
+⅞	 7/8
+⅟	 1/
+Ⅰ	I
+Ⅱ	II
+Ⅲ	III
+Ⅳ	IV
+Ⅴ	V
+Ⅵ	VI
+Ⅶ	VII
+Ⅷ	VIII
+Ⅸ	IX
+Ⅹ	X
+Ⅺ	XI
+Ⅻ	XII
+Ⅼ	L
+Ⅽ	C
+Ⅾ	D
+Ⅿ	M
+ⅰ	i
+ⅱ	ii
+ⅲ	iii
+ⅳ	iv
+ⅴ	v
+ⅵ	vi
+ⅶ	vii
+ⅷ	viii
+ⅸ	ix
+ⅹ	x
+ⅺ	xi
+ⅻ	xii
+ⅼ	l
+ⅽ	c
+ⅾ	d
+ⅿ	m
+−	-
+∕	/
+∖	\
+∣	|
+∥	||
+≪	<<
+≫	>>
+⑴	(1)
+⑵	(2)
+⑶	(3)
+⑷	(4)
+⑸	(5)
+⑹	(6)
+⑺	(7)
+⑻	(8)
+⑼	(9)
+⑽	(10)
+⑾	(11)
+⑿	(12)
+⒀	(13)
+⒁	(14)
+⒂	(15)
+⒃	(16)
+⒄	(17)
+⒅	(18)
+⒆	(19)
+⒇	(20)
+⒈	1.
+⒉	2.
+⒊	3.
+⒋	4.
+⒌	5.
+⒍	6.
+⒎	7.
+⒏	8.
+⒐	9.
+⒑	10.
+⒒	11.
+⒓	12.
+⒔	13.
+⒕	14.
+⒖	15.
+⒗	16.
+⒘	17.
+⒙	18.
+⒚	19.
+⒛	20.
+⒜	(a)
+⒝	(b)
+⒞	(c)
+⒟	(d)
+⒠	(e)
+⒡	(f)
+⒢	(g)
+⒣	(h)
+⒤	(i)
+⒥	(j)
+⒦	(k)
+⒧	(l)
+⒨	(m)
+⒩	(n)
+⒪	(o)
+⒫	(p)
+⒬	(q)
+⒭	(r)
+⒮	(s)
+⒯	(t)
+⒰	(u)
+⒱	(v)
+⒲	(w)
+⒳	(x)
+⒴	(y)
+⒵	(z)
+⦅	((
+⦆	))
+⩴	::=
+⩵	==
+⩶	===
+、	,
+。	.
+〇	0
+〈	<
+〉	>
+《	<<
+》	>>
+〔	[
+〕	]
+〘	[
+〙	]
+〚	[
+〛	]
+〝	"
+〞	"
+㍱	hPa
+㍲	da
+㍳	AU
+㍴	bar
+㍵	oV
+㍶	pc
+㍷	dm
+㍺	IU
+㎀	pA
+㎁	nA
+㎃	mA
+㎄	kA
+㎅	KB
+㎆	MB
+㎇	GB
+㎈	cal
+㎉	kcal
+㎊	pF
+㎋	nF
+㎎	mg
+㎏	kg
+㎐	Hz
+㎑	kHz
+㎒	MHz
+㎓	GHz
+㎔	THz
+㎙	fm
+㎚	nm
+㎜	mm
+㎝	cm
+㎞	km
+㎧	m/s
+㎩	Pa
+㎪	kPa
+㎫	MPa
+㎬	GPa
+㎭	rad
+㎮	rad/s
+㎰	ps
+㎱	ns
+㎳	ms
+㎴	pV
+㎵	nV
+㎷	mV
+㎸	kV
+㎹	MV
+㎺	pW
+㎻	nW
+㎽	mW
+㎾	kW
+㎿	MW
+㏂	a.m.
+㏃	Bq
+㏄	cc
+㏅	cd
+㏆	C/kg
+㏇	Co.
+㏈	dB
+㏉	Gy
+㏊	ha
+㏋	HP
+㏌	in
+㏍	KK
+㏎	KM
+㏏	kt
+㏐	lm
+㏑	ln
+㏒	log
+㏓	lx
+㏔	mb
+㏕	mil
+㏖	mol
+㏗	pH
+㏘	p.m.
+㏙	PPM
+㏚	PR
+㏛	sr
+㏜	Sv
+㏝	Wb
+㏞	V/m
+㏟	A/m
 ﬀ	ff
 ﬁ	fi
 ﬂ	fl
 ﬃ	ffi
 ﬄ	ffl
+ﬅ	st
 ﬆ	st
-Ø	O
-ø	o
-Đ	D
-đ	d
-ı	i
-Ħ	H
-ħ	h
-Ł	L
-ł	l
-ŉ	'n
-Ŧ	T
-ŧ	t
-Ё	Е
-ё	е
-Æ	AE
-ß	ss
-æ	ae
-Œ	OE
-œ	oe
+︐	,
+︑	,
+︒	.
+︓	:
+︔	;
+︕	!
+︖	?
+︙	...
+︰	..
+︱	-
+︲	-
+︵	(
+︶	)
+︷	{
+︸	}
+︹	[
+︺	]
+︽	<<
+︾	>>
+︿	<
+﹀	>
+﹇	[
+﹈	]
+﹐	,
+﹑	,
+﹒	.
+﹔	;
+﹕	:
+﹖	?
+﹗	!
+﹘	-
+﹙	(
+﹚	)
+﹛	{
+﹜	}
+﹝	[
+﹞	]
+﹟	#
+﹠	&
+﹡	*
+﹢	+
+﹣	-
+﹤	<
+﹥	>
+﹦	=
+﹨	\
+﹩	$
+﹪	%
+﹫	@
+！	!
+＂	"
+＃	#
+＄	$
+％	%
+＆	&
+＇	'
+（	(
+）	)
+＊	*
+＋	+
+，	,
+－	-
+．	.
+／	/
+０	0
+１	1
+２	2
+３	3
+４	4
+５	5
+６	6
+７	7
+８	8
+９	9
+：	:
+；	;
+＜	<
+＝	=
+＞	>
+？	?
+＠	@
+Ａ	A
+Ｂ	B
+Ｃ	C
+Ｄ	D
+Ｅ	E
+Ｆ	F
+Ｇ	G
+Ｈ	H
+Ｉ	I
+Ｊ	J
+Ｋ	K
+Ｌ	L
+Ｍ	M
+Ｎ	N
+Ｏ	O
+Ｐ	P
+Ｑ	Q
+Ｒ	R
+Ｓ	S
+Ｔ	T
+Ｕ	U
+Ｖ	V
+Ｗ	W
+Ｘ	X
+Ｙ	Y
+Ｚ	Z
+［	[
+＼	\
+］	]
+＾	^
+＿	_
+｀	`
+ａ	a
+ｂ	b
+ｃ	c
+ｄ	d
+ｅ	e
+ｆ	f
+ｇ	g
+ｈ	h
+ｉ	i
+ｊ	j
+ｋ	k
+ｌ	l
+ｍ	m
+ｎ	n
+ｏ	o
+ｐ	p
+ｑ	q
+ｒ	r
+ｓ	s
+ｔ	t
+ｕ	u
+ｖ	v
+ｗ	w
+ｘ	x
+ｙ	y
+ｚ	z
+｛	{
+｜	|
+｝	}
+～	~
+｟	((
+｠	))
+｡	.
+､	,