From ddf590b8115212ea061f9428f20f4c36d8e25e62 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 9 Mar 2022 10:51:41 +0100 Subject: [PATCH] pycodestyle (PEP 8) cleanup in Python scripts These are mainly whitespace changes. I didn't fix "E501 line too long", which would require more significant surgery. --- contrib/unaccent/generate_unaccent_rules.py | 54 +++++++++++++-------- src/test/locale/sort-test.py | 10 ++-- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index bc667eaf15..c405e231b3 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -38,10 +38,10 @@ sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) # For now we are being conservative by including only Latin and Greek. This # could be extended in future based on feedback from people with relevant # language knowledge. -PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case - (ord('A'), ord('Z')), # Latin upper case - (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA - (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA +PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case + (ord('A'), ord('Z')), # Latin upper case + (0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA + (0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA # Combining marks follow a "base" character, and result in a composite # character. Example: "U&'A\0300'"produces "À".There are three types of @@ -51,9 +51,10 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case # https://en.wikipedia.org/wiki/Combining_character # https://www.unicode.org/charts/PDF/U0300.pdf # https://www.unicode.org/charts/PDF/U20D0.pdf -COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA - (0x20dd, 0x20E0), # Me: Symbols - (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle +COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA + (0x20dd, 0x20E0), # Me: Symbols + (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle + def print_record(codepoint, letter): if letter: @@ -63,12 +64,14 @@ def print_record(codepoint, letter): print(output) + class Codepoint: def __init__(self, id, general_category, combining_ids): self.id = id self.general_category = general_category self.combining_ids = combining_ids + def is_mark_to_remove(codepoint): """Return true if this is a combining mark to remove.""" if not is_mark(codepoint): @@ -79,17 +82,20 @@ def is_mark_to_remove(codepoint): return True return False + def is_plain_letter(codepoint): """Return true if codepoint represents a "plain letter".""" for begin, end in PLAIN_LETTER_RANGES: - if codepoint.id >= begin and codepoint.id <= end: - return True + if codepoint.id >= begin and codepoint.id <= end: + return True return False + def is_mark(codepoint): """Returns true for diacritical marks (combining codepoints).""" return codepoint.general_category in ("Mn", "Me", "Mc") + def is_letter_with_marks(codepoint, table): """Returns true for letters combined with one or more marks.""" # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values @@ -105,16 +111,18 @@ def is_letter_with_marks(codepoint, table): # Check if the base letter of this letter has marks. codepoint_base = codepoint.combining_ids[0] - if (is_plain_letter(table[codepoint_base]) is False and \ - is_letter_with_marks(table[codepoint_base], table) is False): + if is_plain_letter(table[codepoint_base]) is False and \ + is_letter_with_marks(table[codepoint_base], table) is False: return False return True + def is_letter(codepoint, table): """Return true for letter with or without diacritical marks.""" return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) + def get_plain_letter(codepoint, table): """Return the base codepoint without marks. If this codepoint has more than one combining character, do a recursive lookup on the table to @@ -133,15 +141,18 @@ def get_plain_letter(codepoint, table): # Should not come here assert(False) + def is_ligature(codepoint, table): """Return true for letters combined with letters.""" return all(is_letter(table[i], table) for i in codepoint.combining_ids) + def get_plain_letters(codepoint, table): """Return a list of plain letters from a ligature.""" assert(is_ligature(codepoint, table)) return [get_plain_letter(table[id], table) for id in codepoint.combining_ids] + def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): """Parse the XML file and return a set of tuples (src, trg), where "src" is the original character and "trg" the substitute.""" @@ -189,21 +200,23 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): return charactersSet + def special_cases(): """Returns the special cases which are not handled by other methods""" charactersSet = set() # Cyrillic - charactersSet.add((0x0401, "\u0415")) # CYRILLIC CAPITAL LETTER IO - charactersSet.add((0x0451, "\u0435")) # CYRILLIC SMALL LETTER IO + charactersSet.add((0x0401, "\u0415")) # CYRILLIC CAPITAL LETTER IO + charactersSet.add((0x0451, "\u0435")) # CYRILLIC SMALL LETTER IO # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F) - charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS - charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT - charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT + charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS + charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT + charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT return charactersSet + def main(args): # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings decomposition_type_pattern = re.compile(" *<[^>]*> *") @@ -238,12 +251,12 @@ def main(args): len(codepoint.combining_ids) > 1: if is_letter_with_marks(codepoint, table): charactersSet.add((codepoint.id, - chr(get_plain_letter(codepoint, table).id))) + chr(get_plain_letter(codepoint, table).id))) elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): charactersSet.add((codepoint.id, - "".join(chr(combining_codepoint.id) - for combining_codepoint \ - in get_plain_letters(codepoint, table)))) + "".join(chr(combining_codepoint.id) + for combining_codepoint + in get_plain_letters(codepoint, table)))) elif is_mark_to_remove(codepoint): charactersSet.add((codepoint.id, None)) @@ -258,6 +271,7 @@ def main(args): for characterPair in charactersList: print_record(characterPair[0], characterPair[1]) + if __name__ == "__main__": parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.') parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath') diff --git a/src/test/locale/sort-test.py b/src/test/locale/sort-test.py index 53019038ab..21d6e78eb5 100755 --- a/src/test/locale/sort-test.py +++ b/src/test/locale/sort-test.py @@ -1,18 +1,20 @@ #! /usr/bin/env python -import sys, string, locale +import locale +import sys + locale.setlocale(locale.LC_ALL, "") if len(sys.argv) != 2: - sys.stderr.write("Usage: sort.py filename\n") - sys.exit(1) + sys.stderr.write("Usage: sort.py filename\n") + sys.exit(1) infile = open(sys.argv[1], 'r') list = infile.readlines() infile.close() for i in range(0, len(list)): - list[i] = list[i][:-1] # chop! + list[i] = list[i][:-1] # chop! list.sort(key=locale.strxfrm) print('\n'.join(list))