mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-09-28 16:51:48 +02:00
Update unicode.org URLs
Use https, consistent host name, remove references to ftp. Also update the URLs for CLDR, which has moved from Trac to GitHub.
This commit is contained in:
parent
9abb2bfc04
commit
bdb839cbde
@ -24,9 +24,9 @@
|
|||||||
# Latin-ASCII.xml, the latest data sets released can be browsed directly
|
# Latin-ASCII.xml, the latest data sets released can be browsed directly
|
||||||
# via [3]. Note that this script is compatible with at least release 29.
|
# via [3]. Note that this script is compatible with at least release 29.
|
||||||
#
|
#
|
||||||
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
|
||||||
# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
|
# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
|
||||||
# [3] https://unicode.org/cldr/trac/browser/tags
|
# [3] https://github.com/unicode-org/cldr/tags
|
||||||
|
|
||||||
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
|
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
|
||||||
# The approach is to be Python3 compatible with Python2 "backports".
|
# The approach is to be Python3 compatible with Python2 "backports".
|
||||||
@ -113,7 +113,7 @@ def is_mark(codepoint):
|
|||||||
|
|
||||||
def is_letter_with_marks(codepoint, table):
|
def is_letter_with_marks(codepoint, table):
|
||||||
"""Returns true for letters combined with one or more marks."""
|
"""Returns true for letters combined with one or more marks."""
|
||||||
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
|
# See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
|
||||||
|
|
||||||
# Letter may have no combining characters, in which case it has
|
# Letter may have no combining characters, in which case it has
|
||||||
# no marks.
|
# no marks.
|
||||||
@ -226,7 +226,7 @@ def special_cases():
|
|||||||
return charactersSet
|
return charactersSet
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
|
# https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
|
||||||
decomposition_type_pattern = re.compile(" *<[^>]*> *")
|
decomposition_type_pattern = re.compile(" *<[^>]*> *")
|
||||||
|
|
||||||
table = {}
|
table = {}
|
||||||
@ -243,7 +243,7 @@ def main(args):
|
|||||||
for line in unicodeDataFile:
|
for line in unicodeDataFile:
|
||||||
fields = line.split(";")
|
fields = line.split(";")
|
||||||
if len(fields) > 5:
|
if len(fields) > 5:
|
||||||
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
|
# https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
|
||||||
general_category = fields[2]
|
general_category = fields[2]
|
||||||
decomposition = fields[5]
|
decomposition = fields[5]
|
||||||
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
|
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
|
||||||
@ -281,8 +281,8 @@ def main(args):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
|
parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
|
||||||
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
|
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
|
||||||
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
|
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath')
|
||||||
parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
|
parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -728,7 +728,7 @@
|
|||||||
<term><acronym>UTF</acronym></term>
|
<term><acronym>UTF</acronym></term>
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
<ulink url="http://www.unicode.org/">Unicode Transformation
|
<ulink url="https://www.unicode.org/">Unicode Transformation
|
||||||
Format</ulink>
|
Format</ulink>
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
@ -832,12 +832,12 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
|
|||||||
</varlistentry>
|
</varlistentry>
|
||||||
</variablelist>
|
</variablelist>
|
||||||
|
|
||||||
See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode
|
See <ulink url="https://www.unicode.org/reports/tr35/tr35-collation.html">Unicode
|
||||||
Technical Standard #35</ulink>
|
Technical Standard #35</ulink>
|
||||||
and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
|
and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
|
||||||
details. The list of possible collation types (<literal>co</literal>
|
details. The list of possible collation types (<literal>co</literal>
|
||||||
subtag) can be found in
|
subtag) can be found in
|
||||||
the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR
|
the <ulink url="https://github.com/unicode-org/cldr/blob/master/common/bcp47/collation.xml">CLDR
|
||||||
repository</ulink>.
|
repository</ulink>.
|
||||||
The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
|
The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
|
||||||
Explorer</ulink> can be used to check the details of a particular locale
|
Explorer</ulink> can be used to check the details of a particular locale
|
||||||
@ -900,7 +900,7 @@ CREATE COLLATION french FROM "fr-x-icu";
|
|||||||
different Unicode normal forms. It is up to the collation provider to
|
different Unicode normal forms. It is up to the collation provider to
|
||||||
actually implement such insensitive comparisons; the deterministic flag
|
actually implement such insensitive comparisons; the deterministic flag
|
||||||
only determines whether ties are to be broken using bytewise comparison.
|
only determines whether ties are to be broken using bytewise comparison.
|
||||||
See also <ulink url="https://unicode.org/reports/tr10">Unicode Technical
|
See also <ulink url="https://www.unicode.org/reports/tr10">Unicode Technical
|
||||||
Standard 10</ulink> for more information on the terminology.
|
Standard 10</ulink> for more information on the terminology.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
@ -1926,7 +1926,7 @@ RESET client_encoding;
|
|||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
<term><ulink url="http://www.unicode.org/"></ulink></term>
|
<term><ulink url="https://www.unicode.org/"></ulink></term>
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
|
@ -119,7 +119,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
|
|||||||
#DOWNLOAD = curl -o $@
|
#DOWNLOAD = curl -o $@
|
||||||
|
|
||||||
BIG5.TXT CNS11643.TXT:
|
BIG5.TXT CNS11643.TXT:
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
|
||||||
|
|
||||||
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
|
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
|
||||||
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
|
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
|
||||||
@ -131,19 +131,19 @@ GB2312.TXT:
|
|||||||
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
|
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
|
||||||
|
|
||||||
JIS0212.TXT:
|
JIS0212.TXT:
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
|
||||||
|
|
||||||
JOHAB.TXT KSX1001.TXT:
|
JOHAB.TXT KSX1001.TXT:
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
|
||||||
|
|
||||||
KOI8-R.TXT KOI8-U.TXT:
|
KOI8-R.TXT KOI8-U.TXT:
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
|
||||||
|
|
||||||
$(ISO8859TEXTS):
|
$(ISO8859TEXTS):
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
|
||||||
|
|
||||||
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
|
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
|
||||||
|
|
||||||
$(filter CP8%,$(WINTEXTS)):
|
$(filter CP8%,$(WINTEXTS)):
|
||||||
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
|
||||||
|
@ -8,8 +8,8 @@
|
|||||||
# map files provided by Unicode organization.
|
# map files provided by Unicode organization.
|
||||||
# Unfortunately it is prohibited by the organization
|
# Unfortunately it is prohibited by the organization
|
||||||
# to distribute the map files. So if you try to use this script,
|
# to distribute the map files. So if you try to use this script,
|
||||||
# you have to obtain the map files from the organization's ftp site.
|
# you have to obtain the map files from the organization's download site.
|
||||||
# ftp://www.unicode.org/Public/MAPPINGS/
|
# https://www.unicode.org/Public/MAPPINGS/
|
||||||
#
|
#
|
||||||
# Our "big5" comes from BIG5.TXT, with the addition of the characters
|
# Our "big5" comes from BIG5.TXT, with the addition of the characters
|
||||||
# in the range 0xf9d6-0xf9dc from CP950.TXT.
|
# in the range 0xf9d6-0xf9dc from CP950.TXT.
|
||||||
|
@ -8,8 +8,8 @@
|
|||||||
# map files provided by Unicode organization.
|
# map files provided by Unicode organization.
|
||||||
# Unfortunately it is prohibited by the organization
|
# Unfortunately it is prohibited by the organization
|
||||||
# to distribute the map files. So if you try to use this script,
|
# to distribute the map files. So if you try to use this script,
|
||||||
# you have to obtain the map files from the organization's ftp site.
|
# you have to obtain the map files from the organization's download site.
|
||||||
# ftp://www.unicode.org/Public/MAPPINGS/
|
# https://www.unicode.org/Public/MAPPINGS/
|
||||||
# We assume the file include three tab-separated columns:
|
# We assume the file include three tab-separated columns:
|
||||||
# JOHAB code in hex
|
# JOHAB code in hex
|
||||||
# UCS-2 code in hex
|
# UCS-2 code in hex
|
||||||
|
@ -8,8 +8,8 @@
|
|||||||
# map files provided by Unicode organization.
|
# map files provided by Unicode organization.
|
||||||
# Unfortunately it is prohibited by the organization
|
# Unfortunately it is prohibited by the organization
|
||||||
# to distribute the map files. So if you try to use this script,
|
# to distribute the map files. So if you try to use this script,
|
||||||
# you have to obtain the map files from the organization's ftp site.
|
# you have to obtain the map files from the organization's download site.
|
||||||
# ftp://www.unicode.org/Public/MAPPINGS/
|
# https://www.unicode.org/Public/MAPPINGS/
|
||||||
# We assume the file include three tab-separated columns:
|
# We assume the file include three tab-separated columns:
|
||||||
# source character set code in hex
|
# source character set code in hex
|
||||||
# UCS-2 code in hex
|
# UCS-2 code in hex
|
||||||
|
@ -23,7 +23,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
|
|||||||
# These files are part of the Unicode Character Database. Download
|
# These files are part of the Unicode Character Database. Download
|
||||||
# them on demand.
|
# them on demand.
|
||||||
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
|
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
|
||||||
$(DOWNLOAD) http://unicode.org/Public/UNIDATA/$(@F)
|
$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
|
||||||
|
|
||||||
# Generation of conversion tables used for string normalization with
|
# Generation of conversion tables used for string normalization with
|
||||||
# UTF-8 strings.
|
# UTF-8 strings.
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
* Normalize a Unicode string to NFKC form
|
* Normalize a Unicode string to NFKC form
|
||||||
*
|
*
|
||||||
* This implements Unicode normalization, per the documentation at
|
* This implements Unicode normalization, per the documentation at
|
||||||
* http://www.unicode.org/reports/tr15/.
|
* https://www.unicode.org/reports/tr15/.
|
||||||
*
|
*
|
||||||
* Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
|
* Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
|
||||||
*
|
*
|
||||||
@ -109,7 +109,7 @@ get_decomposed_size(pg_wchar code)
|
|||||||
/*
|
/*
|
||||||
* Fast path for Hangul characters not stored in tables to save memory as
|
* Fast path for Hangul characters not stored in tables to save memory as
|
||||||
* decomposition is algorithmic. See
|
* decomposition is algorithmic. See
|
||||||
* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
||||||
* the matter.
|
* the matter.
|
||||||
*/
|
*/
|
||||||
if (code >= SBASE && code < SBASE + SCOUNT)
|
if (code >= SBASE && code < SBASE + SCOUNT)
|
||||||
@ -234,7 +234,7 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
|
|||||||
/*
|
/*
|
||||||
* Fast path for Hangul characters not stored in tables to save memory as
|
* Fast path for Hangul characters not stored in tables to save memory as
|
||||||
* decomposition is algorithmic. See
|
* decomposition is algorithmic. See
|
||||||
* http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
|
||||||
* the matter.
|
* the matter.
|
||||||
*/
|
*/
|
||||||
if (code >= SBASE && code < SBASE + SCOUNT)
|
if (code >= SBASE && code < SBASE + SCOUNT)
|
||||||
@ -362,7 +362,7 @@ unicode_normalize_kc(const pg_wchar *input)
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
|
* Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4,
|
||||||
* a sequence of two adjacent characters in a string is an
|
* a sequence of two adjacent characters in a string is an
|
||||||
* exchangeable pair if the combining class (from the Unicode
|
* exchangeable pair if the combining class (from the Unicode
|
||||||
* Character Database) for the first character is greater than the
|
* Character Database) for the first character is greater than the
|
||||||
|
Loading…
Reference in New Issue
Block a user