Additional unicode primitive functions.

Introduce unicode_version(), icu_unicode_version(), and
unicode_assigned().

The latter requires introducing a new lookup table for the Unicode
General Category, which is generated along with the other Unicode
lookup tables.

Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis 2023-11-01 22:47:06 -07:00
parent 7021d3b176
commit a02b37fc08
18 changed files with 4924 additions and 22 deletions

View File

@ -2859,6 +2859,22 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>unicode_assigned</primary>
</indexterm>
<function>unicode_assigned</function> ( <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Returns <literal>true</literal> if all characters in the string are
assigned Unicode codepoints; <literal>false</literal> otherwise. This
function can only be used when the server encoding is
<literal>UTF8</literal>.
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@ -23427,25 +23443,6 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
This is equivalent to <function>current_user</function>.
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>version</primary>
</indexterm>
<function>version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string describing the <productname>PostgreSQL</productname>
server's version. You can also get this information from
<xref linkend="guc-server-version"/>, or for a machine-readable
version use <xref linkend="guc-server-version-num"/>. Software
developers should use <varname>server_version_num</varname> (available
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
parsing the text version.
</para></entry>
</row>
</tbody>
</tgroup>
</table>
@ -26332,6 +26329,80 @@ SELECT collation for ('foo' COLLATE "de_DE");
</sect2>
<sect2 id="functions-info-version">
<title>Version Information Functions</title>
<para>
The functions shown in <xref linkend="functions-version"/>
print version information.
</para>
<table id="functions-version">
<title>Version Information Functions</title>
<tgroup cols="1">
<thead>
<row>
<entry role="func_table_entry"><para role="func_signature">
Function
</para>
<para>
Description
</para></entry>
</row>
</thead>
<tbody>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>version</primary>
</indexterm>
<function>version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string describing the <productname>PostgreSQL</productname>
server's version. You can also get this information from
<xref linkend="guc-server-version"/>, or for a machine-readable
version use <xref linkend="guc-server-version-num"/>. Software
developers should use <varname>server_version_num</varname> (available
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
parsing the text version.
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>unicode_version</primary>
</indexterm>
<function>unicode_version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string representing the version of Unicode used by
<productname>PostgreSQL</productname>.
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>icu_unicode_version</primary>
</indexterm>
<function>icu_unicode_version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string representing the version of Unicode used by ICU, if
the server was built with ICU support; otherwise returns
<literal>NULL</literal> </para></entry>
</row>
</tbody>
</tgroup>
</table>
</sect2>
</sect1>
<sect1 id="functions-admin">

View File

@ -23,7 +23,9 @@
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
#include "common/unicode_category.h"
#include "common/unicode_norm.h"
#include "common/unicode_version.h"
#include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
return form;
}
/*
* Returns version of Unicode used by Postgres in "major.minor" format (the
* same format as the Unicode version reported by ICU). The third component
* ("update version") never involves additions to the character repertiore and
* is unimportant for most purposes.
*
* See: https://unicode.org/versions/
*/
Datum
unicode_version(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
}
/*
* Returns version of Unicode used by ICU, if enabled; otherwise NULL.
*/
Datum
icu_unicode_version(PG_FUNCTION_ARGS)
{
#ifdef USE_ICU
PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
#else
PG_RETURN_NULL();
#endif
}
/*
* Check whether the string contains only assigned Unicode code
* points. Requires that the database encoding is UTF-8.
*/
Datum
unicode_assigned(PG_FUNCTION_ARGS)
{
text *input = PG_GETARG_TEXT_PP(0);
unsigned char *p;
int size;
if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
/* convert to pg_wchar */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
pg_wchar uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
PG_RETURN_BOOL(false);
p += pg_utf_mblen(p);
}
PG_RETURN_BOOL(true);
}
Datum
unicode_normalize_func(PG_FUNCTION_ARGS)
{

View File

@ -78,6 +78,7 @@ OBJS_COMMON = \
scram-common.o \
string.o \
stringinfo.o \
unicode_category.o \
unicode_norm.o \
username.o \
wait_error.o \

View File

@ -30,6 +30,7 @@ common_sources = files(
'scram-common.c',
'string.c',
'stringinfo.c',
'unicode_category.c',
'unicode_norm.c',
'username.c',
'wait_error.c',

View File

@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
LIBS += $(PTHREAD_LIBS)
LDFLAGS_INTERNAL += $(ICU_LIBS)
CPPFLAGS += $(ICU_CFLAGS)
# By default, do nothing.
all:
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
mv $^ $(top_srcdir)/src/include/common/
$(MAKE) category-check
$(MAKE) normalization-check
# These files are part of the Unicode Character Database. Download
@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
unicode_version.h: generate-unicode_version.pl
$(PERL) $< --version $(UNICODE_VERSION)
unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
$(PERL) $<
# Generation of conversion tables used for string normalization with
# UTF-8 strings.
unicode_norm_hashfunc.h: unicode_norm_table.h
@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
$(PERL) $^ >$@
# Test suite
category-check: category_test
./category_test
normalization-check: norm_test
./norm_test
category_test: category_test.o ../unicode_category.o | submake-common
norm_test: norm_test.o ../unicode_norm.o | submake-common
norm_test.o: norm_test_table.h
@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
clean:
rm -f $(OBJS) norm_test norm_test.o
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
distclean: clean
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h

View File

@ -0,0 +1,108 @@
/*-------------------------------------------------------------------------
* category_test.c
* Program to test Unicode general category functions.
*
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/common/unicode/category_test.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef USE_ICU
#include <unicode/uchar.h>
#endif
#include "common/unicode_category.h"
#include "common/unicode_version.h"
/*
* Parse version into integer for easy comparison.
*/
#ifdef USE_ICU
static int
parse_unicode_version(const char *version)
{
int n,
major,
minor;
n = sscanf(version, "%d.%d", &major, &minor);
Assert(n == 2);
Assert(minor < 100);
return major * 100 + minor;
}
#endif
/*
* Exhaustively test that the Unicode category for each codepoint matches that
* returned by ICU.
*/
int
main(int argc, char **argv)
{
#ifdef USE_ICU
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;
printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
for (UChar32 code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);
if (pg_category != icu_category)
{
/*
* A version mismatch means that some assigned codepoints in the
* newer version may be unassigned in the older version. That's
* OK, though the test will not cover those codepoints marked
* unassigned in the older version (that is, it will no longer be
* an exhaustive test).
*/
if (pg_category == PG_U_UNASSIGNED &&
pg_unicode_version < icu_unicode_version)
pg_skipped_codepoints++;
else if (icu_category == PG_U_UNASSIGNED &&
icu_unicode_version < pg_unicode_version)
icu_skipped_codepoints++;
else
{
printf("FAILURE for codepoint %06x\n", code);
printf("Postgres category: %02d %s %s\n", pg_category,
unicode_category_abbrev(pg_category),
unicode_category_string(pg_category));
printf("ICU category: %02d %s %s\n", icu_category,
unicode_category_abbrev(icu_category),
unicode_category_string(icu_category));
printf("\n");
exit(1);
}
}
}
if (pg_skipped_codepoints > 0)
printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
pg_skipped_codepoints);
if (icu_skipped_codepoints > 0)
printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
icu_skipped_codepoints);
printf("category_test: All tests successful!\n");
exit(0);
#else
printf("ICU support required for test; skipping.\n");
exit(0);
#endif
}

View File

@ -0,0 +1,204 @@
#!/usr/bin/perl
#
# Generate a code point category table and its lookup utilities, using
# Unicode data files as input.
#
# Input: UnicodeData.txt
# Output: unicode_category_table.h
#
# Copyright (c) 2000-2023, PostgreSQL Global Development Group
use strict;
use warnings;
use Getopt::Long;
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
my $CATEGORY_UNASSIGNED = 'Cn';
my $output_path = '.';
GetOptions('outdir:s' => \$output_path);
my $output_table_file = "$output_path/unicode_category_table.h";
my $FH;
# Read entries from UnicodeData.txt into a list of codepoint ranges
# and their general category.
my @category_ranges = ();
my $range_start = undef;
my $range_end = undef;
my $range_category = undef;
# If between a "<..., First>" entry and a "<..., Last>" entry, the gap in
# codepoints represents a range, and $gap_category is equal to the
# category for both (which must match). Otherwise, the gap represents
# unassigned code points.
my $gap_category = undef;
open($FH, '<', "$output_path/UnicodeData.txt")
or die "Could not open $output_path/UnicodeData.txt: $!.";
while (my $line = <$FH>)
{
my @elts = split(';', $line);
my $code = hex($elts[0]);
my $name = $elts[1];
my $category = $elts[2];
die "codepoint out of range" if $code > 0x10FFFF;
die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED;
if (!defined($range_start)) {
my $code_str = sprintf "0x%06x", $code;
die if defined($range_end) || defined($range_category) || defined($gap_category);
die "unexpected first entry <..., Last>" if ($name =~ /Last>/);
die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000;
# initialize
$range_start = $code;
$range_end = $code;
$range_category = $category;
if ($name =~ /<.*, First>$/) {
$gap_category = $category;
} else {
$gap_category = $CATEGORY_UNASSIGNED;
}
next;
}
# Gap in codepoints detected. If it's a different category than
# the current range, emit the current range and initialize a new
# range representing the gap.
if ($range_end + 1 != $code && $range_category ne $gap_category) {
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
$range_start = $range_end + 1;
$range_end = $code - 1;
$range_category = $gap_category;
}
# different category; new range
if ($range_category ne $category) {
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
$range_start = $code;
$range_end = $code;
$range_category = $category;
}
if ($name =~ /<.*, First>$/) {
die "<..., First> entry unexpectedly follows another <..., First> entry"
if $gap_category ne $CATEGORY_UNASSIGNED;
$gap_category = $category;
}
elsif ($name =~ /<.*, Last>$/) {
die "<..., First> and <..., Last> entries have mismatching general category"
if $gap_category ne $category;
$gap_category = $CATEGORY_UNASSIGNED;
}
else {
die "unexpected entry found between <..., First> and <..., Last>"
if $gap_category ne $CATEGORY_UNASSIGNED;
}
$range_end = $code;
}
close $FH;
die "<..., First> entry with no corresponding <..., Last> entry"
if $gap_category ne $CATEGORY_UNASSIGNED;
# emit final range
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
# emit range for any unassigned code points after last entry
if ($range_end < 0x10FFFF) {
$range_start = $range_end + 1;
$range_end = 0x10FFFF;
$range_category = $CATEGORY_UNASSIGNED;
push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
}
my $num_ranges = scalar @category_ranges;
# See: https://www.unicode.org/reports/tr44/#General_Category_Values
my $categories = {
Cn => 'PG_U_UNASSIGNED',
Lu => 'PG_U_UPPERCASE_LETTER',
Ll => 'PG_U_LOWERCASE_LETTER',
Lt => 'PG_U_TITLECASE_LETTER',
Lm => 'PG_U_MODIFIER_LETTER',
Lo => 'PG_U_OTHER_LETTER',
Mn => 'PG_U_NONSPACING_MARK',
Me => 'PG_U_ENCLOSING_MARK',
Mc => 'PG_U_SPACING_MARK',
Nd => 'PG_U_DECIMAL_NUMBER',
Nl => 'PG_U_LETTER_NUMBER',
No => 'PG_U_OTHER_NUMBER',
Zs => 'PG_U_SPACE_SEPARATOR',
Zl => 'PG_U_LINE_SEPARATOR',
Zp => 'PG_U_PARAGRAPH_SEPARATOR',
Cc => 'PG_U_CONTROL',
Cf => 'PG_U_FORMAT',
Co => 'PG_U_PRIVATE_USE',
Cs => 'PG_U_SURROGATE',
Pd => 'PG_U_DASH_PUNCTUATION',
Ps => 'PG_U_OPEN_PUNCTUATION',
Pe => 'PG_U_CLOSE_PUNCTUATION',
Pc => 'PG_U_CONNECTOR_PUNCTUATION',
Po => 'PG_U_OTHER_PUNCTUATION',
Sm => 'PG_U_MATH_SYMBOL',
Sc => 'PG_U_CURRENCY_SYMBOL',
Sk => 'PG_U_MODIFIER_SYMBOL',
So => 'PG_U_OTHER_SYMBOL',
Pi => 'PG_U_INITIAL_PUNCTUATION',
Pf => 'PG_U_FINAL_PUNCTUATION'
};
# Start writing out the output files
open my $OT, '>', $output_table_file
or die "Could not open output file $output_table_file: $!\n";
print $OT <<HEADER;
/*-------------------------------------------------------------------------
*
* unicode_category_table.h
* Category table for Unicode character classification.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/common/unicode_category_table.h
*
*-------------------------------------------------------------------------
*/
#include "common/unicode_category.h"
/*
* File auto-generated by src/common/unicode/generate-unicode_category_table.pl,
* do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H
* here.
*/
typedef struct
{
uint32 first; /* Unicode codepoint */
uint32 last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
/* table of Unicode codepoint ranges and their categories */
static const pg_category_range unicode_categories[$num_ranges] =
{
HEADER
my $firsttime = 1;
foreach my $range (@category_ranges) {
printf $OT ",\n" unless $firsttime;
$firsttime = 0;
my $category = $categories->{$range->{category}};
die "category missing: $range->{category}" unless $category;
printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category;
}
print $OT "\n};\n";

View File

@ -0,0 +1,46 @@
#!/usr/bin/perl
#
# Generate header file with Unicode version used by Postgres.
#
# Output: unicode_version.h
#
# Copyright (c) 2000-2023, PostgreSQL Global Development Group
use strict;
use warnings;
use Getopt::Long;
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
my $output_path = '.';
my $version_str = undef;
GetOptions('outdir:s' => \$output_path, 'version:s' => \$version_str);
my @version_parts = split /\./, $version_str;
my $unicode_version_str = sprintf "%d.%d", $version_parts[0], $version_parts[1];
my $output_file = "$output_path/unicode_version.h";
# Start writing out the output files
open my $OT, '>', $output_file
or die "Could not open output file $output_file: $!\n";
print $OT <<HEADER;
/*-------------------------------------------------------------------------
*
* unicode_version.h
* Unicode version used by Postgres.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/common/unicode_version.h
*
*-------------------------------------------------------------------------
*/
#define PG_UNICODE_VERSION "$unicode_version_str"
HEADER

View File

@ -24,6 +24,25 @@ endforeach
update_unicode_targets = []
update_unicode_targets += \
custom_target('unicode_version.h',
output: ['unicode_version.h'],
command: [
perl, files('generate-unicode_version.pl'),
'--outdir', '@OUTDIR@', '--version', UNICODE_VERSION],
build_by_default: false,
)
update_unicode_targets += \
custom_target('unicode_category_table.h',
input: [unicode_data['UnicodeData.txt']],
output: ['unicode_category_table.h'],
command: [
perl, files('generate-unicode_category_table.pl'),
'--outdir', '@OUTDIR@', '@INPUT@'],
build_by_default: false,
)
update_unicode_targets += \
custom_target('unicode_norm_table.h',
input: [unicode_data['UnicodeData.txt'], unicode_data['CompositionExclusions.txt']],
@ -73,6 +92,17 @@ norm_test_table = custom_target('norm_test_table.h',
inc = include_directories('.')
category_test = executable('category_test',
['category_test.c'],
dependencies: [frontend_port_code, icu],
include_directories: inc,
link_with: [common_static, pgport_static],
build_by_default: false,
kwargs: default_bin_args + {
'install': false,
}
)
norm_test = executable('norm_test',
['norm_test.c', norm_test_table],
dependencies: [frontend_port_code],
@ -86,6 +116,16 @@ norm_test = executable('norm_test',
update_unicode_dep = []
if not meson.is_cross_build()
update_unicode_dep += custom_target('category_test.run',
output: 'category_test.run',
input: update_unicode_targets,
command: [category_test, UNICODE_VERSION],
build_by_default: false,
build_always_stale: true,
)
endif
if not meson.is_cross_build()
update_unicode_dep += custom_target('norm_test.run',
output: 'norm_test.run',

View File

@ -81,6 +81,6 @@ main(int argc, char **argv)
}
}
printf("All tests successful!\n");
printf("norm_test: All tests successful!\n");
exit(0);
}

View File

@ -0,0 +1,195 @@
/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category of Unicode characters.
*
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/common/unicode_category.c
*
*-------------------------------------------------------------------------
*/
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif
#include "common/unicode_category.h"
#include "common/unicode_category_table.h"
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
unicode_category(pg_wchar ucs)
{
int min = 0;
int mid;
int max = lengthof(unicode_categories) - 1;
Assert(ucs >= unicode_categories[0].first &&
ucs <= unicode_categories[max].last);
while (max >= min)
{
mid = (min + max) / 2;
if (ucs > unicode_categories[mid].last)
min = mid + 1;
else if (ucs < unicode_categories[mid].first)
max = mid - 1;
else
return unicode_categories[mid].category;
}
Assert(false);
return (pg_unicode_category) - 1;
}
/*
* Description of Unicode general category.
*/
const char *
unicode_category_string(pg_unicode_category category)
{
switch (category)
{
case PG_U_UNASSIGNED:
return "Unassigned";
case PG_U_UPPERCASE_LETTER:
return "Uppercase_Letter";
case PG_U_LOWERCASE_LETTER:
return "Lowercase_Letter";
case PG_U_TITLECASE_LETTER:
return "Titlecase_Letter";
case PG_U_MODIFIER_LETTER:
return "Modifier_Letter";
case PG_U_OTHER_LETTER:
return "Other_Letter";
case PG_U_NONSPACING_MARK:
return "Nonspacing_Mark";
case PG_U_ENCLOSING_MARK:
return "Enclosing_Mark";
case PG_U_SPACING_MARK:
return "Spacing_Mark";
case PG_U_DECIMAL_NUMBER:
return "Decimal_Number";
case PG_U_LETTER_NUMBER:
return "Letter_Number";
case PG_U_OTHER_NUMBER:
return "Other_Number";
case PG_U_SPACE_SEPARATOR:
return "Space_Separator";
case PG_U_LINE_SEPARATOR:
return "Line_Separator";
case PG_U_PARAGRAPH_SEPARATOR:
return "Paragraph_Separator";
case PG_U_CONTROL:
return "Control";
case PG_U_FORMAT:
return "Format";
case PG_U_PRIVATE_USE:
return "Private_Use";
case PG_U_SURROGATE:
return "Surrogate";
case PG_U_DASH_PUNCTUATION:
return "Dash_Punctuation";
case PG_U_OPEN_PUNCTUATION:
return "Open_Punctuation";
case PG_U_CLOSE_PUNCTUATION:
return "Close_Punctuation";
case PG_U_CONNECTOR_PUNCTUATION:
return "Connector_Punctuation";
case PG_U_OTHER_PUNCTUATION:
return "Other_Punctuation";
case PG_U_MATH_SYMBOL:
return "Math_Symbol";
case PG_U_CURRENCY_SYMBOL:
return "Currency_Symbol";
case PG_U_MODIFIER_SYMBOL:
return "Modifier_Symbol";
case PG_U_OTHER_SYMBOL:
return "Other_Symbol";
case PG_U_INITIAL_PUNCTUATION:
return "Initial_Punctuation";
case PG_U_FINAL_PUNCTUATION:
return "Final_Punctuation";
}
Assert(false);
return "Unrecognized"; /* keep compiler quiet */
}
/*
* Short code for Unicode general category.
*/
const char *
unicode_category_abbrev(pg_unicode_category category)
{
switch (category)
{
case PG_U_UNASSIGNED:
return "Cn";
case PG_U_UPPERCASE_LETTER:
return "Lu";
case PG_U_LOWERCASE_LETTER:
return "Ll";
case PG_U_TITLECASE_LETTER:
return "Lt";
case PG_U_MODIFIER_LETTER:
return "Lm";
case PG_U_OTHER_LETTER:
return "Lo";
case PG_U_NONSPACING_MARK:
return "Mn";
case PG_U_ENCLOSING_MARK:
return "Me";
case PG_U_SPACING_MARK:
return "Mc";
case PG_U_DECIMAL_NUMBER:
return "Nd";
case PG_U_LETTER_NUMBER:
return "Nl";
case PG_U_OTHER_NUMBER:
return "No";
case PG_U_SPACE_SEPARATOR:
return "Zs";
case PG_U_LINE_SEPARATOR:
return "Zl";
case PG_U_PARAGRAPH_SEPARATOR:
return "Zp";
case PG_U_CONTROL:
return "Cc";
case PG_U_FORMAT:
return "Cf";
case PG_U_PRIVATE_USE:
return "Co";
case PG_U_SURROGATE:
return "Cs";
case PG_U_DASH_PUNCTUATION:
return "Pd";
case PG_U_OPEN_PUNCTUATION:
return "Ps";
case PG_U_CLOSE_PUNCTUATION:
return "Pe";
case PG_U_CONNECTOR_PUNCTUATION:
return "Pc";
case PG_U_OTHER_PUNCTUATION:
return "Po";
case PG_U_MATH_SYMBOL:
return "Sm";
case PG_U_CURRENCY_SYMBOL:
return "Sc";
case PG_U_MODIFIER_SYMBOL:
return "Sk";
case PG_U_OTHER_SYMBOL:
return "So";
case PG_U_INITIAL_PUNCTUATION:
return "Pi";
case PG_U_FINAL_PUNCTUATION:
return "Pf";
}
Assert(false);
return "??"; /* keep compiler quiet */
}

View File

@ -12019,6 +12019,18 @@
proname => 'pg_partition_root', prorettype => 'regclass',
proargtypes => 'regclass', prosrc => 'pg_partition_root' },
{ oid => '4549', descr => 'Unicode version used by Postgres',
proname => 'unicode_version', prorettype => 'text', proargtypes => '',
prosrc => 'unicode_version' },
{ oid => '6099', descr => 'Unicode version used by ICU, if enabled',
proname => 'icu_unicode_version', prorettype => 'text', proargtypes => '',
prosrc => 'icu_unicode_version' },
{ oid => '6105', descr => 'check valid Unicode',
proname => 'unicode_assigned', prorettype => 'bool', proargtypes => 'text',
prosrc => 'unicode_assigned' },
{ oid => '4350', descr => 'Unicode normalization',
proname => 'normalize', prorettype => 'text', proargtypes => 'text text',
prosrc => 'unicode_normalize_func' },

View File

@ -0,0 +1,68 @@
/*-------------------------------------------------------------------------
*
* unicode_category.h
* Routines for determining the category of Unicode characters.
*
* These definitions can be used by both frontend and backend code.
*
* Copyright (c) 2017-2023, PostgreSQL Global Development Group
*
* src/include/common/unicode_category.h
*
*-------------------------------------------------------------------------
*/
#ifndef UNICODE_CATEGORY_H
#define UNICODE_CATEGORY_H
#include "mb/pg_wchar.h"
/*
* Unicode General Category Values
*
* See: https://www.unicode.org/reports/tr44/#General_Category_Values
*
* The Unicode stability policy guarantees: "The enumeration of
* General_Category property values is fixed. No new values will be
* added". See: https://www.unicode.org/policies/stability_policy.html
*
* Numeric values chosen to match corresponding ICU UCharCategory.
*/
typedef enum pg_unicode_category
{
PG_U_UNASSIGNED = 0, /* Cn */
PG_U_UPPERCASE_LETTER = 1, /* Lu */
PG_U_LOWERCASE_LETTER = 2, /* Ll */
PG_U_TITLECASE_LETTER = 3, /* Lt */
PG_U_MODIFIER_LETTER = 4, /* Lm */
PG_U_OTHER_LETTER = 5, /* Lo */
PG_U_NONSPACING_MARK = 6, /* Mn */
PG_U_ENCLOSING_MARK = 7, /* Me */
PG_U_SPACING_MARK = 8, /* Mc */
PG_U_DECIMAL_NUMBER = 9, /* Nd */
PG_U_LETTER_NUMBER = 10, /* Nl */
PG_U_OTHER_NUMBER = 11, /* No */
PG_U_SPACE_SEPARATOR = 12, /* Zs */
PG_U_LINE_SEPARATOR = 13, /* Zl */
PG_U_PARAGRAPH_SEPARATOR = 14, /* Zp */
PG_U_CONTROL = 15, /* Cc */
PG_U_FORMAT = 16, /* Cf */
PG_U_PRIVATE_USE = 17, /* Co */
PG_U_SURROGATE = 18, /* Cs */
PG_U_DASH_PUNCTUATION = 19, /* Pd */
PG_U_OPEN_PUNCTUATION = 20, /* Ps */
PG_U_CLOSE_PUNCTUATION = 21, /* Pe */
PG_U_CONNECTOR_PUNCTUATION = 22, /* Pc */
PG_U_OTHER_PUNCTUATION = 23, /* Po */
PG_U_MATH_SYMBOL = 24, /* Sm */
PG_U_CURRENCY_SYMBOL = 25, /* Sc */
PG_U_MODIFIER_SYMBOL = 26, /* Sk */
PG_U_OTHER_SYMBOL = 27, /* So */
PG_U_INITIAL_PUNCTUATION = 28, /* Pi */
PG_U_FINAL_PUNCTUATION = 29 /* Pf */
} pg_unicode_category;
extern pg_unicode_category unicode_category(pg_wchar ucs);
const char *unicode_category_string(pg_unicode_category category);
const char *unicode_category_abbrev(pg_unicode_category category);
#endif /* UNICODE_CATEGORY_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,14 @@
/*-------------------------------------------------------------------------
*
* unicode_version.h
* Unicode version used by Postgres.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/common/unicode_version.h
*
*-------------------------------------------------------------------------
*/
#define PG_UNICODE_VERSION "15.1"

View File

@ -27,6 +27,10 @@ CREATE TABLE icu (def text, en text COLLATE "en-x-icu", upfirst text COLLATE upp
INSERT INTO icu VALUES ('a', 'a', 'a'), ('b', 'b', 'b'), ('A', 'A', 'A'), ('B', 'B', 'B');
});
is( $node1->safe_psql('dbicu', q{SELECT icu_unicode_version() IS NOT NULL}),
qq(t),
'ICU unicode version defined');
is( $node1->safe_psql('dbicu', q{SELECT def FROM icu ORDER BY def}),
qq(A
a

View File

@ -8,6 +8,24 @@ SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
t
(1 row)
SELECT unicode_version() IS NOT NULL;
?column?
----------
t
(1 row)
SELECT unicode_assigned(U&'abc');
unicode_assigned
------------------
t
(1 row)
SELECT unicode_assigned(U&'abc\+10FFFF');
unicode_assigned
------------------
f
(1 row)
SELECT normalize('');
normalize
-----------

View File

@ -5,6 +5,10 @@ SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
SELECT U&'\0061\0308bc' <> U&'\00E4bc' COLLATE "C" AS sanity_check;
SELECT unicode_version() IS NOT NULL;
SELECT unicode_assigned(U&'abc');
SELECT unicode_assigned(U&'abc\+10FFFF');
SELECT normalize('');
SELECT normalize(U&'\0061\0308\24D1c') = U&'\00E4\24D1c' COLLATE "C" AS test_default;
SELECT normalize(U&'\0061\0308\24D1c', NFC) = U&'\00E4\24D1c' COLLATE "C" AS test_nfc;