2023-11-02 06:47:06 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
* unicode_category.c
|
|
|
|
* Determine general category of Unicode characters.
|
|
|
|
*
|
2024-01-04 02:49:05 +01:00
|
|
|
* Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
|
2023-11-02 06:47:06 +01:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/common/unicode_category.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef FRONTEND
|
|
|
|
#include "postgres.h"
|
|
|
|
#else
|
|
|
|
#include "postgres_fe.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "common/unicode_category.h"
|
|
|
|
#include "common/unicode_category_table.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unicode general category for the given codepoint.
|
|
|
|
*/
|
|
|
|
pg_unicode_category
|
|
|
|
unicode_category(pg_wchar ucs)
|
|
|
|
{
|
|
|
|
int min = 0;
|
|
|
|
int mid;
|
|
|
|
int max = lengthof(unicode_categories) - 1;
|
|
|
|
|
2023-12-08 00:44:03 +01:00
|
|
|
Assert(ucs <= 0x10ffff);
|
2023-11-02 06:47:06 +01:00
|
|
|
|
|
|
|
while (max >= min)
|
|
|
|
{
|
|
|
|
mid = (min + max) / 2;
|
|
|
|
if (ucs > unicode_categories[mid].last)
|
|
|
|
min = mid + 1;
|
|
|
|
else if (ucs < unicode_categories[mid].first)
|
|
|
|
max = mid - 1;
|
|
|
|
else
|
|
|
|
return unicode_categories[mid].category;
|
|
|
|
}
|
|
|
|
|
2023-12-08 00:44:03 +01:00
|
|
|
return PG_U_UNASSIGNED;
|
2023-11-02 06:47:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Description of Unicode general category.
|
|
|
|
*/
|
|
|
|
const char *
|
|
|
|
unicode_category_string(pg_unicode_category category)
|
|
|
|
{
|
|
|
|
switch (category)
|
|
|
|
{
|
|
|
|
case PG_U_UNASSIGNED:
|
|
|
|
return "Unassigned";
|
|
|
|
case PG_U_UPPERCASE_LETTER:
|
|
|
|
return "Uppercase_Letter";
|
|
|
|
case PG_U_LOWERCASE_LETTER:
|
|
|
|
return "Lowercase_Letter";
|
|
|
|
case PG_U_TITLECASE_LETTER:
|
|
|
|
return "Titlecase_Letter";
|
|
|
|
case PG_U_MODIFIER_LETTER:
|
|
|
|
return "Modifier_Letter";
|
|
|
|
case PG_U_OTHER_LETTER:
|
|
|
|
return "Other_Letter";
|
|
|
|
case PG_U_NONSPACING_MARK:
|
|
|
|
return "Nonspacing_Mark";
|
|
|
|
case PG_U_ENCLOSING_MARK:
|
|
|
|
return "Enclosing_Mark";
|
|
|
|
case PG_U_SPACING_MARK:
|
|
|
|
return "Spacing_Mark";
|
|
|
|
case PG_U_DECIMAL_NUMBER:
|
|
|
|
return "Decimal_Number";
|
|
|
|
case PG_U_LETTER_NUMBER:
|
|
|
|
return "Letter_Number";
|
|
|
|
case PG_U_OTHER_NUMBER:
|
|
|
|
return "Other_Number";
|
|
|
|
case PG_U_SPACE_SEPARATOR:
|
|
|
|
return "Space_Separator";
|
|
|
|
case PG_U_LINE_SEPARATOR:
|
|
|
|
return "Line_Separator";
|
|
|
|
case PG_U_PARAGRAPH_SEPARATOR:
|
|
|
|
return "Paragraph_Separator";
|
|
|
|
case PG_U_CONTROL:
|
|
|
|
return "Control";
|
|
|
|
case PG_U_FORMAT:
|
|
|
|
return "Format";
|
|
|
|
case PG_U_PRIVATE_USE:
|
|
|
|
return "Private_Use";
|
|
|
|
case PG_U_SURROGATE:
|
|
|
|
return "Surrogate";
|
|
|
|
case PG_U_DASH_PUNCTUATION:
|
|
|
|
return "Dash_Punctuation";
|
|
|
|
case PG_U_OPEN_PUNCTUATION:
|
|
|
|
return "Open_Punctuation";
|
|
|
|
case PG_U_CLOSE_PUNCTUATION:
|
|
|
|
return "Close_Punctuation";
|
|
|
|
case PG_U_CONNECTOR_PUNCTUATION:
|
|
|
|
return "Connector_Punctuation";
|
|
|
|
case PG_U_OTHER_PUNCTUATION:
|
|
|
|
return "Other_Punctuation";
|
|
|
|
case PG_U_MATH_SYMBOL:
|
|
|
|
return "Math_Symbol";
|
|
|
|
case PG_U_CURRENCY_SYMBOL:
|
|
|
|
return "Currency_Symbol";
|
|
|
|
case PG_U_MODIFIER_SYMBOL:
|
|
|
|
return "Modifier_Symbol";
|
|
|
|
case PG_U_OTHER_SYMBOL:
|
|
|
|
return "Other_Symbol";
|
|
|
|
case PG_U_INITIAL_PUNCTUATION:
|
|
|
|
return "Initial_Punctuation";
|
|
|
|
case PG_U_FINAL_PUNCTUATION:
|
|
|
|
return "Final_Punctuation";
|
|
|
|
}
|
|
|
|
|
|
|
|
Assert(false);
|
|
|
|
return "Unrecognized"; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Short code for Unicode general category.
|
|
|
|
*/
|
|
|
|
const char *
|
|
|
|
unicode_category_abbrev(pg_unicode_category category)
|
|
|
|
{
|
|
|
|
switch (category)
|
|
|
|
{
|
|
|
|
case PG_U_UNASSIGNED:
|
|
|
|
return "Cn";
|
|
|
|
case PG_U_UPPERCASE_LETTER:
|
|
|
|
return "Lu";
|
|
|
|
case PG_U_LOWERCASE_LETTER:
|
|
|
|
return "Ll";
|
|
|
|
case PG_U_TITLECASE_LETTER:
|
|
|
|
return "Lt";
|
|
|
|
case PG_U_MODIFIER_LETTER:
|
|
|
|
return "Lm";
|
|
|
|
case PG_U_OTHER_LETTER:
|
|
|
|
return "Lo";
|
|
|
|
case PG_U_NONSPACING_MARK:
|
|
|
|
return "Mn";
|
|
|
|
case PG_U_ENCLOSING_MARK:
|
|
|
|
return "Me";
|
|
|
|
case PG_U_SPACING_MARK:
|
|
|
|
return "Mc";
|
|
|
|
case PG_U_DECIMAL_NUMBER:
|
|
|
|
return "Nd";
|
|
|
|
case PG_U_LETTER_NUMBER:
|
|
|
|
return "Nl";
|
|
|
|
case PG_U_OTHER_NUMBER:
|
|
|
|
return "No";
|
|
|
|
case PG_U_SPACE_SEPARATOR:
|
|
|
|
return "Zs";
|
|
|
|
case PG_U_LINE_SEPARATOR:
|
|
|
|
return "Zl";
|
|
|
|
case PG_U_PARAGRAPH_SEPARATOR:
|
|
|
|
return "Zp";
|
|
|
|
case PG_U_CONTROL:
|
|
|
|
return "Cc";
|
|
|
|
case PG_U_FORMAT:
|
|
|
|
return "Cf";
|
|
|
|
case PG_U_PRIVATE_USE:
|
|
|
|
return "Co";
|
|
|
|
case PG_U_SURROGATE:
|
|
|
|
return "Cs";
|
|
|
|
case PG_U_DASH_PUNCTUATION:
|
|
|
|
return "Pd";
|
|
|
|
case PG_U_OPEN_PUNCTUATION:
|
|
|
|
return "Ps";
|
|
|
|
case PG_U_CLOSE_PUNCTUATION:
|
|
|
|
return "Pe";
|
|
|
|
case PG_U_CONNECTOR_PUNCTUATION:
|
|
|
|
return "Pc";
|
|
|
|
case PG_U_OTHER_PUNCTUATION:
|
|
|
|
return "Po";
|
|
|
|
case PG_U_MATH_SYMBOL:
|
|
|
|
return "Sm";
|
|
|
|
case PG_U_CURRENCY_SYMBOL:
|
|
|
|
return "Sc";
|
|
|
|
case PG_U_MODIFIER_SYMBOL:
|
|
|
|
return "Sk";
|
|
|
|
case PG_U_OTHER_SYMBOL:
|
|
|
|
return "So";
|
|
|
|
case PG_U_INITIAL_PUNCTUATION:
|
|
|
|
return "Pi";
|
|
|
|
case PG_U_FINAL_PUNCTUATION:
|
|
|
|
return "Pf";
|
|
|
|
}
|
|
|
|
|
|
|
|
Assert(false);
|
|
|
|
return "??"; /* keep compiler quiet */
|
|
|
|
}
|