postgresql/src/common/unicode_category.c

194 lines
4.3 KiB
C

/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category of Unicode characters.
*
* Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/common/unicode_category.c
*
*-------------------------------------------------------------------------
*/
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif
#include "common/unicode_category.h"
#include "common/unicode_category_table.h"
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
unicode_category(pg_wchar ucs)
{
int min = 0;
int mid;
int max = lengthof(unicode_categories) - 1;
Assert(ucs <= 0x10ffff);
while (max >= min)
{
mid = (min + max) / 2;
if (ucs > unicode_categories[mid].last)
min = mid + 1;
else if (ucs < unicode_categories[mid].first)
max = mid - 1;
else
return unicode_categories[mid].category;
}
return PG_U_UNASSIGNED;
}
/*
* Description of Unicode general category.
*/
const char *
unicode_category_string(pg_unicode_category category)
{
switch (category)
{
case PG_U_UNASSIGNED:
return "Unassigned";
case PG_U_UPPERCASE_LETTER:
return "Uppercase_Letter";
case PG_U_LOWERCASE_LETTER:
return "Lowercase_Letter";
case PG_U_TITLECASE_LETTER:
return "Titlecase_Letter";
case PG_U_MODIFIER_LETTER:
return "Modifier_Letter";
case PG_U_OTHER_LETTER:
return "Other_Letter";
case PG_U_NONSPACING_MARK:
return "Nonspacing_Mark";
case PG_U_ENCLOSING_MARK:
return "Enclosing_Mark";
case PG_U_SPACING_MARK:
return "Spacing_Mark";
case PG_U_DECIMAL_NUMBER:
return "Decimal_Number";
case PG_U_LETTER_NUMBER:
return "Letter_Number";
case PG_U_OTHER_NUMBER:
return "Other_Number";
case PG_U_SPACE_SEPARATOR:
return "Space_Separator";
case PG_U_LINE_SEPARATOR:
return "Line_Separator";
case PG_U_PARAGRAPH_SEPARATOR:
return "Paragraph_Separator";
case PG_U_CONTROL:
return "Control";
case PG_U_FORMAT:
return "Format";
case PG_U_PRIVATE_USE:
return "Private_Use";
case PG_U_SURROGATE:
return "Surrogate";
case PG_U_DASH_PUNCTUATION:
return "Dash_Punctuation";
case PG_U_OPEN_PUNCTUATION:
return "Open_Punctuation";
case PG_U_CLOSE_PUNCTUATION:
return "Close_Punctuation";
case PG_U_CONNECTOR_PUNCTUATION:
return "Connector_Punctuation";
case PG_U_OTHER_PUNCTUATION:
return "Other_Punctuation";
case PG_U_MATH_SYMBOL:
return "Math_Symbol";
case PG_U_CURRENCY_SYMBOL:
return "Currency_Symbol";
case PG_U_MODIFIER_SYMBOL:
return "Modifier_Symbol";
case PG_U_OTHER_SYMBOL:
return "Other_Symbol";
case PG_U_INITIAL_PUNCTUATION:
return "Initial_Punctuation";
case PG_U_FINAL_PUNCTUATION:
return "Final_Punctuation";
}
Assert(false);
return "Unrecognized"; /* keep compiler quiet */
}
/*
* Short code for Unicode general category.
*/
const char *
unicode_category_abbrev(pg_unicode_category category)
{
switch (category)
{
case PG_U_UNASSIGNED:
return "Cn";
case PG_U_UPPERCASE_LETTER:
return "Lu";
case PG_U_LOWERCASE_LETTER:
return "Ll";
case PG_U_TITLECASE_LETTER:
return "Lt";
case PG_U_MODIFIER_LETTER:
return "Lm";
case PG_U_OTHER_LETTER:
return "Lo";
case PG_U_NONSPACING_MARK:
return "Mn";
case PG_U_ENCLOSING_MARK:
return "Me";
case PG_U_SPACING_MARK:
return "Mc";
case PG_U_DECIMAL_NUMBER:
return "Nd";
case PG_U_LETTER_NUMBER:
return "Nl";
case PG_U_OTHER_NUMBER:
return "No";
case PG_U_SPACE_SEPARATOR:
return "Zs";
case PG_U_LINE_SEPARATOR:
return "Zl";
case PG_U_PARAGRAPH_SEPARATOR:
return "Zp";
case PG_U_CONTROL:
return "Cc";
case PG_U_FORMAT:
return "Cf";
case PG_U_PRIVATE_USE:
return "Co";
case PG_U_SURROGATE:
return "Cs";
case PG_U_DASH_PUNCTUATION:
return "Pd";
case PG_U_OPEN_PUNCTUATION:
return "Ps";
case PG_U_CLOSE_PUNCTUATION:
return "Pe";
case PG_U_CONNECTOR_PUNCTUATION:
return "Pc";
case PG_U_OTHER_PUNCTUATION:
return "Po";
case PG_U_MATH_SYMBOL:
return "Sm";
case PG_U_CURRENCY_SYMBOL:
return "Sc";
case PG_U_MODIFIER_SYMBOL:
return "Sk";
case PG_U_OTHER_SYMBOL:
return "So";
case PG_U_INITIAL_PUNCTUATION:
return "Pi";
case PG_U_FINAL_PUNCTUATION:
return "Pf";
}
Assert(false);
return "??"; /* keep compiler quiet */
}