Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider
out of formatting.c and into unicode_case.c, along with
unicode_strlower() and unicode_strupper(). Accepts an arbitrary word
boundary callback.

Simple for now, but can be extended to support the Unicode Default
Case Conversion algorithm with full case mapping.

Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com
Reviewed-by: Peter Eisentraut
This commit is contained in:
Jeff Davis 2024-03-29 17:35:07 -07:00
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions

View File

@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
return result; return result;
} }
struct WordBoundaryState
{
const char *str;
size_t len;
size_t offset;
bool init;
bool prev_alnum;
};
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
*/
static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, true);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
size_t prev_offset = wbstate->offset;
wbstate->init = true;
wbstate->offset += unicode_utf8len(u);
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
wbstate->offset += unicode_utf8len(u);
}
return wbstate->len;
}
/* /*
* collation-aware, wide-character-aware initcap function * collation-aware, wide-character-aware initcap function
* *
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
#endif #endif
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
{ {
const unsigned char *src = (unsigned char *) buff; const char *src = buff;
size_t srclen = nbytes; size_t srclen = nbytes;
unsigned char *dst;
size_t dstsize; size_t dstsize;
int srcoff = 0; char *dst;
int dstoff = 0; size_t needed;
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
.offset = 0,
.init = false,
.prev_alnum = false,
};
Assert(GetDatabaseEncoding() == PG_UTF8); Assert(GetDatabaseEncoding() == PG_UTF8);
/* overflow paranoia */ /* first try buffer of equal size plus terminating NUL */
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN)) dstsize = srclen + 1;
ereport(ERROR, dst = palloc(dstsize);
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
/* result is at most srclen codepoints plus terminating NUL */ needed = unicode_strtitle(dst, dstsize, src, srclen,
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1; initcap_wbnext, &wbstate);
dst = (unsigned char *) palloc(dstsize); if (needed + 1 > dstsize)
while (srcoff < nbytes)
{ {
pg_wchar u1 = utf8_to_unicode(src + srcoff); /* reset iterator */
pg_wchar u2; wbstate.offset = 0;
int u1len = unicode_utf8len(u1); wbstate.init = false;
int u2len;
if (wasalnum) /* grow buffer if needed and retry */
u2 = unicode_lowercase_simple(u1); dstsize = needed + 1;
else dst = repalloc(dst, dstsize);
u2 = unicode_uppercase_simple(u1); needed = unicode_strtitle(dst, dstsize, src, srclen,
initcap_wbnext, &wbstate);
u2len = unicode_utf8len(u2); Assert(needed + 1 == dstsize);
Assert(dstoff + u2len + 1 <= dstsize);
wasalnum = pg_u_isalnum(u2, true);
unicode_to_utf8(u2, dst + dstoff);
srcoff += u1len;
dstoff += u2len;
} }
Assert(dstoff + 1 <= dstsize); result = dst;
*(dst + dstoff) = '\0';
dstoff++;
/* allocate result buffer of the right size and free workspace */
result = palloc(dstoff);
memcpy(result, dst, dstoff);
pfree(dst);
} }
else else
{ {

View File

@ -21,8 +21,9 @@
#include "mb/pg_wchar.h" #include "mb/pg_wchar.h"
static const pg_case_map *find_case_map(pg_wchar ucs); static const pg_case_map *find_case_map(pg_wchar ucs);
static size_t convert_case(char *dst, size_t dstsize, const char *src, static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
ssize_t srclen, CaseKind casekind); CaseKind str_casekind, WordBoundaryNext wbnext,
void *wbstate);
pg_wchar pg_wchar
unicode_lowercase_simple(pg_wchar code) unicode_lowercase_simple(pg_wchar code)
@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
size_t size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen) unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
{ {
return convert_case(dst, dstsize, src, srclen, CaseLower); return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
}
/*
* unicode_strtitle()
*
* Convert src to titlecase, and return the result length (not including
* terminating NUL).
*
* String src must be encoded in UTF-8. If srclen < 0, src must be
* NUL-terminated.
*
* Result string is stored in dst, truncating if larger than dstsize. If
* dstsize is greater than the result length, dst will be NUL-terminated;
* otherwise not.
*
* If dstsize is zero, dst may be NULL. This is useful for calculating the
* required buffer size before allocating.
*
* Titlecasing requires knowledge about word boundaries, which is provided by
* the callback wbnext. A word boundary is the offset of the start of a word
* or the offset of the character immediately following a word.
*
* The caller is expected to initialize and free the callback state
* wbstate. The callback should first return offset 0 for the first boundary;
* then the offset of each subsequent word boundary; then the total length of
* the string to indicate the final boundary.
*/
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
WordBoundaryNext wbnext, void *wbstate)
{
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
wbstate);
} }
/* /*
@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
size_t size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen) unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
{ {
return convert_case(dst, dstsize, src, srclen, CaseUpper); return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
} }
/* /*
* Implement Unicode Default Case Conversion algorithm. * If str_casekind is CaseLower or CaseUpper, map each character in the string
* for which a mapping is available.
* *
* Map each character in the string for which a mapping is available. * If str_casekind is CaseTitle, maps characters found on a word boundary to
* uppercase and other characters to lowercase.
*/ */
static size_t static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind casekind) CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
{ {
/* character CaseKind varies while titlecasing */
CaseKind chr_casekind = str_casekind;
size_t srcoff = 0; size_t srcoff = 0;
size_t result_len = 0; size_t result_len = 0;
size_t boundary = 0;
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
(str_casekind != CaseTitle && !wbnext && !wbstate));
if (str_casekind == CaseTitle)
{
boundary = wbnext(wbstate);
Assert(boundary == 0); /* start of text is always a boundary */
}
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{ {
@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
int u1len = unicode_utf8len(u1); int u1len = unicode_utf8len(u1);
const pg_case_map *casemap = find_case_map(u1); const pg_case_map *casemap = find_case_map(u1);
if (str_casekind == CaseTitle)
{
if (srcoff == boundary)
{
chr_casekind = CaseUpper;
boundary = wbnext(wbstate);
}
else
chr_casekind = CaseLower;
}
/* perform mapping, update result_len, and write to dst */
if (casemap) if (casemap)
{ {
pg_wchar u2 = casemap->simplemap[casekind]; pg_wchar u2 = casemap->simplemap[chr_casekind];
pg_wchar u2len = unicode_utf8len(u2); pg_wchar u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize) if (result_len + u2len <= dstsize)

View File

@ -16,11 +16,16 @@
#include "mb/pg_wchar.h" #include "mb/pg_wchar.h"
typedef size_t (*WordBoundaryNext) (void *wbstate);
pg_wchar unicode_lowercase_simple(pg_wchar ucs); pg_wchar unicode_lowercase_simple(pg_wchar ucs);
pg_wchar unicode_titlecase_simple(pg_wchar ucs); pg_wchar unicode_titlecase_simple(pg_wchar ucs);
pg_wchar unicode_uppercase_simple(pg_wchar ucs); pg_wchar unicode_uppercase_simple(pg_wchar ucs);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src, size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen); ssize_t srclen);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
ssize_t srclen, WordBoundaryNext wbnext,
void *wbstate);
size_t unicode_strupper(char *dst, size_t dstsize, const char *src, size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
ssize_t srclen); ssize_t srclen);