diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 79df80704d..8736ada4be 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) return result; } +struct WordBoundaryState +{ + const char *str; + size_t len; + size_t offset; + bool init; + bool prev_alnum; +}; + +/* + * Simple word boundary iterator that draws boundaries each time the result of + * pg_u_isalnum() changes. + */ +static size_t +initcap_wbnext(void *state) +{ + struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; + + while (wbstate->offset < wbstate->len && + wbstate->str[wbstate->offset] != '\0') + { + pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + + wbstate->offset); + bool curr_alnum = pg_u_isalnum(u, true); + + if (!wbstate->init || curr_alnum != wbstate->prev_alnum) + { + size_t prev_offset = wbstate->offset; + + wbstate->init = true; + wbstate->offset += unicode_utf8len(u); + wbstate->prev_alnum = curr_alnum; + return prev_offset; + } + + wbstate->offset += unicode_utf8len(u); + } + + return wbstate->len; +} + /* * collation-aware, wide-character-aware initcap function * @@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) #endif if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) { - const unsigned char *src = (unsigned char *) buff; + const char *src = buff; size_t srclen = nbytes; - unsigned char *dst; size_t dstsize; - int srcoff = 0; - int dstoff = 0; + char *dst; + size_t needed; + struct WordBoundaryState wbstate = { + .str = src, + .len = srclen, + .offset = 0, + .init = false, + .prev_alnum = false, + }; Assert(GetDatabaseEncoding() == PG_UTF8); - /* overflow paranoia */ - if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN)) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); - /* result is at most srclen codepoints plus terminating NUL */ - dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1; - dst = (unsigned char *) palloc(dstsize); - - while (srcoff < nbytes) + needed = unicode_strtitle(dst, dstsize, src, srclen, + initcap_wbnext, &wbstate); + if (needed + 1 > dstsize) { - pg_wchar u1 = utf8_to_unicode(src + srcoff); - pg_wchar u2; - int u1len = unicode_utf8len(u1); - int u2len; + /* reset iterator */ + wbstate.offset = 0; + wbstate.init = false; - if (wasalnum) - u2 = unicode_lowercase_simple(u1); - else - u2 = unicode_uppercase_simple(u1); - - u2len = unicode_utf8len(u2); - - Assert(dstoff + u2len + 1 <= dstsize); - - wasalnum = pg_u_isalnum(u2, true); - - unicode_to_utf8(u2, dst + dstoff); - srcoff += u1len; - dstoff += u2len; + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strtitle(dst, dstsize, src, srclen, + initcap_wbnext, &wbstate); + Assert(needed + 1 == dstsize); } - Assert(dstoff + 1 <= dstsize); - *(dst + dstoff) = '\0'; - dstoff++; - - /* allocate result buffer of the right size and free workspace */ - result = palloc(dstoff); - memcpy(result, dst, dstoff); - pfree(dst); + result = dst; } else { diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c index 5e77490006..bc423b0890 100644 --- a/src/common/unicode_case.c +++ b/src/common/unicode_case.c @@ -21,8 +21,9 @@ #include "mb/pg_wchar.h" static const pg_case_map *find_case_map(pg_wchar ucs); -static size_t convert_case(char *dst, size_t dstsize, const char *src, - ssize_t srclen, CaseKind casekind); +static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, + CaseKind str_casekind, WordBoundaryNext wbnext, + void *wbstate); pg_wchar unicode_lowercase_simple(pg_wchar code) @@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code) size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen) { - return convert_case(dst, dstsize, src, srclen, CaseLower); + return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL); +} + +/* + * unicode_strtitle() + * + * Convert src to titlecase, and return the result length (not including + * terminating NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + * + * Titlecasing requires knowledge about word boundaries, which is provided by + * the callback wbnext. A word boundary is the offset of the start of a word + * or the offset of the character immediately following a word. + * + * The caller is expected to initialize and free the callback state + * wbstate. The callback should first return offset 0 for the first boundary; + * then the offset of each subsequent word boundary; then the total length of + * the string to indicate the final boundary. + */ +size_t +unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, + WordBoundaryNext wbnext, void *wbstate) +{ + return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext, + wbstate); } /* @@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen) size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen) { - return convert_case(dst, dstsize, src, srclen, CaseUpper); + return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL); } /* - * Implement Unicode Default Case Conversion algorithm. + * If str_casekind is CaseLower or CaseUpper, map each character in the string + * for which a mapping is available. * - * Map each character in the string for which a mapping is available. + * If str_casekind is CaseTitle, maps characters found on a word boundary to + * uppercase and other characters to lowercase. */ static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, - CaseKind casekind) + CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate) { + /* character CaseKind varies while titlecasing */ + CaseKind chr_casekind = str_casekind; size_t srcoff = 0; size_t result_len = 0; + size_t boundary = 0; + + Assert((str_casekind == CaseTitle && wbnext && wbstate) || + (str_casekind != CaseTitle && !wbnext && !wbstate)); + + if (str_casekind == CaseTitle) + { + boundary = wbnext(wbstate); + Assert(boundary == 0); /* start of text is always a boundary */ + } while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') { @@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, int u1len = unicode_utf8len(u1); const pg_case_map *casemap = find_case_map(u1); + if (str_casekind == CaseTitle) + { + if (srcoff == boundary) + { + chr_casekind = CaseUpper; + boundary = wbnext(wbstate); + } + else + chr_casekind = CaseLower; + } + + /* perform mapping, update result_len, and write to dst */ if (casemap) { - pg_wchar u2 = casemap->simplemap[casekind]; + pg_wchar u2 = casemap->simplemap[chr_casekind]; pg_wchar u2len = unicode_utf8len(u2); if (result_len + u2len <= dstsize) diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h index df36d8db21..c0c3382e79 100644 --- a/src/include/common/unicode_case.h +++ b/src/include/common/unicode_case.h @@ -16,11 +16,16 @@ #include "mb/pg_wchar.h" +typedef size_t (*WordBoundaryNext) (void *wbstate); + pg_wchar unicode_lowercase_simple(pg_wchar ucs); pg_wchar unicode_titlecase_simple(pg_wchar ucs); pg_wchar unicode_uppercase_simple(pg_wchar ucs); size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen); +size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, + ssize_t srclen, WordBoundaryNext wbnext, + void *wbstate); size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen);