Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
2024-03-29 17:35:07 -07:00 · 2024-03-29 17:35:07 -07:00 · 46e5441fa5
parent a96a8b15fa
commit 46e5441fa5
3 changed files with 140 additions and 48 deletions
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 	return result;
 }
 struct WordBoundaryState
 {
 	const char *str;
 	size_t		len;
 	size_t		offset;
 	bool		init;
 	bool		prev_alnum;
 };
 /*
 * Simple word boundary iterator that draws boundaries each time the result of
 * pg_u_isalnum() changes.
 */
 static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
 										wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, true);
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
 		{
 			size_t		prev_offset = wbstate->offset;
 			wbstate->init = true;
 			wbstate->offset += unicode_utf8len(u);
 			wbstate->prev_alnum = curr_alnum;
 			return prev_offset;
 		}
 		wbstate->offset += unicode_utf8len(u);
 	}
 	return wbstate->len;
 }
 /*
 * collation-aware, wide-character-aware initcap function
 *
@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 #endif
 		if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
 		{
-			const unsigned char *src = (unsigned char *) buff;
+			const char *src = buff;
 			size_t		srclen = nbytes;
 			unsigned char *dst;
 			size_t		dstsize;
-			int			srcoff = 0;
+			char	   *dst;
-			int			dstoff = 0;
+			size_t		needed;
 			struct WordBoundaryState wbstate = {
 				.str = src,
 				.len = srclen,
 				.offset = 0,
 				.init = false,
 				.prev_alnum = false,
 			};
 			Assert(GetDatabaseEncoding() == PG_UTF8);
-			/* overflow paranoia */
+			/* first try buffer of equal size plus terminating NUL */
-			if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
+			dstsize = srclen + 1;
-				ereport(ERROR,
+			dst = palloc(dstsize);
 						(errcode(ERRCODE_OUT_OF_MEMORY),
 						 errmsg("out of memory")));
-			/* result is at most srclen codepoints plus terminating NUL */
+			needed = unicode_strtitle(dst, dstsize, src, srclen,
-			dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
+									  initcap_wbnext, &wbstate);
-			dst = (unsigned char *) palloc(dstsize);
+			if (needed + 1 > dstsize)
 			while (srcoff < nbytes)
 			{
-				pg_wchar	u1 = utf8_to_unicode(src + srcoff);
+				/* reset iterator */
-				pg_wchar	u2;
+				wbstate.offset = 0;
-				int			u1len = unicode_utf8len(u1);
+				wbstate.init = false;
 				int			u2len;
-				if (wasalnum)
+				/* grow buffer if needed and retry */
-					u2 = unicode_lowercase_simple(u1);
+				dstsize = needed + 1;
-				else
+				dst = repalloc(dst, dstsize);
-					u2 = unicode_uppercase_simple(u1);
+				needed = unicode_strtitle(dst, dstsize, src, srclen,
-
+										  initcap_wbnext, &wbstate);
-				u2len = unicode_utf8len(u2);
+				Assert(needed + 1 == dstsize);
 				Assert(dstoff + u2len + 1 <= dstsize);
 				wasalnum = pg_u_isalnum(u2, true);
 				unicode_to_utf8(u2, dst + dstoff);
 				srcoff += u1len;
 				dstoff += u2len;
 			}
-			Assert(dstoff + 1 <= dstsize);
+			result = dst;
 			*(dst + dstoff) = '\0';
 			dstoff++;
 			/* allocate result buffer of the right size and free workspace */
 			result = palloc(dstoff);
 			memcpy(result, dst, dstoff);
 			pfree(dst);
 		}
 		else
 		{
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@ -21,8 +21,9 @@
 #include "mb/pg_wchar.h"
 static const pg_case_map *find_case_map(pg_wchar ucs);
-static size_t convert_case(char *dst, size_t dstsize, const char *src,
+static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-						   ssize_t srclen, CaseKind casekind);
+						   CaseKind str_casekind, WordBoundaryNext wbnext,
 						   void *wbstate);
 pg_wchar
 unicode_lowercase_simple(pg_wchar code)
@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
 size_t
 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseLower);
+	return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
 }
 /*
 * unicode_strtitle()
 *
 * Convert src to titlecase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
 *
 * The caller is expected to initialize and free the callback state
 * wbstate. The callback should first return offset 0 for the first boundary;
 * then the offset of each subsequent word boundary; then the total length of
 * the string to indicate the final boundary.
 */
 size_t
 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 				 WordBoundaryNext wbnext, void *wbstate)
 {
 	return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
 						wbstate);
 }
 /*
@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 size_t
 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 {
-	return convert_case(dst, dstsize, src, srclen, CaseUpper);
+	return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
 }
 /*
- * Implement Unicode Default Case Conversion algorithm.
+ * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
- * Map each character in the string for which a mapping is available.
+ * If str_casekind is CaseTitle, maps characters found on a word boundary to
 * uppercase and other characters to lowercase.
 */
 static size_t
 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
-			 CaseKind casekind)
+			 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
 {
 	/* character CaseKind varies while titlecasing */
 	CaseKind	chr_casekind = str_casekind;
 	size_t		srcoff = 0;
 	size_t		result_len = 0;
 	size_t		boundary = 0;
 	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 		   (str_casekind != CaseTitle && !wbnext && !wbstate));
 	if (str_casekind == CaseTitle)
 	{
 		boundary = wbnext(wbstate);
 		Assert(boundary == 0);	/* start of text is always a boundary */
 	}
 	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 	{
@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 		int			u1len = unicode_utf8len(u1);
 		const		pg_case_map *casemap = find_case_map(u1);
 		if (str_casekind == CaseTitle)
 		{
 			if (srcoff == boundary)
 			{
 				chr_casekind = CaseUpper;
 				boundary = wbnext(wbstate);
 			}
 			else
 				chr_casekind = CaseLower;
 		}
 		/* perform mapping, update result_len, and write to dst */
 		if (casemap)
 		{
-			pg_wchar	u2 = casemap->simplemap[casekind];
+			pg_wchar	u2 = casemap->simplemap[chr_casekind];
 			pg_wchar	u2len = unicode_utf8len(u2);
 			if (result_len + u2len <= dstsize)
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@ -16,11 +16,16 @@
 #include "mb/pg_wchar.h"
 typedef size_t (*WordBoundaryNext) (void *wbstate);
 pg_wchar	unicode_lowercase_simple(pg_wchar ucs);
 pg_wchar	unicode_titlecase_simple(pg_wchar ucs);
 pg_wchar	unicode_uppercase_simple(pg_wchar ucs);
 size_t		unicode_strlower(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen);
 size_t		unicode_strtitle(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen, WordBoundaryNext wbnext,
 							 void *wbstate);
 size_t		unicode_strupper(char *dst, size_t dstsize, const char *src,
 							 ssize_t srclen);