Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same

page about the maximum UTF8 sequence length we support (4 bytes since 8.1, 3 before that). pg_utf2wchar_with_len never got updated to support 4-byte characters at all, and in any case had a buffer-overrun risk in that it could produce multiple pg_wchars from what mblen claims to be just one UTF8 character. The only reason we don't have a major security hole is that most callers allocate worst-case output buffers; the sole exception in released versions appears to be pre-8.2 iwchareq() (ie, ILIKE), which can be crashed due to zeroing out its return address --- but AFAICS that can't be exploited for anything more than a crash, due to inability to control what gets written there. Per report from James Russell and Michael Fuhr. Pre-8.1 the risk is much less, but I still think pg_utf2wchar_with_len's behavior given an incomplete final character risks buffer overrun, so back-patch that logic change anyway. This patch also makes sure that UTF8 sequences exceeding the supported length (whichever it is) are consistently treated as error cases, rather than being treated like a valid shorter sequence in some places.
2007-01-24 17:12:17 +00:00 · 2007-01-24 17:12:17 +00:00 · 0887fa1117
parent 07cf99ac6f
commit 0887fa1117
1 changed files with 43 additions and 17 deletions
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@ -1,7 +1,7 @@
 /*
 * conversion functions between pg_wchar and multibyte streams.
 * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.58 2006/10/04 00:30:02 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.59 2007/01/24 17:12:17 tgl Exp $
 *
 * WIN1250 client encoding updated by Pavel Behal
 *
@ -364,46 +364,60 @@ pg_johab_dsplen(const unsigned char *s)
 }

 /*
- * convert UTF8 string to pg_wchar (UCS-2)
- * caller should allocate enough space for "to"
+ * convert UTF8 string to pg_wchar (UCS-4)
+ * caller must allocate enough space for "to", including a trailing zero!
 * len: length of from.
 * "from" not necessarily null terminated.
 */
 static int
 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 {
-	unsigned char c1,
-				c2,
-				c3;
 	int			cnt = 0;
+	uint32		c1,
+				c2,
+				c3,
+				c4;

 	while (len > 0 && *from)
 	{
-		if (!IS_HIGHBIT_SET(*from))
+		if ((*from & 0x80) == 0)
 		{
 			*to = *from++;
 			len--;
 		}
-		else if ((*from & 0xe0) == 0xc0 && len >= 2)
+		else if ((*from & 0xe0) == 0xc0)
 		{
+			if (len < 2)
+				break;			/* drop trailing incomplete char */
 			c1 = *from++ & 0x1f;
 			c2 = *from++ & 0x3f;
-			*to = c1 << 6;
-			*to |= c2;
+			*to = (c1 << 6) | c2;
 			len -= 2;
 		}
-		else if ((*from & 0xe0) == 0xe0 && len >= 3)
+		else if ((*from & 0xf0) == 0xe0)
 		{
+			if (len < 3)
+				break;			/* drop trailing incomplete char */
 			c1 = *from++ & 0x0f;
 			c2 = *from++ & 0x3f;
 			c3 = *from++ & 0x3f;
-			*to = c1 << 12;
-			*to |= c2 << 6;
-			*to |= c3;
+			*to = (c1 << 12) | (c2 << 6) | c3;
 			len -= 3;
 		}
+		else if ((*from & 0xf8) == 0xf0)
+		{
+			if (len < 4)
+				break;			/* drop trailing incomplete char */
+			c1 = *from++ & 0x07;
+			c2 = *from++ & 0x3f;
+			c3 = *from++ & 0x3f;
+			c4 = *from++ & 0x3f;
+			*to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
+			len -= 4;
+		}
 		else
 		{
+			/* treat a bogus char as length 1; not ours to raise error */
 			*to = *from++;
 			len--;
 		}
@ -415,12 +429,20 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 }

 /*
- * returns the byte length of a UTF8 character pointed to by s
+ * Return the byte length of a UTF8 character pointed to by s
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 4 bytes; hence do NOT return a value larger than 4.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
 */
 int
 pg_utf_mblen(const unsigned char *s)
 {
-	int			len = 1;
+	int			len;

 	if ((*s & 0x80) == 0)
 		len = 1;
@ -430,10 +452,14 @@ pg_utf_mblen(const unsigned char *s)
 		len = 3;
 	else if ((*s & 0xf8) == 0xf0)
 		len = 4;
+#ifdef NOT_USED
 	else if ((*s & 0xfc) == 0xf8)
 		len = 5;
 	else if ((*s & 0xfe) == 0xfc)
 		len = 6;
+#endif
+	else
+		len = 1;
 	return len;
 }

@ -596,7 +622,7 @@ utf2ucs(const unsigned char *c)
 		return (pg_wchar) (((c[0] & 0x0f) << 12) |
 						   ((c[1] & 0x3f) << 6) |
 						   (c[2] & 0x3f));
-	else if ((*c & 0xf0) == 0xf0)
+	else if ((*c & 0xf8) == 0xf0)
 		return (pg_wchar) (((c[0] & 0x07) << 18) |
 						   ((c[1] & 0x3f) << 12) |
 						   ((c[2] & 0x3f) << 6) |