Fix performance regression in pg_strtointNN_safe functions

Between 6fcda9aba and 1b6f632a3, the pg_strtoint functions became quite a bit slower in v16, despite efforts in 6b423ec67 to speed these up. Since the majority of cases for these functions will only contain base-10 digits, perhaps prefixed by a '-', it makes sense to have a special case for this and just fall back on the more complex version which processes hex, octal, binary and underscores if the fast path version fails to parse the string. While we're here, update the header comments for these functions to mention that hex, octal and binary formats along with underscore separators are now supported. Author: Andres Freund, David Rowley Reported-by: Masahiko Sawada Reviewed-by: Dean Rasheed, John Naylor Discussion: https://postgr.es/m/CAD21AoDvDmUQeJtZrau1ovnT_smN940%3DKp6mszNGK3bq9yRN6g%40mail.gmail.com Backpatch-through: 16, where 6fcda9aba and 1b6f632a3 were added
2023-08-02 12:05:41 +12:00 · 2023-08-02 12:05:41 +12:00 · 3845577cb5
parent 36ab831f9a
commit 3845577cb5
1 changed files with 279 additions and 27 deletions
--- a/src/backend/utils/adt/numutils.c
+++ b/src/backend/utils/adt/numutils.c
@ -97,9 +97,18 @@ static const int8 hexlookup[128] = {
 };

 /*
- * Convert input string to a signed 16 bit integer.
+ * Convert input string to a signed 16 bit integer.  Input strings may be
+ * expressed in base-10, hexadecimal, octal, or binary format, all of which
+ * can be prefixed by an optional sign character, either '+' (the default) or
+ * '-' for negative numbers.  Hex strings are recognized by the digits being
+ * prefixed by 0x or 0X while octal strings are recognized by the 0o or 0O
+ * prefix.  The binary representation is recognized by the 0b or 0B prefix.
 *
- * Allows any number of leading or trailing whitespace characters.
+ * Allows any number of leading or trailing whitespace characters.  Digits may
+ * optionally be separated by a single underscore character.  These can only
+ * come between digits and not before or after the digits.  Underscores have
+ * no effect on the return value and are supported only to assist in improving
+ * the human readability of the input strings.
 *
 * pg_strtoint16() will throw ereport() upon bad input format or overflow;
 * while pg_strtoint16_safe() instead returns such complaints in *escontext,
@ -122,9 +131,84 @@ pg_strtoint16_safe(const char *s, Node *escontext)
 	const char *firstdigit;
 	uint16		tmp = 0;
 	bool		neg = false;
+	unsigned char digit;
+
+	/*
+	 * The majority of cases are likely to be base-10 digits without any
+	 * underscore separator characters.  We'll first try to parse the string
+	 * with the assumption that's the case and only fallback on a slower
+	 * implementation which handles hex, octal and binary strings and
+	 * underscores if the fastpath version cannot parse the string.
+	 */
+
+	/* leave it up to the slow path to look for leading spaces */
+
+	if (*ptr == '-')
+	{
+		ptr++;
+		neg = true;
+	}
+
+	/* a leading '+' is uncommon so leave that for the slow path */
+
+	/* process the first digit */
+	digit = (*ptr - '0');
+
+	/*
+	 * Exploit unsigned arithmetic to save having to check both the upper and
+	 * lower bounds of the digit.
+	 */
+	if (likely(digit < 10))
+	{
+		ptr++;
+		tmp = digit;
+	}
+	else
+	{
+		/* we need at least one digit */
+		goto slow;
+	}
+
+	/* process remaining digits */
+	for (;;)
+	{
+		digit = (*ptr - '0');
+
+		if (digit >= 10)
+			break;
+
+		ptr++;
+
+		if (unlikely(tmp > -(PG_INT16_MIN / 10)))
+			goto out_of_range;
+
+		tmp = tmp * 10 + digit;
+	}
+
+	/* when the string does not end in a digit, let the slow path handle it */
+	if (unlikely(*ptr != '\0'))
+		goto slow;
+
+	if (neg)
+	{
+		/* check the negative equivalent will fit without overflowing */
+		if (unlikely(tmp > (uint16) (-(PG_INT16_MIN + 1)) + 1))
+			goto out_of_range;
+		return -((int16) tmp);
+	}
+
+	if (unlikely(tmp > PG_INT16_MAX))
+		goto out_of_range;
+
+	return (int16) tmp;
+
+slow:
+	tmp = 0;
+	ptr = s;
+	/* no need to reset neg */

 	/* skip leading spaces */
-	while (likely(*ptr) && isspace((unsigned char) *ptr))
+	while (isspace((unsigned char) *ptr))
 		ptr++;

 	/* handle sign */
@ -141,7 +225,7 @@ pg_strtoint16_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (isxdigit((unsigned char) *ptr))
 			{
@ -165,7 +249,7 @@ pg_strtoint16_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (*ptr >= '0' && *ptr <= '7')
 			{
@ -189,7 +273,7 @@ pg_strtoint16_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (*ptr >= '0' && *ptr <= '1')
 			{
@ -213,9 +297,9 @@ pg_strtoint16_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr;

-		while (*ptr)
+		for (;;)
 		{
-			if (isdigit((unsigned char) *ptr))
+			if (*ptr >= '0' && *ptr <= '9')
 			{
 				if (unlikely(tmp > -(PG_INT16_MIN / 10)))
 					goto out_of_range;
@ -242,7 +326,7 @@ pg_strtoint16_safe(const char *s, Node *escontext)
 		goto invalid_syntax;

 	/* allow trailing whitespace, but not other trailing chars */
-	while (*ptr != '\0' && isspace((unsigned char) *ptr))
+	while (isspace((unsigned char) *ptr))
 		ptr++;

 	if (unlikely(*ptr != '\0'))
@ -275,9 +359,18 @@ invalid_syntax:
 }

 /*
- * Convert input string to a signed 32 bit integer.
+ * Convert input string to a signed 32 bit integer.  Input strings may be
+ * expressed in base-10, hexadecimal, octal, or binary format, all of which
+ * can be prefixed by an optional sign character, either '+' (the default) or
+ * '-' for negative numbers.  Hex strings are recognized by the digits being
+ * prefixed by 0x or 0X while octal strings are recognized by the 0o or 0O
+ * prefix.  The binary representation is recognized by the 0b or 0B prefix.
 *
- * Allows any number of leading or trailing whitespace characters.
+ * Allows any number of leading or trailing whitespace characters.  Digits may
+ * optionally be separated by a single underscore character.  These can only
+ * come between digits and not before or after the digits.  Underscores have
+ * no effect on the return value and are supported only to assist in improving
+ * the human readability of the input strings.
 *
 * pg_strtoint32() will throw ereport() upon bad input format or overflow;
 * while pg_strtoint32_safe() instead returns such complaints in *escontext,
@ -300,9 +393,84 @@ pg_strtoint32_safe(const char *s, Node *escontext)
 	const char *firstdigit;
 	uint32		tmp = 0;
 	bool		neg = false;
+	unsigned char digit;
+
+	/*
+	 * The majority of cases are likely to be base-10 digits without any
+	 * underscore separator characters.  We'll first try to parse the string
+	 * with the assumption that's the case and only fallback on a slower
+	 * implementation which handles hex, octal and binary strings and
+	 * underscores if the fastpath version cannot parse the string.
+	 */
+
+	/* leave it up to the slow path to look for leading spaces */
+
+	if (*ptr == '-')
+	{
+		ptr++;
+		neg = true;
+	}
+
+	/* a leading '+' is uncommon so leave that for the slow path */
+
+	/* process the first digit */
+	digit = (*ptr - '0');
+
+	/*
+	 * Exploit unsigned arithmetic to save having to check both the upper and
+	 * lower bounds of the digit.
+	 */
+	if (likely(digit < 10))
+	{
+		ptr++;
+		tmp = digit;
+	}
+	else
+	{
+		/* we need at least one digit */
+		goto slow;
+	}
+
+	/* process remaining digits */
+	for (;;)
+	{
+		digit = (*ptr - '0');
+
+		if (digit >= 10)
+			break;
+
+		ptr++;
+
+		if (unlikely(tmp > -(PG_INT32_MIN / 10)))
+			goto out_of_range;
+
+		tmp = tmp * 10 + digit;
+	}
+
+	/* when the string does not end in a digit, let the slow path handle it */
+	if (unlikely(*ptr != '\0'))
+		goto slow;
+
+	if (neg)
+	{
+		/* check the negative equivalent will fit without overflowing */
+		if (unlikely(tmp > (uint32) (-(PG_INT32_MIN + 1)) + 1))
+			goto out_of_range;
+		return -((int32) tmp);
+	}
+
+	if (unlikely(tmp > PG_INT32_MAX))
+		goto out_of_range;
+
+	return (int32) tmp;
+
+slow:
+	tmp = 0;
+	ptr = s;
+	/* no need to reset neg */

 	/* skip leading spaces */
-	while (likely(*ptr) && isspace((unsigned char) *ptr))
+	while (isspace((unsigned char) *ptr))
 		ptr++;

 	/* handle sign */
@ -319,7 +487,7 @@ pg_strtoint32_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (isxdigit((unsigned char) *ptr))
 			{
@ -343,7 +511,7 @@ pg_strtoint32_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (*ptr >= '0' && *ptr <= '7')
 			{
@ -367,7 +535,7 @@ pg_strtoint32_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (*ptr >= '0' && *ptr <= '1')
 			{
@ -391,9 +559,9 @@ pg_strtoint32_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr;

-		while (*ptr)
+		for (;;)
 		{
-			if (isdigit((unsigned char) *ptr))
+			if (*ptr >= '0' && *ptr <= '9')
 			{
 				if (unlikely(tmp > -(PG_INT32_MIN / 10)))
 					goto out_of_range;
@ -420,7 +588,7 @@ pg_strtoint32_safe(const char *s, Node *escontext)
 		goto invalid_syntax;

 	/* allow trailing whitespace, but not other trailing chars */
-	while (*ptr != '\0' && isspace((unsigned char) *ptr))
+	while (isspace((unsigned char) *ptr))
 		ptr++;

 	if (unlikely(*ptr != '\0'))
@ -453,9 +621,18 @@ invalid_syntax:
 }

 /*
- * Convert input string to a signed 64 bit integer.
+ * Convert input string to a signed 64 bit integer.  Input strings may be
+ * expressed in base-10, hexadecimal, octal, or binary format, all of which
+ * can be prefixed by an optional sign character, either '+' (the default) or
+ * '-' for negative numbers.  Hex strings are recognized by the digits being
+ * prefixed by 0x or 0X while octal strings are recognized by the 0o or 0O
+ * prefix.  The binary representation is recognized by the 0b or 0B prefix.
 *
- * Allows any number of leading or trailing whitespace characters.
+ * Allows any number of leading or trailing whitespace characters.  Digits may
+ * optionally be separated by a single underscore character.  These can only
+ * come between digits and not before or after the digits.  Underscores have
+ * no effect on the return value and are supported only to assist in improving
+ * the human readability of the input strings.
 *
 * pg_strtoint64() will throw ereport() upon bad input format or overflow;
 * while pg_strtoint64_safe() instead returns such complaints in *escontext,
@ -478,9 +655,84 @@ pg_strtoint64_safe(const char *s, Node *escontext)
 	const char *firstdigit;
 	uint64		tmp = 0;
 	bool		neg = false;
+	unsigned char digit;
+
+	/*
+	 * The majority of cases are likely to be base-10 digits without any
+	 * underscore separator characters.  We'll first try to parse the string
+	 * with the assumption that's the case and only fallback on a slower
+	 * implementation which handles hex, octal and binary strings and
+	 * underscores if the fastpath version cannot parse the string.
+	 */
+
+	/* leave it up to the slow path to look for leading spaces */
+
+	if (*ptr == '-')
+	{
+		ptr++;
+		neg = true;
+	}
+
+	/* a leading '+' is uncommon so leave that for the slow path */
+
+	/* process the first digit */
+	digit = (*ptr - '0');
+
+	/*
+	 * Exploit unsigned arithmetic to save having to check both the upper and
+	 * lower bounds of the digit.
+	 */
+	if (likely(digit < 10))
+	{
+		ptr++;
+		tmp = digit;
+	}
+	else
+	{
+		/* we need at least one digit */
+		goto slow;
+	}
+
+	/* process remaining digits */
+	for (;;)
+	{
+		digit = (*ptr - '0');
+
+		if (digit >= 10)
+			break;
+
+		ptr++;
+
+		if (unlikely(tmp > -(PG_INT64_MIN / 10)))
+			goto out_of_range;
+
+		tmp = tmp * 10 + digit;
+	}
+
+	/* when the string does not end in a digit, let the slow path handle it */
+	if (unlikely(*ptr != '\0'))
+		goto slow;
+
+	if (neg)
+	{
+		/* check the negative equivalent will fit without overflowing */
+		if (unlikely(tmp > (uint64) (-(PG_INT64_MIN + 1)) + 1))
+			goto out_of_range;
+		return -((int64) tmp);
+	}
+
+	if (unlikely(tmp > PG_INT64_MAX))
+		goto out_of_range;
+
+	return (int64) tmp;
+
+slow:
+	tmp = 0;
+	ptr = s;
+	/* no need to reset neg */

 	/* skip leading spaces */
-	while (*ptr && isspace((unsigned char) *ptr))
+	while (isspace((unsigned char) *ptr))
 		ptr++;

 	/* handle sign */
@ -497,7 +749,7 @@ pg_strtoint64_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (isxdigit((unsigned char) *ptr))
 			{
@ -521,7 +773,7 @@ pg_strtoint64_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (*ptr >= '0' && *ptr <= '7')
 			{
@ -545,7 +797,7 @@ pg_strtoint64_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr += 2;

-		while (*ptr)
+		for (;;)
 		{
 			if (*ptr >= '0' && *ptr <= '1')
 			{
@ -569,9 +821,9 @@ pg_strtoint64_safe(const char *s, Node *escontext)
 	{
 		firstdigit = ptr;

-		while (*ptr)
+		for (;;)
 		{
-			if (isdigit((unsigned char) *ptr))
+			if (*ptr >= '0' && *ptr <= '9')
 			{
 				if (unlikely(tmp > -(PG_INT64_MIN / 10)))
 					goto out_of_range;
@ -598,7 +850,7 @@ pg_strtoint64_safe(const char *s, Node *escontext)
 		goto invalid_syntax;

 	/* allow trailing whitespace, but not other trailing chars */
-	while (*ptr != '\0' && isspace((unsigned char) *ptr))
+	while (isspace((unsigned char) *ptr))
 		ptr++;

 	if (unlikely(*ptr != '\0'))