Support negative indexes in split_part().

This provides a handy way to get, say, the last field of the string. Use of a negative index in this way has precedent in the nearby left() and right() functions. The implementation scans the string twice when N < -1, but it seems likely that N = -1 will be the huge majority of actual use cases, so I'm not really excited about adding complexity to avoid that. Nikhil Benesch, reviewed by Jacob Champion; cosmetic tweakage by me Discussion: https://postgr.es/m/cbb7f861-6162-3a51-9823-97bc3aa0b638@gmail.com
2020-11-13 13:49:48 -05:00 · 2020-11-13 13:49:48 -05:00 · ec0294fb2c
parent 3bf44303b9
commit ec0294fb2c
4 changed files with 172 additions and 17 deletions
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@ -3356,11 +3356,17 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
       <para>
        Splits <parameter>string</parameter> at occurrences
        of <parameter>delimiter</parameter> and returns
-        the <parameter>n</parameter>'th field (counting from one).
+        the <parameter>n</parameter>'th field (counting from one),
        or when <parameter>n</parameter> is negative, returns
        the |<parameter>n</parameter>|'th-from-last field.
       </para>
       <para>
        <literal>split_part('abc~@~def~@~ghi', '~@~', 2)</literal>
        <returnvalue>def</returnvalue>
       </para>
       <para>
        <literal>split_part('abc,def,ghi,jkl', ',', -2)</literal>
        <returnvalue>ghi</returnvalue>
       </para></entry>
      </row>
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@ -51,7 +51,7 @@ typedef struct varlena VarString;
 typedef struct
 {
 	bool		is_multibyte;	/* T if multibyte encoding */
-	bool		is_multibyte_char_in_char;
+	bool		is_multibyte_char_in_char;	/* need to check char boundaries? */
 	char	   *str1;			/* haystack string */
 	char	   *str2;			/* needle string */
@ -1439,8 +1439,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
 /*
 * Return a pointer to the current match.
 *
- * The returned pointer points into correct position in the original
+ * The returned pointer points into the original haystack string.
 * the haystack string.
 */
 static char *
 text_position_get_match_ptr(TextPositionState *state)
@ -1471,12 +1470,27 @@ text_position_get_match_pos(TextPositionState *state)
 	}
 }
 /*
 * Reset search state to the initial state installed by text_position_setup.
 *
 * The next call to text_position_next will search from the beginning
 * of the string.
 */
 static void
 text_position_reset(TextPositionState *state)
 {
 	state->last_match = NULL;
 	state->refpoint = state->str1;
 	state->refpos = 0;
 }
 static void
 text_position_cleanup(TextPositionState *state)
 {
 	/* no cleanup needed */
 }
 static void
 check_collation_set(Oid collid)
 {
@ -4581,9 +4595,8 @@ replace_text_regexp(text *src_text, void *regexp,
 /*
 * split_part
- * parse input string
+ * parse input string based on provided field separator
- * return ord item (1 based)
+ * return N'th item (1 based, negative counts from end)
 * based on provided field separator
 */
 Datum
 split_part(PG_FUNCTION_ARGS)
@ -4600,10 +4613,10 @@ split_part(PG_FUNCTION_ARGS)
 	bool		found;
 	/* field number is 1 based */
-	if (fldnum < 1)
+	if (fldnum == 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("field position must be greater than zero")));
+				 errmsg("field position must not be zero")));
 	inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
 	fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
@ -4612,32 +4625,72 @@ split_part(PG_FUNCTION_ARGS)
 	if (inputstring_len < 1)
 		PG_RETURN_TEXT_P(cstring_to_text(""));
-	/* empty field separator */
+	/* handle empty field separator */
 	if (fldsep_len < 1)
 	{
-		/* if first field, return input string, else empty string */
+		/* if first or last field, return input string, else empty string */
-		if (fldnum == 1)
+		if (fldnum == 1 || fldnum == -1)
 			PG_RETURN_TEXT_P(inputstring);
 		else
 			PG_RETURN_TEXT_P(cstring_to_text(""));
 	}
 	/* find the first field separator */
 	text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
 	/* identify bounds of first field */
 	start_ptr = VARDATA_ANY(inputstring);
 	found = text_position_next(&state);
 	/* special case if fldsep not found at all */
 	if (!found)
 	{
 		text_position_cleanup(&state);
-		/* if field 1 requested, return input string, else empty string */
+		/* if first or last field, return input string, else empty string */
-		if (fldnum == 1)
+		if (fldnum == 1 || fldnum == -1)
 			PG_RETURN_TEXT_P(inputstring);
 		else
 			PG_RETURN_TEXT_P(cstring_to_text(""));
 	}
 	/*
 	 * take care of a negative field number (i.e. count from the right) by
 	 * converting to a positive field number; we need total number of fields
 	 */
 	if (fldnum < 0)
 	{
 		/* we found a fldsep, so there are at least two fields */
 		int			numfields = 2;
 		while (text_position_next(&state))
 			numfields++;
 		/* special case of last field does not require an extra pass */
 		if (fldnum == -1)
 		{
 			start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
 			end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
 			text_position_cleanup(&state);
 			PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
 													  end_ptr - start_ptr));
 		}
 		/* else, convert fldnum to positive notation */
 		fldnum += numfields + 1;
 		/* if nonexistent field, return empty string */
 		if (fldnum <= 0)
 		{
 			text_position_cleanup(&state);
 			PG_RETURN_TEXT_P(cstring_to_text(""));
 		}
 		/* reset to pointing at first match, but now with positive fldnum */
 		text_position_reset(&state);
 		found = text_position_next(&state);
 		Assert(found);
 	}
 	/* identify bounds of first field */
 	start_ptr = VARDATA_ANY(inputstring);
 	end_ptr = text_position_get_match_ptr(&state);
 	while (found && --fldnum > 0)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@ -1552,8 +1552,56 @@ SELECT replace('yabadoo', 'bad', '') AS "yaoo";
 --
 -- test split_part
 --
 select split_part('','@',1) AS "empty string";
 empty string 
 --------------
 (1 row)
 select split_part('','@',-1) AS "empty string";
 empty string 
 --------------
 (1 row)
 select split_part('joeuser@mydatabase','',1) AS "joeuser@mydatabase";
 joeuser@mydatabase 
 --------------------
 joeuser@mydatabase
 (1 row)
 select split_part('joeuser@mydatabase','',2) AS "empty string";
 empty string 
 --------------
 (1 row)
 select split_part('joeuser@mydatabase','',-1) AS "joeuser@mydatabase";
 joeuser@mydatabase 
 --------------------
 joeuser@mydatabase
 (1 row)
 select split_part('joeuser@mydatabase','',-2) AS "empty string";
 empty string 
 --------------
 (1 row)
 select split_part('joeuser@mydatabase','@',0) AS "an error";
-ERROR:  field position must be greater than zero
+ERROR:  field position must not be zero
 select split_part('joeuser@mydatabase','@@',1) AS "joeuser@mydatabase";
 joeuser@mydatabase 
 --------------------
 joeuser@mydatabase
 (1 row)
 select split_part('joeuser@mydatabase','@@',2) AS "empty string";
 empty string 
 --------------
 (1 row)
 select split_part('joeuser@mydatabase','@',1) AS "joeuser";
 joeuser 
 ---------
@ -1578,6 +1626,30 @@ select split_part('@joeuser@mydatabase@','@',2) AS "joeuser";
 joeuser
 (1 row)
 select split_part('joeuser@mydatabase','@',-1) AS "mydatabase";
 mydatabase 
 ------------
 mydatabase
 (1 row)
 select split_part('joeuser@mydatabase','@',-2) AS "joeuser";
 joeuser 
 ---------
 joeuser
 (1 row)
 select split_part('joeuser@mydatabase','@',-3) AS "empty string";
 empty string 
 --------------
 (1 row)
 select split_part('@joeuser@mydatabase@','@',-2) AS "mydatabase";
 mydatabase 
 ------------
 mydatabase
 (1 row)
 --
 -- test to_hex
 --
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@ -533,8 +533,24 @@ SELECT replace('yabadoo', 'bad', '') AS "yaoo";
 --
 -- test split_part
 --
 select split_part('','@',1) AS "empty string";
 select split_part('','@',-1) AS "empty string";
 select split_part('joeuser@mydatabase','',1) AS "joeuser@mydatabase";
 select split_part('joeuser@mydatabase','',2) AS "empty string";
 select split_part('joeuser@mydatabase','',-1) AS "joeuser@mydatabase";
 select split_part('joeuser@mydatabase','',-2) AS "empty string";
 select split_part('joeuser@mydatabase','@',0) AS "an error";
 select split_part('joeuser@mydatabase','@@',1) AS "joeuser@mydatabase";
 select split_part('joeuser@mydatabase','@@',2) AS "empty string";
 select split_part('joeuser@mydatabase','@',1) AS "joeuser";
 select split_part('joeuser@mydatabase','@',2) AS "mydatabase";
@ -543,6 +559,14 @@ select split_part('joeuser@mydatabase','@',3) AS "empty string";
 select split_part('@joeuser@mydatabase@','@',2) AS "joeuser";
 select split_part('joeuser@mydatabase','@',-1) AS "mydatabase";
 select split_part('joeuser@mydatabase','@',-2) AS "joeuser";
 select split_part('joeuser@mydatabase','@',-3) AS "empty string";
 select split_part('@joeuser@mydatabase@','@',-2) AS "mydatabase";
 --
 -- test to_hex
 --