From ec0294fb2c8287fd673c57701cdcf6c6396b2f60 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 13 Nov 2020 13:49:48 -0500 Subject: [PATCH] Support negative indexes in split_part(). This provides a handy way to get, say, the last field of the string. Use of a negative index in this way has precedent in the nearby left() and right() functions. The implementation scans the string twice when N < -1, but it seems likely that N = -1 will be the huge majority of actual use cases, so I'm not really excited about adding complexity to avoid that. Nikhil Benesch, reviewed by Jacob Champion; cosmetic tweakage by me Discussion: https://postgr.es/m/cbb7f861-6162-3a51-9823-97bc3aa0b638@gmail.com --- doc/src/sgml/func.sgml | 8 ++- src/backend/utils/adt/varlena.c | 83 ++++++++++++++++++++++----- src/test/regress/expected/strings.out | 74 +++++++++++++++++++++++- src/test/regress/sql/strings.sql | 24 ++++++++ 4 files changed, 172 insertions(+), 17 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 2783985b55..7c7d177c02 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3356,11 +3356,17 @@ repeat('Pg', 4) PgPgPgPg Splits string at occurrences of delimiter and returns - the n'th field (counting from one). + the n'th field (counting from one), + or when n is negative, returns + the |n|'th-from-last field. split_part('abc~@~def~@~ghi', '~@~', 2) def + + + split_part('abc,def,ghi,jkl', ',', -2) + ghi diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 5512e02940..ff9bf238f3 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -51,7 +51,7 @@ typedef struct varlena VarString; typedef struct { bool is_multibyte; /* T if multibyte encoding */ - bool is_multibyte_char_in_char; + bool is_multibyte_char_in_char; /* need to check char boundaries? */ char *str1; /* haystack string */ char *str2; /* needle string */ @@ -1439,8 +1439,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) /* * Return a pointer to the current match. * - * The returned pointer points into correct position in the original - * the haystack string. + * The returned pointer points into the original haystack string. */ static char * text_position_get_match_ptr(TextPositionState *state) @@ -1471,12 +1470,27 @@ text_position_get_match_pos(TextPositionState *state) } } +/* + * Reset search state to the initial state installed by text_position_setup. + * + * The next call to text_position_next will search from the beginning + * of the string. + */ +static void +text_position_reset(TextPositionState *state) +{ + state->last_match = NULL; + state->refpoint = state->str1; + state->refpos = 0; +} + static void text_position_cleanup(TextPositionState *state) { /* no cleanup needed */ } + static void check_collation_set(Oid collid) { @@ -4581,9 +4595,8 @@ replace_text_regexp(text *src_text, void *regexp, /* * split_part - * parse input string - * return ord item (1 based) - * based on provided field separator + * parse input string based on provided field separator + * return N'th item (1 based, negative counts from end) */ Datum split_part(PG_FUNCTION_ARGS) @@ -4600,10 +4613,10 @@ split_part(PG_FUNCTION_ARGS) bool found; /* field number is 1 based */ - if (fldnum < 1) + if (fldnum == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("field position must be greater than zero"))); + errmsg("field position must not be zero"))); inputstring_len = VARSIZE_ANY_EXHDR(inputstring); fldsep_len = VARSIZE_ANY_EXHDR(fldsep); @@ -4612,32 +4625,72 @@ split_part(PG_FUNCTION_ARGS) if (inputstring_len < 1) PG_RETURN_TEXT_P(cstring_to_text("")); - /* empty field separator */ + /* handle empty field separator */ if (fldsep_len < 1) { - /* if first field, return input string, else empty string */ - if (fldnum == 1) + /* if first or last field, return input string, else empty string */ + if (fldnum == 1 || fldnum == -1) PG_RETURN_TEXT_P(inputstring); else PG_RETURN_TEXT_P(cstring_to_text("")); } + /* find the first field separator */ text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state); - /* identify bounds of first field */ - start_ptr = VARDATA_ANY(inputstring); found = text_position_next(&state); /* special case if fldsep not found at all */ if (!found) { text_position_cleanup(&state); - /* if field 1 requested, return input string, else empty string */ - if (fldnum == 1) + /* if first or last field, return input string, else empty string */ + if (fldnum == 1 || fldnum == -1) PG_RETURN_TEXT_P(inputstring); else PG_RETURN_TEXT_P(cstring_to_text("")); } + + /* + * take care of a negative field number (i.e. count from the right) by + * converting to a positive field number; we need total number of fields + */ + if (fldnum < 0) + { + /* we found a fldsep, so there are at least two fields */ + int numfields = 2; + + while (text_position_next(&state)) + numfields++; + + /* special case of last field does not require an extra pass */ + if (fldnum == -1) + { + start_ptr = text_position_get_match_ptr(&state) + fldsep_len; + end_ptr = VARDATA_ANY(inputstring) + inputstring_len; + text_position_cleanup(&state); + PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr, + end_ptr - start_ptr)); + } + + /* else, convert fldnum to positive notation */ + fldnum += numfields + 1; + + /* if nonexistent field, return empty string */ + if (fldnum <= 0) + { + text_position_cleanup(&state); + PG_RETURN_TEXT_P(cstring_to_text("")); + } + + /* reset to pointing at first match, but now with positive fldnum */ + text_position_reset(&state); + found = text_position_next(&state); + Assert(found); + } + + /* identify bounds of first field */ + start_ptr = VARDATA_ANY(inputstring); end_ptr = text_position_get_match_ptr(&state); while (found && --fldnum > 0) diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 8c034c9599..298b6c48c2 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -1552,8 +1552,56 @@ SELECT replace('yabadoo', 'bad', '') AS "yaoo"; -- -- test split_part -- +select split_part('','@',1) AS "empty string"; + empty string +-------------- + +(1 row) + +select split_part('','@',-1) AS "empty string"; + empty string +-------------- + +(1 row) + +select split_part('joeuser@mydatabase','',1) AS "joeuser@mydatabase"; + joeuser@mydatabase +-------------------- + joeuser@mydatabase +(1 row) + +select split_part('joeuser@mydatabase','',2) AS "empty string"; + empty string +-------------- + +(1 row) + +select split_part('joeuser@mydatabase','',-1) AS "joeuser@mydatabase"; + joeuser@mydatabase +-------------------- + joeuser@mydatabase +(1 row) + +select split_part('joeuser@mydatabase','',-2) AS "empty string"; + empty string +-------------- + +(1 row) + select split_part('joeuser@mydatabase','@',0) AS "an error"; -ERROR: field position must be greater than zero +ERROR: field position must not be zero +select split_part('joeuser@mydatabase','@@',1) AS "joeuser@mydatabase"; + joeuser@mydatabase +-------------------- + joeuser@mydatabase +(1 row) + +select split_part('joeuser@mydatabase','@@',2) AS "empty string"; + empty string +-------------- + +(1 row) + select split_part('joeuser@mydatabase','@',1) AS "joeuser"; joeuser --------- @@ -1578,6 +1626,30 @@ select split_part('@joeuser@mydatabase@','@',2) AS "joeuser"; joeuser (1 row) +select split_part('joeuser@mydatabase','@',-1) AS "mydatabase"; + mydatabase +------------ + mydatabase +(1 row) + +select split_part('joeuser@mydatabase','@',-2) AS "joeuser"; + joeuser +--------- + joeuser +(1 row) + +select split_part('joeuser@mydatabase','@',-3) AS "empty string"; + empty string +-------------- + +(1 row) + +select split_part('@joeuser@mydatabase@','@',-2) AS "mydatabase"; + mydatabase +------------ + mydatabase +(1 row) + -- -- test to_hex -- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 14901a2692..ad5221ab6b 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -533,8 +533,24 @@ SELECT replace('yabadoo', 'bad', '') AS "yaoo"; -- -- test split_part -- +select split_part('','@',1) AS "empty string"; + +select split_part('','@',-1) AS "empty string"; + +select split_part('joeuser@mydatabase','',1) AS "joeuser@mydatabase"; + +select split_part('joeuser@mydatabase','',2) AS "empty string"; + +select split_part('joeuser@mydatabase','',-1) AS "joeuser@mydatabase"; + +select split_part('joeuser@mydatabase','',-2) AS "empty string"; + select split_part('joeuser@mydatabase','@',0) AS "an error"; +select split_part('joeuser@mydatabase','@@',1) AS "joeuser@mydatabase"; + +select split_part('joeuser@mydatabase','@@',2) AS "empty string"; + select split_part('joeuser@mydatabase','@',1) AS "joeuser"; select split_part('joeuser@mydatabase','@',2) AS "mydatabase"; @@ -543,6 +559,14 @@ select split_part('joeuser@mydatabase','@',3) AS "empty string"; select split_part('@joeuser@mydatabase@','@',2) AS "joeuser"; +select split_part('joeuser@mydatabase','@',-1) AS "mydatabase"; + +select split_part('joeuser@mydatabase','@',-2) AS "joeuser"; + +select split_part('joeuser@mydatabase','@',-3) AS "empty string"; + +select split_part('@joeuser@mydatabase@','@',-2) AS "mydatabase"; + -- -- test to_hex --