From 66f163068030b5c5fe792a0daee27822dac43791 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 2 Sep 2020 18:23:56 -0400 Subject: [PATCH] Add string_to_table() function. This splits a string at occurrences of a delimiter. It is exactly like string_to_array() except for producing a set of values instead of an array of values. Thus, the relationship of these two functions is the same as between regexp_split_to_table() and regexp_split_to_array(). Although the same results could be had from unnest(string_to_array()), this is somewhat faster than that, and anyway it seems reasonable to have it for symmetry with the regexp functions. Pavel Stehule, reviewed by Peter Smith Discussion: https://postgr.es/m/CAFj8pRD8HOpjq2TqeTBhSo_QkzjLOhXzGCpKJ4nCs7Y9SQkuPw@mail.gmail.com --- doc/src/sgml/func.sgml | 90 +++++++---- src/backend/utils/adt/varlena.c | 214 +++++++++++++++++++-------- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 18 ++- src/test/regress/expected/arrays.out | 108 ++++++++++++++ src/test/regress/sql/arrays.sql | 15 ++ 6 files changed, 354 insertions(+), 93 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 2efd80baa4..e2e618791e 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3220,7 +3220,7 @@ repeat('Pg', 4) PgPgPgPg Splits string using a POSIX regular - expression as the delimiter; see + expression as the delimiter, producing an array of results; see . @@ -3239,7 +3239,7 @@ repeat('Pg', 4) PgPgPgPg Splits string using a POSIX regular - expression as the delimiter; see + expression as the delimiter, producing a set of results; see . @@ -3460,6 +3460,65 @@ repeat('Pg', 4) PgPgPgPg + + + + string_to_array + + string_to_array ( string text, delimiter text , null_string text ) + text[] + + + Splits the string at occurrences + of delimiter and forms the resulting fields + into a text array. + If delimiter is NULL, + each character in the string will become a + separate element in the array. + If delimiter is an empty string, then + the string is treated as a single field. + If null_string is supplied and is + not NULL, fields matching that string are + replaced by NULL. + + + string_to_array('xx~~yy~~zz', '~~', 'yy') + {xx,NULL,zz} + + + + + + + string_to_table + + string_to_table ( string text, delimiter text , null_string text ) + setof text + + + Splits the string at occurrences + of delimiter and returns the resulting fields + as a set of text rows. + If delimiter is NULL, + each character in the string will become a + separate row of the result. + If delimiter is an empty string, then + the string is treated as a single field. + If null_string is supplied and is + not NULL, fields matching that string are + replaced by NULL. + + + string_to_table('xx~^~yy~^~zz', '~^~', 'yy') + + + xx + NULL + zz + + + + @@ -17819,33 +17878,6 @@ SELECT NULLIF(value, '(none)') ... - - - - string_to_array - - string_to_array ( string text, delimiter text , null_string text ) - text[] - - - Splits the string at occurrences - of delimiter and forms the remaining data - into a text array. - If delimiter is NULL, - each character in the string will become a - separate element in the array. - If delimiter is an empty string, then - the string is treated as a single field. - If null_string is supplied and is - not NULL, fields matching that string are converted - to NULL entries. - - - string_to_array('xx~~yy~~zz', '~~', 'yy') - {xx,NULL,zz} - - - diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index df10bfb906..d7bc330541 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -26,6 +26,7 @@ #include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "nodes/execnodes.h" #include "parser/scansup.h" #include "port/pg_bswap.h" #include "regex/regex.h" @@ -92,6 +93,17 @@ typedef struct pg_locale_t locale; } VarStringSortSupport; +/* + * Output data for split_text(): we output either to an array or a table. + * tupstore and tupdesc must be set up in advance to output to a table. + */ +typedef struct +{ + ArrayBuildState *astate; + Tuplestorestate *tupstore; + TupleDesc tupdesc; +} SplitTextOutputData; + /* * This should be large enough that most strings will fit, but small enough * that we feel comfortable putting it on the stack @@ -139,7 +151,11 @@ static bytea *bytea_substring(Datum str, bool length_not_specified); static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); static void appendStringInfoText(StringInfo str, const text *t); -static Datum text_to_array_internal(PG_FUNCTION_ARGS); +static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate); +static void split_text_accum_result(SplitTextOutputData *tstate, + text *field_value, + text *null_string, + Oid collation); static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, const char *fldsep, const char *null_string); static StringInfo makeStringAggState(FunctionCallInfo fcinfo); @@ -4564,13 +4580,13 @@ replace_text_regexp(text *src_text, void *regexp, } /* - * split_text + * split_part * parse input string * return ord item (1 based) * based on provided field separator */ Datum -split_text(PG_FUNCTION_ARGS) +split_part(PG_FUNCTION_ARGS) { text *inputstring = PG_GETARG_TEXT_PP(0); text *fldsep = PG_GETARG_TEXT_PP(1); @@ -4599,7 +4615,6 @@ split_text(PG_FUNCTION_ARGS) /* empty field separator */ if (fldsep_len < 1) { - text_position_cleanup(&state); /* if first field, return input string, else empty string */ if (fldnum == 1) PG_RETURN_TEXT_P(inputstring); @@ -4679,7 +4694,19 @@ text_isequal(text *txt1, text *txt2, Oid collid) Datum text_to_array(PG_FUNCTION_ARGS) { - return text_to_array_internal(fcinfo); + SplitTextOutputData tstate; + + /* For array output, tstate should start as all zeroes */ + memset(&tstate, 0, sizeof(tstate)); + + if (!split_text(fcinfo, &tstate)) + PG_RETURN_NULL(); + + if (tstate.astate == NULL) + PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); + + PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate, + CurrentMemoryContext)); } /* @@ -4693,30 +4720,90 @@ text_to_array(PG_FUNCTION_ARGS) Datum text_to_array_null(PG_FUNCTION_ARGS) { - return text_to_array_internal(fcinfo); + return text_to_array(fcinfo); } /* - * common code for text_to_array and text_to_array_null functions + * text_to_table + * parse input string and return table of elements, + * based on provided field separator + */ +Datum +text_to_table(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo; + SplitTextOutputData tstate; + MemoryContext old_cxt; + + /* check to see if caller supports us returning a tuplestore */ + if (rsi == NULL || !IsA(rsi, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsi->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* OK, prepare tuplestore in per-query memory */ + old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory); + + tstate.astate = NULL; + tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc); + tstate.tupstore = tuplestore_begin_heap(true, false, work_mem); + + MemoryContextSwitchTo(old_cxt); + + (void) split_text(fcinfo, &tstate); + + tuplestore_donestoring(tstate.tupstore); + + rsi->returnMode = SFRM_Materialize; + rsi->setResult = tstate.tupstore; + rsi->setDesc = tstate.tupdesc; + + return (Datum) 0; +} + +/* + * text_to_table_null + * parse input string and return table of elements, + * based on provided field separator and null string + * + * This is a separate entry point only to prevent the regression tests from + * complaining about different argument sets for the same internal function. + */ +Datum +text_to_table_null(PG_FUNCTION_ARGS) +{ + return text_to_table(fcinfo); +} + +/* + * Common code for text_to_array, text_to_array_null, text_to_table + * and text_to_table_null functions. * * These are not strict so we have to test for null inputs explicitly. + * Returns false if result is to be null, else returns true. + * + * Note that if the result is valid but empty (zero elements), we return + * without changing *tstate --- caller must handle that case, too. */ -static Datum -text_to_array_internal(PG_FUNCTION_ARGS) +static bool +split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) { text *inputstring; text *fldsep; text *null_string; + Oid collation = PG_GET_COLLATION(); int inputstring_len; int fldsep_len; char *start_ptr; text *result_text; - bool is_null; - ArrayBuildState *astate = NULL; /* when input string is NULL, then result is NULL too */ if (PG_ARGISNULL(0)) - PG_RETURN_NULL(); + return false; inputstring = PG_GETARG_TEXT_PP(0); @@ -4743,35 +4830,19 @@ text_to_array_internal(PG_FUNCTION_ARGS) inputstring_len = VARSIZE_ANY_EXHDR(inputstring); fldsep_len = VARSIZE_ANY_EXHDR(fldsep); - /* return empty array for empty input string */ + /* return empty set for empty input string */ if (inputstring_len < 1) - PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); + return true; - /* - * empty field separator: return the input string as a one-element - * array - */ + /* empty field separator: return input string as a one-element set */ if (fldsep_len < 1) { - Datum elems[1]; - bool nulls[1]; - int dims[1]; - int lbs[1]; - - /* single element can be a NULL too */ - is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false; - - elems[0] = PointerGetDatum(inputstring); - nulls[0] = is_null; - dims[0] = 1; - lbs[0] = 1; - /* XXX: this hardcodes assumptions about the text type */ - PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls, - 1, dims, lbs, - TEXTOID, -1, false, TYPALIGN_INT)); + split_text_accum_result(tstate, inputstring, + null_string, collation); + return true; } - text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state); + text_position_setup(inputstring, fldsep, collation, &state); start_ptr = VARDATA_ANY(inputstring); @@ -4797,16 +4868,12 @@ text_to_array_internal(PG_FUNCTION_ARGS) chunk_len = end_ptr - start_ptr; } - /* must build a temp text datum to pass to accumArrayResult */ + /* build a temp text datum to pass to split_text_accum_result */ result_text = cstring_to_text_with_len(start_ptr, chunk_len); - is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false; /* stash away this field */ - astate = accumArrayResult(astate, - PointerGetDatum(result_text), - is_null, - TEXTOID, - CurrentMemoryContext); + split_text_accum_result(tstate, result_text, + null_string, collation); pfree(result_text); @@ -4821,16 +4888,12 @@ text_to_array_internal(PG_FUNCTION_ARGS) else { /* - * When fldsep is NULL, each character in the inputstring becomes an - * element in the result array. The separator is effectively the - * space between characters. + * When fldsep is NULL, each character in the input string becomes a + * separate element in the result set. The separator is effectively + * the space between characters. */ inputstring_len = VARSIZE_ANY_EXHDR(inputstring); - /* return empty array for empty input string */ - if (inputstring_len < 1) - PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); - start_ptr = VARDATA_ANY(inputstring); while (inputstring_len > 0) @@ -4839,16 +4902,12 @@ text_to_array_internal(PG_FUNCTION_ARGS) CHECK_FOR_INTERRUPTS(); - /* must build a temp text datum to pass to accumArrayResult */ + /* build a temp text datum to pass to split_text_accum_result */ result_text = cstring_to_text_with_len(start_ptr, chunk_len); - is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false; /* stash away this field */ - astate = accumArrayResult(astate, - PointerGetDatum(result_text), - is_null, - TEXTOID, - CurrentMemoryContext); + split_text_accum_result(tstate, result_text, + null_string, collation); pfree(result_text); @@ -4857,8 +4916,47 @@ text_to_array_internal(PG_FUNCTION_ARGS) } } - PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, - CurrentMemoryContext)); + return true; +} + +/* + * Add text item to result set (table or array). + * + * This is also responsible for checking to see if the item matches + * the null_string, in which case we should emit NULL instead. + */ +static void +split_text_accum_result(SplitTextOutputData *tstate, + text *field_value, + text *null_string, + Oid collation) +{ + bool is_null = false; + + if (null_string && text_isequal(field_value, null_string, collation)) + is_null = true; + + if (tstate->tupstore) + { + Datum values[1]; + bool nulls[1]; + + values[0] = PointerGetDatum(field_value); + nulls[0] = is_null; + + tuplestore_putvalues(tstate->tupstore, + tstate->tupdesc, + values, + nulls); + } + else + { + tstate->astate = accumArrayResult(tstate->astate, + PointerGetDatum(field_value), + is_null, + TEXTOID, + CurrentMemoryContext); + } } /* diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 52ca61f8a8..c807f83bad 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202008301 +#define CATALOG_VERSION_NO 202009021 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1dd325e0e6..687509ba92 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1561,16 +1561,24 @@ { oid => '383', proname => 'array_cat', proisstrict => 'f', prorettype => 'anyarray', proargtypes => 'anyarray anyarray', prosrc => 'array_cat' }, -{ oid => '394', descr => 'split delimited text into text[]', +{ oid => '394', descr => 'split delimited text', proname => 'string_to_array', proisstrict => 'f', prorettype => '_text', proargtypes => 'text text', prosrc => 'text_to_array' }, +{ oid => '376', descr => 'split delimited text, with null string', + proname => 'string_to_array', proisstrict => 'f', prorettype => '_text', + proargtypes => 'text text text', prosrc => 'text_to_array_null' }, +{ oid => '8432', descr => 'split delimited text', + proname => 'string_to_table', proisstrict => 'f', prorows => '1000', + proretset => 't', prorettype => 'text', proargtypes => 'text text', + prosrc => 'text_to_table' }, +{ oid => '8433', descr => 'split delimited text, with null string', + proname => 'string_to_table', proisstrict => 'f', prorows => '1000', + proretset => 't', prorettype => 'text', proargtypes => 'text text text', + prosrc => 'text_to_table_null' }, { oid => '395', descr => 'concatenate array elements, using delimiter, into text', proname => 'array_to_string', provolatile => 's', prorettype => 'text', proargtypes => 'anyarray text', prosrc => 'array_to_text' }, -{ oid => '376', descr => 'split delimited text into text[], with null string', - proname => 'string_to_array', proisstrict => 'f', prorettype => '_text', - proargtypes => 'text text text', prosrc => 'text_to_array_null' }, { oid => '384', descr => 'concatenate array elements, using delimiter and null string, into text', proname => 'array_to_string', proisstrict => 'f', provolatile => 's', @@ -3547,7 +3555,7 @@ prosrc => 'regexp_matches' }, { oid => '2088', descr => 'split string by field_sep and return field_num', proname => 'split_part', prorettype => 'text', - proargtypes => 'text text int4', prosrc => 'split_text' }, + proargtypes => 'text text int4', prosrc => 'split_part' }, { oid => '2765', descr => 'split string by pattern', proname => 'regexp_split_to_table', prorows => '1000', proretset => 't', prorettype => 'text', proargtypes => 'text text', diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index c730563f03..f9d9ad6aef 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -1755,6 +1755,114 @@ select string_to_array('1,2,3,4,*,6', ',', '*'); {1,2,3,4,NULL,6} (1 row) +select v, v is null as "is null" from string_to_table('1|2|3', '|') g(v); + v | is null +---+--------- + 1 | f + 2 | f + 3 | f +(3 rows) + +select v, v is null as "is null" from string_to_table('1|2|3|', '|') g(v); + v | is null +---+--------- + 1 | f + 2 | f + 3 | f + | f +(4 rows) + +select v, v is null as "is null" from string_to_table('1||2|3||', '||') g(v); + v | is null +-----+--------- + 1 | f + 2|3 | f + | f +(3 rows) + +select v, v is null as "is null" from string_to_table('1|2|3', '') g(v); + v | is null +-------+--------- + 1|2|3 | f +(1 row) + +select v, v is null as "is null" from string_to_table('', '|') g(v); + v | is null +---+--------- +(0 rows) + +select v, v is null as "is null" from string_to_table('1|2|3', NULL) g(v); + v | is null +---+--------- + 1 | f + | | f + 2 | f + | | f + 3 | f +(5 rows) + +select v, v is null as "is null" from string_to_table(NULL, '|') g(v); + v | is null +---+--------- +(0 rows) + +select v, v is null as "is null" from string_to_table('abc', '') g(v); + v | is null +-----+--------- + abc | f +(1 row) + +select v, v is null as "is null" from string_to_table('abc', '', 'abc') g(v); + v | is null +---+--------- + | t +(1 row) + +select v, v is null as "is null" from string_to_table('abc', ',') g(v); + v | is null +-----+--------- + abc | f +(1 row) + +select v, v is null as "is null" from string_to_table('abc', ',', 'abc') g(v); + v | is null +---+--------- + | t +(1 row) + +select v, v is null as "is null" from string_to_table('1,2,3,4,,6', ',') g(v); + v | is null +---+--------- + 1 | f + 2 | f + 3 | f + 4 | f + | f + 6 | f +(6 rows) + +select v, v is null as "is null" from string_to_table('1,2,3,4,,6', ',', '') g(v); + v | is null +---+--------- + 1 | f + 2 | f + 3 | f + 4 | f + | t + 6 | f +(6 rows) + +select v, v is null as "is null" from string_to_table('1,2,3,4,*,6', ',', '*') g(v); + v | is null +---+--------- + 1 | f + 2 | f + 3 | f + 4 | f + | t + 6 | f +(6 rows) + select array_to_string(NULL::int4[], ',') IS NULL; ?column? ---------- diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index 25dd4e2c6d..2b689ae88f 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -544,6 +544,21 @@ select string_to_array('1,2,3,4,,6', ','); select string_to_array('1,2,3,4,,6', ',', ''); select string_to_array('1,2,3,4,*,6', ',', '*'); +select v, v is null as "is null" from string_to_table('1|2|3', '|') g(v); +select v, v is null as "is null" from string_to_table('1|2|3|', '|') g(v); +select v, v is null as "is null" from string_to_table('1||2|3||', '||') g(v); +select v, v is null as "is null" from string_to_table('1|2|3', '') g(v); +select v, v is null as "is null" from string_to_table('', '|') g(v); +select v, v is null as "is null" from string_to_table('1|2|3', NULL) g(v); +select v, v is null as "is null" from string_to_table(NULL, '|') g(v); +select v, v is null as "is null" from string_to_table('abc', '') g(v); +select v, v is null as "is null" from string_to_table('abc', '', 'abc') g(v); +select v, v is null as "is null" from string_to_table('abc', ',') g(v); +select v, v is null as "is null" from string_to_table('abc', ',', 'abc') g(v); +select v, v is null as "is null" from string_to_table('1,2,3,4,,6', ',') g(v); +select v, v is null as "is null" from string_to_table('1,2,3,4,,6', ',', '') g(v); +select v, v is null as "is null" from string_to_table('1,2,3,4,*,6', ',', '*') g(v); + select array_to_string(NULL::int4[], ',') IS NULL; select array_to_string('{}'::int4[], ','); select array_to_string(array[1,2,3,4,NULL,6], ',');