Add array_sample() and array_shuffle() functions.

These are useful in Monte Carlo applications.

Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself

Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net
This commit is contained in:
Tom Lane 2023-04-07 11:47:07 -04:00
parent cd82e5c79d
commit 888f2ea0a8
6 changed files with 284 additions and 2 deletions

View File

@ -16053,7 +16053,7 @@ SELECT js,
js IS JSON ARRAY "array?"
FROM (VALUES
('123'), ('"abc"'), ('{"a": "b"}'), ('[1,2]'),('abc')) foo(js);
js | json? | scalar? | object? | array?
js | json? | scalar? | object? | array?
------------+-------+---------+---------+--------
123 | t | t | f | f
"abc" | t | t | f | f
@ -18777,6 +18777,48 @@ SELECT NULLIF(value, '(none)') ...
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>array_sample</primary>
</indexterm>
<function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
<returnvalue>anyarray</returnvalue>
</para>
<para>
Returns an array of <parameter>n</parameter> items randomly selected
from <parameter>array</parameter>. <parameter>n</parameter> may not
exceed the length of <parameter>array</parameter>'s first dimension.
If <parameter>array</parameter> is multi-dimensional,
an <quote>item</quote> is a slice having a given first subscript.
</para>
<para>
<literal>array_sample(ARRAY[1,2,3,4,5,6], 3)</literal>
<returnvalue>{2,6,1}</returnvalue>
</para>
<para>
<literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal>
<returnvalue>{{5,6},{1,2}}</returnvalue>
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>array_shuffle</primary>
</indexterm>
<function>array_shuffle</function> ( <type>anyarray</type> )
<returnvalue>anyarray</returnvalue>
</para>
<para>
Randomly shuffles the first dimension of the array.
</para>
<para>
<literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal>
<returnvalue>{{5,6},{1,2},{3,4}}</returnvalue>
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm id="function-array-to-string">

View File

@ -15,6 +15,7 @@
#include "catalog/pg_type.h"
#include "libpq/pqformat.h"
#include "common/int.h"
#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/datum.h"
@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
}
/*
* array_shuffle_n
* Return a copy of array with n randomly chosen items.
*
* The number of items must not exceed the size of the first dimension of the
* array. We preserve the first dimension's lower bound if keep_lb,
* else it's set to 1. Lower-order dimensions are preserved in any case.
*
* NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
* from the system catalogs, given only the elmtyp. However, the caller is
* in a better position to cache this info across multiple calls.
*/
static ArrayType *
array_shuffle_n(ArrayType *array, int n, bool keep_lb,
Oid elmtyp, TypeCacheEntry *typentry)
{
ArrayType *result;
int ndim,
*dims,
*lbs,
nelm,
nitem,
rdims[MAXDIM],
rlbs[MAXDIM];
int16 elmlen;
bool elmbyval;
char elmalign;
Datum *elms,
*ielms;
bool *nuls,
*inuls;
ndim = ARR_NDIM(array);
dims = ARR_DIMS(array);
lbs = ARR_LBOUND(array);
elmlen = typentry->typlen;
elmbyval = typentry->typbyval;
elmalign = typentry->typalign;
/* If the target array is empty, exit fast */
if (ndim < 1 || dims[0] < 1 || n < 1)
return construct_empty_array(elmtyp);
deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
&elms, &nuls, &nelm);
nitem = dims[0]; /* total number of items */
nelm /= nitem; /* number of elements per item */
Assert(n <= nitem); /* else it's caller error */
/*
* Shuffle array using Fisher-Yates algorithm. Scan the array and swap
* current item (nelm datums starting at ielms) with a randomly chosen
* later item (nelm datums starting at jelms) in each iteration. We can
* stop once we've done n iterations; then first n items are the result.
*/
ielms = elms;
inuls = nuls;
for (int i = 0; i < n; i++)
{
int j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm;
Datum *jelms = elms + j;
bool *jnuls = nuls + j;
/* Swap i'th and j'th items; advance ielms/inuls to next item */
for (int k = 0; k < nelm; k++)
{
Datum elm = *ielms;
bool nul = *inuls;
*ielms++ = *jelms;
*inuls++ = *jnuls;
*jelms++ = elm;
*jnuls++ = nul;
}
}
/* Set up dimensions of the result */
memcpy(rdims, dims, ndim * sizeof(int));
memcpy(rlbs, lbs, ndim * sizeof(int));
rdims[0] = n;
if (!keep_lb)
rlbs[0] = 1;
result = construct_md_array(elms, nuls, ndim, rdims, rlbs,
elmtyp, elmlen, elmbyval, elmalign);
pfree(elms);
pfree(nuls);
return result;
}
/*
* array_shuffle
*
* Returns an array with the same dimensions as the input array, with its
* first-dimension elements in random order.
*/
Datum
array_shuffle(PG_FUNCTION_ARGS)
{
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
ArrayType *result;
Oid elmtyp;
TypeCacheEntry *typentry;
/*
* There is no point in shuffling empty arrays or arrays with less than
* two items.
*/
if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2)
PG_RETURN_ARRAYTYPE_P(array);
elmtyp = ARR_ELEMTYPE(array);
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
if (typentry == NULL || typentry->type_id != elmtyp)
{
typentry = lookup_type_cache(elmtyp, 0);
fcinfo->flinfo->fn_extra = (void *) typentry;
}
result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry);
PG_RETURN_ARRAYTYPE_P(result);
}
/*
* array_sample
*
* Returns an array of n randomly chosen first-dimension elements
* from the input array.
*/
Datum
array_sample(PG_FUNCTION_ARGS)
{
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
int n = PG_GETARG_INT32(1);
ArrayType *result;
Oid elmtyp;
TypeCacheEntry *typentry;
int nitem;
nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0];
if (n < 0 || n > nitem)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("sample size must be between 0 and %d", nitem)));
elmtyp = ARR_ELEMTYPE(array);
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
if (typentry == NULL || typentry->type_id != elmtyp)
{
typentry = lookup_type_cache(elmtyp, 0);
fcinfo->flinfo->fn_extra = (void *) typentry;
}
result = array_shuffle_n(array, n, false, elmtyp, typentry);
PG_RETURN_ARRAYTYPE_P(result);
}

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202304051
#define CATALOG_VERSION_NO 202304071
#endif

View File

@ -1717,6 +1717,12 @@
{ oid => '6172', descr => 'remove last N elements of array',
proname => 'trim_array', prorettype => 'anyarray',
proargtypes => 'anyarray int4', prosrc => 'trim_array' },
{ oid => '8464', descr => 'shuffle array',
proname => 'array_shuffle', provolatile => 'v', prorettype => 'anyarray',
proargtypes => 'anyarray', prosrc => 'array_shuffle' },
{ oid => '8465', descr => 'take samples from array',
proname => 'array_sample', provolatile => 'v', prorettype => 'anyarray',
proargtypes => 'anyarray int4', prosrc => 'array_sample' },
{ oid => '3816', descr => 'array typanalyze',
proname => 'array_typanalyze', provolatile => 's', prorettype => 'bool',
proargtypes => 'internal', prosrc => 'array_typanalyze' },

View File

@ -2472,3 +2472,57 @@ SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
ERROR: number of elements to trim must be between 0 and 3
SELECT trim_array(ARRAY[]::int[], 1); -- fail
ERROR: number of elements to trim must be between 0 and 0
-- array_shuffle
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
?column?
----------
t
(1 row)
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
?column?
----------
t
(1 row)
SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
array_dims
-------------
[-1:2][2:3]
(1 row)
SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
array_dims
-----------------
[1:3][1:2][1:2]
(1 row)
-- array_sample
SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
?column?
----------
t
(1 row)
SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
array_length
--------------
3
(1 row)
SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
array_dims
------------
[1:3][2:3]
(1 row)
SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
array_dims
-----------------
[1:2][1:2][1:2]
(1 row)
SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
ERROR: sample size must be between 0 and 6
SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
ERROR: sample size must be between 0 and 6

View File

@ -761,3 +761,17 @@ FROM
SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
SELECT trim_array(ARRAY[]::int[], 1); -- fail
-- array_shuffle
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
-- array_sample
SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail