postgresql/src/backend/utils/adt/jsonb_gin.c
Bruce Momjian ee94300446 Update copyright for 2016
Backpatch certain files through 9.1
2016-01-02 13:33:40 -05:00

612 lines
15 KiB
C

/*-------------------------------------------------------------------------
*
* jsonb_gin.c
* GIN support functions for jsonb
*
* Copyright (c) 2014-2016, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* src/backend/utils/adt/jsonb_gin.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/gin.h"
#include "access/hash.h"
#include "access/stratnum.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "utils/builtins.h"
#include "utils/jsonb.h"
typedef struct PathHashStack
{
uint32 hash;
struct PathHashStack *parent;
} PathHashStack;
static Datum make_text_key(char flag, const char *str, int len);
static Datum make_scalar_key(const JsonbValue *scalarVal, bool is_key);
/*
*
* jsonb_ops GIN opclass support functions
*
*/
Datum
gin_compare_jsonb(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int32 result;
char *a1p,
*a2p;
int len1,
len2;
a1p = VARDATA_ANY(arg1);
a2p = VARDATA_ANY(arg2);
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
/* Compare text as bttextcmp does, but always using C collation */
result = varstr_cmp(a1p, len1, a2p, len2, C_COLLATION_OID);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(result);
}
Datum
gin_extract_jsonb(PG_FUNCTION_ARGS)
{
Jsonb *jb = (Jsonb *) PG_GETARG_JSONB(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
int total = 2 * JB_ROOT_COUNT(jb);
JsonbIterator *it;
JsonbValue v;
JsonbIteratorToken r;
int i = 0;
Datum *entries;
/* If the root level is empty, we certainly have no keys */
if (total == 0)
{
*nentries = 0;
PG_RETURN_POINTER(NULL);
}
/* Otherwise, use 2 * root count as initial estimate of result size */
entries = (Datum *) palloc(sizeof(Datum) * total);
it = JsonbIteratorInit(&jb->root);
while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
{
/* Since we recurse into the object, we might need more space */
if (i >= total)
{
total *= 2;
entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
}
switch (r)
{
case WJB_KEY:
entries[i++] = make_scalar_key(&v, true);
break;
case WJB_ELEM:
/* Pretend string array elements are keys, see jsonb.h */
entries[i++] = make_scalar_key(&v, (v.type == jbvString));
break;
case WJB_VALUE:
entries[i++] = make_scalar_key(&v, false);
break;
default:
/* we can ignore structural items */
break;
}
}
*nentries = i;
PG_RETURN_POINTER(entries);
}
Datum
gin_extract_jsonb_query(PG_FUNCTION_ARGS)
{
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
StrategyNumber strategy = PG_GETARG_UINT16(2);
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
Datum *entries;
if (strategy == JsonbContainsStrategyNumber)
{
/* Query is a jsonb, so just apply gin_extract_jsonb... */
entries = (Datum *)
DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb,
PG_GETARG_DATUM(0),
PointerGetDatum(nentries)));
/* ...although "contains {}" requires a full index scan */
if (*nentries == 0)
*searchMode = GIN_SEARCH_MODE_ALL;
}
else if (strategy == JsonbExistsStrategyNumber)
{
/* Query is a text string, which we treat as a key */
text *query = PG_GETARG_TEXT_PP(0);
*nentries = 1;
entries = (Datum *) palloc(sizeof(Datum));
entries[0] = make_text_key(JGINFLAG_KEY,
VARDATA_ANY(query),
VARSIZE_ANY_EXHDR(query));
}
else if (strategy == JsonbExistsAnyStrategyNumber ||
strategy == JsonbExistsAllStrategyNumber)
{
/* Query is a text array; each element is treated as a key */
ArrayType *query = PG_GETARG_ARRAYTYPE_P(0);
Datum *key_datums;
bool *key_nulls;
int key_count;
int i,
j;
deconstruct_array(query,
TEXTOID, -1, false, 'i',
&key_datums, &key_nulls, &key_count);
entries = (Datum *) palloc(sizeof(Datum) * key_count);
for (i = 0, j = 0; i < key_count; i++)
{
/* Nulls in the array are ignored */
if (key_nulls[i])
continue;
entries[j++] = make_text_key(JGINFLAG_KEY,
VARDATA_ANY(key_datums[i]),
VARSIZE_ANY_EXHDR(key_datums[i]));
}
*nentries = j;
/* ExistsAll with no keys should match everything */
if (j == 0 && strategy == JsonbExistsAllStrategyNumber)
*searchMode = GIN_SEARCH_MODE_ALL;
}
else
{
elog(ERROR, "unrecognized strategy number: %d", strategy);
entries = NULL; /* keep compiler quiet */
}
PG_RETURN_POINTER(entries);
}
Datum
gin_consistent_jsonb(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
StrategyNumber strategy = PG_GETARG_UINT16(1);
/* Jsonb *query = PG_GETARG_JSONB(2); */
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = true;
int32 i;
if (strategy == JsonbContainsStrategyNumber)
{
/*
* We must always recheck, since we can't tell from the index whether
* the positions of the matched items match the structure of the query
* object. (Even if we could, we'd also have to worry about hashed
* keys and the index's failure to distinguish keys from string array
* elements.) However, the tuple certainly doesn't match unless it
* contains all the query keys.
*/
*recheck = true;
for (i = 0; i < nkeys; i++)
{
if (!check[i])
{
res = false;
break;
}
}
}
else if (strategy == JsonbExistsStrategyNumber)
{
/*
* Although the key is certainly present in the index, we must recheck
* because (1) the key might be hashed, and (2) the index match might
* be for a key that's not at top level of the JSON object. For (1),
* we could look at the query key to see if it's hashed and not
* recheck if not, but the index lacks enough info to tell about (2).
*/
*recheck = true;
res = true;
}
else if (strategy == JsonbExistsAnyStrategyNumber)
{
/* As for plain exists, we must recheck */
*recheck = true;
res = true;
}
else if (strategy == JsonbExistsAllStrategyNumber)
{
/* As for plain exists, we must recheck */
*recheck = true;
/* ... but unless all the keys are present, we can say "false" */
for (i = 0; i < nkeys; i++)
{
if (!check[i])
{
res = false;
break;
}
}
}
else
elog(ERROR, "unrecognized strategy number: %d", strategy);
PG_RETURN_BOOL(res);
}
Datum
gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
{
GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
StrategyNumber strategy = PG_GETARG_UINT16(1);
/* Jsonb *query = PG_GETARG_JSONB(2); */
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
GinTernaryValue res = GIN_MAYBE;
int32 i;
/*
* Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
* corresponds to always forcing recheck in the regular consistent
* function, for the reasons listed there.
*/
if (strategy == JsonbContainsStrategyNumber ||
strategy == JsonbExistsAllStrategyNumber)
{
/* All extracted keys must be present */
for (i = 0; i < nkeys; i++)
{
if (check[i] == GIN_FALSE)
{
res = GIN_FALSE;
break;
}
}
}
else if (strategy == JsonbExistsStrategyNumber ||
strategy == JsonbExistsAnyStrategyNumber)
{
/* At least one extracted key must be present */
res = GIN_FALSE;
for (i = 0; i < nkeys; i++)
{
if (check[i] == GIN_TRUE ||
check[i] == GIN_MAYBE)
{
res = GIN_MAYBE;
break;
}
}
}
else
elog(ERROR, "unrecognized strategy number: %d", strategy);
PG_RETURN_GIN_TERNARY_VALUE(res);
}
/*
*
* jsonb_path_ops GIN opclass support functions
*
* In a jsonb_path_ops index, the GIN keys are uint32 hashes, one per JSON
* value; but the JSON key(s) leading to each value are also included in its
* hash computation. This means we can only support containment queries,
* but the index can distinguish, for example, {"foo": 42} from {"bar": 42}
* since different hashes will be generated.
*
*/
Datum
gin_extract_jsonb_path(PG_FUNCTION_ARGS)
{
Jsonb *jb = PG_GETARG_JSONB(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
int total = 2 * JB_ROOT_COUNT(jb);
JsonbIterator *it;
JsonbValue v;
JsonbIteratorToken r;
PathHashStack tail;
PathHashStack *stack;
int i = 0;
Datum *entries;
/* If the root level is empty, we certainly have no keys */
if (total == 0)
{
*nentries = 0;
PG_RETURN_POINTER(NULL);
}
/* Otherwise, use 2 * root count as initial estimate of result size */
entries = (Datum *) palloc(sizeof(Datum) * total);
/* We keep a stack of partial hashes corresponding to parent key levels */
tail.parent = NULL;
tail.hash = 0;
stack = &tail;
it = JsonbIteratorInit(&jb->root);
while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
{
PathHashStack *parent;
/* Since we recurse into the object, we might need more space */
if (i >= total)
{
total *= 2;
entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
}
switch (r)
{
case WJB_BEGIN_ARRAY:
case WJB_BEGIN_OBJECT:
/* Push a stack level for this object */
parent = stack;
stack = (PathHashStack *) palloc(sizeof(PathHashStack));
/*
* We pass forward hashes from outer nesting levels so that
* the hashes for nested values will include outer keys as
* well as their own keys.
*
* Nesting an array within another array will not alter
* innermost scalar element hash values, but that seems
* inconsequential.
*/
stack->hash = parent->hash;
stack->parent = parent;
break;
case WJB_KEY:
/* mix this key into the current outer hash */
JsonbHashScalarValue(&v, &stack->hash);
/* hash is now ready to incorporate the value */
break;
case WJB_ELEM:
case WJB_VALUE:
/* mix the element or value's hash into the prepared hash */
JsonbHashScalarValue(&v, &stack->hash);
/* and emit an index entry */
entries[i++] = UInt32GetDatum(stack->hash);
/* reset hash for next key, value, or sub-object */
stack->hash = stack->parent->hash;
break;
case WJB_END_ARRAY:
case WJB_END_OBJECT:
/* Pop the stack */
parent = stack->parent;
pfree(stack);
stack = parent;
/* reset hash for next key, value, or sub-object */
if (stack->parent)
stack->hash = stack->parent->hash;
else
stack->hash = 0;
break;
default:
elog(ERROR, "invalid JsonbIteratorNext rc: %d", (int) r);
}
}
*nentries = i;
PG_RETURN_POINTER(entries);
}
Datum
gin_extract_jsonb_query_path(PG_FUNCTION_ARGS)
{
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
StrategyNumber strategy = PG_GETARG_UINT16(2);
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
Datum *entries;
if (strategy != JsonbContainsStrategyNumber)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/* Query is a jsonb, so just apply gin_extract_jsonb_path ... */
entries = (Datum *)
DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_path,
PG_GETARG_DATUM(0),
PointerGetDatum(nentries)));
/* ... although "contains {}" requires a full index scan */
if (*nentries == 0)
*searchMode = GIN_SEARCH_MODE_ALL;
PG_RETURN_POINTER(entries);
}
Datum
gin_consistent_jsonb_path(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
StrategyNumber strategy = PG_GETARG_UINT16(1);
/* Jsonb *query = PG_GETARG_JSONB(2); */
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = true;
int32 i;
if (strategy != JsonbContainsStrategyNumber)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/*
* jsonb_path_ops is necessarily lossy, not only because of hash
* collisions but also because it doesn't preserve complete information
* about the structure of the JSON object. Besides, there are some
* special rules around the containment of raw scalars in arrays that are
* not handled here. So we must always recheck a match. However, if not
* all of the keys are present, the tuple certainly doesn't match.
*/
*recheck = true;
for (i = 0; i < nkeys; i++)
{
if (!check[i])
{
res = false;
break;
}
}
PG_RETURN_BOOL(res);
}
Datum
gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS)
{
GinTernaryValue *check = (GinTernaryValue *) PG_GETARG_POINTER(0);
StrategyNumber strategy = PG_GETARG_UINT16(1);
/* Jsonb *query = PG_GETARG_JSONB(2); */
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
GinTernaryValue res = GIN_MAYBE;
int32 i;
if (strategy != JsonbContainsStrategyNumber)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/*
* Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
* corresponds to always forcing recheck in the regular consistent
* function, for the reasons listed there.
*/
for (i = 0; i < nkeys; i++)
{
if (check[i] == GIN_FALSE)
{
res = GIN_FALSE;
break;
}
}
PG_RETURN_GIN_TERNARY_VALUE(res);
}
/*
* Construct a jsonb_ops GIN key from a flag byte and a textual representation
* (which need not be null-terminated). This function is responsible
* for hashing overlength text representations; it will add the
* JGINFLAG_HASHED bit to the flag value if it does that.
*/
static Datum
make_text_key(char flag, const char *str, int len)
{
text *item;
char hashbuf[10];
if (len > JGIN_MAXLENGTH)
{
uint32 hashval;
hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len));
snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval);
str = hashbuf;
len = 8;
flag |= JGINFLAG_HASHED;
}
/*
* Now build the text Datum. For simplicity we build a 4-byte-header
* varlena text Datum here, but we expect it will get converted to short
* header format when stored in the index.
*/
item = (text *) palloc(VARHDRSZ + len + 1);
SET_VARSIZE(item, VARHDRSZ + len + 1);
*VARDATA(item) = flag;
memcpy(VARDATA(item) + 1, str, len);
return PointerGetDatum(item);
}
/*
* Create a textual representation of a JsonbValue that will serve as a GIN
* key in a jsonb_ops index. is_key is true if the JsonbValue is a key,
* or if it is a string array element (since we pretend those are keys,
* see jsonb.h).
*/
static Datum
make_scalar_key(const JsonbValue *scalarVal, bool is_key)
{
Datum item;
char *cstr;
switch (scalarVal->type)
{
case jbvNull:
Assert(!is_key);
item = make_text_key(JGINFLAG_NULL, "", 0);
break;
case jbvBool:
Assert(!is_key);
item = make_text_key(JGINFLAG_BOOL,
scalarVal->val.boolean ? "t" : "f", 1);
break;
case jbvNumeric:
Assert(!is_key);
/*
* A normalized textual representation, free of trailing zeroes,
* is required so that numerically equal values will produce equal
* strings.
*
* It isn't ideal that numerics are stored in a relatively bulky
* textual format. However, it's a notationally convenient way of
* storing a "union" type in the GIN B-Tree, and indexing Jsonb
* strings takes precedence.
*/
cstr = numeric_normalize(scalarVal->val.numeric);
item = make_text_key(JGINFLAG_NUM, cstr, strlen(cstr));
pfree(cstr);
break;
case jbvString:
item = make_text_key(is_key ? JGINFLAG_KEY : JGINFLAG_STR,
scalarVal->val.string.val,
scalarVal->val.string.len);
break;
default:
elog(ERROR, "unrecognized jsonb scalar type: %d", scalarVal->type);
item = 0; /* keep compiler quiet */
break;
}
return item;
}