Improve key representation for GIN jsonb_ops, and fix existence-search bug.

Change the key representation so that values that would exceed 127 bytes
are hashed into short strings, and so that the original JSON datatype of
each value is recorded in the index.  The hashing rule eliminates the major
objection to having this opclass be the default for jsonb, namely that it
could fail for plausible input data (due to GIN's restrictions on maximum
key length).  Preserving datatype information doesn't really buy us much
right now, but it requires no extra space compared to the previous way,
and it might be useful later.

Also, change the consistency-checking functions to request recheck for
exists (jsonb ? text) and related operators.  The original analysis that
this is an exactly checkable query was incorrect, since the index does
not preserve information about whether a key appears at top level in
the indexed JSON object.  Add a test case demonstrating the problem.

Make some other, mostly cosmetic improvements to the code in jsonb_gin.c
as well.

catversion bump due to on-disk data format change in jsonb_ops indexes.
This commit is contained in:
Tom Lane 2014-05-09 08:41:26 -04:00
parent ff7bbb0176
commit 46dddf7673
7 changed files with 340 additions and 318 deletions

View File

@ -14,6 +14,7 @@
#include "postgres.h"
#include "access/gin.h"
#include "access/hash.h"
#include "access/skey.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
@ -26,14 +27,15 @@ typedef struct PathHashStack
struct PathHashStack *parent;
} PathHashStack;
static text *make_text_key(const char *str, int len, char flag);
static text *make_scalar_key(const JsonbValue *scalarVal, char flag);
static Datum make_text_key(char flag, const char *str, int len);
static Datum make_scalar_key(const JsonbValue *scalarVal, bool is_key);
/*
*
* jsonb_ops GIN opclass support functions
*
*/
Datum
gin_compare_jsonb(PG_FUNCTION_ARGS)
{
@ -65,80 +67,49 @@ gin_extract_jsonb(PG_FUNCTION_ARGS)
{
Jsonb *jb = (Jsonb *) PG_GETARG_JSONB(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
Datum *entries = NULL;
int total = 2 * JB_ROOT_COUNT(jb);
int i = 0,
r;
JsonbIterator *it;
JsonbValue v;
int i = 0,
r;
Datum *entries;
/* If the root level is empty, we certainly have no keys */
if (total == 0)
{
*nentries = 0;
PG_RETURN_POINTER(NULL);
}
/* Otherwise, use 2 * root count as initial estimate of result size */
entries = (Datum *) palloc(sizeof(Datum) * total);
it = JsonbIteratorInit(&jb->root);
while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
{
/* Since we recurse into the object, we might need more space */
if (i >= total)
{
total *= 2;
entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
}
/*
* Serialize keys and elements equivalently, but only when elements
* are Jsonb strings. Otherwise, serialize elements as values. Array
* elements are indexed as keys, for the benefit of
* JsonbExistsStrategyNumber. Our definition of existence does not
* allow for checking the existence of a non-jbvString element (just
* like the definition of the underlying operator), because the
* operator takes a text rhs argument (which is taken as a proxy for
* an equivalent Jsonb string).
*
* The way existence is represented does not preclude an alternative
* existence operator, that takes as its rhs value an arbitrarily
* internally-typed Jsonb. The only reason that isn't the case here
* is that the existence operator is only really intended to determine
* if an object has a certain key (object pair keys are of course
* invariably strings), which is extended to jsonb arrays. You could
* think of the default Jsonb definition of existence as being
* equivalent to a definition where all types of scalar array elements
* are keys that we can check the existence of, while just forbidding
* non-string notation. This inflexibility prevents the user from
* having to qualify that the rhs string is a raw scalar string (that
* is, naturally no internal string quoting in required for the text
* argument), and allows us to not set the reset flag for
* JsonbExistsStrategyNumber, since we know that keys are strings for
* both objects and arrays, and don't have to further account for type
* mismatch. Not having to set the reset flag makes it less than
* tempting to tighten up the definition of existence to preclude
* array elements entirely, which would arguably be a simpler
* alternative. In any case the infrastructure used to implement the
* existence operator could trivially support this hypothetical,
* slightly distinct definition of existence.
*/
switch (r)
{
case WJB_KEY:
/* Serialize key separately, for existence strategies */
entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM));
entries[i++] = make_scalar_key(&v, true);
break;
case WJB_ELEM:
if (v.type == jbvString)
entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM));
else
entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL));
/* Pretend string array elements are keys, see jsonb.h */
entries[i++] = make_scalar_key(&v, (v.type == jbvString));
break;
case WJB_VALUE:
entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL));
entries[i++] = make_scalar_key(&v, false);
break;
default:
continue;
/* we can ignore structural items */
break;
}
}
@ -163,30 +134,30 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS)
PG_GETARG_DATUM(0),
PointerGetDatum(nentries)));
/* ...although "contains {}" requires a full index scan */
if (entries == NULL)
if (*nentries == 0)
*searchMode = GIN_SEARCH_MODE_ALL;
}
else if (strategy == JsonbExistsStrategyNumber)
{
/* Query is a text string, which we treat as a key */
text *query = PG_GETARG_TEXT_PP(0);
text *item;
*nentries = 1;
entries = (Datum *) palloc(sizeof(Datum));
item = make_text_key(VARDATA_ANY(query), VARSIZE_ANY_EXHDR(query),
JKEYELEM);
entries[0] = PointerGetDatum(item);
entries[0] = make_text_key(JGINFLAG_KEY,
VARDATA_ANY(query),
VARSIZE_ANY_EXHDR(query));
}
else if (strategy == JsonbExistsAnyStrategyNumber ||
strategy == JsonbExistsAllStrategyNumber)
{
/* Query is a text array; each element is treated as a key */
ArrayType *query = PG_GETARG_ARRAYTYPE_P(0);
Datum *key_datums;
bool *key_nulls;
int key_count;
int i,
j;
text *item;
deconstruct_array(query,
TEXTOID, -1, false, 'i',
@ -194,15 +165,14 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS)
entries = (Datum *) palloc(sizeof(Datum) * key_count);
for (i = 0, j = 0; i < key_count; ++i)
for (i = 0, j = 0; i < key_count; i++)
{
/* Nulls in the array are ignored */
if (key_nulls[i])
continue;
item = make_text_key(VARDATA(key_datums[i]),
VARSIZE(key_datums[i]) - VARHDRSZ,
JKEYELEM);
entries[j++] = PointerGetDatum(item);
entries[j++] = make_text_key(JGINFLAG_KEY,
VARDATA_ANY(key_datums[i]),
VARSIZE_ANY_EXHDR(key_datums[i]));
}
*nentries = j;
@ -236,13 +206,12 @@ gin_consistent_jsonb(PG_FUNCTION_ARGS)
if (strategy == JsonbContainsStrategyNumber)
{
/*
* Index doesn't have information about correspondence of Jsonb keys
* and values (as distinct from GIN keys, which a key/value pair is
* stored as), so invariably we recheck. Besides, there are some
* special rules around the containment of raw scalar arrays and
* regular arrays that are not represented here. However, if all of
* the keys are not present, that's sufficient reason to return false
* and finish immediately.
* We must always recheck, since we can't tell from the index whether
* the positions of the matched items match the structure of the query
* object. (Even if we could, we'd also have to worry about hashed
* keys and the index's failure to distinguish keys from string array
* elements.) However, the tuple certainly doesn't match unless it
* contains all the query keys.
*/
*recheck = true;
for (i = 0; i < nkeys; i++)
@ -256,20 +225,27 @@ gin_consistent_jsonb(PG_FUNCTION_ARGS)
}
else if (strategy == JsonbExistsStrategyNumber)
{
/* Existence of key guaranteed in default search mode */
*recheck = false;
/*
* Although the key is certainly present in the index, we must recheck
* because (1) the key might be hashed, and (2) the index match might
* be for a key that's not at top level of the JSON object. For (1),
* we could look at the query key to see if it's hashed and not
* recheck if not, but the index lacks enough info to tell about (2).
*/
*recheck = true;
res = true;
}
else if (strategy == JsonbExistsAnyStrategyNumber)
{
/* Existence of key guaranteed in default search mode */
*recheck = false;
/* As for plain exists, we must recheck */
*recheck = true;
res = true;
}
else if (strategy == JsonbExistsAllStrategyNumber)
{
/* Testing for the presence of all keys gives an exact result */
*recheck = false;
/* As for plain exists, we must recheck */
*recheck = true;
/* ... but unless all the keys are present, we can say "false" */
for (i = 0; i < nkeys; i++)
{
if (!check[i])
@ -295,19 +271,18 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
GinTernaryValue res = GIN_TRUE;
GinTernaryValue res = GIN_MAYBE;
int32 i;
if (strategy == JsonbContainsStrategyNumber)
/*
* Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
* corresponds to always forcing recheck in the regular consistent
* function, for the reasons listed there.
*/
if (strategy == JsonbContainsStrategyNumber ||
strategy == JsonbExistsAllStrategyNumber)
{
bool has_maybe = false;
/*
* All extracted keys must be present. Combination of GIN_MAYBE and
* GIN_TRUE gives GIN_MAYBE result because then all keys may be
* present.
*/
/* All extracted keys must be present */
for (i = 0; i < nkeys; i++)
{
if (check[i] == GIN_FALSE)
@ -315,55 +290,21 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
res = GIN_FALSE;
break;
}
if (check[i] == GIN_MAYBE)
{
res = GIN_MAYBE;
has_maybe = true;
}
}
/*
* Index doesn't have information about correspondence of Jsonb keys
* and values (as distinct from GIN keys, which a key/value pair is
* stored as), so invariably we recheck. This is also reflected in
* how GIN_MAYBE is given in response to there being no GIN_MAYBE
* input.
*/
if (!has_maybe && res == GIN_TRUE)
res = GIN_MAYBE;
}
else if (strategy == JsonbExistsStrategyNumber ||
strategy == JsonbExistsAnyStrategyNumber)
{
/* Existence of key guaranteed in default search mode */
/* At least one extracted key must be present */
res = GIN_FALSE;
for (i = 0; i < nkeys; i++)
{
if (check[i] == GIN_TRUE)
{
res = GIN_TRUE;
break;
}
if (check[i] == GIN_MAYBE)
if (check[i] == GIN_TRUE ||
check[i] == GIN_MAYBE)
{
res = GIN_MAYBE;
}
}
}
else if (strategy == JsonbExistsAllStrategyNumber)
{
/* Testing for the presence of all keys gives an exact result */
for (i = 0; i < nkeys; i++)
{
if (check[i] == GIN_FALSE)
{
res = GIN_FALSE;
break;
}
if (check[i] == GIN_MAYBE)
{
res = GIN_MAYBE;
}
}
}
else
@ -376,7 +317,151 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS)
*
* jsonb_hash_ops GIN opclass support functions
*
* In a jsonb_hash_ops index, the GIN keys are uint32 hashes, one per JSON
* value; but the JSON key(s) leading to each value are also included in its
* hash computation. This means we can only support containment queries,
* but the index can distinguish, for example, {"foo": 42} from {"bar": 42}
* since different hashes will be generated.
*
*/
Datum
gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
{
Jsonb *jb = PG_GETARG_JSONB(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
int total = 2 * JB_ROOT_COUNT(jb);
JsonbIterator *it;
JsonbValue v;
PathHashStack tail;
PathHashStack *stack;
int i = 0,
r;
Datum *entries;
/* If the root level is empty, we certainly have no keys */
if (total == 0)
{
*nentries = 0;
PG_RETURN_POINTER(NULL);
}
/* Otherwise, use 2 * root count as initial estimate of result size */
entries = (Datum *) palloc(sizeof(Datum) * total);
/* We keep a stack of hashes corresponding to parent key levels */
tail.parent = NULL;
tail.hash = 0;
stack = &tail;
it = JsonbIteratorInit(&jb->root);
while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
{
PathHashStack *parent;
/* Since we recurse into the object, we might need more space */
if (i >= total)
{
total *= 2;
entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
}
switch (r)
{
case WJB_BEGIN_ARRAY:
case WJB_BEGIN_OBJECT:
/* Push a stack level for this object */
parent = stack;
stack = (PathHashStack *) palloc(sizeof(PathHashStack));
if (parent->parent)
{
/*
* We pass forward hashes from previous container nesting
* levels so that nested arrays with an outermost nested
* object will have element hashes mixed with the
* outermost key. It's also somewhat useful to have
* nested objects' innermost values have hashes that are a
* function of not just their own key, but outer keys too.
*
* Nesting an array within another array will not alter
* innermost scalar element hash values, but that seems
* inconsequential.
*/
stack->hash = parent->hash;
}
else
{
/*
* At the outermost level, initialize hash with container
* type proxy value. Note that this makes JB_FARRAY and
* JB_FOBJECT part of the on-disk representation, but they
* are that in the base jsonb object storage already.
*/
stack->hash = (r == WJB_BEGIN_ARRAY) ? JB_FARRAY : JB_FOBJECT;
}
stack->parent = parent;
break;
case WJB_KEY:
/* initialize hash from parent */
stack->hash = stack->parent->hash;
/* and mix in this key */
JsonbHashScalarValue(&v, &stack->hash);
/* hash is now ready to incorporate the value */
break;
case WJB_ELEM:
/* array elements use parent hash mixed with element's hash */
stack->hash = stack->parent->hash;
/* FALL THRU */
case WJB_VALUE:
/* mix the element or value's hash into the prepared hash */
JsonbHashScalarValue(&v, &stack->hash);
/* and emit an index entry */
entries[i++] = UInt32GetDatum(stack->hash);
/* Note: we assume we'll see KEY before another VALUE */
break;
case WJB_END_ARRAY:
case WJB_END_OBJECT:
/* Pop the stack */
parent = stack->parent;
pfree(stack);
stack = parent;
break;
default:
elog(ERROR, "invalid JsonbIteratorNext rc: %d", r);
}
}
*nentries = i;
PG_RETURN_POINTER(entries);
}
Datum
gin_extract_jsonb_query_hash(PG_FUNCTION_ARGS)
{
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
StrategyNumber strategy = PG_GETARG_UINT16(2);
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
Datum *entries;
if (strategy != JsonbContainsStrategyNumber)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/* Query is a jsonb, so just apply gin_extract_jsonb_hash ... */
entries = (Datum *)
DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_hash,
PG_GETARG_DATUM(0),
PointerGetDatum(nentries)));
/* ... although "contains {}" requires a full index scan */
if (*nentries == 0)
*searchMode = GIN_SEARCH_MODE_ALL;
PG_RETURN_POINTER(entries);
}
Datum
gin_consistent_jsonb_hash(PG_FUNCTION_ARGS)
{
@ -395,13 +480,13 @@ gin_consistent_jsonb_hash(PG_FUNCTION_ARGS)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/*
* jsonb_hash_ops index doesn't have information about correspondence of
* Jsonb keys and values (as distinct from GIN keys, which a key/value
* pair is stored as), so invariably we recheck. Besides, there are some
* jsonb_hash_ops is necessarily lossy, not only because of hash
* collisions but also because it doesn't preserve complete information
* about the structure of the JSON object. Besides, there are some
* special rules around the containment of raw scalar arrays and regular
* arrays that are not represented here. However, if all of the keys are
* not present, that's sufficient reason to return false and finish
* immediately.
* arrays that are not handled here. So we must always recheck a match.
* However, if not all of the keys are present, the tuple certainly
* doesn't match.
*/
*recheck = true;
for (i = 0; i < nkeys; i++)
@ -426,17 +511,16 @@ gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS)
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
GinTernaryValue res = GIN_TRUE;
GinTernaryValue res = GIN_MAYBE;
int32 i;
bool has_maybe = false;
if (strategy != JsonbContainsStrategyNumber)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/*
* All extracted keys must be present. A combination of GIN_MAYBE and
* GIN_TRUE induces a GIN_MAYBE result, because then all keys may be
* present.
* Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this
* corresponds to always forcing recheck in the regular consistent
* function, for the reasons listed there.
*/
for (i = 0; i < nkeys; i++)
{
@ -445,161 +529,39 @@ gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS)
res = GIN_FALSE;
break;
}
if (check[i] == GIN_MAYBE)
{
res = GIN_MAYBE;
has_maybe = true;
}
}
/*
* jsonb_hash_ops index doesn't have information about correspondence of
* Jsonb keys and values (as distinct from GIN keys, which for this
* opclass are a hash of a pair, or a hash of just an element), so
* invariably we recheck. This is also reflected in how GIN_MAYBE is
* given in response to there being no GIN_MAYBE input.
*/
if (!has_maybe && res == GIN_TRUE)
res = GIN_MAYBE;
PG_RETURN_GIN_TERNARY_VALUE(res);
}
Datum
gin_extract_jsonb_hash(PG_FUNCTION_ARGS)
{
Jsonb *jb = PG_GETARG_JSONB(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
int total = 2 * JB_ROOT_COUNT(jb);
JsonbIterator *it;
JsonbValue v;
PathHashStack tail;
PathHashStack *stack;
int i = 0,
r;
Datum *entries = NULL;
if (total == 0)
{
*nentries = 0;
PG_RETURN_POINTER(NULL);
}
entries = (Datum *) palloc(sizeof(Datum) * total);
it = JsonbIteratorInit(&jb->root);
tail.parent = NULL;
tail.hash = 0;
stack = &tail;
while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
{
PathHashStack *tmp;
if (i >= total)
{
total *= 2;
entries = (Datum *) repalloc(entries, sizeof(Datum) * total);
}
switch (r)
{
case WJB_BEGIN_ARRAY:
case WJB_BEGIN_OBJECT:
tmp = stack;
stack = (PathHashStack *) palloc(sizeof(PathHashStack));
/*
* Nesting an array within another array will not alter
* innermost scalar element hash values, but that seems
* inconsequential
*/
if (tmp->parent)
{
/*
* We pass forward hashes from previous container nesting
* levels so that nested arrays with an outermost nested
* object will have element hashes mixed with the
* outermost key. It's also somewhat useful to have
* nested objects innermost values have hashes that are a
* function of not just their own key, but outer keys too.
*/
stack->hash = tmp->hash;
}
else
{
/*
* At least nested level, initialize with stable container
* type proxy value
*/
stack->hash = (r == WJB_BEGIN_ARRAY) ? JB_FARRAY : JB_FOBJECT;
}
stack->parent = tmp;
break;
case WJB_KEY:
/* Initialize hash from parent */
stack->hash = stack->parent->hash;
JsonbHashScalarValue(&v, &stack->hash);
break;
case WJB_ELEM:
/* Elements have parent hash mixed in separately */
stack->hash = stack->parent->hash;
case WJB_VALUE:
/* Element/value case */
JsonbHashScalarValue(&v, &stack->hash);
entries[i++] = UInt32GetDatum(stack->hash);
break;
case WJB_END_ARRAY:
case WJB_END_OBJECT:
/* Pop the stack */
tmp = stack->parent;
pfree(stack);
stack = tmp;
break;
default:
elog(ERROR, "invalid JsonbIteratorNext rc: %d", r);
}
}
*nentries = i;
PG_RETURN_POINTER(entries);
}
Datum
gin_extract_jsonb_query_hash(PG_FUNCTION_ARGS)
{
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
StrategyNumber strategy = PG_GETARG_UINT16(2);
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
Datum *entries;
if (strategy != JsonbContainsStrategyNumber)
elog(ERROR, "unrecognized strategy number: %d", strategy);
/* Query is a jsonb, so just apply gin_extract_jsonb... */
entries = (Datum *)
DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_hash,
PG_GETARG_DATUM(0),
PointerGetDatum(nentries)));
/* ...although "contains {}" requires a full index scan */
if (entries == NULL)
*searchMode = GIN_SEARCH_MODE_ALL;
PG_RETURN_POINTER(entries);
}
/*
* Build a text value from a cstring and flag suitable for storage as a key
* value
* Construct a jsonb_ops GIN key from a flag byte and a textual representation
* (which need not be null-terminated). This function is responsible
* for hashing overlength text representations; it will add the
* JGINFLAG_HASHED bit to the flag value if it does that.
*/
static text *
make_text_key(const char *str, int len, char flag)
static Datum
make_text_key(char flag, const char *str, int len)
{
text *item;
char hashbuf[10];
if (len > JGIN_MAXLENGTH)
{
uint32 hashval;
hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len));
snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval);
str = hashbuf;
len = 8;
flag |= JGINFLAG_HASHED;
}
/*
* Now build the text Datum. For simplicity we build a 4-byte-header
* varlena text Datum here, but we expect it will get converted to short
* header format when stored in the index.
*/
item = (text *) palloc(VARHDRSZ + len + 1);
SET_VARSIZE(item, VARHDRSZ + len + 1);
@ -607,31 +569,39 @@ make_text_key(const char *str, int len, char flag)
memcpy(VARDATA(item) + 1, str, len);
return item;
return PointerGetDatum(item);
}
/*
* Create a textual representation of a jsonbValue for GIN storage.
* Create a textual representation of a JsonbValue that will serve as a GIN
* key in a jsonb_ops index. is_key is true if the JsonbValue is a key,
* or if it is a string array element (since we pretend those are keys,
* see jsonb.h).
*/
static text *
make_scalar_key(const JsonbValue *scalarVal, char flag)
static Datum
make_scalar_key(const JsonbValue *scalarVal, bool is_key)
{
text *item;
Datum item;
char *cstr;
switch (scalarVal->type)
{
case jbvNull:
item = make_text_key("n", 1, flag);
Assert(!is_key);
item = make_text_key(JGINFLAG_NULL, "", 0);
break;
case jbvBool:
item = make_text_key(scalarVal->val.boolean ? "t" : "f", 1, flag);
Assert(!is_key);
item = make_text_key(JGINFLAG_BOOL,
scalarVal->val.boolean ? "t" : "f", 1);
break;
case jbvNumeric:
Assert(!is_key);
/*
* A normalized textual representation, free of trailing zeroes is
* is required.
* A normalized textual representation, free of trailing zeroes,
* is required so that numerically equal values will produce equal
* strings.
*
* It isn't ideal that numerics are stored in a relatively bulky
* textual format. However, it's a notationally convenient way of
@ -639,15 +609,18 @@ make_scalar_key(const JsonbValue *scalarVal, char flag)
* strings takes precedence.
*/
cstr = numeric_normalize(scalarVal->val.numeric);
item = make_text_key(cstr, strlen(cstr), flag);
item = make_text_key(JGINFLAG_NUM, cstr, strlen(cstr));
pfree(cstr);
break;
case jbvString:
item = make_text_key(scalarVal->val.string.val, scalarVal->val.string.len,
flag);
item = make_text_key(is_key ? JGINFLAG_KEY : JGINFLAG_STR,
scalarVal->val.string.val,
scalarVal->val.string.len);
break;
default:
elog(ERROR, "invalid jsonb scalar type");
elog(ERROR, "unrecognized jsonb scalar type: %d", scalarVal->type);
item = 0; /* keep compiler quiet */
break;
}
return item;

View File

@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201405051
#define CATALOG_VERSION_NO 201405091
#endif

View File

@ -29,25 +29,41 @@ typedef enum
WJB_END_OBJECT
} JsonbIteratorToken;
/*
* When using a GIN index for jsonb, we choose to index both keys and values.
* The storage format is text, with K, or V prepended to the string to indicate
* key/element or value/element.
*
* Jsonb Keys and string array elements are treated equivalently when
* serialized to text index storage. One day we may wish to create an opclass
* that only indexes values, but for now keys and values are stored in GIN
* indexes in a way that doesn't really consider their relationship to each
* other.
*/
#define JKEYELEM 'K'
#define JVAL 'V'
/* Strategy numbers for GIN index opclasses */
#define JsonbContainsStrategyNumber 7
#define JsonbExistsStrategyNumber 9
#define JsonbExistsAnyStrategyNumber 10
#define JsonbExistsAllStrategyNumber 11
/*
* In the standard jsonb_ops GIN opclass for jsonb, we choose to index both
* keys and values. The storage format is text. The first byte of the text
* string distinguishes whether this is a key (always a string), null value,
* boolean value, numeric value, or string value. However, array elements
* that are strings are marked as though they were keys; this imprecision
* supports the definition of the "exists" operator, which treats array
* elements like keys. The remainder of the text string is empty for a null
* value, "t" or "f" for a boolean value, a normalized print representation of
* a numeric value, or the text of a string value. However, if the length of
* this text representation would exceed JGIN_MAXLENGTH bytes, we instead hash
* the text representation and store an 8-hex-digit representation of the
* uint32 hash value, marking the prefix byte with an additional bit to
* distinguish that this has happened. Hashing long strings saves space and
* ensures that we won't overrun the maximum entry length for a GIN index.
* (But JGIN_MAXLENGTH is quite a bit shorter than GIN's limit. It's chosen
* to ensure that the on-disk text datum will have a short varlena header.)
* Note that when any hashed item appears in a query, we must recheck index
* matches against the heap tuple; currently, this costs nothing because we
* must always recheck for other reasons.
*/
#define JGINFLAG_KEY 0x01 /* key (or string array element) */
#define JGINFLAG_NULL 0x02 /* null value */
#define JGINFLAG_BOOL 0x03 /* boolean value */
#define JGINFLAG_NUM 0x04 /* numeric value */
#define JGINFLAG_STR 0x05 /* string value (if not an array element) */
#define JGINFLAG_HASHED 0x10 /* OR'd into flag if value was hashed */
#define JGIN_MAXLENGTH 125 /* max length of text part before hashing */
/* Convenience macros */
#define DatumGetJsonb(d) ((Jsonb *) PG_DETOAST_DATUM(d))
#define JsonbGetDatum(p) PointerGetDatum(p)
@ -332,12 +348,12 @@ extern Datum gin_consistent_jsonb_hash(PG_FUNCTION_ARGS);
extern Datum gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS);
/* Support functions */
extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
uint32 flags,
JsonbValue *key);
uint32 flags,
JsonbValue *key);
extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *sheader,
uint32 i);
uint32 i);
extern JsonbValue *pushJsonbValue(JsonbParseState **pstate,
JsonbIteratorToken seq, JsonbValue *scalarVal);
extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container);

View File

@ -1006,4 +1006,7 @@
{"wait":null, "line":1000}
{"age":25}
{"age":25.0}
{"foo": {"bar": "baz"}}
{"foo": {"blah": "baz"}}
{"fool": {"bar": "baz"}}
{}

View File

@ -1483,6 +1483,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
@ -1543,7 +1549,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'public';
@ -1552,6 +1558,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
@ -1591,7 +1603,7 @@ RESET enable_seqscan;
SELECT count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow;
count
-------
4788
4791
(1 row)
SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GROUP BY key ORDER BY count DESC, key;
@ -1621,20 +1633,22 @@ SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GRO
abstract | 161
array | 5
age | 2
(24 rows)
foo | 2
fool | 1
(26 rows)
-- sort/hash
SELECT count(distinct j) FROM testjsonb;
count
-------
891
894
(1 row)
SET enable_hashagg = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SET enable_hashagg = on;
@ -1642,7 +1656,7 @@ SET enable_sort = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SELECT distinct * FROM (values (jsonb '{}' || ''),('{}')) v(j);
@ -1709,7 +1723,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
RESET enable_seqscan;

View File

@ -1483,6 +1483,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
@ -1543,7 +1549,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'public';
@ -1552,6 +1558,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public';
194
(1 row)
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
count
-------
0
(1 row)
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
count
-------
@ -1591,7 +1603,7 @@ RESET enable_seqscan;
SELECT count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow;
count
-------
4788
4791
(1 row)
SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GROUP BY key ORDER BY count DESC, key;
@ -1621,20 +1633,22 @@ SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GRO
abstract | 161
array | 5
age | 2
(24 rows)
foo | 2
fool | 1
(26 rows)
-- sort/hash
SELECT count(distinct j) FROM testjsonb;
count
-------
891
894
(1 row)
SET enable_hashagg = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SET enable_hashagg = on;
@ -1642,7 +1656,7 @@ SET enable_sort = off;
SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2;
count
-------
891
894
(1 row)
SELECT distinct * FROM (values (jsonb '{}' || ''),('{}')) v(j);
@ -1709,7 +1723,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}';
SELECT count(*) FROM testjsonb WHERE j @> '{}';
count
-------
1009
1012
(1 row)
RESET enable_seqscan;

View File

@ -334,6 +334,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"wait":"CC", "public":true}';
SELECT count(*) FROM testjsonb WHERE j @> '{"age":25}';
SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}';
SELECT count(*) FROM testjsonb WHERE j ? 'public';
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
SELECT count(*) FROM testjsonb WHERE j ?& ARRAY['public','disabled'];
@ -350,6 +351,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}';
-- excercise GIN_SEARCH_MODE_ALL
SELECT count(*) FROM testjsonb WHERE j @> '{}';
SELECT count(*) FROM testjsonb WHERE j ? 'public';
SELECT count(*) FROM testjsonb WHERE j ? 'bar';
SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled'];
SELECT count(*) FROM testjsonb WHERE j ?& ARRAY['public','disabled'];