diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c index 592036ac58..57a0b2c8a3 100644 --- a/src/backend/utils/adt/jsonb_gin.c +++ b/src/backend/utils/adt/jsonb_gin.c @@ -14,6 +14,7 @@ #include "postgres.h" #include "access/gin.h" +#include "access/hash.h" #include "access/skey.h" #include "catalog/pg_collation.h" #include "catalog/pg_type.h" @@ -26,14 +27,15 @@ typedef struct PathHashStack struct PathHashStack *parent; } PathHashStack; -static text *make_text_key(const char *str, int len, char flag); -static text *make_scalar_key(const JsonbValue *scalarVal, char flag); +static Datum make_text_key(char flag, const char *str, int len); +static Datum make_scalar_key(const JsonbValue *scalarVal, bool is_key); /* * * jsonb_ops GIN opclass support functions * */ + Datum gin_compare_jsonb(PG_FUNCTION_ARGS) { @@ -65,80 +67,49 @@ gin_extract_jsonb(PG_FUNCTION_ARGS) { Jsonb *jb = (Jsonb *) PG_GETARG_JSONB(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - Datum *entries = NULL; int total = 2 * JB_ROOT_COUNT(jb); - int i = 0, - r; JsonbIterator *it; JsonbValue v; + int i = 0, + r; + Datum *entries; + /* If the root level is empty, we certainly have no keys */ if (total == 0) { *nentries = 0; PG_RETURN_POINTER(NULL); } + /* Otherwise, use 2 * root count as initial estimate of result size */ entries = (Datum *) palloc(sizeof(Datum) * total); it = JsonbIteratorInit(&jb->root); while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) { + /* Since we recurse into the object, we might need more space */ if (i >= total) { total *= 2; entries = (Datum *) repalloc(entries, sizeof(Datum) * total); } - /* - * Serialize keys and elements equivalently, but only when elements - * are Jsonb strings. Otherwise, serialize elements as values. Array - * elements are indexed as keys, for the benefit of - * JsonbExistsStrategyNumber. Our definition of existence does not - * allow for checking the existence of a non-jbvString element (just - * like the definition of the underlying operator), because the - * operator takes a text rhs argument (which is taken as a proxy for - * an equivalent Jsonb string). - * - * The way existence is represented does not preclude an alternative - * existence operator, that takes as its rhs value an arbitrarily - * internally-typed Jsonb. The only reason that isn't the case here - * is that the existence operator is only really intended to determine - * if an object has a certain key (object pair keys are of course - * invariably strings), which is extended to jsonb arrays. You could - * think of the default Jsonb definition of existence as being - * equivalent to a definition where all types of scalar array elements - * are keys that we can check the existence of, while just forbidding - * non-string notation. This inflexibility prevents the user from - * having to qualify that the rhs string is a raw scalar string (that - * is, naturally no internal string quoting in required for the text - * argument), and allows us to not set the reset flag for - * JsonbExistsStrategyNumber, since we know that keys are strings for - * both objects and arrays, and don't have to further account for type - * mismatch. Not having to set the reset flag makes it less than - * tempting to tighten up the definition of existence to preclude - * array elements entirely, which would arguably be a simpler - * alternative. In any case the infrastructure used to implement the - * existence operator could trivially support this hypothetical, - * slightly distinct definition of existence. - */ switch (r) { case WJB_KEY: - /* Serialize key separately, for existence strategies */ - entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM)); + entries[i++] = make_scalar_key(&v, true); break; case WJB_ELEM: - if (v.type == jbvString) - entries[i++] = PointerGetDatum(make_scalar_key(&v, JKEYELEM)); - else - entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL)); + /* Pretend string array elements are keys, see jsonb.h */ + entries[i++] = make_scalar_key(&v, (v.type == jbvString)); break; case WJB_VALUE: - entries[i++] = PointerGetDatum(make_scalar_key(&v, JVAL)); + entries[i++] = make_scalar_key(&v, false); break; default: - continue; + /* we can ignore structural items */ + break; } } @@ -163,30 +134,30 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS) PG_GETARG_DATUM(0), PointerGetDatum(nentries))); /* ...although "contains {}" requires a full index scan */ - if (entries == NULL) + if (*nentries == 0) *searchMode = GIN_SEARCH_MODE_ALL; } else if (strategy == JsonbExistsStrategyNumber) { + /* Query is a text string, which we treat as a key */ text *query = PG_GETARG_TEXT_PP(0); - text *item; *nentries = 1; entries = (Datum *) palloc(sizeof(Datum)); - item = make_text_key(VARDATA_ANY(query), VARSIZE_ANY_EXHDR(query), - JKEYELEM); - entries[0] = PointerGetDatum(item); + entries[0] = make_text_key(JGINFLAG_KEY, + VARDATA_ANY(query), + VARSIZE_ANY_EXHDR(query)); } else if (strategy == JsonbExistsAnyStrategyNumber || strategy == JsonbExistsAllStrategyNumber) { + /* Query is a text array; each element is treated as a key */ ArrayType *query = PG_GETARG_ARRAYTYPE_P(0); Datum *key_datums; bool *key_nulls; int key_count; int i, j; - text *item; deconstruct_array(query, TEXTOID, -1, false, 'i', @@ -194,15 +165,14 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS) entries = (Datum *) palloc(sizeof(Datum) * key_count); - for (i = 0, j = 0; i < key_count; ++i) + for (i = 0, j = 0; i < key_count; i++) { /* Nulls in the array are ignored */ if (key_nulls[i]) continue; - item = make_text_key(VARDATA(key_datums[i]), - VARSIZE(key_datums[i]) - VARHDRSZ, - JKEYELEM); - entries[j++] = PointerGetDatum(item); + entries[j++] = make_text_key(JGINFLAG_KEY, + VARDATA_ANY(key_datums[i]), + VARSIZE_ANY_EXHDR(key_datums[i])); } *nentries = j; @@ -236,13 +206,12 @@ gin_consistent_jsonb(PG_FUNCTION_ARGS) if (strategy == JsonbContainsStrategyNumber) { /* - * Index doesn't have information about correspondence of Jsonb keys - * and values (as distinct from GIN keys, which a key/value pair is - * stored as), so invariably we recheck. Besides, there are some - * special rules around the containment of raw scalar arrays and - * regular arrays that are not represented here. However, if all of - * the keys are not present, that's sufficient reason to return false - * and finish immediately. + * We must always recheck, since we can't tell from the index whether + * the positions of the matched items match the structure of the query + * object. (Even if we could, we'd also have to worry about hashed + * keys and the index's failure to distinguish keys from string array + * elements.) However, the tuple certainly doesn't match unless it + * contains all the query keys. */ *recheck = true; for (i = 0; i < nkeys; i++) @@ -256,20 +225,27 @@ gin_consistent_jsonb(PG_FUNCTION_ARGS) } else if (strategy == JsonbExistsStrategyNumber) { - /* Existence of key guaranteed in default search mode */ - *recheck = false; + /* + * Although the key is certainly present in the index, we must recheck + * because (1) the key might be hashed, and (2) the index match might + * be for a key that's not at top level of the JSON object. For (1), + * we could look at the query key to see if it's hashed and not + * recheck if not, but the index lacks enough info to tell about (2). + */ + *recheck = true; res = true; } else if (strategy == JsonbExistsAnyStrategyNumber) { - /* Existence of key guaranteed in default search mode */ - *recheck = false; + /* As for plain exists, we must recheck */ + *recheck = true; res = true; } else if (strategy == JsonbExistsAllStrategyNumber) { - /* Testing for the presence of all keys gives an exact result */ - *recheck = false; + /* As for plain exists, we must recheck */ + *recheck = true; + /* ... but unless all the keys are present, we can say "false" */ for (i = 0; i < nkeys; i++) { if (!check[i]) @@ -295,19 +271,18 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS) int32 nkeys = PG_GETARG_INT32(3); /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ - GinTernaryValue res = GIN_TRUE; - + GinTernaryValue res = GIN_MAYBE; int32 i; - if (strategy == JsonbContainsStrategyNumber) + /* + * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this + * corresponds to always forcing recheck in the regular consistent + * function, for the reasons listed there. + */ + if (strategy == JsonbContainsStrategyNumber || + strategy == JsonbExistsAllStrategyNumber) { - bool has_maybe = false; - - /* - * All extracted keys must be present. Combination of GIN_MAYBE and - * GIN_TRUE gives GIN_MAYBE result because then all keys may be - * present. - */ + /* All extracted keys must be present */ for (i = 0; i < nkeys; i++) { if (check[i] == GIN_FALSE) @@ -315,55 +290,21 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS) res = GIN_FALSE; break; } - if (check[i] == GIN_MAYBE) - { - res = GIN_MAYBE; - has_maybe = true; - } } - - /* - * Index doesn't have information about correspondence of Jsonb keys - * and values (as distinct from GIN keys, which a key/value pair is - * stored as), so invariably we recheck. This is also reflected in - * how GIN_MAYBE is given in response to there being no GIN_MAYBE - * input. - */ - if (!has_maybe && res == GIN_TRUE) - res = GIN_MAYBE; } else if (strategy == JsonbExistsStrategyNumber || strategy == JsonbExistsAnyStrategyNumber) { - /* Existence of key guaranteed in default search mode */ + /* At least one extracted key must be present */ res = GIN_FALSE; for (i = 0; i < nkeys; i++) { - if (check[i] == GIN_TRUE) - { - res = GIN_TRUE; - break; - } - if (check[i] == GIN_MAYBE) + if (check[i] == GIN_TRUE || + check[i] == GIN_MAYBE) { res = GIN_MAYBE; - } - } - } - else if (strategy == JsonbExistsAllStrategyNumber) - { - /* Testing for the presence of all keys gives an exact result */ - for (i = 0; i < nkeys; i++) - { - if (check[i] == GIN_FALSE) - { - res = GIN_FALSE; break; } - if (check[i] == GIN_MAYBE) - { - res = GIN_MAYBE; - } } } else @@ -376,7 +317,151 @@ gin_triconsistent_jsonb(PG_FUNCTION_ARGS) * * jsonb_hash_ops GIN opclass support functions * + * In a jsonb_hash_ops index, the GIN keys are uint32 hashes, one per JSON + * value; but the JSON key(s) leading to each value are also included in its + * hash computation. This means we can only support containment queries, + * but the index can distinguish, for example, {"foo": 42} from {"bar": 42} + * since different hashes will be generated. + * */ + +Datum +gin_extract_jsonb_hash(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB(0); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + int total = 2 * JB_ROOT_COUNT(jb); + JsonbIterator *it; + JsonbValue v; + PathHashStack tail; + PathHashStack *stack; + int i = 0, + r; + Datum *entries; + + /* If the root level is empty, we certainly have no keys */ + if (total == 0) + { + *nentries = 0; + PG_RETURN_POINTER(NULL); + } + + /* Otherwise, use 2 * root count as initial estimate of result size */ + entries = (Datum *) palloc(sizeof(Datum) * total); + + /* We keep a stack of hashes corresponding to parent key levels */ + tail.parent = NULL; + tail.hash = 0; + stack = &tail; + + it = JsonbIteratorInit(&jb->root); + + while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + PathHashStack *parent; + + /* Since we recurse into the object, we might need more space */ + if (i >= total) + { + total *= 2; + entries = (Datum *) repalloc(entries, sizeof(Datum) * total); + } + + switch (r) + { + case WJB_BEGIN_ARRAY: + case WJB_BEGIN_OBJECT: + /* Push a stack level for this object */ + parent = stack; + stack = (PathHashStack *) palloc(sizeof(PathHashStack)); + + if (parent->parent) + { + /* + * We pass forward hashes from previous container nesting + * levels so that nested arrays with an outermost nested + * object will have element hashes mixed with the + * outermost key. It's also somewhat useful to have + * nested objects' innermost values have hashes that are a + * function of not just their own key, but outer keys too. + * + * Nesting an array within another array will not alter + * innermost scalar element hash values, but that seems + * inconsequential. + */ + stack->hash = parent->hash; + } + else + { + /* + * At the outermost level, initialize hash with container + * type proxy value. Note that this makes JB_FARRAY and + * JB_FOBJECT part of the on-disk representation, but they + * are that in the base jsonb object storage already. + */ + stack->hash = (r == WJB_BEGIN_ARRAY) ? JB_FARRAY : JB_FOBJECT; + } + stack->parent = parent; + break; + case WJB_KEY: + /* initialize hash from parent */ + stack->hash = stack->parent->hash; + /* and mix in this key */ + JsonbHashScalarValue(&v, &stack->hash); + /* hash is now ready to incorporate the value */ + break; + case WJB_ELEM: + /* array elements use parent hash mixed with element's hash */ + stack->hash = stack->parent->hash; + /* FALL THRU */ + case WJB_VALUE: + /* mix the element or value's hash into the prepared hash */ + JsonbHashScalarValue(&v, &stack->hash); + /* and emit an index entry */ + entries[i++] = UInt32GetDatum(stack->hash); + /* Note: we assume we'll see KEY before another VALUE */ + break; + case WJB_END_ARRAY: + case WJB_END_OBJECT: + /* Pop the stack */ + parent = stack->parent; + pfree(stack); + stack = parent; + break; + default: + elog(ERROR, "invalid JsonbIteratorNext rc: %d", r); + } + } + + *nentries = i; + + PG_RETURN_POINTER(entries); +} + +Datum +gin_extract_jsonb_query_hash(PG_FUNCTION_ARGS) +{ + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); + Datum *entries; + + if (strategy != JsonbContainsStrategyNumber) + elog(ERROR, "unrecognized strategy number: %d", strategy); + + /* Query is a jsonb, so just apply gin_extract_jsonb_hash ... */ + entries = (Datum *) + DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_hash, + PG_GETARG_DATUM(0), + PointerGetDatum(nentries))); + + /* ... although "contains {}" requires a full index scan */ + if (*nentries == 0) + *searchMode = GIN_SEARCH_MODE_ALL; + + PG_RETURN_POINTER(entries); +} + Datum gin_consistent_jsonb_hash(PG_FUNCTION_ARGS) { @@ -395,13 +480,13 @@ gin_consistent_jsonb_hash(PG_FUNCTION_ARGS) elog(ERROR, "unrecognized strategy number: %d", strategy); /* - * jsonb_hash_ops index doesn't have information about correspondence of - * Jsonb keys and values (as distinct from GIN keys, which a key/value - * pair is stored as), so invariably we recheck. Besides, there are some + * jsonb_hash_ops is necessarily lossy, not only because of hash + * collisions but also because it doesn't preserve complete information + * about the structure of the JSON object. Besides, there are some * special rules around the containment of raw scalar arrays and regular - * arrays that are not represented here. However, if all of the keys are - * not present, that's sufficient reason to return false and finish - * immediately. + * arrays that are not handled here. So we must always recheck a match. + * However, if not all of the keys are present, the tuple certainly + * doesn't match. */ *recheck = true; for (i = 0; i < nkeys; i++) @@ -426,17 +511,16 @@ gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS) int32 nkeys = PG_GETARG_INT32(3); /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ - GinTernaryValue res = GIN_TRUE; + GinTernaryValue res = GIN_MAYBE; int32 i; - bool has_maybe = false; if (strategy != JsonbContainsStrategyNumber) elog(ERROR, "unrecognized strategy number: %d", strategy); /* - * All extracted keys must be present. A combination of GIN_MAYBE and - * GIN_TRUE induces a GIN_MAYBE result, because then all keys may be - * present. + * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this + * corresponds to always forcing recheck in the regular consistent + * function, for the reasons listed there. */ for (i = 0; i < nkeys; i++) { @@ -445,161 +529,39 @@ gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS) res = GIN_FALSE; break; } - if (check[i] == GIN_MAYBE) - { - res = GIN_MAYBE; - has_maybe = true; - } } - /* - * jsonb_hash_ops index doesn't have information about correspondence of - * Jsonb keys and values (as distinct from GIN keys, which for this - * opclass are a hash of a pair, or a hash of just an element), so - * invariably we recheck. This is also reflected in how GIN_MAYBE is - * given in response to there being no GIN_MAYBE input. - */ - if (!has_maybe && res == GIN_TRUE) - res = GIN_MAYBE; - PG_RETURN_GIN_TERNARY_VALUE(res); } -Datum -gin_extract_jsonb_hash(PG_FUNCTION_ARGS) -{ - Jsonb *jb = PG_GETARG_JSONB(0); - int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - int total = 2 * JB_ROOT_COUNT(jb); - JsonbIterator *it; - JsonbValue v; - PathHashStack tail; - PathHashStack *stack; - int i = 0, - r; - Datum *entries = NULL; - - if (total == 0) - { - *nentries = 0; - PG_RETURN_POINTER(NULL); - } - - entries = (Datum *) palloc(sizeof(Datum) * total); - - it = JsonbIteratorInit(&jb->root); - - tail.parent = NULL; - tail.hash = 0; - stack = &tail; - - while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) - { - PathHashStack *tmp; - - if (i >= total) - { - total *= 2; - entries = (Datum *) repalloc(entries, sizeof(Datum) * total); - } - - switch (r) - { - case WJB_BEGIN_ARRAY: - case WJB_BEGIN_OBJECT: - tmp = stack; - stack = (PathHashStack *) palloc(sizeof(PathHashStack)); - - /* - * Nesting an array within another array will not alter - * innermost scalar element hash values, but that seems - * inconsequential - */ - if (tmp->parent) - { - /* - * We pass forward hashes from previous container nesting - * levels so that nested arrays with an outermost nested - * object will have element hashes mixed with the - * outermost key. It's also somewhat useful to have - * nested objects innermost values have hashes that are a - * function of not just their own key, but outer keys too. - */ - stack->hash = tmp->hash; - } - else - { - /* - * At least nested level, initialize with stable container - * type proxy value - */ - stack->hash = (r == WJB_BEGIN_ARRAY) ? JB_FARRAY : JB_FOBJECT; - } - stack->parent = tmp; - break; - case WJB_KEY: - /* Initialize hash from parent */ - stack->hash = stack->parent->hash; - JsonbHashScalarValue(&v, &stack->hash); - break; - case WJB_ELEM: - /* Elements have parent hash mixed in separately */ - stack->hash = stack->parent->hash; - case WJB_VALUE: - /* Element/value case */ - JsonbHashScalarValue(&v, &stack->hash); - entries[i++] = UInt32GetDatum(stack->hash); - break; - case WJB_END_ARRAY: - case WJB_END_OBJECT: - /* Pop the stack */ - tmp = stack->parent; - pfree(stack); - stack = tmp; - break; - default: - elog(ERROR, "invalid JsonbIteratorNext rc: %d", r); - } - } - - *nentries = i; - - PG_RETURN_POINTER(entries); -} - -Datum -gin_extract_jsonb_query_hash(PG_FUNCTION_ARGS) -{ - int32 *nentries = (int32 *) PG_GETARG_POINTER(1); - StrategyNumber strategy = PG_GETARG_UINT16(2); - int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); - Datum *entries; - - if (strategy != JsonbContainsStrategyNumber) - elog(ERROR, "unrecognized strategy number: %d", strategy); - - /* Query is a jsonb, so just apply gin_extract_jsonb... */ - entries = (Datum *) - DatumGetPointer(DirectFunctionCall2(gin_extract_jsonb_hash, - PG_GETARG_DATUM(0), - PointerGetDatum(nentries))); - - /* ...although "contains {}" requires a full index scan */ - if (entries == NULL) - *searchMode = GIN_SEARCH_MODE_ALL; - - PG_RETURN_POINTER(entries); -} - /* - * Build a text value from a cstring and flag suitable for storage as a key - * value + * Construct a jsonb_ops GIN key from a flag byte and a textual representation + * (which need not be null-terminated). This function is responsible + * for hashing overlength text representations; it will add the + * JGINFLAG_HASHED bit to the flag value if it does that. */ -static text * -make_text_key(const char *str, int len, char flag) +static Datum +make_text_key(char flag, const char *str, int len) { text *item; + char hashbuf[10]; + if (len > JGIN_MAXLENGTH) + { + uint32 hashval; + + hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len)); + snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval); + str = hashbuf; + len = 8; + flag |= JGINFLAG_HASHED; + } + + /* + * Now build the text Datum. For simplicity we build a 4-byte-header + * varlena text Datum here, but we expect it will get converted to short + * header format when stored in the index. + */ item = (text *) palloc(VARHDRSZ + len + 1); SET_VARSIZE(item, VARHDRSZ + len + 1); @@ -607,31 +569,39 @@ make_text_key(const char *str, int len, char flag) memcpy(VARDATA(item) + 1, str, len); - return item; + return PointerGetDatum(item); } /* - * Create a textual representation of a jsonbValue for GIN storage. + * Create a textual representation of a JsonbValue that will serve as a GIN + * key in a jsonb_ops index. is_key is true if the JsonbValue is a key, + * or if it is a string array element (since we pretend those are keys, + * see jsonb.h). */ -static text * -make_scalar_key(const JsonbValue *scalarVal, char flag) +static Datum +make_scalar_key(const JsonbValue *scalarVal, bool is_key) { - text *item; + Datum item; char *cstr; switch (scalarVal->type) { case jbvNull: - item = make_text_key("n", 1, flag); + Assert(!is_key); + item = make_text_key(JGINFLAG_NULL, "", 0); break; case jbvBool: - item = make_text_key(scalarVal->val.boolean ? "t" : "f", 1, flag); + Assert(!is_key); + item = make_text_key(JGINFLAG_BOOL, + scalarVal->val.boolean ? "t" : "f", 1); break; case jbvNumeric: + Assert(!is_key); /* - * A normalized textual representation, free of trailing zeroes is - * is required. + * A normalized textual representation, free of trailing zeroes, + * is required so that numerically equal values will produce equal + * strings. * * It isn't ideal that numerics are stored in a relatively bulky * textual format. However, it's a notationally convenient way of @@ -639,15 +609,18 @@ make_scalar_key(const JsonbValue *scalarVal, char flag) * strings takes precedence. */ cstr = numeric_normalize(scalarVal->val.numeric); - item = make_text_key(cstr, strlen(cstr), flag); + item = make_text_key(JGINFLAG_NUM, cstr, strlen(cstr)); pfree(cstr); break; case jbvString: - item = make_text_key(scalarVal->val.string.val, scalarVal->val.string.len, - flag); + item = make_text_key(is_key ? JGINFLAG_KEY : JGINFLAG_STR, + scalarVal->val.string.val, + scalarVal->val.string.len); break; default: - elog(ERROR, "invalid jsonb scalar type"); + elog(ERROR, "unrecognized jsonb scalar type: %d", scalarVal->type); + item = 0; /* keep compiler quiet */ + break; } return item; diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 2eb78128be..f37a78a264 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201405051 +#define CATALOG_VERSION_NO 201405091 #endif diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h index fc746c8b74..1a6409ac0d 100644 --- a/src/include/utils/jsonb.h +++ b/src/include/utils/jsonb.h @@ -29,25 +29,41 @@ typedef enum WJB_END_OBJECT } JsonbIteratorToken; -/* - * When using a GIN index for jsonb, we choose to index both keys and values. - * The storage format is text, with K, or V prepended to the string to indicate - * key/element or value/element. - * - * Jsonb Keys and string array elements are treated equivalently when - * serialized to text index storage. One day we may wish to create an opclass - * that only indexes values, but for now keys and values are stored in GIN - * indexes in a way that doesn't really consider their relationship to each - * other. - */ -#define JKEYELEM 'K' -#define JVAL 'V' - +/* Strategy numbers for GIN index opclasses */ #define JsonbContainsStrategyNumber 7 #define JsonbExistsStrategyNumber 9 #define JsonbExistsAnyStrategyNumber 10 #define JsonbExistsAllStrategyNumber 11 +/* + * In the standard jsonb_ops GIN opclass for jsonb, we choose to index both + * keys and values. The storage format is text. The first byte of the text + * string distinguishes whether this is a key (always a string), null value, + * boolean value, numeric value, or string value. However, array elements + * that are strings are marked as though they were keys; this imprecision + * supports the definition of the "exists" operator, which treats array + * elements like keys. The remainder of the text string is empty for a null + * value, "t" or "f" for a boolean value, a normalized print representation of + * a numeric value, or the text of a string value. However, if the length of + * this text representation would exceed JGIN_MAXLENGTH bytes, we instead hash + * the text representation and store an 8-hex-digit representation of the + * uint32 hash value, marking the prefix byte with an additional bit to + * distinguish that this has happened. Hashing long strings saves space and + * ensures that we won't overrun the maximum entry length for a GIN index. + * (But JGIN_MAXLENGTH is quite a bit shorter than GIN's limit. It's chosen + * to ensure that the on-disk text datum will have a short varlena header.) + * Note that when any hashed item appears in a query, we must recheck index + * matches against the heap tuple; currently, this costs nothing because we + * must always recheck for other reasons. + */ +#define JGINFLAG_KEY 0x01 /* key (or string array element) */ +#define JGINFLAG_NULL 0x02 /* null value */ +#define JGINFLAG_BOOL 0x03 /* boolean value */ +#define JGINFLAG_NUM 0x04 /* numeric value */ +#define JGINFLAG_STR 0x05 /* string value (if not an array element) */ +#define JGINFLAG_HASHED 0x10 /* OR'd into flag if value was hashed */ +#define JGIN_MAXLENGTH 125 /* max length of text part before hashing */ + /* Convenience macros */ #define DatumGetJsonb(d) ((Jsonb *) PG_DETOAST_DATUM(d)) #define JsonbGetDatum(p) PointerGetDatum(p) @@ -332,12 +348,12 @@ extern Datum gin_consistent_jsonb_hash(PG_FUNCTION_ARGS); extern Datum gin_triconsistent_jsonb_hash(PG_FUNCTION_ARGS); /* Support functions */ -extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); +extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader, - uint32 flags, - JsonbValue *key); + uint32 flags, + JsonbValue *key); extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *sheader, - uint32 i); + uint32 i); extern JsonbValue *pushJsonbValue(JsonbParseState **pstate, JsonbIteratorToken seq, JsonbValue *scalarVal); extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container); diff --git a/src/test/regress/data/jsonb.data b/src/test/regress/data/jsonb.data index 1352ebe3ac..622501b236 100644 --- a/src/test/regress/data/jsonb.data +++ b/src/test/regress/data/jsonb.data @@ -1006,4 +1006,7 @@ {"wait":null, "line":1000} {"age":25} {"age":25.0} +{"foo": {"bar": "baz"}} +{"foo": {"blah": "baz"}} +{"fool": {"bar": "baz"}} {} diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index 8bd0131100..c5a7d64ae4 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -1483,6 +1483,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public'; 194 (1 row) +SELECT count(*) FROM testjsonb WHERE j ? 'bar'; + count +------- + 0 +(1 row) + SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled']; count ------- @@ -1543,7 +1549,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}'; SELECT count(*) FROM testjsonb WHERE j @> '{}'; count ------- - 1009 + 1012 (1 row) SELECT count(*) FROM testjsonb WHERE j ? 'public'; @@ -1552,6 +1558,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public'; 194 (1 row) +SELECT count(*) FROM testjsonb WHERE j ? 'bar'; + count +------- + 0 +(1 row) + SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled']; count ------- @@ -1591,7 +1603,7 @@ RESET enable_seqscan; SELECT count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow; count ------- - 4788 + 4791 (1 row) SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GROUP BY key ORDER BY count DESC, key; @@ -1621,20 +1633,22 @@ SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GRO abstract | 161 array | 5 age | 2 -(24 rows) + foo | 2 + fool | 1 +(26 rows) -- sort/hash SELECT count(distinct j) FROM testjsonb; count ------- - 891 + 894 (1 row) SET enable_hashagg = off; SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2; count ------- - 891 + 894 (1 row) SET enable_hashagg = on; @@ -1642,7 +1656,7 @@ SET enable_sort = off; SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2; count ------- - 891 + 894 (1 row) SELECT distinct * FROM (values (jsonb '{}' || ''),('{}')) v(j); @@ -1709,7 +1723,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}'; SELECT count(*) FROM testjsonb WHERE j @> '{}'; count ------- - 1009 + 1012 (1 row) RESET enable_seqscan; diff --git a/src/test/regress/expected/jsonb_1.out b/src/test/regress/expected/jsonb_1.out index 35524fb9a7..0e3ebd161e 100644 --- a/src/test/regress/expected/jsonb_1.out +++ b/src/test/regress/expected/jsonb_1.out @@ -1483,6 +1483,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public'; 194 (1 row) +SELECT count(*) FROM testjsonb WHERE j ? 'bar'; + count +------- + 0 +(1 row) + SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled']; count ------- @@ -1543,7 +1549,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}'; SELECT count(*) FROM testjsonb WHERE j @> '{}'; count ------- - 1009 + 1012 (1 row) SELECT count(*) FROM testjsonb WHERE j ? 'public'; @@ -1552,6 +1558,12 @@ SELECT count(*) FROM testjsonb WHERE j ? 'public'; 194 (1 row) +SELECT count(*) FROM testjsonb WHERE j ? 'bar'; + count +------- + 0 +(1 row) + SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled']; count ------- @@ -1591,7 +1603,7 @@ RESET enable_seqscan; SELECT count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow; count ------- - 4788 + 4791 (1 row) SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GROUP BY key ORDER BY count DESC, key; @@ -1621,20 +1633,22 @@ SELECT key, count(*) FROM (SELECT (jsonb_each(j)).key FROM testjsonb) AS wow GRO abstract | 161 array | 5 age | 2 -(24 rows) + foo | 2 + fool | 1 +(26 rows) -- sort/hash SELECT count(distinct j) FROM testjsonb; count ------- - 891 + 894 (1 row) SET enable_hashagg = off; SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2; count ------- - 891 + 894 (1 row) SET enable_hashagg = on; @@ -1642,7 +1656,7 @@ SET enable_sort = off; SELECT count(*) FROM (SELECT j FROM (SELECT * FROM testjsonb UNION ALL SELECT * FROM testjsonb) js GROUP BY j) js2; count ------- - 891 + 894 (1 row) SELECT distinct * FROM (values (jsonb '{}' || ''),('{}')) v(j); @@ -1709,7 +1723,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}'; SELECT count(*) FROM testjsonb WHERE j @> '{}'; count ------- - 1009 + 1012 (1 row) RESET enable_seqscan; diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index 3ee43e9347..3e9048911b 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -334,6 +334,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"wait":"CC", "public":true}'; SELECT count(*) FROM testjsonb WHERE j @> '{"age":25}'; SELECT count(*) FROM testjsonb WHERE j @> '{"age":25.0}'; SELECT count(*) FROM testjsonb WHERE j ? 'public'; +SELECT count(*) FROM testjsonb WHERE j ? 'bar'; SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled']; SELECT count(*) FROM testjsonb WHERE j ?& ARRAY['public','disabled']; @@ -350,6 +351,7 @@ SELECT count(*) FROM testjsonb WHERE j @> '{"array":["bar"]}'; -- excercise GIN_SEARCH_MODE_ALL SELECT count(*) FROM testjsonb WHERE j @> '{}'; SELECT count(*) FROM testjsonb WHERE j ? 'public'; +SELECT count(*) FROM testjsonb WHERE j ? 'bar'; SELECT count(*) FROM testjsonb WHERE j ?| ARRAY['public','disabled']; SELECT count(*) FROM testjsonb WHERE j ?& ARRAY['public','disabled'];