postgresql/src/backend/utils/adt/jsonb.c
Tom Lane def4c28cf9 Change JSONB's on-disk format for improved performance.
The original design used an array of offsets into the variable-length
portion of a JSONB container.  However, such an array is basically
uncompressible by simple compression techniques such as TOAST's LZ
compressor.  That's bad enough, but because the offset array is at the
front, it tended to trigger the give-up-after-1KB heuristic in the TOAST
code, so that the entire JSONB object was stored uncompressed; which was
the root cause of bug #11109 from Larry White.

To fix without losing the ability to extract a random array element in O(1)
time, change this scheme so that most of the JEntry array elements hold
lengths rather than offsets.  With data that's compressible at all, there
tend to be fewer distinct element lengths, so that there is scope for
compression of the JEntry array.  Every N'th entry is still an offset.
To determine the length or offset of any specific element, we might have
to examine up to N preceding JEntrys, but that's still O(1) so far as the
total container size is concerned.  Testing shows that this cost is
negligible compared to other costs of accessing a JSONB field, and that
the method does largely fix the incompressible-data problem.

While at it, rearrange the order of elements in a JSONB object so that
it's "all the keys, then all the values" not alternating keys and values.
This doesn't really make much difference right at the moment, but it will
allow providing a fast path for extracting individual object fields from
large JSONB values stored EXTERNAL (ie, uncompressed), analogously to the
existing optimization for substring extraction from large EXTERNAL text
values.

Bump catversion to denote the incompatibility in on-disk format.
We will need to fix pg_upgrade to disallow upgrading jsonb data stored
with 9.4 betas 1 and 2.

Heikki Linnakangas and Tom Lane
2014-09-29 12:29:21 -04:00

465 lines
11 KiB
C

/*-------------------------------------------------------------------------
*
* jsonb.c
* I/O routines for jsonb type
*
* Copyright (c) 2014, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/utils/adt/jsonb.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "libpq/pqformat.h"
#include "utils/builtins.h"
#include "utils/json.h"
#include "utils/jsonapi.h"
#include "utils/jsonb.h"
typedef struct JsonbInState
{
JsonbParseState *parseState;
JsonbValue *res;
} JsonbInState;
static inline Datum jsonb_from_cstring(char *json, int len);
static size_t checkStringLen(size_t len);
static void jsonb_in_object_start(void *pstate);
static void jsonb_in_object_end(void *pstate);
static void jsonb_in_array_start(void *pstate);
static void jsonb_in_array_end(void *pstate);
static void jsonb_in_object_field_start(void *pstate, char *fname, bool isnull);
static void jsonb_put_escaped_value(StringInfo out, JsonbValue *scalarVal);
static void jsonb_in_scalar(void *pstate, char *token, JsonTokenType tokentype);
/*
* jsonb type input function
*/
Datum
jsonb_in(PG_FUNCTION_ARGS)
{
char *json = PG_GETARG_CSTRING(0);
return jsonb_from_cstring(json, strlen(json));
}
/*
* jsonb type recv function
*
* The type is sent as text in binary mode, so this is almost the same
* as the input function, but it's prefixed with a version number so we
* can change the binary format sent in future if necessary. For now,
* only version 1 is supported.
*/
Datum
jsonb_recv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
int version = pq_getmsgint(buf, 1);
char *str;
int nbytes;
if (version == 1)
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
else
elog(ERROR, "unsupported jsonb version number %d", version);
return jsonb_from_cstring(str, nbytes);
}
/*
* jsonb type output function
*/
Datum
jsonb_out(PG_FUNCTION_ARGS)
{
Jsonb *jb = PG_GETARG_JSONB(0);
char *out;
out = JsonbToCString(NULL, &jb->root, VARSIZE(jb));
PG_RETURN_CSTRING(out);
}
/*
* jsonb type send function
*
* Just send jsonb as a version number, then a string of text
*/
Datum
jsonb_send(PG_FUNCTION_ARGS)
{
Jsonb *jb = PG_GETARG_JSONB(0);
StringInfoData buf;
StringInfo jtext = makeStringInfo();
int version = 1;
(void) JsonbToCString(jtext, &jb->root, VARSIZE(jb));
pq_begintypsend(&buf);
pq_sendint(&buf, version, 1);
pq_sendtext(&buf, jtext->data, jtext->len);
pfree(jtext->data);
pfree(jtext);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
/*
* SQL function jsonb_typeof(jsonb) -> text
*
* This function is here because the analog json function is in json.c, since
* it uses the json parser internals not exposed elsewhere.
*/
Datum
jsonb_typeof(PG_FUNCTION_ARGS)
{
Jsonb *in = PG_GETARG_JSONB(0);
JsonbIterator *it;
JsonbValue v;
char *result;
if (JB_ROOT_IS_OBJECT(in))
result = "object";
else if (JB_ROOT_IS_ARRAY(in) && !JB_ROOT_IS_SCALAR(in))
result = "array";
else
{
Assert(JB_ROOT_IS_SCALAR(in));
it = JsonbIteratorInit(&in->root);
/*
* A root scalar is stored as an array of one element, so we get the
* array and then its first (and only) member.
*/
(void) JsonbIteratorNext(&it, &v, true);
Assert(v.type == jbvArray);
(void) JsonbIteratorNext(&it, &v, true);
switch (v.type)
{
case jbvNull:
result = "null";
break;
case jbvString:
result = "string";
break;
case jbvNumeric:
result = "number";
break;
case jbvBool:
result = "boolean";
break;
default:
elog(ERROR, "unknown jsonb scalar type");
}
}
PG_RETURN_TEXT_P(cstring_to_text(result));
}
/*
* jsonb_from_cstring
*
* Turns json string into a jsonb Datum.
*
* Uses the json parser (with hooks) to construct a jsonb.
*/
static inline Datum
jsonb_from_cstring(char *json, int len)
{
JsonLexContext *lex;
JsonbInState state;
JsonSemAction sem;
memset(&state, 0, sizeof(state));
memset(&sem, 0, sizeof(sem));
lex = makeJsonLexContextCstringLen(json, len, true);
sem.semstate = (void *) &state;
sem.object_start = jsonb_in_object_start;
sem.array_start = jsonb_in_array_start;
sem.object_end = jsonb_in_object_end;
sem.array_end = jsonb_in_array_end;
sem.scalar = jsonb_in_scalar;
sem.object_field_start = jsonb_in_object_field_start;
pg_parse_json(lex, &sem);
/* after parsing, the item member has the composed jsonb structure */
PG_RETURN_POINTER(JsonbValueToJsonb(state.res));
}
static size_t
checkStringLen(size_t len)
{
if (len > JENTRY_OFFLENMASK)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("string too long to represent as jsonb string"),
errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.",
JENTRY_OFFLENMASK)));
return len;
}
static void
jsonb_in_object_start(void *pstate)
{
JsonbInState *_state = (JsonbInState *) pstate;
_state->res = pushJsonbValue(&_state->parseState, WJB_BEGIN_OBJECT, NULL);
}
static void
jsonb_in_object_end(void *pstate)
{
JsonbInState *_state = (JsonbInState *) pstate;
_state->res = pushJsonbValue(&_state->parseState, WJB_END_OBJECT, NULL);
}
static void
jsonb_in_array_start(void *pstate)
{
JsonbInState *_state = (JsonbInState *) pstate;
_state->res = pushJsonbValue(&_state->parseState, WJB_BEGIN_ARRAY, NULL);
}
static void
jsonb_in_array_end(void *pstate)
{
JsonbInState *_state = (JsonbInState *) pstate;
_state->res = pushJsonbValue(&_state->parseState, WJB_END_ARRAY, NULL);
}
static void
jsonb_in_object_field_start(void *pstate, char *fname, bool isnull)
{
JsonbInState *_state = (JsonbInState *) pstate;
JsonbValue v;
Assert(fname != NULL);
v.type = jbvString;
v.val.string.len = checkStringLen(strlen(fname));
v.val.string.val = fname;
_state->res = pushJsonbValue(&_state->parseState, WJB_KEY, &v);
}
static void
jsonb_put_escaped_value(StringInfo out, JsonbValue *scalarVal)
{
switch (scalarVal->type)
{
case jbvNull:
appendBinaryStringInfo(out, "null", 4);
break;
case jbvString:
escape_json(out, pnstrdup(scalarVal->val.string.val, scalarVal->val.string.len));
break;
case jbvNumeric:
appendStringInfoString(out,
DatumGetCString(DirectFunctionCall1(numeric_out,
PointerGetDatum(scalarVal->val.numeric))));
break;
case jbvBool:
if (scalarVal->val.boolean)
appendBinaryStringInfo(out, "true", 4);
else
appendBinaryStringInfo(out, "false", 5);
break;
default:
elog(ERROR, "unknown jsonb scalar type");
}
}
/*
* For jsonb we always want the de-escaped value - that's what's in token
*/
static void
jsonb_in_scalar(void *pstate, char *token, JsonTokenType tokentype)
{
JsonbInState *_state = (JsonbInState *) pstate;
JsonbValue v;
switch (tokentype)
{
case JSON_TOKEN_STRING:
Assert(token != NULL);
v.type = jbvString;
v.val.string.len = checkStringLen(strlen(token));
v.val.string.val = token;
break;
case JSON_TOKEN_NUMBER:
/*
* No need to check size of numeric values, because maximum
* numeric size is well below the JsonbValue restriction
*/
Assert(token != NULL);
v.type = jbvNumeric;
v.val.numeric = DatumGetNumeric(DirectFunctionCall3(numeric_in, CStringGetDatum(token), 0, -1));
break;
case JSON_TOKEN_TRUE:
v.type = jbvBool;
v.val.boolean = true;
break;
case JSON_TOKEN_FALSE:
v.type = jbvBool;
v.val.boolean = false;
break;
case JSON_TOKEN_NULL:
v.type = jbvNull;
break;
default:
/* should not be possible */
elog(ERROR, "invalid json token type");
break;
}
if (_state->parseState == NULL)
{
/* single scalar */
JsonbValue va;
va.type = jbvArray;
va.val.array.rawScalar = true;
va.val.array.nElems = 1;
_state->res = pushJsonbValue(&_state->parseState, WJB_BEGIN_ARRAY, &va);
_state->res = pushJsonbValue(&_state->parseState, WJB_ELEM, &v);
_state->res = pushJsonbValue(&_state->parseState, WJB_END_ARRAY, NULL);
}
else
{
JsonbValue *o = &_state->parseState->contVal;
switch (o->type)
{
case jbvArray:
_state->res = pushJsonbValue(&_state->parseState, WJB_ELEM, &v);
break;
case jbvObject:
_state->res = pushJsonbValue(&_state->parseState, WJB_VALUE, &v);
break;
default:
elog(ERROR, "unexpected parent of nested structure");
}
}
}
/*
* JsonbToCString
* Converts jsonb value to a C-string.
*
* If 'out' argument is non-null, the resulting C-string is stored inside the
* StringBuffer. The resulting string is always returned.
*
* A typical case for passing the StringInfo in rather than NULL is where the
* caller wants access to the len attribute without having to call strlen, e.g.
* if they are converting it to a text* object.
*/
char *
JsonbToCString(StringInfo out, JsonbContainer *in, int estimated_len)
{
bool first = true;
JsonbIterator *it;
int type = 0;
JsonbValue v;
int level = 0;
bool redo_switch = false;
if (out == NULL)
out = makeStringInfo();
enlargeStringInfo(out, (estimated_len >= 0) ? estimated_len : 64);
it = JsonbIteratorInit(in);
while (redo_switch ||
((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE))
{
redo_switch = false;
switch (type)
{
case WJB_BEGIN_ARRAY:
if (!first)
appendBinaryStringInfo(out, ", ", 2);
first = true;
if (!v.val.array.rawScalar)
appendStringInfoChar(out, '[');
level++;
break;
case WJB_BEGIN_OBJECT:
if (!first)
appendBinaryStringInfo(out, ", ", 2);
first = true;
appendStringInfoCharMacro(out, '{');
level++;
break;
case WJB_KEY:
if (!first)
appendBinaryStringInfo(out, ", ", 2);
first = true;
/* json rules guarantee this is a string */
jsonb_put_escaped_value(out, &v);
appendBinaryStringInfo(out, ": ", 2);
type = JsonbIteratorNext(&it, &v, false);
if (type == WJB_VALUE)
{
first = false;
jsonb_put_escaped_value(out, &v);
}
else
{
Assert(type == WJB_BEGIN_OBJECT || type == WJB_BEGIN_ARRAY);
/*
* We need to rerun the current switch() since we need to
* output the object which we just got from the iterator
* before calling the iterator again.
*/
redo_switch = true;
}
break;
case WJB_ELEM:
if (!first)
appendBinaryStringInfo(out, ", ", 2);
else
first = false;
jsonb_put_escaped_value(out, &v);
break;
case WJB_END_ARRAY:
level--;
if (!v.val.array.rawScalar)
appendStringInfoChar(out, ']');
first = false;
break;
case WJB_END_OBJECT:
level--;
appendStringInfoCharMacro(out, '}');
first = false;
break;
default:
elog(ERROR, "unknown flag of jsonb iterator");
}
}
Assert(level == 0);
return out->data;
}