postgresql/src/backend/utils/adt/varlena.c

6533 lines
164 KiB
C

/*-------------------------------------------------------------------------
*
* varlena.c
* Functions for the variable-length built-in types.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/varlena.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include <limits.h>
#include "access/detoast.h"
#include "access/toast_compression.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
#include "common/unicode_norm.h"
#include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "nodes/execnodes.h"
#include "parser/scansup.h"
#include "port/pg_bswap.h"
#include "regex/regex.h"
#include "utils/builtins.h"
#include "utils/bytea.h"
#include "utils/guc.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
#include "utils/sortsupport.h"
#include "utils/varlena.h"
/* GUC variable */
int bytea_output = BYTEA_OUTPUT_HEX;
typedef struct varlena VarString;
/*
* State for text_position_* functions.
*/
typedef struct
{
bool is_multibyte_char_in_char; /* need to check char boundaries? */
char *str1; /* haystack string */
char *str2; /* needle string */
int len1; /* string lengths in bytes */
int len2;
/* Skip table for Boyer-Moore-Horspool search algorithm: */
int skiptablemask; /* mask for ANDing with skiptable subscripts */
int skiptable[256]; /* skip distance for given mismatched char */
char *last_match; /* pointer to last match in 'str1' */
/*
* Sometimes we need to convert the byte position of a match to a
* character position. These store the last position that was converted,
* so that on the next call, we can continue from that point, rather than
* count characters from the very beginning.
*/
char *refpoint; /* pointer within original haystack string */
int refpos; /* 0-based character offset of the same point */
} TextPositionState;
typedef struct
{
char *buf1; /* 1st string, or abbreviation original string
* buf */
char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
int buflen1; /* Allocated length of buf1 */
int buflen2; /* Allocated length of buf2 */
int last_len1; /* Length of last buf1 string/strxfrm() input */
int last_len2; /* Length of last buf2 string/strxfrm() blob */
int last_returned; /* Last comparison result (cache) */
bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
bool collate_c;
Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
hyperLogLogState full_card; /* Full key cardinality state */
double prop_card; /* Required cardinality proportion */
pg_locale_t locale;
} VarStringSortSupport;
/*
* Output data for split_text(): we output either to an array or a table.
* tupstore and tupdesc must be set up in advance to output to a table.
*/
typedef struct
{
ArrayBuildState *astate;
Tuplestorestate *tupstore;
TupleDesc tupdesc;
} SplitTextOutputData;
/*
* This should be large enough that most strings will fit, but small enough
* that we feel comfortable putting it on the stack
*/
#define TEXTBUFLEN 1024
#define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
#define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
static int32 text_length(Datum str);
static text *text_catenate(text *t1, text *t2);
static text *text_substring(Datum str,
int32 start,
int32 length,
bool length_not_specified);
static text *text_overlay(text *t1, text *t2, int sp, int sl);
static int text_position(text *t1, text *t2, Oid collid);
static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
static bool text_position_next(TextPositionState *state);
static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
static char *text_position_get_match_ptr(TextPositionState *state);
static int text_position_get_match_pos(TextPositionState *state);
static void text_position_cleanup(TextPositionState *state);
static void check_collation_set(Oid collid);
static int text_cmp(text *arg1, text *arg2, Oid collid);
static bytea *bytea_catenate(bytea *t1, bytea *t2);
static bytea *bytea_substring(Datum str,
int S,
int L,
bool length_not_specified);
static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
static void appendStringInfoText(StringInfo str, const text *t);
static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
static void split_text_accum_result(SplitTextOutputData *tstate,
text *field_value,
text *null_string,
Oid collation);
static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
const char *fldsep, const char *null_string);
static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
int *value);
static const char *text_format_parse_format(const char *start_ptr,
const char *end_ptr,
int *argpos, int *widthpos,
int *flags, int *width);
static void text_format_string_conversion(StringInfo buf, char conversion,
FmgrInfo *typOutputInfo,
Datum value, bool isNull,
int flags, int width);
static void text_format_append_string(StringInfo buf, const char *str,
int flags, int width);
/*****************************************************************************
* CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
*****************************************************************************/
/*
* cstring_to_text
*
* Create a text value from a null-terminated C string.
*
* The new text value is freshly palloc'd with a full-size VARHDR.
*/
text *
cstring_to_text(const char *s)
{
return cstring_to_text_with_len(s, strlen(s));
}
/*
* cstring_to_text_with_len
*
* Same as cstring_to_text except the caller specifies the string length;
* the string need not be null_terminated.
*/
text *
cstring_to_text_with_len(const char *s, int len)
{
text *result = (text *) palloc(len + VARHDRSZ);
SET_VARSIZE(result, len + VARHDRSZ);
memcpy(VARDATA(result), s, len);
return result;
}
/*
* text_to_cstring
*
* Create a palloc'd, null-terminated C string from a text value.
*
* We support being passed a compressed or toasted text value.
* This is a bit bogus since such values shouldn't really be referred to as
* "text *", but it seems useful for robustness. If we didn't handle that
* case here, we'd need another routine that did, anyway.
*/
char *
text_to_cstring(const text *t)
{
/* must cast away the const, unfortunately */
text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
int len = VARSIZE_ANY_EXHDR(tunpacked);
char *result;
result = (char *) palloc(len + 1);
memcpy(result, VARDATA_ANY(tunpacked), len);
result[len] = '\0';
if (tunpacked != t)
pfree(tunpacked);
return result;
}
/*
* text_to_cstring_buffer
*
* Copy a text value into a caller-supplied buffer of size dst_len.
*
* The text string is truncated if necessary to fit. The result is
* guaranteed null-terminated (unless dst_len == 0).
*
* We support being passed a compressed or toasted text value.
* This is a bit bogus since such values shouldn't really be referred to as
* "text *", but it seems useful for robustness. If we didn't handle that
* case here, we'd need another routine that did, anyway.
*/
void
text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
{
/* must cast away the const, unfortunately */
text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
if (dst_len > 0)
{
dst_len--;
if (dst_len >= src_len)
dst_len = src_len;
else /* ensure truncation is encoding-safe */
dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
dst[dst_len] = '\0';
}
if (srcunpacked != src)
pfree(srcunpacked);
}
/*****************************************************************************
* USER I/O ROUTINES *
*****************************************************************************/
#define VAL(CH) ((CH) - '0')
#define DIG(VAL) ((VAL) + '0')
/*
* byteain - converts from printable representation of byte array
*
* Non-printable characters must be passed as '\nnn' (octal) and are
* converted to internal form. '\' must be passed as '\\'.
* ereport(ERROR, ...) if bad form.
*
* BUGS:
* The input is scanned twice.
* The error checking of input is minimal.
*/
Datum
byteain(PG_FUNCTION_ARGS)
{
char *inputText = PG_GETARG_CSTRING(0);
Node *escontext = fcinfo->context;
char *tp;
char *rp;
int bc;
bytea *result;
/* Recognize hex input */
if (inputText[0] == '\\' && inputText[1] == 'x')
{
size_t len = strlen(inputText);
bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
result = palloc(bc);
bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
escontext);
SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
PG_RETURN_BYTEA_P(result);
}
/* Else, it's the traditional escaped style */
for (bc = 0, tp = inputText; *tp != '\0'; bc++)
{
if (tp[0] != '\\')
tp++;
else if ((tp[0] == '\\') &&
(tp[1] >= '0' && tp[1] <= '3') &&
(tp[2] >= '0' && tp[2] <= '7') &&
(tp[3] >= '0' && tp[3] <= '7'))
tp += 4;
else if ((tp[0] == '\\') &&
(tp[1] == '\\'))
tp += 2;
else
{
/*
* one backslash, not followed by another or ### valid octal
*/
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type %s", "bytea")));
}
}
bc += VARHDRSZ;
result = (bytea *) palloc(bc);
SET_VARSIZE(result, bc);
tp = inputText;
rp = VARDATA(result);
while (*tp != '\0')
{
if (tp[0] != '\\')
*rp++ = *tp++;
else if ((tp[0] == '\\') &&
(tp[1] >= '0' && tp[1] <= '3') &&
(tp[2] >= '0' && tp[2] <= '7') &&
(tp[3] >= '0' && tp[3] <= '7'))
{
bc = VAL(tp[1]);
bc <<= 3;
bc += VAL(tp[2]);
bc <<= 3;
*rp++ = bc + VAL(tp[3]);
tp += 4;
}
else if ((tp[0] == '\\') &&
(tp[1] == '\\'))
{
*rp++ = '\\';
tp += 2;
}
else
{
/*
* We should never get here. The first pass should not allow it.
*/
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type %s", "bytea")));
}
}
PG_RETURN_BYTEA_P(result);
}
/*
* byteaout - converts to printable representation of byte array
*
* In the traditional escaped format, non-printable characters are
* printed as '\nnn' (octal) and '\' as '\\'.
*/
Datum
byteaout(PG_FUNCTION_ARGS)
{
bytea *vlena = PG_GETARG_BYTEA_PP(0);
char *result;
char *rp;
if (bytea_output == BYTEA_OUTPUT_HEX)
{
/* Print hex format */
rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
*rp++ = '\\';
*rp++ = 'x';
rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
}
else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
{
/* Print traditional escaped format */
char *vp;
uint64 len;
int i;
len = 1; /* empty string has 1 char */
vp = VARDATA_ANY(vlena);
for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
{
if (*vp == '\\')
len += 2;
else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
len += 4;
else
len++;
}
/*
* In principle len can't overflow uint32 if the input fit in 1GB, but
* for safety let's check rather than relying on palloc's internal
* check.
*/
if (len > MaxAllocSize)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg_internal("result of bytea output conversion is too large")));
rp = result = (char *) palloc(len);
vp = VARDATA_ANY(vlena);
for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
{
if (*vp == '\\')
{
*rp++ = '\\';
*rp++ = '\\';
}
else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
{
int val; /* holds unprintable chars */
val = *vp;
rp[0] = '\\';
rp[3] = DIG(val & 07);
val >>= 3;
rp[2] = DIG(val & 07);
val >>= 3;
rp[1] = DIG(val & 03);
rp += 4;
}
else
*rp++ = *vp;
}
}
else
{
elog(ERROR, "unrecognized bytea_output setting: %d",
bytea_output);
rp = result = NULL; /* keep compiler quiet */
}
*rp = '\0';
PG_RETURN_CSTRING(result);
}
/*
* bytearecv - converts external binary format to bytea
*/
Datum
bytearecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
bytea *result;
int nbytes;
nbytes = buf->len - buf->cursor;
result = (bytea *) palloc(nbytes + VARHDRSZ);
SET_VARSIZE(result, nbytes + VARHDRSZ);
pq_copymsgbytes(buf, VARDATA(result), nbytes);
PG_RETURN_BYTEA_P(result);
}
/*
* byteasend - converts bytea to binary format
*
* This is a special case: just copy the input...
*/
Datum
byteasend(PG_FUNCTION_ARGS)
{
bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
PG_RETURN_BYTEA_P(vlena);
}
Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)
{
StringInfo state;
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
/* Append the value unless null, preceding it with the delimiter. */
if (!PG_ARGISNULL(1))
{
bytea *value = PG_GETARG_BYTEA_PP(1);
bool isfirst = false;
/*
* You might think we can just throw away the first delimiter, however
* we must keep it as we may be a parallel worker doing partial
* aggregation building a state to send to the main process. We need
* to keep the delimiter of every aggregation so that the combine
* function can properly join up the strings of two separately
* partially aggregated results. The first delimiter is only stripped
* off in the final function. To know how much to strip off the front
* of the string, we store the length of the first delimiter in the
* StringInfo's cursor field, which we don't otherwise need here.
*/
if (state == NULL)
{
state = makeStringAggState(fcinfo);
isfirst = true;
}
if (!PG_ARGISNULL(2))
{
bytea *delim = PG_GETARG_BYTEA_PP(2);
appendBinaryStringInfo(state, VARDATA_ANY(delim),
VARSIZE_ANY_EXHDR(delim));
if (isfirst)
state->cursor = VARSIZE_ANY_EXHDR(delim);
}
appendBinaryStringInfo(state, VARDATA_ANY(value),
VARSIZE_ANY_EXHDR(value));
}
/*
* The transition type for string_agg() is declared to be "internal",
* which is a pass-by-value type the same size as a pointer.
*/
if (state)
PG_RETURN_POINTER(state);
PG_RETURN_NULL();
}
Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
{
StringInfo state;
/* cannot be called directly because of internal-type argument */
Assert(AggCheckCallContext(fcinfo, NULL));
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
if (state != NULL)
{
/* As per comment in transfn, strip data before the cursor position */
bytea *result;
int strippedlen = state->len - state->cursor;
result = (bytea *) palloc(strippedlen + VARHDRSZ);
SET_VARSIZE(result, strippedlen + VARHDRSZ);
memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
PG_RETURN_BYTEA_P(result);
}
else
PG_RETURN_NULL();
}
/*
* textin - converts "..." to internal representation
*/
Datum
textin(PG_FUNCTION_ARGS)
{
char *inputText = PG_GETARG_CSTRING(0);
PG_RETURN_TEXT_P(cstring_to_text(inputText));
}
/*
* textout - converts internal representation to "..."
*/
Datum
textout(PG_FUNCTION_ARGS)
{
Datum txt = PG_GETARG_DATUM(0);
PG_RETURN_CSTRING(TextDatumGetCString(txt));
}
/*
* textrecv - converts external binary format to text
*/
Datum
textrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
text *result;
char *str;
int nbytes;
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
result = cstring_to_text_with_len(str, nbytes);
pfree(str);
PG_RETURN_TEXT_P(result);
}
/*
* textsend - converts text to binary format
*/
Datum
textsend(PG_FUNCTION_ARGS)
{
text *t = PG_GETARG_TEXT_PP(0);
StringInfoData buf;
pq_begintypsend(&buf);
pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
/*
* unknownin - converts "..." to internal representation
*/
Datum
unknownin(PG_FUNCTION_ARGS)
{
char *str = PG_GETARG_CSTRING(0);
/* representation is same as cstring */
PG_RETURN_CSTRING(pstrdup(str));
}
/*
* unknownout - converts internal representation to "..."
*/
Datum
unknownout(PG_FUNCTION_ARGS)
{
/* representation is same as cstring */
char *str = PG_GETARG_CSTRING(0);
PG_RETURN_CSTRING(pstrdup(str));
}
/*
* unknownrecv - converts external binary format to unknown
*/
Datum
unknownrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
char *str;
int nbytes;
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
/* representation is same as cstring */
PG_RETURN_CSTRING(str);
}
/*
* unknownsend - converts unknown to binary format
*/
Datum
unknownsend(PG_FUNCTION_ARGS)
{
/* representation is same as cstring */
char *str = PG_GETARG_CSTRING(0);
StringInfoData buf;
pq_begintypsend(&buf);
pq_sendtext(&buf, str, strlen(str));
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
/* ========== PUBLIC ROUTINES ========== */
/*
* textlen -
* returns the logical length of a text*
* (which is less than the VARSIZE of the text*)
*/
Datum
textlen(PG_FUNCTION_ARGS)
{
Datum str = PG_GETARG_DATUM(0);
/* try to avoid decompressing argument */
PG_RETURN_INT32(text_length(str));
}
/*
* text_length -
* Does the real work for textlen()
*
* This is broken out so it can be called directly by other string processing
* functions. Note that the argument is passed as a Datum, to indicate that
* it may still be in compressed form. We can avoid decompressing it at all
* in some cases.
*/
static int32
text_length(Datum str)
{
/* fastpath when max encoding length is one */
if (pg_database_encoding_max_length() == 1)
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
else
{
text *t = DatumGetTextPP(str);
PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
VARSIZE_ANY_EXHDR(t)));
}
}
/*
* textoctetlen -
* returns the physical length of a text*
* (which is less than the VARSIZE of the text*)
*/
Datum
textoctetlen(PG_FUNCTION_ARGS)
{
Datum str = PG_GETARG_DATUM(0);
/* We need not detoast the input at all */
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
}
/*
* textcat -
* takes two text* and returns a text* that is the concatenation of
* the two.
*
* Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
* Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
* Allocate space for output in all cases.
* XXX - thomas 1997-07-10
*/
Datum
textcat(PG_FUNCTION_ARGS)
{
text *t1 = PG_GETARG_TEXT_PP(0);
text *t2 = PG_GETARG_TEXT_PP(1);
PG_RETURN_TEXT_P(text_catenate(t1, t2));
}
/*
* text_catenate
* Guts of textcat(), broken out so it can be used by other functions
*
* Arguments can be in short-header form, but not compressed or out-of-line
*/
static text *
text_catenate(text *t1, text *t2)
{
text *result;
int len1,
len2,
len;
char *ptr;
len1 = VARSIZE_ANY_EXHDR(t1);
len2 = VARSIZE_ANY_EXHDR(t2);
/* paranoia ... probably should throw error instead? */
if (len1 < 0)
len1 = 0;
if (len2 < 0)
len2 = 0;
len = len1 + len2 + VARHDRSZ;
result = (text *) palloc(len);
/* Set size of result string... */
SET_VARSIZE(result, len);
/* Fill data field of result string... */
ptr = VARDATA(result);
if (len1 > 0)
memcpy(ptr, VARDATA_ANY(t1), len1);
if (len2 > 0)
memcpy(ptr + len1, VARDATA_ANY(t2), len2);
return result;
}
/*
* charlen_to_bytelen()
* Compute the number of bytes occupied by n characters starting at *p
*
* It is caller's responsibility that there actually are n characters;
* the string need not be null-terminated.
*/
static int
charlen_to_bytelen(const char *p, int n)
{
if (pg_database_encoding_max_length() == 1)
{
/* Optimization for single-byte encodings */
return n;
}
else
{
const char *s;
for (s = p; n > 0; n--)
s += pg_mblen(s);
return s - p;
}
}
/*
* text_substr()
* Return a substring starting at the specified position.
* - thomas 1997-12-31
*
* Input:
* - string
* - starting position (is one-based)
* - string length
*
* If the starting position is zero or less, then return from the start of the string
* adjusting the length to be consistent with the "negative start" per SQL.
* If the length is less than zero, return the remaining string.
*
* Added multibyte support.
* - Tatsuo Ishii 1998-4-21
* Changed behavior if starting position is less than one to conform to SQL behavior.
* Formerly returned the entire string; now returns a portion.
* - Thomas Lockhart 1998-12-10
* Now uses faster TOAST-slicing interface
* - John Gray 2002-02-22
* Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
* behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
* error; if E < 1, return '', not entire string). Fixed MB related bug when
* S > LC and < LC + 4 sometimes garbage characters are returned.
* - Joe Conway 2002-08-10
*/
Datum
text_substr(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
PG_GETARG_INT32(1),
PG_GETARG_INT32(2),
false));
}
/*
* text_substr_no_len -
* Wrapper to avoid opr_sanity failure due to
* one function accepting a different number of args.
*/
Datum
text_substr_no_len(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
PG_GETARG_INT32(1),
-1, true));
}
/*
* text_substring -
* Does the real work for text_substr() and text_substr_no_len()
*
* This is broken out so it can be called directly by other string processing
* functions. Note that the argument is passed as a Datum, to indicate that
* it may still be in compressed/toasted form. We can avoid detoasting all
* of it in some cases.
*
* The result is always a freshly palloc'd datum.
*/
static text *
text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
{
int32 eml = pg_database_encoding_max_length();
int32 S = start; /* start position */
int32 S1; /* adjusted start position */
int32 L1; /* adjusted substring length */
int32 E; /* end position */
/*
* SQL99 says S can be zero or negative, but we still must fetch from the
* start of the string.
*/
S1 = Max(S, 1);
/* life is easy if the encoding max length is 1 */
if (eml == 1)
{
if (length_not_specified) /* special case - get length to end of
* string */
L1 = -1;
else if (length < 0)
{
/* SQL99 says to throw an error for E < S, i.e., negative length */
ereport(ERROR,
(errcode(ERRCODE_SUBSTRING_ERROR),
errmsg("negative substring length not allowed")));
L1 = -1; /* silence stupider compilers */
}
else if (pg_add_s32_overflow(S, length, &E))
{
/*
* L could be large enough for S + L to overflow, in which case
* the substring must run to end of string.
*/
L1 = -1;
}
else
{
/*
* A zero or negative value for the end position can happen if the
* start was negative or one. SQL99 says to return a zero-length
* string.
*/
if (E < 1)
return cstring_to_text("");
L1 = E - S1;
}
/*
* If the start position is past the end of the string, SQL99 says to
* return a zero-length string -- DatumGetTextPSlice() will do that
* for us. We need only convert S1 to zero-based starting position.
*/
return DatumGetTextPSlice(str, S1 - 1, L1);
}
else if (eml > 1)
{
/*
* When encoding max length is > 1, we can't get LC without
* detoasting, so we'll grab a conservatively large slice now and go
* back later to do the right thing
*/
int32 slice_start;
int32 slice_size;
int32 slice_strlen;
text *slice;
int32 E1;
int32 i;
char *p;
char *s;
text *ret;
/*
* We need to start at position zero because there is no way to know
* in advance which byte offset corresponds to the supplied start
* position.
*/
slice_start = 0;
if (length_not_specified) /* special case - get length to end of
* string */
slice_size = L1 = -1;
else if (length < 0)
{
/* SQL99 says to throw an error for E < S, i.e., negative length */
ereport(ERROR,
(errcode(ERRCODE_SUBSTRING_ERROR),
errmsg("negative substring length not allowed")));
slice_size = L1 = -1; /* silence stupider compilers */
}
else if (pg_add_s32_overflow(S, length, &E))
{
/*
* L could be large enough for S + L to overflow, in which case
* the substring must run to end of string.
*/
slice_size = L1 = -1;
}
else
{
/*
* A zero or negative value for the end position can happen if the
* start was negative or one. SQL99 says to return a zero-length
* string.
*/
if (E < 1)
return cstring_to_text("");
/*
* if E is past the end of the string, the tuple toaster will
* truncate the length for us
*/
L1 = E - S1;
/*
* Total slice size in bytes can't be any longer than the start
* position plus substring length times the encoding max length.
* If that overflows, we can just use -1.
*/
if (pg_mul_s32_overflow(E, eml, &slice_size))
slice_size = -1;
}
/*
* If we're working with an untoasted source, no need to do an extra
* copying step.
*/
if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
VARATT_IS_EXTERNAL(DatumGetPointer(str)))
slice = DatumGetTextPSlice(str, slice_start, slice_size);
else
slice = (text *) DatumGetPointer(str);
/* see if we got back an empty string */
if (VARSIZE_ANY_EXHDR(slice) == 0)
{
if (slice != (text *) DatumGetPointer(str))
pfree(slice);
return cstring_to_text("");
}
/* Now we can get the actual length of the slice in MB characters */
slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
VARSIZE_ANY_EXHDR(slice));
/*
* Check that the start position wasn't > slice_strlen. If so, SQL99
* says to return a zero-length string.
*/
if (S1 > slice_strlen)
{
if (slice != (text *) DatumGetPointer(str))
pfree(slice);
return cstring_to_text("");
}
/*
* Adjust L1 and E1 now that we know the slice string length. Again
* remember that S1 is one based, and slice_start is zero based.
*/
if (L1 > -1)
E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
else
E1 = slice_start + 1 + slice_strlen;
/*
* Find the start position in the slice; remember S1 is not zero based
*/
p = VARDATA_ANY(slice);
for (i = 0; i < S1 - 1; i++)
p += pg_mblen(p);
/* hang onto a pointer to our start position */
s = p;
/*
* Count the actual bytes used by the substring of the requested
* length.
*/
for (i = S1; i < E1; i++)
p += pg_mblen(p);
ret = (text *) palloc(VARHDRSZ + (p - s));
SET_VARSIZE(ret, VARHDRSZ + (p - s));
memcpy(VARDATA(ret), s, (p - s));
if (slice != (text *) DatumGetPointer(str))
pfree(slice);
return ret;
}
else
elog(ERROR, "invalid backend encoding: encoding max length < 1");
/* not reached: suppress compiler warning */
return NULL;
}
/*
* textoverlay
* Replace specified substring of first string with second
*
* The SQL standard defines OVERLAY() in terms of substring and concatenation.
* This code is a direct implementation of what the standard says.
*/
Datum
textoverlay(PG_FUNCTION_ARGS)
{
text *t1 = PG_GETARG_TEXT_PP(0);
text *t2 = PG_GETARG_TEXT_PP(1);
int sp = PG_GETARG_INT32(2); /* substring start position */
int sl = PG_GETARG_INT32(3); /* substring length */
PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
}
Datum
textoverlay_no_len(PG_FUNCTION_ARGS)
{
text *t1 = PG_GETARG_TEXT_PP(0);
text *t2 = PG_GETARG_TEXT_PP(1);
int sp = PG_GETARG_INT32(2); /* substring start position */
int sl;
sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
}
static text *
text_overlay(text *t1, text *t2, int sp, int sl)
{
text *result;
text *s1;
text *s2;
int sp_pl_sl;
/*
* Check for possible integer-overflow cases. For negative sp, throw a
* "substring length" error because that's what should be expected
* according to the spec's definition of OVERLAY().
*/
if (sp <= 0)
ereport(ERROR,
(errcode(ERRCODE_SUBSTRING_ERROR),
errmsg("negative substring length not allowed")));
if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("integer out of range")));
s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
result = text_catenate(s1, t2);
result = text_catenate(result, s2);
return result;
}
/*
* textpos -
* Return the position of the specified substring.
* Implements the SQL POSITION() function.
* Ref: A Guide To The SQL Standard, Date & Darwen, 1997
* - thomas 1997-07-27
*/
Datum
textpos(PG_FUNCTION_ARGS)
{
text *str = PG_GETARG_TEXT_PP(0);
text *search_str = PG_GETARG_TEXT_PP(1);
PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
}
/*
* text_position -
* Does the real work for textpos()
*
* Inputs:
* t1 - string to be searched
* t2 - pattern to match within t1
* Result:
* Character index of the first matched char, starting from 1,
* or 0 if no match.
*
* This is broken out so it can be called directly by other string processing
* functions.
*/
static int
text_position(text *t1, text *t2, Oid collid)
{
TextPositionState state;
int result;
/* Empty needle always matches at position 1 */
if (VARSIZE_ANY_EXHDR(t2) < 1)
return 1;
/* Otherwise, can't match if haystack is shorter than needle */
if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
return 0;
text_position_setup(t1, t2, collid, &state);
if (!text_position_next(&state))
result = 0;
else
result = text_position_get_match_pos(&state);
text_position_cleanup(&state);
return result;
}
/*
* text_position_setup, text_position_next, text_position_cleanup -
* Component steps of text_position()
*
* These are broken out so that a string can be efficiently searched for
* multiple occurrences of the same pattern. text_position_next may be
* called multiple times, and it advances to the next match on each call.
* text_position_get_match_ptr() and text_position_get_match_pos() return
* a pointer or 1-based character position of the last match, respectively.
*
* The "state" variable is normally just a local variable in the caller.
*
* NOTE: text_position_next skips over the matched portion. For example,
* searching for "xx" in "xxx" returns only one match, not two.
*/
static void
text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
{
int len1 = VARSIZE_ANY_EXHDR(t1);
int len2 = VARSIZE_ANY_EXHDR(t2);
pg_locale_t mylocale = 0;
check_collation_set(collid);
if (!lc_collate_is_c(collid))
mylocale = pg_newlocale_from_collation(collid);
if (!pg_locale_deterministic(mylocale))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("nondeterministic collations are not supported for substring searches")));
Assert(len1 > 0);
Assert(len2 > 0);
/*
* Even with a multi-byte encoding, we perform the search using the raw
* byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
* because in UTF-8 the byte sequence of one character cannot contain
* another character. For other multi-byte encodings, we do the search
* initially as a simple byte search, ignoring multibyte issues, but
* verify afterwards that the match we found is at a character boundary,
* and continue the search if it was a false match.
*/
if (pg_database_encoding_max_length() == 1)
state->is_multibyte_char_in_char = false;
else if (GetDatabaseEncoding() == PG_UTF8)
state->is_multibyte_char_in_char = false;
else
state->is_multibyte_char_in_char = true;
state->str1 = VARDATA_ANY(t1);
state->str2 = VARDATA_ANY(t2);
state->len1 = len1;
state->len2 = len2;
state->last_match = NULL;
state->refpoint = state->str1;
state->refpos = 0;
/*
* Prepare the skip table for Boyer-Moore-Horspool searching. In these
* notes we use the terminology that the "haystack" is the string to be
* searched (t1) and the "needle" is the pattern being sought (t2).
*
* If the needle is empty or bigger than the haystack then there is no
* point in wasting cycles initializing the table. We also choose not to
* use B-M-H for needles of length 1, since the skip table can't possibly
* save anything in that case.
*/
if (len1 >= len2 && len2 > 1)
{
int searchlength = len1 - len2;
int skiptablemask;
int last;
int i;
const char *str2 = state->str2;
/*
* First we must determine how much of the skip table to use. The
* declaration of TextPositionState allows up to 256 elements, but for
* short search problems we don't really want to have to initialize so
* many elements --- it would take too long in comparison to the
* actual search time. So we choose a useful skip table size based on
* the haystack length minus the needle length. The closer the needle
* length is to the haystack length the less useful skipping becomes.
*
* Note: since we use bit-masking to select table elements, the skip
* table size MUST be a power of 2, and so the mask must be 2^N-1.
*/
if (searchlength < 16)
skiptablemask = 3;
else if (searchlength < 64)
skiptablemask = 7;
else if (searchlength < 128)
skiptablemask = 15;
else if (searchlength < 512)
skiptablemask = 31;
else if (searchlength < 2048)
skiptablemask = 63;
else if (searchlength < 4096)
skiptablemask = 127;
else
skiptablemask = 255;
state->skiptablemask = skiptablemask;
/*
* Initialize the skip table. We set all elements to the needle
* length, since this is the correct skip distance for any character
* not found in the needle.
*/
for (i = 0; i <= skiptablemask; i++)
state->skiptable[i] = len2;
/*
* Now examine the needle. For each character except the last one,
* set the corresponding table element to the appropriate skip
* distance. Note that when two characters share the same skip table
* entry, the one later in the needle must determine the skip
* distance.
*/
last = len2 - 1;
for (i = 0; i < last; i++)
state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
}
}
/*
* Advance to the next match, starting from the end of the previous match
* (or the beginning of the string, on first call). Returns true if a match
* is found.
*
* Note that this refuses to match an empty-string needle. Most callers
* will have handled that case specially and we'll never see it here.
*/
static bool
text_position_next(TextPositionState *state)
{
int needle_len = state->len2;
char *start_ptr;
char *matchptr;
if (needle_len <= 0)
return false; /* result for empty pattern */
/* Start from the point right after the previous match. */
if (state->last_match)
start_ptr = state->last_match + needle_len;
else
start_ptr = state->str1;
retry:
matchptr = text_position_next_internal(start_ptr, state);
if (!matchptr)
return false;
/*
* Found a match for the byte sequence. If this is a multibyte encoding,
* where one character's byte sequence can appear inside a longer
* multi-byte character, we need to verify that the match was at a
* character boundary, not in the middle of a multi-byte character.
*/
if (state->is_multibyte_char_in_char)
{
/* Walk one character at a time, until we reach the match. */
/* the search should never move backwards. */
Assert(state->refpoint <= matchptr);
while (state->refpoint < matchptr)
{
/* step to next character. */
state->refpoint += pg_mblen(state->refpoint);
state->refpos++;
/*
* If we stepped over the match's start position, then it was a
* false positive, where the byte sequence appeared in the middle
* of a multi-byte character. Skip it, and continue the search at
* the next character boundary.
*/
if (state->refpoint > matchptr)
{
start_ptr = state->refpoint;
goto retry;
}
}
}
state->last_match = matchptr;
return true;
}
/*
* Subroutine of text_position_next(). This searches for the raw byte
* sequence, ignoring any multi-byte encoding issues. Returns the first
* match starting at 'start_ptr', or NULL if no match is found.
*/
static char *
text_position_next_internal(char *start_ptr, TextPositionState *state)
{
int haystack_len = state->len1;
int needle_len = state->len2;
int skiptablemask = state->skiptablemask;
const char *haystack = state->str1;
const char *needle = state->str2;
const char *haystack_end = &haystack[haystack_len];
const char *hptr;
Assert(start_ptr >= haystack && start_ptr <= haystack_end);
if (needle_len == 1)
{
/* No point in using B-M-H for a one-character needle */
char nchar = *needle;
hptr = start_ptr;
while (hptr < haystack_end)
{
if (*hptr == nchar)
return (char *) hptr;
hptr++;
}
}
else
{
const char *needle_last = &needle[needle_len - 1];
/* Start at startpos plus the length of the needle */
hptr = start_ptr + needle_len - 1;
while (hptr < haystack_end)
{
/* Match the needle scanning *backward* */
const char *nptr;
const char *p;
nptr = needle_last;
p = hptr;
while (*nptr == *p)
{
/* Matched it all? If so, return 1-based position */
if (nptr == needle)
return (char *) p;
nptr--, p--;
}
/*
* No match, so use the haystack char at hptr to decide how far to
* advance. If the needle had any occurrence of that character
* (or more precisely, one sharing the same skiptable entry)
* before its last character, then we advance far enough to align
* the last such needle character with that haystack position.
* Otherwise we can advance by the whole needle length.
*/
hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
}
}
return 0; /* not found */
}
/*
* Return a pointer to the current match.
*
* The returned pointer points into the original haystack string.
*/
static char *
text_position_get_match_ptr(TextPositionState *state)
{
return state->last_match;
}
/*
* Return the offset of the current match.
*
* The offset is in characters, 1-based.
*/
static int
text_position_get_match_pos(TextPositionState *state)
{
/* Convert the byte position to char position. */
state->refpos += pg_mbstrlen_with_len(state->refpoint,
state->last_match - state->refpoint);
state->refpoint = state->last_match;
return state->refpos + 1;
}
/*
* Reset search state to the initial state installed by text_position_setup.
*
* The next call to text_position_next will search from the beginning
* of the string.
*/
static void
text_position_reset(TextPositionState *state)
{
state->last_match = NULL;
state->refpoint = state->str1;
state->refpos = 0;
}
static void
text_position_cleanup(TextPositionState *state)
{
/* no cleanup needed */
}
static void
check_collation_set(Oid collid)
{
if (!OidIsValid(collid))
{
/*
* This typically means that the parser could not resolve a conflict
* of implicit collations, so report it that way.
*/
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for string comparison"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
}
}
/* varstr_cmp()
* Comparison function for text strings with given lengths.
* Includes locale support, but must copy strings to temporary memory
* to allow null-termination for inputs to strcoll().
* Returns an integer less than, equal to, or greater than zero, indicating
* whether arg1 is less than, equal to, or greater than arg2.
*
* Note: many functions that depend on this are marked leakproof; therefore,
* avoid reporting the actual contents of the input when throwing errors.
* All errors herein should be things that can't happen except on corrupt
* data, anyway; otherwise we will have trouble with indexing strings that
* would cause them.
*/
int
varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
{
int result;
check_collation_set(collid);
/*
* Unfortunately, there is no strncoll(), so in the non-C locale case we
* have to do some memory copying. This turns out to be significantly
* slower, so we optimize the case where LC_COLLATE is C. We also try to
* optimize relatively-short strings by avoiding palloc/pfree overhead.
*/
if (lc_collate_is_c(collid))
{
result = memcmp(arg1, arg2, Min(len1, len2));
if ((result == 0) && (len1 != len2))
result = (len1 < len2) ? -1 : 1;
}
else
{
pg_locale_t mylocale;
mylocale = pg_newlocale_from_collation(collid);
/*
* memcmp() can't tell us which of two unequal strings sorts first,
* but it's a cheap way to tell if they're equal. Testing shows that
* memcmp() followed by strcoll() is only trivially slower than
* strcoll() by itself, so we don't lose much if this doesn't work out
* very often, and if it does - for example, because there are many
* equal strings in the input - then we win big by avoiding expensive
* collation-aware comparisons.
*/
if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
return 0;
result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
/* Break tie if necessary. */
if (result == 0 && pg_locale_deterministic(mylocale))
{
result = memcmp(arg1, arg2, Min(len1, len2));
if ((result == 0) && (len1 != len2))
result = (len1 < len2) ? -1 : 1;
}
}
return result;
}
/* text_cmp()
* Internal comparison function for text strings.
* Returns -1, 0 or 1
*/
static int
text_cmp(text *arg1, text *arg2, Oid collid)
{
char *a1p,
*a2p;
int len1,
len2;
a1p = VARDATA_ANY(arg1);
a2p = VARDATA_ANY(arg2);
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
return varstr_cmp(a1p, len1, a2p, len2, collid);
}
/*
* Comparison functions for text strings.
*
* Note: btree indexes need these routines not to leak memory; therefore,
* be careful to free working copies of toasted datums. Most places don't
* need to be so careful.
*/
Datum
texteq(PG_FUNCTION_ARGS)
{
Oid collid = PG_GET_COLLATION();
bool locale_is_c = false;
pg_locale_t mylocale = 0;
bool result;
check_collation_set(collid);
if (lc_collate_is_c(collid))
locale_is_c = true;
else
mylocale = pg_newlocale_from_collation(collid);
if (locale_is_c || pg_locale_deterministic(mylocale))
{
Datum arg1 = PG_GETARG_DATUM(0);
Datum arg2 = PG_GETARG_DATUM(1);
Size len1,
len2;
/*
* Since we only care about equality or not-equality, we can avoid all
* the expense of strcoll() here, and just do bitwise comparison. In
* fact, we don't even have to do a bitwise comparison if we can show
* the lengths of the strings are unequal; which might save us from
* having to detoast one or both values.
*/
len1 = toast_raw_datum_size(arg1);
len2 = toast_raw_datum_size(arg2);
if (len1 != len2)
result = false;
else
{
text *targ1 = DatumGetTextPP(arg1);
text *targ2 = DatumGetTextPP(arg2);
result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
len1 - VARHDRSZ) == 0);
PG_FREE_IF_COPY(targ1, 0);
PG_FREE_IF_COPY(targ2, 1);
}
}
else
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
result = (text_cmp(arg1, arg2, collid) == 0);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
}
PG_RETURN_BOOL(result);
}
Datum
textne(PG_FUNCTION_ARGS)
{
Oid collid = PG_GET_COLLATION();
bool locale_is_c = false;
pg_locale_t mylocale = 0;
bool result;
check_collation_set(collid);
if (lc_collate_is_c(collid))
locale_is_c = true;
else
mylocale = pg_newlocale_from_collation(collid);
if (locale_is_c || pg_locale_deterministic(mylocale))
{
Datum arg1 = PG_GETARG_DATUM(0);
Datum arg2 = PG_GETARG_DATUM(1);
Size len1,
len2;
/* See comment in texteq() */
len1 = toast_raw_datum_size(arg1);
len2 = toast_raw_datum_size(arg2);
if (len1 != len2)
result = true;
else
{
text *targ1 = DatumGetTextPP(arg1);
text *targ2 = DatumGetTextPP(arg2);
result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
len1 - VARHDRSZ) != 0);
PG_FREE_IF_COPY(targ1, 0);
PG_FREE_IF_COPY(targ2, 1);
}
}
else
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
result = (text_cmp(arg1, arg2, collid) != 0);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
}
PG_RETURN_BOOL(result);
}
Datum
text_lt(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
bool result;
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
text_le(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
bool result;
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
text_gt(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
bool result;
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
text_ge(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
bool result;
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
text_starts_with(PG_FUNCTION_ARGS)
{
Datum arg1 = PG_GETARG_DATUM(0);
Datum arg2 = PG_GETARG_DATUM(1);
Oid collid = PG_GET_COLLATION();
pg_locale_t mylocale = 0;
bool result;
Size len1,
len2;
check_collation_set(collid);
if (!lc_collate_is_c(collid))
mylocale = pg_newlocale_from_collation(collid);
if (!pg_locale_deterministic(mylocale))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("nondeterministic collations are not supported for substring searches")));
len1 = toast_raw_datum_size(arg1);
len2 = toast_raw_datum_size(arg2);
if (len2 > len1)
result = false;
else
{
text *targ1 = text_substring(arg1, 1, len2, false);
text *targ2 = DatumGetTextPP(arg2);
result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
VARSIZE_ANY_EXHDR(targ2)) == 0);
PG_FREE_IF_COPY(targ1, 0);
PG_FREE_IF_COPY(targ2, 1);
}
PG_RETURN_BOOL(result);
}
Datum
bttextcmp(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int32 result;
result = text_cmp(arg1, arg2, PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(result);
}
Datum
bttextsortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
Oid collid = ssup->ssup_collation;
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
/* Use generic string SortSupport */
varstr_sortsupport(ssup, TEXTOID, collid);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_VOID();
}
/*
* Generic sortsupport interface for character type's operator classes.
* Includes locale support, and support for BpChar semantics (i.e. removing
* trailing spaces before comparison).
*
* Relies on the assumption that text, VarChar, BpChar, and bytea all have the
* same representation. Callers that always use the C collation (e.g.
* non-collatable type callers like bytea) may have NUL bytes in their strings;
* this will not work with any other collation, though.
*/
void
varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
{
bool abbreviate = ssup->abbreviate;
bool collate_c = false;
VarStringSortSupport *sss;
pg_locale_t locale = 0;
check_collation_set(collid);
/*
* If possible, set ssup->comparator to a function which can be used to
* directly compare two datums. If we can do this, we'll avoid the
* overhead of a trip through the fmgr layer for every comparison, which
* can be substantial.
*
* Most typically, we'll set the comparator to varlenafastcmp_locale,
* which uses strcoll() to perform comparisons. We use that for the
* BpChar case too, but type NAME uses namefastcmp_locale. However, if
* LC_COLLATE = C, we can make things quite a bit faster with
* varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
* memcmp() rather than strcoll().
*/
if (lc_collate_is_c(collid))
{
if (typid == BPCHAROID)
ssup->comparator = bpcharfastcmp_c;
else if (typid == NAMEOID)
{
ssup->comparator = namefastcmp_c;
/* Not supporting abbreviation with type NAME, for now */
abbreviate = false;
}
else
ssup->comparator = varstrfastcmp_c;
collate_c = true;
}
else
{
/*
* We need a collation-sensitive comparison. To make things faster,
* we'll figure out the collation based on the locale id and cache the
* result.
*/
locale = pg_newlocale_from_collation(collid);
/*
* We use varlenafastcmp_locale except for type NAME.
*/
if (typid == NAMEOID)
{
ssup->comparator = namefastcmp_locale;
/* Not supporting abbreviation with type NAME, for now */
abbreviate = false;
}
else
ssup->comparator = varlenafastcmp_locale;
}
/*
* Unfortunately, it seems that abbreviation for non-C collations is
* broken on many common platforms; see pg_strxfrm_enabled().
*
* Even apart from the risk of broken locales, it's possible that there
* are platforms where the use of abbreviated keys should be disabled at
* compile time. Having only 4 byte datums could make worst-case
* performance drastically more likely, for example. Moreover, macOS's
* strxfrm() implementation is known to not effectively concentrate a
* significant amount of entropy from the original string in earlier
* transformed blobs. It's possible that other supported platforms are
* similarly encumbered. So, if we ever get past disabling this
* categorically, we may still want or need to disable it for particular
* platforms.
*/
if (!collate_c && !pg_strxfrm_enabled(locale))
abbreviate = false;
/*
* If we're using abbreviated keys, or if we're using a locale-aware
* comparison, we need to initialize a VarStringSortSupport object. Both
* cases will make use of the temporary buffers we initialize here for
* scratch space (and to detect requirement for BpChar semantics from
* caller), and the abbreviation case requires additional state.
*/
if (abbreviate || !collate_c)
{
sss = palloc(sizeof(VarStringSortSupport));
sss->buf1 = palloc(TEXTBUFLEN);
sss->buflen1 = TEXTBUFLEN;
sss->buf2 = palloc(TEXTBUFLEN);
sss->buflen2 = TEXTBUFLEN;
/* Start with invalid values */
sss->last_len1 = -1;
sss->last_len2 = -1;
/* Initialize */
sss->last_returned = 0;
sss->locale = locale;
/*
* To avoid somehow confusing a strxfrm() blob and an original string,
* constantly keep track of the variety of data that buf1 and buf2
* currently contain.
*
* Comparisons may be interleaved with conversion calls. Frequently,
* conversions and comparisons are batched into two distinct phases,
* but the correctness of caching cannot hinge upon this. For
* comparison caching, buffer state is only trusted if cache_blob is
* found set to false, whereas strxfrm() caching only trusts the state
* when cache_blob is found set to true.
*
* Arbitrarily initialize cache_blob to true.
*/
sss->cache_blob = true;
sss->collate_c = collate_c;
sss->typid = typid;
ssup->ssup_extra = sss;
/*
* If possible, plan to use the abbreviated keys optimization. The
* core code may switch back to authoritative comparator should
* abbreviation be aborted.
*/
if (abbreviate)
{
sss->prop_card = 0.20;
initHyperLogLog(&sss->abbr_card, 10);
initHyperLogLog(&sss->full_card, 10);
ssup->abbrev_full_comparator = ssup->comparator;
ssup->comparator = ssup_datum_unsigned_cmp;
ssup->abbrev_converter = varstr_abbrev_convert;
ssup->abbrev_abort = varstr_abbrev_abort;
}
}
}
/*
* sortsupport comparison func (for C locale case)
*/
static int
varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
{
VarString *arg1 = DatumGetVarStringPP(x);
VarString *arg2 = DatumGetVarStringPP(y);
char *a1p,
*a2p;
int len1,
len2,
result;
a1p = VARDATA_ANY(arg1);
a2p = VARDATA_ANY(arg2);
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
result = memcmp(a1p, a2p, Min(len1, len2));
if ((result == 0) && (len1 != len2))
result = (len1 < len2) ? -1 : 1;
/* We can't afford to leak memory here. */
if (PointerGetDatum(arg1) != x)
pfree(arg1);
if (PointerGetDatum(arg2) != y)
pfree(arg2);
return result;
}
/*
* sortsupport comparison func (for BpChar C locale case)
*
* BpChar outsources its sortsupport to this module. Specialization for the
* varstr_sortsupport BpChar case, modeled on
* internal_bpchar_pattern_compare().
*/
static int
bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
{
BpChar *arg1 = DatumGetBpCharPP(x);
BpChar *arg2 = DatumGetBpCharPP(y);
char *a1p,
*a2p;
int len1,
len2,
result;
a1p = VARDATA_ANY(arg1);
a2p = VARDATA_ANY(arg2);
len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
result = memcmp(a1p, a2p, Min(len1, len2));
if ((result == 0) && (len1 != len2))
result = (len1 < len2) ? -1 : 1;
/* We can't afford to leak memory here. */
if (PointerGetDatum(arg1) != x)
pfree(arg1);
if (PointerGetDatum(arg2) != y)
pfree(arg2);
return result;
}
/*
* sortsupport comparison func (for NAME C locale case)
*/
static int
namefastcmp_c(Datum x, Datum y, SortSupport ssup)
{
Name arg1 = DatumGetName(x);
Name arg2 = DatumGetName(y);
return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
}
/*
* sortsupport comparison func (for locale case with all varlena types)
*/
static int
varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
{
VarString *arg1 = DatumGetVarStringPP(x);
VarString *arg2 = DatumGetVarStringPP(y);
char *a1p,
*a2p;
int len1,
len2,
result;
a1p = VARDATA_ANY(arg1);
a2p = VARDATA_ANY(arg2);
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
/* We can't afford to leak memory here. */
if (PointerGetDatum(arg1) != x)
pfree(arg1);
if (PointerGetDatum(arg2) != y)
pfree(arg2);
return result;
}
/*
* sortsupport comparison func (for locale case with NAME type)
*/
static int
namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
{
Name arg1 = DatumGetName(x);
Name arg2 = DatumGetName(y);
return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
NameStr(*arg2), strlen(NameStr(*arg2)),
ssup);
}
/*
* sortsupport comparison func for locale cases
*/
static int
varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
{
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
int result;
bool arg1_match;
/* Fast pre-check for equality, as discussed in varstr_cmp() */
if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
{
/*
* No change in buf1 or buf2 contents, so avoid changing last_len1 or
* last_len2. Existing contents of buffers might still be used by
* next call.
*
* It's fine to allow the comparison of BpChar padding bytes here,
* even though that implies that the memcmp() will usually be
* performed for BpChar callers (though multibyte characters could
* still prevent that from occurring). The memcmp() is still very
* cheap, and BpChar's funny semantics have us remove trailing spaces
* (not limited to padding), so we need make no distinction between
* padding space characters and "real" space characters.
*/
return 0;
}
if (sss->typid == BPCHAROID)
{
/* Get true number of bytes, ignoring trailing spaces */
len1 = bpchartruelen(a1p, len1);
len2 = bpchartruelen(a2p, len2);
}
if (len1 >= sss->buflen1)
{
sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
sss->buf1 = repalloc(sss->buf1, sss->buflen1);
}
if (len2 >= sss->buflen2)
{
sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
sss->buf2 = repalloc(sss->buf2, sss->buflen2);
}
/*
* We're likely to be asked to compare the same strings repeatedly, and
* memcmp() is so much cheaper than strcoll() that it pays to try to cache
* comparisons, even though in general there is no reason to think that
* that will work out (every string datum may be unique). Caching does
* not slow things down measurably when it doesn't work out, and can speed
* things up by rather a lot when it does. In part, this is because the
* memcmp() compares data from cachelines that are needed in L1 cache even
* when the last comparison's result cannot be reused.
*/
arg1_match = true;
if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
{
arg1_match = false;
memcpy(sss->buf1, a1p, len1);
sss->buf1[len1] = '\0';
sss->last_len1 = len1;
}
/*
* If we're comparing the same two strings as last time, we can return the
* same answer without calling strcoll() again. This is more likely than
* it seems (at least with moderate to low cardinality sets), because
* quicksort compares the same pivot against many values.
*/
if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
{
memcpy(sss->buf2, a2p, len2);
sss->buf2[len2] = '\0';
sss->last_len2 = len2;
}
else if (arg1_match && !sss->cache_blob)
{
/* Use result cached following last actual strcoll() call */
return sss->last_returned;
}
result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
/* Break tie if necessary. */
if (result == 0 && pg_locale_deterministic(sss->locale))
result = strcmp(sss->buf1, sss->buf2);
/* Cache result, perhaps saving an expensive strcoll() call next time */
sss->cache_blob = false;
sss->last_returned = result;
return result;
}
/*
* Conversion routine for sortsupport. Converts original to abbreviated key
* representation. Our encoding strategy is simple -- pack the first 8 bytes
* of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
* stored in reverse order), and treat it as an unsigned integer. When the "C"
* locale is used, or in case of bytea, just memcpy() from original instead.
*/
static Datum
varstr_abbrev_convert(Datum original, SortSupport ssup)
{
const size_t max_prefix_bytes = sizeof(Datum);
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
VarString *authoritative = DatumGetVarStringPP(original);
char *authoritative_data = VARDATA_ANY(authoritative);
/* working state */
Datum res;
char *pres;
int len;
uint32 hash;
pres = (char *) &res;
/* memset(), so any non-overwritten bytes are NUL */
memset(pres, 0, max_prefix_bytes);
len = VARSIZE_ANY_EXHDR(authoritative);
/* Get number of bytes, ignoring trailing spaces */
if (sss->typid == BPCHAROID)
len = bpchartruelen(authoritative_data, len);
/*
* If we're using the C collation, use memcpy(), rather than strxfrm(), to
* abbreviate keys. The full comparator for the C locale is always
* memcmp(). It would be incorrect to allow bytea callers (callers that
* always force the C collation -- bytea isn't a collatable type, but this
* approach is convenient) to use strxfrm(). This is because bytea
* strings may contain NUL bytes. Besides, this should be faster, too.
*
* More generally, it's okay that bytea callers can have NUL bytes in
* strings because abbreviated cmp need not make a distinction between
* terminating NUL bytes, and NUL bytes representing actual NULs in the
* authoritative representation. Hopefully a comparison at or past one
* abbreviated key's terminating NUL byte will resolve the comparison
* without consulting the authoritative representation; specifically, some
* later non-NUL byte in the longer string can resolve the comparison
* against a subsequent terminating NUL in the shorter string. There will
* usually be what is effectively a "length-wise" resolution there and
* then.
*
* If that doesn't work out -- if all bytes in the longer string
* positioned at or past the offset of the smaller string's (first)
* terminating NUL are actually representative of NUL bytes in the
* authoritative binary string (perhaps with some *terminating* NUL bytes
* towards the end of the longer string iff it happens to still be small)
* -- then an authoritative tie-breaker will happen, and do the right
* thing: explicitly consider string length.
*/
if (sss->collate_c)
memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
else
{
Size bsize;
/*
* We're not using the C collation, so fall back on strxfrm or ICU
* analogs.
*/
/* By convention, we use buffer 1 to store and NUL-terminate */
if (len >= sss->buflen1)
{
sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
sss->buf1 = repalloc(sss->buf1, sss->buflen1);
}
/* Might be able to reuse strxfrm() blob from last call */
if (sss->last_len1 == len && sss->cache_blob &&
memcmp(sss->buf1, authoritative_data, len) == 0)
{
memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
/* No change affecting cardinality, so no hashing required */
goto done;
}
memcpy(sss->buf1, authoritative_data, len);
/*
* pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
*/
sss->buf1[len] = '\0';
sss->last_len1 = len;
if (pg_strxfrm_prefix_enabled(sss->locale))
{
if (sss->buflen2 < max_prefix_bytes)
{
sss->buflen2 = Max(max_prefix_bytes,
Min(sss->buflen2 * 2, MaxAllocSize));
sss->buf2 = repalloc(sss->buf2, sss->buflen2);
}
bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
max_prefix_bytes, sss->locale);
sss->last_len2 = bsize;
}
else
{
/*
* Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
* again. The pg_strxfrm() function leaves the result buffer
* content undefined if the result did not fit, so we need to
* retry until everything fits, even though we only need the first
* few bytes in the end.
*/
for (;;)
{
bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
sss->locale);
sss->last_len2 = bsize;
if (bsize < sss->buflen2)
break;
/*
* Grow buffer and retry.
*/
sss->buflen2 = Max(bsize + 1,
Min(sss->buflen2 * 2, MaxAllocSize));
sss->buf2 = repalloc(sss->buf2, sss->buflen2);
}
}
/*
* Every Datum byte is always compared. This is safe because the
* strxfrm() blob is itself NUL terminated, leaving no danger of
* misinterpreting any NUL bytes not intended to be interpreted as
* logically representing termination.
*
* (Actually, even if there were NUL bytes in the blob it would be
* okay. See remarks on bytea case above.)
*/
memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
}
/*
* Maintain approximate cardinality of both abbreviated keys and original,
* authoritative keys using HyperLogLog. Used as cheap insurance against
* the worst case, where we do many string transformations for no saving
* in full strcoll()-based comparisons. These statistics are used by
* varstr_abbrev_abort().
*
* First, Hash key proper, or a significant fraction of it. Mix in length
* in order to compensate for cases where differences are past
* PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
*/
hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
Min(len, PG_CACHE_LINE_SIZE)));
if (len > PG_CACHE_LINE_SIZE)
hash ^= DatumGetUInt32(hash_uint32((uint32) len));
addHyperLogLog(&sss->full_card, hash);
/* Hash abbreviated key */
#if SIZEOF_DATUM == 8
{
uint32 lohalf,
hihalf;
lohalf = (uint32) res;
hihalf = (uint32) (res >> 32);
hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
}
#else /* SIZEOF_DATUM != 8 */
hash = DatumGetUInt32(hash_uint32((uint32) res));
#endif
addHyperLogLog(&sss->abbr_card, hash);
/* Cache result, perhaps saving an expensive strxfrm() call next time */
sss->cache_blob = true;
done:
/*
* Byteswap on little-endian machines.
*
* This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
* 3-way comparator) works correctly on all platforms. If we didn't do
* this, the comparator would have to call memcmp() with a pair of
* pointers to the first byte of each abbreviated key, which is slower.
*/
res = DatumBigEndianToNative(res);
/* Don't leak memory here */
if (PointerGetDatum(authoritative) != original)
pfree(authoritative);
return res;
}
/*
* Callback for estimating effectiveness of abbreviated key optimization, using
* heuristic rules. Returns value indicating if the abbreviation optimization
* should be aborted, based on its projected effectiveness.
*/
static bool
varstr_abbrev_abort(int memtupcount, SortSupport ssup)
{
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
double abbrev_distinct,
key_distinct;
Assert(ssup->abbreviate);
/* Have a little patience */
if (memtupcount < 100)
return false;
abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
key_distinct = estimateHyperLogLog(&sss->full_card);
/*
* Clamp cardinality estimates to at least one distinct value. While
* NULLs are generally disregarded, if only NULL values were seen so far,
* that might misrepresent costs if we failed to clamp.
*/
if (abbrev_distinct <= 1.0)
abbrev_distinct = 1.0;
if (key_distinct <= 1.0)
key_distinct = 1.0;
/*
* In the worst case all abbreviated keys are identical, while at the same
* time there are differences within full key strings not captured in
* abbreviations.
*/
#ifdef TRACE_SORT
if (trace_sort)
{
double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
"(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
sss->prop_card);
}
#endif
/*
* If the number of distinct abbreviated keys approximately matches the
* number of distinct authoritative original keys, that's reason enough to
* proceed. We can win even with a very low cardinality set if most
* tie-breakers only memcmp(). This is by far the most important
* consideration.
*
* While comparisons that are resolved at the abbreviated key level are
* considerably cheaper than tie-breakers resolved with memcmp(), both of
* those two outcomes are so much cheaper than a full strcoll() once
* sorting is underway that it doesn't seem worth it to weigh abbreviated
* cardinality against the overall size of the set in order to more
* accurately model costs. Assume that an abbreviated comparison, and an
* abbreviated comparison with a cheap memcmp()-based authoritative
* resolution are equivalent.
*/
if (abbrev_distinct > key_distinct * sss->prop_card)
{
/*
* When we have exceeded 10,000 tuples, decay required cardinality
* aggressively for next call.
*
* This is useful because the number of comparisons required on
* average increases at a linearithmic rate, and at roughly 10,000
* tuples that factor will start to dominate over the linear costs of
* string transformation (this is a conservative estimate). The decay
* rate is chosen to be a little less aggressive than halving -- which
* (since we're called at points at which memtupcount has doubled)
* would never see the cost model actually abort past the first call
* following a decay. This decay rate is mostly a precaution against
* a sudden, violent swing in how well abbreviated cardinality tracks
* full key cardinality. The decay also serves to prevent a marginal
* case from being aborted too late, when too much has already been
* invested in string transformation.
*
* It's possible for sets of several million distinct strings with
* mere tens of thousands of distinct abbreviated keys to still
* benefit very significantly. This will generally occur provided
* each abbreviated key is a proxy for a roughly uniform number of the
* set's full keys. If it isn't so, we hope to catch that early and
* abort. If it isn't caught early, by the time the problem is
* apparent it's probably not worth aborting.
*/
if (memtupcount > 10000)
sss->prop_card *= 0.65;
return false;
}
/*
* Abort abbreviation strategy.
*
* The worst case, where all abbreviated keys are identical while all
* original strings differ will typically only see a regression of about
* 10% in execution time for small to medium sized lists of strings.
* Whereas on modern CPUs where cache stalls are the dominant cost, we can
* often expect very large improvements, particularly with sets of strings
* of moderately high to high abbreviated cardinality. There is little to
* lose but much to gain, which our strategy reflects.
*/
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
"(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
#endif
return true;
}
/*
* Generic equalimage support function for character type's operator classes.
* Disables the use of deduplication with nondeterministic collations.
*/
Datum
btvarstrequalimage(PG_FUNCTION_ARGS)
{
/* Oid opcintype = PG_GETARG_OID(0); */
Oid collid = PG_GET_COLLATION();
check_collation_set(collid);
if (lc_collate_is_c(collid) ||
collid == DEFAULT_COLLATION_OID ||
get_collation_isdeterministic(collid))
PG_RETURN_BOOL(true);
else
PG_RETURN_BOOL(false);
}
Datum
text_larger(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
text *result;
result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
PG_RETURN_TEXT_P(result);
}
Datum
text_smaller(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
text *result;
result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
PG_RETURN_TEXT_P(result);
}
/*
* Cross-type comparison functions for types text and name.
*/
Datum
nameeqtext(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
size_t len1 = strlen(NameStr(*arg1));
size_t len2 = VARSIZE_ANY_EXHDR(arg2);
Oid collid = PG_GET_COLLATION();
bool result;
check_collation_set(collid);
if (collid == C_COLLATION_OID)
result = (len1 == len2 &&
memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
else
result = (varstr_cmp(NameStr(*arg1), len1,
VARDATA_ANY(arg2), len2,
collid) == 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
texteqname(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
Name arg2 = PG_GETARG_NAME(1);
size_t len1 = VARSIZE_ANY_EXHDR(arg1);
size_t len2 = strlen(NameStr(*arg2));
Oid collid = PG_GET_COLLATION();
bool result;
check_collation_set(collid);
if (collid == C_COLLATION_OID)
result = (len1 == len2 &&
memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
else
result = (varstr_cmp(VARDATA_ANY(arg1), len1,
NameStr(*arg2), len2,
collid) == 0);
PG_FREE_IF_COPY(arg1, 0);
PG_RETURN_BOOL(result);
}
Datum
namenetext(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
size_t len1 = strlen(NameStr(*arg1));
size_t len2 = VARSIZE_ANY_EXHDR(arg2);
Oid collid = PG_GET_COLLATION();
bool result;
check_collation_set(collid);
if (collid == C_COLLATION_OID)
result = !(len1 == len2 &&
memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
else
result = !(varstr_cmp(NameStr(*arg1), len1,
VARDATA_ANY(arg2), len2,
collid) == 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
textnename(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
Name arg2 = PG_GETARG_NAME(1);
size_t len1 = VARSIZE_ANY_EXHDR(arg1);
size_t len2 = strlen(NameStr(*arg2));
Oid collid = PG_GET_COLLATION();
bool result;
check_collation_set(collid);
if (collid == C_COLLATION_OID)
result = !(len1 == len2 &&
memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
else
result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
NameStr(*arg2), len2,
collid) == 0);
PG_FREE_IF_COPY(arg1, 0);
PG_RETURN_BOOL(result);
}
Datum
btnametextcmp(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int32 result;
result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(result);
}
Datum
bttextnamecmp(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
Name arg2 = PG_GETARG_NAME(1);
int32 result;
result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
NameStr(*arg2), strlen(NameStr(*arg2)),
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_RETURN_INT32(result);
}
#define CmpCall(cmpfunc) \
DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
PG_GET_COLLATION(), \
PG_GETARG_DATUM(0), \
PG_GETARG_DATUM(1)))
Datum
namelttext(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
}
Datum
nameletext(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
}
Datum
namegttext(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
}
Datum
namegetext(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
}
Datum
textltname(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
}
Datum
textlename(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
}
Datum
textgtname(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
}
Datum
textgename(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
}
#undef CmpCall
/*
* The following operators support character-by-character comparison
* of text datums, to allow building indexes suitable for LIKE clauses.
* Note that the regular texteq/textne comparison operators, and regular
* support functions 1 and 2 with "C" collation are assumed to be
* compatible with these!
*/
static int
internal_text_pattern_compare(text *arg1, text *arg2)
{
int result;
int len1,
len2;
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
if (result != 0)
return result;
else if (len1 < len2)
return -1;
else if (len1 > len2)
return 1;
else
return 0;
}
Datum
text_pattern_lt(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int result;
result = internal_text_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result < 0);
}
Datum
text_pattern_le(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int result;
result = internal_text_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result <= 0);
}
Datum
text_pattern_ge(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int result;
result = internal_text_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result >= 0);
}
Datum
text_pattern_gt(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int result;
result = internal_text_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result > 0);
}
Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)
{
text *arg1 = PG_GETARG_TEXT_PP(0);
text *arg2 = PG_GETARG_TEXT_PP(1);
int result;
result = internal_text_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(result);
}
Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
/* Use generic string SortSupport, forcing "C" collation */
varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_VOID();
}
/*-------------------------------------------------------------
* byteaoctetlen
*
* get the number of bytes contained in an instance of type 'bytea'
*-------------------------------------------------------------
*/
Datum
byteaoctetlen(PG_FUNCTION_ARGS)
{
Datum str = PG_GETARG_DATUM(0);
/* We need not detoast the input at all */
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
}
/*
* byteacat -
* takes two bytea* and returns a bytea* that is the concatenation of
* the two.
*
* Cloned from textcat and modified as required.
*/
Datum
byteacat(PG_FUNCTION_ARGS)
{
bytea *t1 = PG_GETARG_BYTEA_PP(0);
bytea *t2 = PG_GETARG_BYTEA_PP(1);
PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
}
/*
* bytea_catenate
* Guts of byteacat(), broken out so it can be used by other functions
*
* Arguments can be in short-header form, but not compressed or out-of-line
*/
static bytea *
bytea_catenate(bytea *t1, bytea *t2)
{
bytea *result;
int len1,
len2,
len;
char *ptr;
len1 = VARSIZE_ANY_EXHDR(t1);
len2 = VARSIZE_ANY_EXHDR(t2);
/* paranoia ... probably should throw error instead? */
if (len1 < 0)
len1 = 0;
if (len2 < 0)
len2 = 0;
len = len1 + len2 + VARHDRSZ;
result = (bytea *) palloc(len);
/* Set size of result string... */
SET_VARSIZE(result, len);
/* Fill data field of result string... */
ptr = VARDATA(result);
if (len1 > 0)
memcpy(ptr, VARDATA_ANY(t1), len1);
if (len2 > 0)
memcpy(ptr + len1, VARDATA_ANY(t2), len2);
return result;
}
#define PG_STR_GET_BYTEA(str_) \
DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
/*
* bytea_substr()
* Return a substring starting at the specified position.
* Cloned from text_substr and modified as required.
*
* Input:
* - string
* - starting position (is one-based)
* - string length (optional)
*
* If the starting position is zero or less, then return from the start of the string
* adjusting the length to be consistent with the "negative start" per SQL.
* If the length is less than zero, an ERROR is thrown. If no third argument
* (length) is provided, the length to the end of the string is assumed.
*/
Datum
bytea_substr(PG_FUNCTION_ARGS)
{
PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
PG_GETARG_INT32(1),
PG_GETARG_INT32(2),
false));
}
/*
* bytea_substr_no_len -
* Wrapper to avoid opr_sanity failure due to
* one function accepting a different number of args.
*/
Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)
{
PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
PG_GETARG_INT32(1),
-1,
true));
}
static bytea *
bytea_substring(Datum str,
int S,
int L,
bool length_not_specified)
{
int32 S1; /* adjusted start position */
int32 L1; /* adjusted substring length */
int32 E; /* end position */
/*
* The logic here should generally match text_substring().
*/
S1 = Max(S, 1);
if (length_not_specified)
{
/*
* Not passed a length - DatumGetByteaPSlice() grabs everything to the
* end of the string if we pass it a negative value for length.
*/
L1 = -1;
}
else if (L < 0)
{
/* SQL99 says to throw an error for E < S, i.e., negative length */
ereport(ERROR,
(errcode(ERRCODE_SUBSTRING_ERROR),
errmsg("negative substring length not allowed")));
L1 = -1; /* silence stupider compilers */
}
else if (pg_add_s32_overflow(S, L, &E))
{
/*
* L could be large enough for S + L to overflow, in which case the
* substring must run to end of string.
*/
L1 = -1;
}
else
{
/*
* A zero or negative value for the end position can happen if the
* start was negative or one. SQL99 says to return a zero-length
* string.
*/
if (E < 1)
return PG_STR_GET_BYTEA("");
L1 = E - S1;
}
/*
* If the start position is past the end of the string, SQL99 says to
* return a zero-length string -- DatumGetByteaPSlice() will do that for
* us. We need only convert S1 to zero-based starting position.
*/
return DatumGetByteaPSlice(str, S1 - 1, L1);
}
/*
* byteaoverlay
* Replace specified substring of first string with second
*
* The SQL standard defines OVERLAY() in terms of substring and concatenation.
* This code is a direct implementation of what the standard says.
*/
Datum
byteaoverlay(PG_FUNCTION_ARGS)
{
bytea *t1 = PG_GETARG_BYTEA_PP(0);
bytea *t2 = PG_GETARG_BYTEA_PP(1);
int sp = PG_GETARG_INT32(2); /* substring start position */
int sl = PG_GETARG_INT32(3); /* substring length */
PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
}
Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)
{
bytea *t1 = PG_GETARG_BYTEA_PP(0);
bytea *t2 = PG_GETARG_BYTEA_PP(1);
int sp = PG_GETARG_INT32(2); /* substring start position */
int sl;
sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
}
static bytea *
bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
{
bytea *result;
bytea *s1;
bytea *s2;
int sp_pl_sl;
/*
* Check for possible integer-overflow cases. For negative sp, throw a
* "substring length" error because that's what should be expected
* according to the spec's definition of OVERLAY().
*/
if (sp <= 0)
ereport(ERROR,
(errcode(ERRCODE_SUBSTRING_ERROR),
errmsg("negative substring length not allowed")));
if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("integer out of range")));
s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
result = bytea_catenate(s1, t2);
result = bytea_catenate(result, s2);
return result;
}
/*
* bit_count
*/
Datum
bytea_bit_count(PG_FUNCTION_ARGS)
{
bytea *t1 = PG_GETARG_BYTEA_PP(0);
PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
}
/*
* byteapos -
* Return the position of the specified substring.
* Implements the SQL POSITION() function.
* Cloned from textpos and modified as required.
*/
Datum
byteapos(PG_FUNCTION_ARGS)
{
bytea *t1 = PG_GETARG_BYTEA_PP(0);
bytea *t2 = PG_GETARG_BYTEA_PP(1);
int pos;
int px,
p;
int len1,
len2;
char *p1,
*p2;
len1 = VARSIZE_ANY_EXHDR(t1);
len2 = VARSIZE_ANY_EXHDR(t2);
if (len2 <= 0)
PG_RETURN_INT32(1); /* result for empty pattern */
p1 = VARDATA_ANY(t1);
p2 = VARDATA_ANY(t2);
pos = 0;
px = (len1 - len2);
for (p = 0; p <= px; p++)
{
if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
{
pos = p + 1;
break;
};
p1++;
};
PG_RETURN_INT32(pos);
}
/*-------------------------------------------------------------
* byteaGetByte
*
* this routine treats "bytea" as an array of bytes.
* It returns the Nth byte (a number between 0 and 255).
*-------------------------------------------------------------
*/
Datum
byteaGetByte(PG_FUNCTION_ARGS)
{
bytea *v = PG_GETARG_BYTEA_PP(0);
int32 n = PG_GETARG_INT32(1);
int len;
int byte;
len = VARSIZE_ANY_EXHDR(v);
if (n < 0 || n >= len)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("index %d out of valid range, 0..%d",
n, len - 1)));
byte = ((unsigned char *) VARDATA_ANY(v))[n];
PG_RETURN_INT32(byte);
}
/*-------------------------------------------------------------
* byteaGetBit
*
* This routine treats a "bytea" type like an array of bits.
* It returns the value of the Nth bit (0 or 1).
*
*-------------------------------------------------------------
*/
Datum
byteaGetBit(PG_FUNCTION_ARGS)
{
bytea *v = PG_GETARG_BYTEA_PP(0);
int64 n = PG_GETARG_INT64(1);
int byteNo,
bitNo;
int len;
int byte;
len = VARSIZE_ANY_EXHDR(v);
if (n < 0 || n >= (int64) len * 8)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("index %lld out of valid range, 0..%lld",
(long long) n, (long long) len * 8 - 1)));
/* n/8 is now known < len, so safe to cast to int */
byteNo = (int) (n / 8);
bitNo = (int) (n % 8);
byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
if (byte & (1 << bitNo))
PG_RETURN_INT32(1);
else
PG_RETURN_INT32(0);
}
/*-------------------------------------------------------------
* byteaSetByte
*
* Given an instance of type 'bytea' creates a new one with
* the Nth byte set to the given value.
*
*-------------------------------------------------------------
*/
Datum
byteaSetByte(PG_FUNCTION_ARGS)
{
bytea *res = PG_GETARG_BYTEA_P_COPY(0);
int32 n = PG_GETARG_INT32(1);
int32 newByte = PG_GETARG_INT32(2);
int len;
len = VARSIZE(res) - VARHDRSZ;
if (n < 0 || n >= len)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("index %d out of valid range, 0..%d",
n, len - 1)));
/*
* Now set the byte.
*/
((unsigned char *) VARDATA(res))[n] = newByte;
PG_RETURN_BYTEA_P(res);
}
/*-------------------------------------------------------------
* byteaSetBit
*
* Given an instance of type 'bytea' creates a new one with
* the Nth bit set to the given value.
*
*-------------------------------------------------------------
*/
Datum
byteaSetBit(PG_FUNCTION_ARGS)
{
bytea *res = PG_GETARG_BYTEA_P_COPY(0);
int64 n = PG_GETARG_INT64(1);
int32 newBit = PG_GETARG_INT32(2);
int len;
int oldByte,
newByte;
int byteNo,
bitNo;
len = VARSIZE(res) - VARHDRSZ;
if (n < 0 || n >= (int64) len * 8)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
errmsg("index %lld out of valid range, 0..%lld",
(long long) n, (long long) len * 8 - 1)));
/* n/8 is now known < len, so safe to cast to int */
byteNo = (int) (n / 8);
bitNo = (int) (n % 8);
/*
* sanity check!
*/
if (newBit != 0 && newBit != 1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("new bit must be 0 or 1")));
/*
* Update the byte.
*/
oldByte = ((unsigned char *) VARDATA(res))[byteNo];
if (newBit == 0)
newByte = oldByte & (~(1 << bitNo));
else
newByte = oldByte | (1 << bitNo);
((unsigned char *) VARDATA(res))[byteNo] = newByte;
PG_RETURN_BYTEA_P(res);
}
/* text_name()
* Converts a text type to a Name type.
*/
Datum
text_name(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_PP(0);
Name result;
int len;
len = VARSIZE_ANY_EXHDR(s);
/* Truncate oversize input */
if (len >= NAMEDATALEN)
len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
/* We use palloc0 here to ensure result is zero-padded */
result = (Name) palloc0(NAMEDATALEN);
memcpy(NameStr(*result), VARDATA_ANY(s), len);
PG_RETURN_NAME(result);
}
/* name_text()
* Converts a Name type to a text type.
*/
Datum
name_text(PG_FUNCTION_ARGS)
{
Name s = PG_GETARG_NAME(0);
PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
}
/*
* textToQualifiedNameList - convert a text object to list of names
*
* This implements the input parsing needed by nextval() and other
* functions that take a text parameter representing a qualified name.
* We split the name at dots, downcase if not double-quoted, and
* truncate names if they're too long.
*/
List *
textToQualifiedNameList(text *textval)
{
char *rawname;
List *result = NIL;
List *namelist;
ListCell *l;
/* Convert to C string (handles possible detoasting). */
/* Note we rely on being able to modify rawname below. */
rawname = text_to_cstring(textval);
if (!SplitIdentifierString(rawname, '.', &namelist))
ereport(ERROR,
(errcode(ERRCODE_INVALID_NAME),
errmsg("invalid name syntax")));
if (namelist == NIL)
ereport(ERROR,
(errcode(ERRCODE_INVALID_NAME),
errmsg("invalid name syntax")));
foreach(l, namelist)
{
char *curname = (char *) lfirst(l);
result = lappend(result, makeString(pstrdup(curname)));
}
pfree(rawname);
list_free(namelist);
return result;
}
/*
* SplitIdentifierString --- parse a string containing identifiers
*
* This is the guts of textToQualifiedNameList, and is exported for use in
* other situations such as parsing GUC variables. In the GUC case, it's
* important to avoid memory leaks, so the API is designed to minimize the
* amount of stuff that needs to be allocated and freed.
*
* Inputs:
* rawstring: the input string; must be overwritable! On return, it's
* been modified to contain the separated identifiers.
* separator: the separator punctuation expected between identifiers
* (typically '.' or ','). Whitespace may also appear around
* identifiers.
* Outputs:
* namelist: filled with a palloc'd list of pointers to identifiers within
* rawstring. Caller should list_free() this even on error return.
*
* Returns true if okay, false if there is a syntax error in the string.
*
* Note that an empty string is considered okay here, though not in
* textToQualifiedNameList.
*/
bool
SplitIdentifierString(char *rawstring, char separator,
List **namelist)
{
char *nextp = rawstring;
bool done = false;
*namelist = NIL;
while (scanner_isspace(*nextp))
nextp++; /* skip leading whitespace */
if (*nextp == '\0')
return true; /* allow empty string */
/* At the top of the loop, we are at start of a new identifier. */
do
{
char *curname;
char *endp;
if (*nextp == '"')
{
/* Quoted name --- collapse quote-quote pairs, no downcasing */
curname = nextp + 1;
for (;;)
{
endp = strchr(nextp + 1, '"');
if (endp == NULL)
return false; /* mismatched quotes */
if (endp[1] != '"')
break; /* found end of quoted name */
/* Collapse adjacent quotes into one quote, and look again */
memmove(endp, endp + 1, strlen(endp));
nextp = endp;
}
/* endp now points at the terminating quote */
nextp = endp + 1;
}
else
{
/* Unquoted name --- extends to separator or whitespace */
char *downname;
int len;
curname = nextp;
while (*nextp && *nextp != separator &&
!scanner_isspace(*nextp))
nextp++;
endp = nextp;
if (curname == nextp)
return false; /* empty unquoted name not allowed */
/*
* Downcase the identifier, using same code as main lexer does.
*
* XXX because we want to overwrite the input in-place, we cannot
* support a downcasing transformation that increases the string
* length. This is not a problem given the current implementation
* of downcase_truncate_identifier, but we'll probably have to do
* something about this someday.
*/
len = endp - curname;
downname = downcase_truncate_identifier(curname, len, false);
Assert(strlen(downname) <= len);
strncpy(curname, downname, len); /* strncpy is required here */
pfree(downname);
}
while (scanner_isspace(*nextp))
nextp++; /* skip trailing whitespace */
if (*nextp == separator)
{
nextp++;
while (scanner_isspace(*nextp))
nextp++; /* skip leading whitespace for next */
/* we expect another name, so done remains false */
}
else if (*nextp == '\0')
done = true;
else
return false; /* invalid syntax */
/* Now safe to overwrite separator with a null */
*endp = '\0';
/* Truncate name if it's overlength */
truncate_identifier(curname, strlen(curname), false);
/*
* Finished isolating current name --- add it to list
*/
*namelist = lappend(*namelist, curname);
/* Loop back if we didn't reach end of string */
} while (!done);
return true;
}
/*
* SplitDirectoriesString --- parse a string containing file/directory names
*
* This works fine on file names too; the function name is historical.
*
* This is similar to SplitIdentifierString, except that the parsing
* rules are meant to handle pathnames instead of identifiers: there is
* no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
* and we apply canonicalize_path() to each extracted string. Because of the
* last, the returned strings are separately palloc'd rather than being
* pointers into rawstring --- but we still scribble on rawstring.
*
* Inputs:
* rawstring: the input string; must be modifiable!
* separator: the separator punctuation expected between directories
* (typically ',' or ';'). Whitespace may also appear around
* directories.
* Outputs:
* namelist: filled with a palloc'd list of directory names.
* Caller should list_free_deep() this even on error return.
*
* Returns true if okay, false if there is a syntax error in the string.
*
* Note that an empty string is considered okay here.
*/
bool
SplitDirectoriesString(char *rawstring, char separator,
List **namelist)
{
char *nextp = rawstring;
bool done = false;
*namelist = NIL;
while (scanner_isspace(*nextp))
nextp++; /* skip leading whitespace */
if (*nextp == '\0')
return true; /* allow empty string */
/* At the top of the loop, we are at start of a new directory. */
do
{
char *curname;
char *endp;
if (*nextp == '"')
{
/* Quoted name --- collapse quote-quote pairs */
curname = nextp + 1;
for (;;)
{
endp = strchr(nextp + 1, '"');
if (endp == NULL)
return false; /* mismatched quotes */
if (endp[1] != '"')
break; /* found end of quoted name */
/* Collapse adjacent quotes into one quote, and look again */
memmove(endp, endp + 1, strlen(endp));
nextp = endp;
}
/* endp now points at the terminating quote */
nextp = endp + 1;
}
else
{
/* Unquoted name --- extends to separator or end of string */
curname = endp = nextp;
while (*nextp && *nextp != separator)
{
/* trailing whitespace should not be included in name */
if (!scanner_isspace(*nextp))
endp = nextp + 1;
nextp++;
}
if (curname == endp)
return false; /* empty unquoted name not allowed */
}
while (scanner_isspace(*nextp))
nextp++; /* skip trailing whitespace */
if (*nextp == separator)
{
nextp++;
while (scanner_isspace(*nextp))
nextp++; /* skip leading whitespace for next */
/* we expect another name, so done remains false */
}
else if (*nextp == '\0')
done = true;
else
return false; /* invalid syntax */
/* Now safe to overwrite separator with a null */
*endp = '\0';
/* Truncate path if it's overlength */
if (strlen(curname) >= MAXPGPATH)
curname[MAXPGPATH - 1] = '\0';
/*
* Finished isolating current name --- add it to list
*/
curname = pstrdup(curname);
canonicalize_path(curname);
*namelist = lappend(*namelist, curname);
/* Loop back if we didn't reach end of string */
} while (!done);
return true;
}
/*
* SplitGUCList --- parse a string containing identifiers or file names
*
* This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
* presuming whether the elements will be taken as identifiers or file names.
* We assume the input has already been through flatten_set_variable_args(),
* so that we need never downcase (if appropriate, that was done already).
* Nor do we ever truncate, since we don't know the correct max length.
* We disallow embedded whitespace for simplicity (it shouldn't matter,
* because any embedded whitespace should have led to double-quoting).
* Otherwise the API is identical to SplitIdentifierString.
*
* XXX it's annoying to have so many copies of this string-splitting logic.
* However, it's not clear that having one function with a bunch of option
* flags would be much better.
*
* XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
* Be sure to update that if you have to change this.
*
* Inputs:
* rawstring: the input string; must be overwritable! On return, it's
* been modified to contain the separated identifiers.
* separator: the separator punctuation expected between identifiers
* (typically '.' or ','). Whitespace may also appear around
* identifiers.
* Outputs:
* namelist: filled with a palloc'd list of pointers to identifiers within
* rawstring. Caller should list_free() this even on error return.
*
* Returns true if okay, false if there is a syntax error in the string.
*/
bool
SplitGUCList(char *rawstring, char separator,
List **namelist)
{
char *nextp = rawstring;
bool done = false;
*namelist = NIL;
while (scanner_isspace(*nextp))
nextp++; /* skip leading whitespace */
if (*nextp == '\0')
return true; /* allow empty string */
/* At the top of the loop, we are at start of a new identifier. */
do
{
char *curname;
char *endp;
if (*nextp == '"')
{
/* Quoted name --- collapse quote-quote pairs */
curname = nextp + 1;
for (;;)
{
endp = strchr(nextp + 1, '"');
if (endp == NULL)
return false; /* mismatched quotes */
if (endp[1] != '"')
break; /* found end of quoted name */
/* Collapse adjacent quotes into one quote, and look again */
memmove(endp, endp + 1, strlen(endp));
nextp = endp;
}
/* endp now points at the terminating quote */
nextp = endp + 1;
}
else
{
/* Unquoted name --- extends to separator or whitespace */
curname = nextp;
while (*nextp && *nextp != separator &&
!scanner_isspace(*nextp))
nextp++;
endp = nextp;
if (curname == nextp)
return false; /* empty unquoted name not allowed */
}
while (scanner_isspace(*nextp))
nextp++; /* skip trailing whitespace */
if (*nextp == separator)
{
nextp++;
while (scanner_isspace(*nextp))
nextp++; /* skip leading whitespace for next */
/* we expect another name, so done remains false */
}
else if (*nextp == '\0')
done = true;
else
return false; /* invalid syntax */
/* Now safe to overwrite separator with a null */
*endp = '\0';
/*
* Finished isolating current name --- add it to list
*/
*namelist = lappend(*namelist, curname);
/* Loop back if we didn't reach end of string */
} while (!done);
return true;
}
/*****************************************************************************
* Comparison Functions used for bytea
*
* Note: btree indexes need these routines not to leak memory; therefore,
* be careful to free working copies of toasted datums. Most places don't
* need to be so careful.
*****************************************************************************/
Datum
byteaeq(PG_FUNCTION_ARGS)
{
Datum arg1 = PG_GETARG_DATUM(0);
Datum arg2 = PG_GETARG_DATUM(1);
bool result;
Size len1,
len2;
/*
* We can use a fast path for unequal lengths, which might save us from
* having to detoast one or both values.
*/
len1 = toast_raw_datum_size(arg1);
len2 = toast_raw_datum_size(arg2);
if (len1 != len2)
result = false;
else
{
bytea *barg1 = DatumGetByteaPP(arg1);
bytea *barg2 = DatumGetByteaPP(arg2);
result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
len1 - VARHDRSZ) == 0);
PG_FREE_IF_COPY(barg1, 0);
PG_FREE_IF_COPY(barg2, 1);
}
PG_RETURN_BOOL(result);
}
Datum
byteane(PG_FUNCTION_ARGS)
{
Datum arg1 = PG_GETARG_DATUM(0);
Datum arg2 = PG_GETARG_DATUM(1);
bool result;
Size len1,
len2;
/*
* We can use a fast path for unequal lengths, which might save us from
* having to detoast one or both values.
*/
len1 = toast_raw_datum_size(arg1);
len2 = toast_raw_datum_size(arg2);
if (len1 != len2)
result = true;
else
{
bytea *barg1 = DatumGetByteaPP(arg1);
bytea *barg2 = DatumGetByteaPP(arg2);
result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
len1 - VARHDRSZ) != 0);
PG_FREE_IF_COPY(barg1, 0);
PG_FREE_IF_COPY(barg2, 1);
}
PG_RETURN_BOOL(result);
}
Datum
bytealt(PG_FUNCTION_ARGS)
{
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
int len1,
len2;
int cmp;
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
}
Datum
byteale(PG_FUNCTION_ARGS)
{
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
int len1,
len2;
int cmp;
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
}
Datum
byteagt(PG_FUNCTION_ARGS)
{
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
int len1,
len2;
int cmp;
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
}
Datum
byteage(PG_FUNCTION_ARGS)
{
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
int len1,
len2;
int cmp;
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
}
Datum
byteacmp(PG_FUNCTION_ARGS)
{
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
int len1,
len2;
int cmp;
len1 = VARSIZE_ANY_EXHDR(arg1);
len2 = VARSIZE_ANY_EXHDR(arg2);
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
if ((cmp == 0) && (len1 != len2))
cmp = (len1 < len2) ? -1 : 1;
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(cmp);
}
Datum
bytea_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
/* Use generic string SortSupport, forcing "C" collation */
varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_VOID();
}
/*
* appendStringInfoText
*
* Append a text to str.
* Like appendStringInfoString(str, text_to_cstring(t)) but faster.
*/
static void
appendStringInfoText(StringInfo str, const text *t)
{
appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
}
/*
* replace_text
* replace all occurrences of 'old_sub_str' in 'orig_str'
* with 'new_sub_str' to form 'new_str'
*
* returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
* otherwise returns 'new_str'
*/
Datum
replace_text(PG_FUNCTION_ARGS)
{
text *src_text = PG_GETARG_TEXT_PP(0);
text *from_sub_text = PG_GETARG_TEXT_PP(1);
text *to_sub_text = PG_GETARG_TEXT_PP(2);
int src_text_len;
int from_sub_text_len;
TextPositionState state;
text *ret_text;
int chunk_len;
char *curr_ptr;
char *start_ptr;
StringInfoData str;
bool found;
src_text_len = VARSIZE_ANY_EXHDR(src_text);
from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
/* Return unmodified source string if empty source or pattern */
if (src_text_len < 1 || from_sub_text_len < 1)
{
PG_RETURN_TEXT_P(src_text);
}
text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
found = text_position_next(&state);
/* When the from_sub_text is not found, there is nothing to do. */
if (!found)
{
text_position_cleanup(&state);
PG_RETURN_TEXT_P(src_text);
}
curr_ptr = text_position_get_match_ptr(&state);
start_ptr = VARDATA_ANY(src_text);
initStringInfo(&str);
do
{
CHECK_FOR_INTERRUPTS();
/* copy the data skipped over by last text_position_next() */
chunk_len = curr_ptr - start_ptr;
appendBinaryStringInfo(&str, start_ptr, chunk_len);
appendStringInfoText(&str, to_sub_text);
start_ptr = curr_ptr + from_sub_text_len;
found = text_position_next(&state);
if (found)
curr_ptr = text_position_get_match_ptr(&state);
}
while (found);
/* copy trailing data */
chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
appendBinaryStringInfo(&str, start_ptr, chunk_len);
text_position_cleanup(&state);
ret_text = cstring_to_text_with_len(str.data, str.len);
pfree(str.data);
PG_RETURN_TEXT_P(ret_text);
}
/*
* check_replace_text_has_escape
*
* Returns 0 if text contains no backslashes that need processing.
* Returns 1 if text contains backslashes, but not regexp submatch specifiers.
* Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
*/
static int
check_replace_text_has_escape(const text *replace_text)
{
int result = 0;
const char *p = VARDATA_ANY(replace_text);
const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
while (p < p_end)
{
/* Find next escape char, if any. */
p = memchr(p, '\\', p_end - p);
if (p == NULL)
break;
p++;
/* Note: a backslash at the end doesn't require extra processing. */
if (p < p_end)
{
if (*p >= '1' && *p <= '9')
return 2; /* Found a submatch specifier, so done */
result = 1; /* Found some other sequence, keep looking */
p++;
}
}
return result;
}
/*
* appendStringInfoRegexpSubstr
*
* Append replace_text to str, substituting regexp back references for
* \n escapes. start_ptr is the start of the match in the source string,
* at logical character position data_pos.
*/
static void
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
regmatch_t *pmatch,
char *start_ptr, int data_pos)
{
const char *p = VARDATA_ANY(replace_text);
const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
while (p < p_end)
{
const char *chunk_start = p;
int so;
int eo;
/* Find next escape char, if any. */
p = memchr(p, '\\', p_end - p);
if (p == NULL)
p = p_end;
/* Copy the text we just scanned over, if any. */
if (p > chunk_start)
appendBinaryStringInfo(str, chunk_start, p - chunk_start);
/* Done if at end of string, else advance over escape char. */
if (p >= p_end)
break;
p++;
if (p >= p_end)
{
/* Escape at very end of input. Treat same as unexpected char */
appendStringInfoChar(str, '\\');
break;
}
if (*p >= '1' && *p <= '9')
{
/* Use the back reference of regexp. */
int idx = *p - '0';
so = pmatch[idx].rm_so;
eo = pmatch[idx].rm_eo;
p++;
}
else if (*p == '&')
{
/* Use the entire matched string. */
so = pmatch[0].rm_so;
eo = pmatch[0].rm_eo;
p++;
}
else if (*p == '\\')
{
/* \\ means transfer one \ to output. */
appendStringInfoChar(str, '\\');
p++;
continue;
}
else
{
/*
* If escape char is not followed by any expected char, just treat
* it as ordinary data to copy. (XXX would it be better to throw
* an error?)
*/
appendStringInfoChar(str, '\\');
continue;
}
if (so >= 0 && eo >= 0)
{
/*
* Copy the text that is back reference of regexp. Note so and eo
* are counted in characters not bytes.
*/
char *chunk_start;
int chunk_len;
Assert(so >= data_pos);
chunk_start = start_ptr;
chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
chunk_len = charlen_to_bytelen(chunk_start, eo - so);
appendBinaryStringInfo(str, chunk_start, chunk_len);
}
}
}
/*
* replace_text_regexp
*
* replace substring(s) in src_text that match pattern with replace_text.
* The replace_text can contain backslash markers to substitute
* (parts of) the matched text.
*
* cflags: regexp compile flags.
* collation: collation to use.
* search_start: the character (not byte) offset in src_text at which to
* begin searching.
* n: if 0, replace all matches; if > 0, replace only the N'th match.
*/
text *
replace_text_regexp(text *src_text, text *pattern_text,
text *replace_text,
int cflags, Oid collation,
int search_start, int n)
{
text *ret_text;
regex_t *re;
int src_text_len = VARSIZE_ANY_EXHDR(src_text);
int nmatches = 0;
StringInfoData buf;
regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
int nmatch = lengthof(pmatch);
pg_wchar *data;
size_t data_len;
int data_pos;
char *start_ptr;
int escape_status;
initStringInfo(&buf);
/* Convert data string to wide characters. */
data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
/* Check whether replace_text has escapes, especially regexp submatches. */
escape_status = check_replace_text_has_escape(replace_text);
/* If no regexp submatches, we can use REG_NOSUB. */
if (escape_status < 2)
{
cflags |= REG_NOSUB;
/* Also tell pg_regexec we only want the whole-match location. */
nmatch = 1;
}
/* Prepare the regexp. */
re = RE_compile_and_cache(pattern_text, cflags, collation);
/* start_ptr points to the data_pos'th character of src_text */
start_ptr = (char *) VARDATA_ANY(src_text);
data_pos = 0;
while (search_start <= data_len)
{
int regexec_result;
CHECK_FOR_INTERRUPTS();
regexec_result = pg_regexec(re,
data,
data_len,
search_start,
NULL, /* no details */
nmatch,
pmatch,
0);
if (regexec_result == REG_NOMATCH)
break;
if (regexec_result != REG_OKAY)
{
char errMsg[100];
pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("regular expression failed: %s", errMsg)));
}
/*
* Count matches, and decide whether to replace this match.
*/
nmatches++;
if (n > 0 && nmatches != n)
{
/*
* No, so advance search_start, but not start_ptr/data_pos. (Thus,
* we treat the matched text as if it weren't matched, and copy it
* to the output later.)
*/
search_start = pmatch[0].rm_eo;
if (pmatch[0].rm_so == pmatch[0].rm_eo)
search_start++;
continue;
}
/*
* Copy the text to the left of the match position. Note we are given
* character not byte indexes.
*/
if (pmatch[0].rm_so - data_pos > 0)
{
int chunk_len;
chunk_len = charlen_to_bytelen(start_ptr,
pmatch[0].rm_so - data_pos);
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
/*
* Advance start_ptr over that text, to avoid multiple rescans of
* it if the replace_text contains multiple back-references.
*/
start_ptr += chunk_len;
data_pos = pmatch[0].rm_so;
}
/*
* Copy the replace_text, processing escapes if any are present.
*/
if (escape_status > 0)
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
start_ptr, data_pos);
else
appendStringInfoText(&buf, replace_text);
/* Advance start_ptr and data_pos over the matched text. */
start_ptr += charlen_to_bytelen(start_ptr,
pmatch[0].rm_eo - data_pos);
data_pos = pmatch[0].rm_eo;
/*
* If we only want to replace one occurrence, we're done.
*/
if (n > 0)
break;
/*
* Advance search position. Normally we start the next search at the
* end of the previous match; but if the match was of zero length, we
* have to advance by one character, or we'd just find the same match
* again.
*/
search_start = data_pos;
if (pmatch[0].rm_so == pmatch[0].rm_eo)
search_start++;
}
/*
* Copy the text to the right of the last match.
*/
if (data_pos < data_len)
{
int chunk_len;
chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
}
ret_text = cstring_to_text_with_len(buf.data, buf.len);
pfree(buf.data);
pfree(data);
return ret_text;
}
/*
* split_part
* parse input string based on provided field separator
* return N'th item (1 based, negative counts from end)
*/
Datum
split_part(PG_FUNCTION_ARGS)
{
text *inputstring = PG_GETARG_TEXT_PP(0);
text *fldsep = PG_GETARG_TEXT_PP(1);
int fldnum = PG_GETARG_INT32(2);
int inputstring_len;
int fldsep_len;
TextPositionState state;
char *start_ptr;
char *end_ptr;
text *result_text;
bool found;
/* field number is 1 based */
if (fldnum == 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("field position must not be zero")));
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
/* return empty string for empty input string */
if (inputstring_len < 1)
PG_RETURN_TEXT_P(cstring_to_text(""));
/* handle empty field separator */
if (fldsep_len < 1)
{
/* if first or last field, return input string, else empty string */
if (fldnum == 1 || fldnum == -1)
PG_RETURN_TEXT_P(inputstring);
else
PG_RETURN_TEXT_P(cstring_to_text(""));
}
/* find the first field separator */
text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
found = text_position_next(&state);
/* special case if fldsep not found at all */
if (!found)
{
text_position_cleanup(&state);
/* if first or last field, return input string, else empty string */
if (fldnum == 1 || fldnum == -1)
PG_RETURN_TEXT_P(inputstring);
else
PG_RETURN_TEXT_P(cstring_to_text(""));
}
/*
* take care of a negative field number (i.e. count from the right) by
* converting to a positive field number; we need total number of fields
*/
if (fldnum < 0)
{
/* we found a fldsep, so there are at least two fields */
int numfields = 2;
while (text_position_next(&state))
numfields++;
/* special case of last field does not require an extra pass */
if (fldnum == -1)
{
start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
text_position_cleanup(&state);
PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
end_ptr - start_ptr));
}
/* else, convert fldnum to positive notation */
fldnum += numfields + 1;
/* if nonexistent field, return empty string */
if (fldnum <= 0)
{
text_position_cleanup(&state);
PG_RETURN_TEXT_P(cstring_to_text(""));
}
/* reset to pointing at first match, but now with positive fldnum */
text_position_reset(&state);
found = text_position_next(&state);
Assert(found);
}
/* identify bounds of first field */
start_ptr = VARDATA_ANY(inputstring);
end_ptr = text_position_get_match_ptr(&state);
while (found && --fldnum > 0)
{
/* identify bounds of next field */
start_ptr = end_ptr + fldsep_len;
found = text_position_next(&state);
if (found)
end_ptr = text_position_get_match_ptr(&state);
}
text_position_cleanup(&state);
if (fldnum > 0)
{
/* N'th field separator not found */
/* if last field requested, return it, else empty string */
if (fldnum == 1)
{
int last_len = start_ptr - VARDATA_ANY(inputstring);
result_text = cstring_to_text_with_len(start_ptr,
inputstring_len - last_len);
}
else
result_text = cstring_to_text("");
}
else
{
/* non-last field requested */
result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
}
PG_RETURN_TEXT_P(result_text);
}
/*
* Convenience function to return true when two text params are equal.
*/
static bool
text_isequal(text *txt1, text *txt2, Oid collid)
{
return DatumGetBool(DirectFunctionCall2Coll(texteq,
collid,
PointerGetDatum(txt1),
PointerGetDatum(txt2)));
}
/*
* text_to_array
* parse input string and return text array of elements,
* based on provided field separator
*/
Datum
text_to_array(PG_FUNCTION_ARGS)
{
SplitTextOutputData tstate;
/* For array output, tstate should start as all zeroes */
memset(&tstate, 0, sizeof(tstate));
if (!split_text(fcinfo, &tstate))
PG_RETURN_NULL();
if (tstate.astate == NULL)
PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
PG_RETURN_DATUM(makeArrayResult(tstate.astate,
CurrentMemoryContext));
}
/*
* text_to_array_null
* parse input string and return text array of elements,
* based on provided field separator and null string
*
* This is a separate entry point only to prevent the regression tests from
* complaining about different argument sets for the same internal function.
*/
Datum
text_to_array_null(PG_FUNCTION_ARGS)
{
return text_to_array(fcinfo);
}
/*
* text_to_table
* parse input string and return table of elements,
* based on provided field separator
*/
Datum
text_to_table(PG_FUNCTION_ARGS)
{
ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
SplitTextOutputData tstate;
tstate.astate = NULL;
InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
tstate.tupstore = rsi->setResult;
tstate.tupdesc = rsi->setDesc;
(void) split_text(fcinfo, &tstate);
return (Datum) 0;
}
/*
* text_to_table_null
* parse input string and return table of elements,
* based on provided field separator and null string
*
* This is a separate entry point only to prevent the regression tests from
* complaining about different argument sets for the same internal function.
*/
Datum
text_to_table_null(PG_FUNCTION_ARGS)
{
return text_to_table(fcinfo);
}
/*
* Common code for text_to_array, text_to_array_null, text_to_table
* and text_to_table_null functions.
*
* These are not strict so we have to test for null inputs explicitly.
* Returns false if result is to be null, else returns true.
*
* Note that if the result is valid but empty (zero elements), we return
* without changing *tstate --- caller must handle that case, too.
*/
static bool
split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
{
text *inputstring;
text *fldsep;
text *null_string;
Oid collation = PG_GET_COLLATION();
int inputstring_len;
int fldsep_len;
char *start_ptr;
text *result_text;
/* when input string is NULL, then result is NULL too */
if (PG_ARGISNULL(0))
return false;
inputstring = PG_GETARG_TEXT_PP(0);
/* fldsep can be NULL */
if (!PG_ARGISNULL(1))
fldsep = PG_GETARG_TEXT_PP(1);
else
fldsep = NULL;
/* null_string can be NULL or omitted */
if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
null_string = PG_GETARG_TEXT_PP(2);
else
null_string = NULL;
if (fldsep != NULL)
{
/*
* Normal case with non-null fldsep. Use the text_position machinery
* to search for occurrences of fldsep.
*/
TextPositionState state;
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
/* return empty set for empty input string */
if (inputstring_len < 1)
return true;
/* empty field separator: return input string as a one-element set */
if (fldsep_len < 1)
{
split_text_accum_result(tstate, inputstring,
null_string, collation);
return true;
}
text_position_setup(inputstring, fldsep, collation, &state);
start_ptr = VARDATA_ANY(inputstring);
for (;;)
{
bool found;
char *end_ptr;
int chunk_len;
CHECK_FOR_INTERRUPTS();
found = text_position_next(&state);
if (!found)
{
/* fetch last field */
chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
end_ptr = NULL; /* not used, but some compilers complain */
}
else
{
/* fetch non-last field */
end_ptr = text_position_get_match_ptr(&state);
chunk_len = end_ptr - start_ptr;
}
/* build a temp text datum to pass to split_text_accum_result */
result_text = cstring_to_text_with_len(start_ptr, chunk_len);
/* stash away this field */
split_text_accum_result(tstate, result_text,
null_string, collation);
pfree(result_text);
if (!found)
break;
start_ptr = end_ptr + fldsep_len;
}
text_position_cleanup(&state);
}
else
{
/*
* When fldsep is NULL, each character in the input string becomes a
* separate element in the result set. The separator is effectively
* the space between characters.
*/
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
start_ptr = VARDATA_ANY(inputstring);
while (inputstring_len > 0)
{
int chunk_len = pg_mblen(start_ptr);
CHECK_FOR_INTERRUPTS();
/* build a temp text datum to pass to split_text_accum_result */
result_text = cstring_to_text_with_len(start_ptr, chunk_len);
/* stash away this field */
split_text_accum_result(tstate, result_text,
null_string, collation);
pfree(result_text);
start_ptr += chunk_len;
inputstring_len -= chunk_len;
}
}
return true;
}
/*
* Add text item to result set (table or array).
*
* This is also responsible for checking to see if the item matches
* the null_string, in which case we should emit NULL instead.
*/
static void
split_text_accum_result(SplitTextOutputData *tstate,
text *field_value,
text *null_string,
Oid collation)
{
bool is_null = false;
if (null_string && text_isequal(field_value, null_string, collation))
is_null = true;
if (tstate->tupstore)
{
Datum values[1];
bool nulls[1];
values[0] = PointerGetDatum(field_value);
nulls[0] = is_null;
tuplestore_putvalues(tstate->tupstore,
tstate->tupdesc,
values,
nulls);
}
else
{
tstate->astate = accumArrayResult(tstate->astate,
PointerGetDatum(field_value),
is_null,
TEXTOID,
CurrentMemoryContext);
}
}
/*
* array_to_text
* concatenate Cstring representation of input array elements
* using provided field separator
*/
Datum
array_to_text(PG_FUNCTION_ARGS)
{
ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
}
/*
* array_to_text_null
* concatenate Cstring representation of input array elements
* using provided field separator and null string
*
* This version is not strict so we have to test for null inputs explicitly.
*/
Datum
array_to_text_null(PG_FUNCTION_ARGS)
{
ArrayType *v;
char *fldsep;
char *null_string;
/* returns NULL when first or second parameter is NULL */
if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
PG_RETURN_NULL();
v = PG_GETARG_ARRAYTYPE_P(0);
fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
/* NULL null string is passed through as a null pointer */
if (!PG_ARGISNULL(2))
null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
else
null_string = NULL;
PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
}
/*
* common code for array_to_text and array_to_text_null functions
*/
static text *
array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
const char *fldsep, const char *null_string)
{
text *result;
int nitems,
*dims,
ndims;
Oid element_type;
int typlen;
bool typbyval;
char typalign;
StringInfoData buf;
bool printed = false;
char *p;
bits8 *bitmap;
int bitmask;
int i;
ArrayMetaState *my_extra;
ndims = ARR_NDIM(v);
dims = ARR_DIMS(v);
nitems = ArrayGetNItems(ndims, dims);
/* if there are no elements, return an empty string */
if (nitems == 0)
return cstring_to_text_with_len("", 0);
element_type = ARR_ELEMTYPE(v);
initStringInfo(&buf);
/*
* We arrange to look up info about element type, including its output
* conversion proc, only once per series of calls, assuming the element
* type doesn't change underneath us.
*/
my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
if (my_extra == NULL)
{
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
sizeof(ArrayMetaState));
my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
my_extra->element_type = ~element_type;
}
if (my_extra->element_type != element_type)
{
/*
* Get info about element type, including its output conversion proc
*/
get_type_io_data(element_type, IOFunc_output,
&my_extra->typlen, &my_extra->typbyval,
&my_extra->typalign, &my_extra->typdelim,
&my_extra->typioparam, &my_extra->typiofunc);
fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
fcinfo->flinfo->fn_mcxt);
my_extra->element_type = element_type;
}
typlen = my_extra->typlen;
typbyval = my_extra->typbyval;
typalign = my_extra->typalign;
p = ARR_DATA_PTR(v);
bitmap = ARR_NULLBITMAP(v);
bitmask = 1;
for (i = 0; i < nitems; i++)
{
Datum itemvalue;
char *value;
/* Get source element, checking for NULL */
if (bitmap && (*bitmap & bitmask) == 0)
{
/* if null_string is NULL, we just ignore null elements */
if (null_string != NULL)
{
if (printed)
appendStringInfo(&buf, "%s%s", fldsep, null_string);
else
appendStringInfoString(&buf, null_string);
printed = true;
}
}
else
{
itemvalue = fetch_att(p, typbyval, typlen);
value = OutputFunctionCall(&my_extra->proc, itemvalue);
if (printed)
appendStringInfo(&buf, "%s%s", fldsep, value);
else
appendStringInfoString(&buf, value);
printed = true;
p = att_addlength_pointer(p, typlen, p);
p = (char *) att_align_nominal(p, typalign);
}
/* advance bitmap pointer if any */
if (bitmap)
{
bitmask <<= 1;
if (bitmask == 0x100)
{
bitmap++;
bitmask = 1;
}
}
}
result = cstring_to_text_with_len(buf.data, buf.len);
pfree(buf.data);
return result;
}
#define HEXBASE 16
/*
* Convert an int32 to a string containing a base 16 (hex) representation of
* the number.
*/
Datum
to_hex32(PG_FUNCTION_ARGS)
{
uint32 value = (uint32) PG_GETARG_INT32(0);
char *ptr;
const char *digits = "0123456789abcdef";
char buf[32]; /* bigger than needed, but reasonable */
ptr = buf + sizeof(buf) - 1;
*ptr = '\0';
do
{
*--ptr = digits[value % HEXBASE];
value /= HEXBASE;
} while (ptr > buf && value);
PG_RETURN_TEXT_P(cstring_to_text(ptr));
}
/*
* Convert an int64 to a string containing a base 16 (hex) representation of
* the number.
*/
Datum
to_hex64(PG_FUNCTION_ARGS)
{
uint64 value = (uint64) PG_GETARG_INT64(0);
char *ptr;
const char *digits = "0123456789abcdef";
char buf[32]; /* bigger than needed, but reasonable */
ptr = buf + sizeof(buf) - 1;
*ptr = '\0';
do
{
*--ptr = digits[value % HEXBASE];
value /= HEXBASE;
} while (ptr > buf && value);
PG_RETURN_TEXT_P(cstring_to_text(ptr));
}
/*
* Return the size of a datum, possibly compressed
*
* Works on any data type
*/
Datum
pg_column_size(PG_FUNCTION_ARGS)
{
Datum value = PG_GETARG_DATUM(0);
int32 result;
int typlen;
/* On first call, get the input type's typlen, and save at *fn_extra */
if (fcinfo->flinfo->fn_extra == NULL)
{
/* Lookup the datatype of the supplied argument */
Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
typlen = get_typlen(argtypeid);
if (typlen == 0) /* should not happen */
elog(ERROR, "cache lookup failed for type %u", argtypeid);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
sizeof(int));
*((int *) fcinfo->flinfo->fn_extra) = typlen;
}
else
typlen = *((int *) fcinfo->flinfo->fn_extra);
if (typlen == -1)
{
/* varlena type, possibly toasted */
result = toast_datum_size(value);
}
else if (typlen == -2)
{
/* cstring */
result = strlen(DatumGetCString(value)) + 1;
}
else
{
/* ordinary fixed-width type */
result = typlen;
}
PG_RETURN_INT32(result);
}
/*
* Return the compression method stored in the compressed attribute. Return
* NULL for non varlena type or uncompressed data.
*/
Datum
pg_column_compression(PG_FUNCTION_ARGS)
{
int typlen;
char *result;
ToastCompressionId cmid;
/* On first call, get the input type's typlen, and save at *fn_extra */
if (fcinfo->flinfo->fn_extra == NULL)
{
/* Lookup the datatype of the supplied argument */
Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
typlen = get_typlen(argtypeid);
if (typlen == 0) /* should not happen */
elog(ERROR, "cache lookup failed for type %u", argtypeid);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
sizeof(int));
*((int *) fcinfo->flinfo->fn_extra) = typlen;
}
else
typlen = *((int *) fcinfo->flinfo->fn_extra);
if (typlen != -1)
PG_RETURN_NULL();
/* get the compression method id stored in the compressed varlena */
cmid = toast_get_compression_id((struct varlena *)
DatumGetPointer(PG_GETARG_DATUM(0)));
if (cmid == TOAST_INVALID_COMPRESSION_ID)
PG_RETURN_NULL();
/* convert compression method id to compression method name */
switch (cmid)
{
case TOAST_PGLZ_COMPRESSION_ID:
result = "pglz";
break;
case TOAST_LZ4_COMPRESSION_ID:
result = "lz4";
break;
default:
elog(ERROR, "invalid compression method id %d", cmid);
}
PG_RETURN_TEXT_P(cstring_to_text(result));
}
/*
* string_agg - Concatenates values and returns string.
*
* Syntax: string_agg(value text, delimiter text) RETURNS text
*
* Note: Any NULL values are ignored. The first-call delimiter isn't
* actually used at all, and on subsequent calls the delimiter precedes
* the associated value.
*/
/* subroutine to initialize state */
static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)
{
StringInfo state;
MemoryContext aggcontext;
MemoryContext oldcontext;
if (!AggCheckCallContext(fcinfo, &aggcontext))
{
/* cannot be called directly because of internal-type argument */
elog(ERROR, "string_agg_transfn called in non-aggregate context");
}
/*
* Create state in aggregate context. It'll stay there across subsequent
* calls.
*/
oldcontext = MemoryContextSwitchTo(aggcontext);
state = makeStringInfo();
MemoryContextSwitchTo(oldcontext);
return state;
}
Datum
string_agg_transfn(PG_FUNCTION_ARGS)
{
StringInfo state;
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
/* Append the value unless null, preceding it with the delimiter. */
if (!PG_ARGISNULL(1))
{
text *value = PG_GETARG_TEXT_PP(1);
bool isfirst = false;
/*
* You might think we can just throw away the first delimiter, however
* we must keep it as we may be a parallel worker doing partial
* aggregation building a state to send to the main process. We need
* to keep the delimiter of every aggregation so that the combine
* function can properly join up the strings of two separately
* partially aggregated results. The first delimiter is only stripped
* off in the final function. To know how much to strip off the front
* of the string, we store the length of the first delimiter in the
* StringInfo's cursor field, which we don't otherwise need here.
*/
if (state == NULL)
{
state = makeStringAggState(fcinfo);
isfirst = true;
}
if (!PG_ARGISNULL(2))
{
text *delim = PG_GETARG_TEXT_PP(2);
appendStringInfoText(state, delim);
if (isfirst)
state->cursor = VARSIZE_ANY_EXHDR(delim);
}
appendStringInfoText(state, value);
}
/*
* The transition type for string_agg() is declared to be "internal",
* which is a pass-by-value type the same size as a pointer.
*/
if (state)
PG_RETURN_POINTER(state);
PG_RETURN_NULL();
}
/*
* string_agg_combine
* Aggregate combine function for string_agg(text) and string_agg(bytea)
*/
Datum
string_agg_combine(PG_FUNCTION_ARGS)
{
StringInfo state1;
StringInfo state2;
MemoryContext agg_context;
if (!AggCheckCallContext(fcinfo, &agg_context))
elog(ERROR, "aggregate function called in non-aggregate context");
state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
if (state2 == NULL)
{
/*
* NULL state2 is easy, just return state1, which we know is already
* in the agg_context
*/
if (state1 == NULL)
PG_RETURN_NULL();
PG_RETURN_POINTER(state1);
}
if (state1 == NULL)
{
/* We must copy state2's data into the agg_context */
MemoryContext old_context;
old_context = MemoryContextSwitchTo(agg_context);
state1 = makeStringAggState(fcinfo);
appendBinaryStringInfo(state1, state2->data, state2->len);
state1->cursor = state2->cursor;
MemoryContextSwitchTo(old_context);
}
else if (state2->len > 0)
{
/* Combine ... state1->cursor does not change in this case */
appendBinaryStringInfo(state1, state2->data, state2->len);
}
PG_RETURN_POINTER(state1);
}
/*
* string_agg_serialize
* Aggregate serialize function for string_agg(text) and string_agg(bytea)
*
* This is strict, so we need not handle NULL input
*/
Datum
string_agg_serialize(PG_FUNCTION_ARGS)
{
StringInfo state;
StringInfoData buf;
bytea *result;
/* cannot be called directly because of internal-type argument */
Assert(AggCheckCallContext(fcinfo, NULL));
state = (StringInfo) PG_GETARG_POINTER(0);
pq_begintypsend(&buf);
/* cursor */
pq_sendint(&buf, state->cursor, 4);
/* data */
pq_sendbytes(&buf, state->data, state->len);
result = pq_endtypsend(&buf);
PG_RETURN_BYTEA_P(result);
}
/*
* string_agg_deserialize
* Aggregate deserial function for string_agg(text) and string_agg(bytea)
*
* This is strict, so we need not handle NULL input
*/
Datum
string_agg_deserialize(PG_FUNCTION_ARGS)
{
bytea *sstate;
StringInfo result;
StringInfoData buf;
char *data;
int datalen;
/* cannot be called directly because of internal-type argument */
Assert(AggCheckCallContext(fcinfo, NULL));
sstate = PG_GETARG_BYTEA_PP(0);
/*
* Copy the bytea into a StringInfo so that we can "receive" it using the
* standard recv-function infrastructure.
*/
initStringInfo(&buf);
appendBinaryStringInfo(&buf,
VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate));
result = makeStringAggState(fcinfo);
/* cursor */
result->cursor = pq_getmsgint(&buf, 4);
/* data */
datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
data = (char *) pq_getmsgbytes(&buf, datalen);
appendBinaryStringInfo(result, data, datalen);
pq_getmsgend(&buf);
pfree(buf.data);
PG_RETURN_POINTER(result);
}
Datum
string_agg_finalfn(PG_FUNCTION_ARGS)
{
StringInfo state;
/* cannot be called directly because of internal-type argument */
Assert(AggCheckCallContext(fcinfo, NULL));
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
if (state != NULL)
{
/* As per comment in transfn, strip data before the cursor position */
PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
state->len - state->cursor));
}
else
PG_RETURN_NULL();
}
/*
* Prepare cache with fmgr info for the output functions of the datatypes of
* the arguments of a concat-like function, beginning with argument "argidx".
* (Arguments before that will have corresponding slots in the resulting
* FmgrInfo array, but we don't fill those slots.)
*/
static FmgrInfo *
build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
{
FmgrInfo *foutcache;
int i;
/* We keep the info in fn_mcxt so it survives across calls */
foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
PG_NARGS() * sizeof(FmgrInfo));
for (i = argidx; i < PG_NARGS(); i++)
{
Oid valtype;
Oid typOutput;
bool typIsVarlena;
valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
if (!OidIsValid(valtype))
elog(ERROR, "could not determine data type of concat() input");
getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
}
fcinfo->flinfo->fn_extra = foutcache;
return foutcache;
}
/*
* Implementation of both concat() and concat_ws().
*
* sepstr is the separator string to place between values.
* argidx identifies the first argument to concatenate (counting from zero);
* note that this must be constant across any one series of calls.
*
* Returns NULL if result should be NULL, else text value.
*/
static text *
concat_internal(const char *sepstr, int argidx,
FunctionCallInfo fcinfo)
{
text *result;
StringInfoData str;
FmgrInfo *foutcache;
bool first_arg = true;
int i;
/*
* concat(VARIADIC some-array) is essentially equivalent to
* array_to_text(), ie concat the array elements with the given separator.
* So we just pass the case off to that code.
*/
if (get_fn_expr_variadic(fcinfo->flinfo))
{
ArrayType *arr;
/* Should have just the one argument */
Assert(argidx == PG_NARGS() - 1);
/* concat(VARIADIC NULL) is defined as NULL */
if (PG_ARGISNULL(argidx))
return NULL;
/*
* Non-null argument had better be an array. We assume that any call
* context that could let get_fn_expr_variadic return true will have
* checked that a VARIADIC-labeled parameter actually is an array. So
* it should be okay to just Assert that it's an array rather than
* doing a full-fledged error check.
*/
Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
/* OK, safe to fetch the array value */
arr = PG_GETARG_ARRAYTYPE_P(argidx);
/*
* And serialize the array. We tell array_to_text to ignore null
* elements, which matches the behavior of the loop below.
*/
return array_to_text_internal(fcinfo, arr, sepstr, NULL);
}
/* Normal case without explicit VARIADIC marker */
initStringInfo(&str);
/* Get output function info, building it if first time through */
foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
if (foutcache == NULL)
foutcache = build_concat_foutcache(fcinfo, argidx);
for (i = argidx; i < PG_NARGS(); i++)
{
if (!PG_ARGISNULL(i))
{
Datum value = PG_GETARG_DATUM(i);
/* add separator if appropriate */
if (first_arg)
first_arg = false;
else
appendStringInfoString(&str, sepstr);
/* call the appropriate type output function, append the result */
appendStringInfoString(&str,
OutputFunctionCall(&foutcache[i], value));
}
}
result = cstring_to_text_with_len(str.data, str.len);
pfree(str.data);
return result;
}
/*
* Concatenate all arguments. NULL arguments are ignored.
*/
Datum
text_concat(PG_FUNCTION_ARGS)
{
text *result;
result = concat_internal("", 0, fcinfo);
if (result == NULL)
PG_RETURN_NULL();
PG_RETURN_TEXT_P(result);
}
/*
* Concatenate all but first argument value with separators. The first
* parameter is used as the separator. NULL arguments are ignored.
*/
Datum
text_concat_ws(PG_FUNCTION_ARGS)
{
char *sep;
text *result;
/* return NULL when separator is NULL */
if (PG_ARGISNULL(0))
PG_RETURN_NULL();
sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
result = concat_internal(sep, 1, fcinfo);
if (result == NULL)
PG_RETURN_NULL();
PG_RETURN_TEXT_P(result);
}
/*
* Return first n characters in the string. When n is negative,
* return all but last |n| characters.
*/
Datum
text_left(PG_FUNCTION_ARGS)
{
int n = PG_GETARG_INT32(1);
if (n < 0)
{
text *str = PG_GETARG_TEXT_PP(0);
const char *p = VARDATA_ANY(str);
int len = VARSIZE_ANY_EXHDR(str);
int rlen;
n = pg_mbstrlen_with_len(p, len) + n;
rlen = pg_mbcharcliplen(p, len, n);
PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
}
else
PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
}
/*
* Return last n characters in the string. When n is negative,
* return all but first |n| characters.
*/
Datum
text_right(PG_FUNCTION_ARGS)
{
text *str = PG_GETARG_TEXT_PP(0);
const char *p = VARDATA_ANY(str);
int len = VARSIZE_ANY_EXHDR(str);
int n = PG_GETARG_INT32(1);
int off;
if (n < 0)
n = -n;
else
n = pg_mbstrlen_with_len(p, len) - n;
off = pg_mbcharcliplen(p, len, n);
PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
}
/*
* Return reversed string
*/
Datum
text_reverse(PG_FUNCTION_ARGS)
{
text *str = PG_GETARG_TEXT_PP(0);
const char *p = VARDATA_ANY(str);
int len = VARSIZE_ANY_EXHDR(str);
const char *endp = p + len;
text *result;
char *dst;
result = palloc(len + VARHDRSZ);
dst = (char *) VARDATA(result) + len;
SET_VARSIZE(result, len + VARHDRSZ);
if (pg_database_encoding_max_length() > 1)
{
/* multibyte version */
while (p < endp)
{
int sz;
sz = pg_mblen(p);
dst -= sz;
memcpy(dst, p, sz);
p += sz;
}
}
else
{
/* single byte version */
while (p < endp)
*(--dst) = *p++;
}
PG_RETURN_TEXT_P(result);
}
/*
* Support macros for text_format()
*/
#define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
#define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
do { \
if (++(ptr) >= (end_ptr)) \
ereport(ERROR, \
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
errmsg("unterminated format() type specifier"), \
errhint("For a single \"%%\" use \"%%%%\"."))); \
} while (0)
/*
* Returns a formatted string
*/
Datum
text_format(PG_FUNCTION_ARGS)
{
text *fmt;
StringInfoData str;
const char *cp;
const char *start_ptr;
const char *end_ptr;
text *result;
int arg;
bool funcvariadic;
int nargs;
Datum *elements = NULL;
bool *nulls = NULL;
Oid element_type = InvalidOid;
Oid prev_type = InvalidOid;
Oid prev_width_type = InvalidOid;
FmgrInfo typoutputfinfo;
FmgrInfo typoutputinfo_width;
/* When format string is null, immediately return null */
if (PG_ARGISNULL(0))
PG_RETURN_NULL();
/* If argument is marked VARIADIC, expand array into elements */
if (get_fn_expr_variadic(fcinfo->flinfo))
{
ArrayType *arr;
int16 elmlen;
bool elmbyval;
char elmalign;
int nitems;
/* Should have just the one argument */
Assert(PG_NARGS() == 2);
/* If argument is NULL, we treat it as zero-length array */
if (PG_ARGISNULL(1))
nitems = 0;
else
{
/*
* Non-null argument had better be an array. We assume that any
* call context that could let get_fn_expr_variadic return true
* will have checked that a VARIADIC-labeled parameter actually is
* an array. So it should be okay to just Assert that it's an
* array rather than doing a full-fledged error check.
*/
Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
/* OK, safe to fetch the array value */
arr = PG_GETARG_ARRAYTYPE_P(1);
/* Get info about array element type */
element_type = ARR_ELEMTYPE(arr);
get_typlenbyvalalign(element_type,
&elmlen, &elmbyval, &elmalign);
/* Extract all array elements */
deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
&elements, &nulls, &nitems);
}
nargs = nitems + 1;
funcvariadic = true;
}
else
{
/* Non-variadic case, we'll process the arguments individually */
nargs = PG_NARGS();
funcvariadic = false;
}
/* Setup for main loop. */
fmt = PG_GETARG_TEXT_PP(0);
start_ptr = VARDATA_ANY(fmt);
end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
initStringInfo(&str);
arg = 1; /* next argument position to print */
/* Scan format string, looking for conversion specifiers. */
for (cp = start_ptr; cp < end_ptr; cp++)
{
int argpos;
int widthpos;
int flags;
int width;
Datum value;
bool isNull;
Oid typid;
/*
* If it's not the start of a conversion specifier, just copy it to
* the output buffer.
*/
if (*cp != '%')
{
appendStringInfoCharMacro(&str, *cp);
continue;
}
ADVANCE_PARSE_POINTER(cp, end_ptr);
/* Easy case: %% outputs a single % */
if (*cp == '%')
{
appendStringInfoCharMacro(&str, *cp);
continue;
}
/* Parse the optional portions of the format specifier */
cp = text_format_parse_format(cp, end_ptr,
&argpos, &widthpos,
&flags, &width);
/*
* Next we should see the main conversion specifier. Whether or not
* an argument position was present, it's known that at least one
* character remains in the string at this point. Experience suggests
* that it's worth checking that that character is one of the expected
* ones before we try to fetch arguments, so as to produce the least
* confusing response to a mis-formatted specifier.
*/
if (strchr("sIL", *cp) == NULL)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized format() type specifier \"%.*s\"",
pg_mblen(cp), cp),
errhint("For a single \"%%\" use \"%%%%\".")));
/* If indirect width was specified, get its value */
if (widthpos >= 0)
{
/* Collect the specified or next argument position */
if (widthpos > 0)
arg = widthpos;
if (arg >= nargs)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("too few arguments for format()")));
/* Get the value and type of the selected argument */
if (!funcvariadic)
{
value = PG_GETARG_DATUM(arg);
isNull = PG_ARGISNULL(arg);
typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
}
else
{
value = elements[arg - 1];
isNull = nulls[arg - 1];
typid = element_type;
}
if (!OidIsValid(typid))
elog(ERROR, "could not determine data type of format() input");
arg++;
/* We can treat NULL width the same as zero */
if (isNull)
width = 0;
else if (typid == INT4OID)
width = DatumGetInt32(value);
else if (typid == INT2OID)
width = DatumGetInt16(value);
else
{
/* For less-usual datatypes, convert to text then to int */
char *str;
if (typid != prev_width_type)
{
Oid typoutputfunc;
bool typIsVarlena;
getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
fmgr_info(typoutputfunc, &typoutputinfo_width);
prev_width_type = typid;
}
str = OutputFunctionCall(&typoutputinfo_width, value);
/* pg_strtoint32 will complain about bad data or overflow */
width = pg_strtoint32(str);
pfree(str);
}
}
/* Collect the specified or next argument position */
if (argpos > 0)
arg = argpos;
if (arg >= nargs)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("too few arguments for format()")));
/* Get the value and type of the selected argument */
if (!funcvariadic)
{
value = PG_GETARG_DATUM(arg);
isNull = PG_ARGISNULL(arg);
typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
}
else
{
value = elements[arg - 1];
isNull = nulls[arg - 1];
typid = element_type;
}
if (!OidIsValid(typid))
elog(ERROR, "could not determine data type of format() input");
arg++;
/*
* Get the appropriate typOutput function, reusing previous one if
* same type as previous argument. That's particularly useful in the
* variadic-array case, but often saves work even for ordinary calls.
*/
if (typid != prev_type)
{
Oid typoutputfunc;
bool typIsVarlena;
getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
fmgr_info(typoutputfunc, &typoutputfinfo);
prev_type = typid;
}
/*
* And now we can format the value.
*/
switch (*cp)
{
case 's':
case 'I':
case 'L':
text_format_string_conversion(&str, *cp, &typoutputfinfo,
value, isNull,
flags, width);
break;
default:
/* should not get here, because of previous check */
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized format() type specifier \"%.*s\"",
pg_mblen(cp), cp),
errhint("For a single \"%%\" use \"%%%%\".")));
break;
}
}
/* Don't need deconstruct_array results anymore. */
if (elements != NULL)
pfree(elements);
if (nulls != NULL)
pfree(nulls);
/* Generate results. */
result = cstring_to_text_with_len(str.data, str.len);
pfree(str.data);
PG_RETURN_TEXT_P(result);
}
/*
* Parse contiguous digits as a decimal number.
*
* Returns true if some digits could be parsed.
* The value is returned into *value, and *ptr is advanced to the next
* character to be parsed.
*
* Note parsing invariant: at least one character is known available before
* string end (end_ptr) at entry, and this is still true at exit.
*/
static bool
text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
{
bool found = false;
const char *cp = *ptr;
int val = 0;
while (*cp >= '0' && *cp <= '9')
{
int8 digit = (*cp - '0');
if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
unlikely(pg_add_s32_overflow(val, digit, &val)))
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("number is out of range")));
ADVANCE_PARSE_POINTER(cp, end_ptr);
found = true;
}
*ptr = cp;
*value = val;
return found;
}
/*
* Parse a format specifier (generally following the SUS printf spec).
*
* We have already advanced over the initial '%', and we are looking for
* [argpos][flags][width]type (but the type character is not consumed here).
*
* Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
* Output parameters:
* argpos: argument position for value to be printed. -1 means unspecified.
* widthpos: argument position for width. Zero means the argument position
* was unspecified (ie, take the next arg) and -1 means no width
* argument (width was omitted or specified as a constant).
* flags: bitmask of flags.
* width: directly-specified width value. Zero means the width was omitted
* (note it's not necessary to distinguish this case from an explicit
* zero width value).
*
* The function result is the next character position to be parsed, ie, the
* location where the type character is/should be.
*
* Note parsing invariant: at least one character is known available before
* string end (end_ptr) at entry, and this is still true at exit.
*/
static const char *
text_format_parse_format(const char *start_ptr, const char *end_ptr,
int *argpos, int *widthpos,
int *flags, int *width)
{
const char *cp = start_ptr;
int n;
/* set defaults for output parameters */
*argpos = -1;
*widthpos = -1;
*flags = 0;
*width = 0;
/* try to identify first number */
if (text_format_parse_digits(&cp, end_ptr, &n))
{
if (*cp != '$')
{
/* Must be just a width and a type, so we're done */
*width = n;
return cp;
}
/* The number was argument position */
*argpos = n;
/* Explicit 0 for argument index is immediately refused */
if (n == 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("format specifies argument 0, but arguments are numbered from 1")));
ADVANCE_PARSE_POINTER(cp, end_ptr);
}
/* Handle flags (only minus is supported now) */
while (*cp == '-')
{
*flags |= TEXT_FORMAT_FLAG_MINUS;
ADVANCE_PARSE_POINTER(cp, end_ptr);
}
if (*cp == '*')
{
/* Handle indirect width */
ADVANCE_PARSE_POINTER(cp, end_ptr);
if (text_format_parse_digits(&cp, end_ptr, &n))
{
/* number in this position must be closed by $ */
if (*cp != '$')
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("width argument position must be ended by \"$\"")));
/* The number was width argument position */
*widthpos = n;
/* Explicit 0 for argument index is immediately refused */
if (n == 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("format specifies argument 0, but arguments are numbered from 1")));
ADVANCE_PARSE_POINTER(cp, end_ptr);
}
else
*widthpos = 0; /* width's argument position is unspecified */
}
else
{
/* Check for direct width specification */
if (text_format_parse_digits(&cp, end_ptr, &n))
*width = n;
}
/* cp should now be pointing at type character */
return cp;
}
/*
* Format a %s, %I, or %L conversion
*/
static void
text_format_string_conversion(StringInfo buf, char conversion,
FmgrInfo *typOutputInfo,
Datum value, bool isNull,
int flags, int width)
{
char *str;
/* Handle NULL arguments before trying to stringify the value. */
if (isNull)
{
if (conversion == 's')
text_format_append_string(buf, "", flags, width);
else if (conversion == 'L')
text_format_append_string(buf, "NULL", flags, width);
else if (conversion == 'I')
ereport(ERROR,
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("null values cannot be formatted as an SQL identifier")));
return;
}
/* Stringify. */
str = OutputFunctionCall(typOutputInfo, value);
/* Escape. */
if (conversion == 'I')
{
/* quote_identifier may or may not allocate a new string. */
text_format_append_string(buf, quote_identifier(str), flags, width);
}
else if (conversion == 'L')
{
char *qstr = quote_literal_cstr(str);
text_format_append_string(buf, qstr, flags, width);
/* quote_literal_cstr() always allocates a new string */
pfree(qstr);
}
else
text_format_append_string(buf, str, flags, width);
/* Cleanup. */
pfree(str);
}
/*
* Append str to buf, padding as directed by flags/width
*/
static void
text_format_append_string(StringInfo buf, const char *str,
int flags, int width)
{
bool align_to_left = false;
int len;
/* fast path for typical easy case */
if (width == 0)
{
appendStringInfoString(buf, str);
return;
}
if (width < 0)
{
/* Negative width: implicit '-' flag, then take absolute value */
align_to_left = true;
/* -INT_MIN is undefined */
if (width <= INT_MIN)
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("number is out of range")));
width = -width;
}
else if (flags & TEXT_FORMAT_FLAG_MINUS)
align_to_left = true;
len = pg_mbstrlen(str);
if (align_to_left)
{
/* left justify */
appendStringInfoString(buf, str);
if (len < width)
appendStringInfoSpaces(buf, width - len);
}
else
{
/* right justify */
if (len < width)
appendStringInfoSpaces(buf, width - len);
appendStringInfoString(buf, str);
}
}
/*
* text_format_nv - nonvariadic wrapper for text_format function.
*
* note: this wrapper is necessary to pass the sanity check in opr_sanity,
* which checks that all built-in functions that share the implementing C
* function take the same number of arguments.
*/
Datum
text_format_nv(PG_FUNCTION_ARGS)
{
return text_format(fcinfo);
}
/*
* Helper function for Levenshtein distance functions. Faster than memcmp(),
* for this use case.
*/
static inline bool
rest_of_char_same(const char *s1, const char *s2, int len)
{
while (len > 0)
{
len--;
if (s1[len] != s2[len])
return false;
}
return true;
}
/* Expand each Levenshtein distance variant */
#include "levenshtein.c"
#define LEVENSHTEIN_LESS_EQUAL
#include "levenshtein.c"
/*
* The following *ClosestMatch() functions can be used to determine whether a
* user-provided string resembles any known valid values, which is useful for
* providing hints in log messages, among other things. Use these functions
* like so:
*
* initClosestMatch(&state, source_string, max_distance);
*
* for (int i = 0; i < num_valid_strings; i++)
* updateClosestMatch(&state, valid_strings[i]);
*
* closestMatch = getClosestMatch(&state);
*/
/*
* Initialize the given state with the source string and maximum Levenshtein
* distance to consider.
*/
void
initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
{
Assert(state);
Assert(max_d >= 0);
state->source = source;
state->min_d = -1;
state->max_d = max_d;
state->match = NULL;
}
/*
* If the candidate string is a closer match than the current one saved (or
* there is no match saved), save it as the closest match.
*
* If the source or candidate string is NULL, empty, or too long, this function
* takes no action. Likewise, if the Levenshtein distance exceeds the maximum
* allowed or more than half the characters are different, no action is taken.
*/
void
updateClosestMatch(ClosestMatchState *state, const char *candidate)
{
int dist;
Assert(state);
if (state->source == NULL || state->source[0] == '\0' ||
candidate == NULL || candidate[0] == '\0')
return;
/*
* To avoid ERROR-ing, we check the lengths here instead of setting
* 'trusted' to false in the call to varstr_levenshtein_less_equal().
*/
if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
return;
dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
candidate, strlen(candidate), 1, 1, 1,
state->max_d, true);
if (dist <= state->max_d &&
dist <= strlen(state->source) / 2 &&
(state->min_d == -1 || dist < state->min_d))
{
state->min_d = dist;
state->match = candidate;
}
}
/*
* Return the closest match. If no suitable candidates were provided via
* updateClosestMatch(), return NULL.
*/
const char *
getClosestMatch(ClosestMatchState *state)
{
Assert(state);
return state->match;
}
/*
* Unicode support
*/
static UnicodeNormalizationForm
unicode_norm_form_from_string(const char *formstr)
{
UnicodeNormalizationForm form = -1;
/*
* Might as well check this while we're here.
*/
if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
if (pg_strcasecmp(formstr, "NFC") == 0)
form = UNICODE_NFC;
else if (pg_strcasecmp(formstr, "NFD") == 0)
form = UNICODE_NFD;
else if (pg_strcasecmp(formstr, "NFKC") == 0)
form = UNICODE_NFKC;
else if (pg_strcasecmp(formstr, "NFKD") == 0)
form = UNICODE_NFKD;
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid normalization form: %s", formstr)));
return form;
}
Datum
unicode_normalize_func(PG_FUNCTION_ARGS)
{
text *input = PG_GETARG_TEXT_PP(0);
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
pg_wchar *input_chars;
pg_wchar *output_chars;
unsigned char *p;
text *result;
int i;
form = unicode_norm_form_from_string(formstr);
/* convert to pg_wchar */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
input_chars = palloc((size + 1) * sizeof(pg_wchar));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
input_chars[i] = (pg_wchar) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* action */
output_chars = unicode_normalize(form, input_chars);
/* convert back to UTF-8 string */
size = 0;
for (pg_wchar *wp = output_chars; *wp; wp++)
{
unsigned char buf[4];
unicode_to_utf8(*wp, buf);
size += pg_utf_mblen(buf);
}
result = palloc(size + VARHDRSZ);
SET_VARSIZE(result, size + VARHDRSZ);
p = (unsigned char *) VARDATA_ANY(result);
for (pg_wchar *wp = output_chars; *wp; wp++)
{
unicode_to_utf8(*wp, p);
p += pg_utf_mblen(p);
}
Assert((char *) p == (char *) result + size + VARHDRSZ);
PG_RETURN_TEXT_P(result);
}
/*
* Check whether the string is in the specified Unicode normalization form.
*
* This is done by converting the string to the specified normal form and then
* comparing that to the original string. To speed that up, we also apply the
* "quick check" algorithm specified in UAX #15, which can give a yes or no
* answer for many strings by just scanning the string once.
*
* This function should generally be optimized for the case where the string
* is in fact normalized. In that case, we'll end up looking at the entire
* string, so it's probably not worth doing any incremental conversion etc.
*/
Datum
unicode_is_normalized(PG_FUNCTION_ARGS)
{
text *input = PG_GETARG_TEXT_PP(0);
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
pg_wchar *input_chars;
pg_wchar *output_chars;
unsigned char *p;
int i;
UnicodeNormalizationQC quickcheck;
int output_size;
bool result;
form = unicode_norm_form_from_string(formstr);
/* convert to pg_wchar */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
input_chars = palloc((size + 1) * sizeof(pg_wchar));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
input_chars[i] = (pg_wchar) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* quick check (see UAX #15) */
quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
if (quickcheck == UNICODE_NORM_QC_YES)
PG_RETURN_BOOL(true);
else if (quickcheck == UNICODE_NORM_QC_NO)
PG_RETURN_BOOL(false);
/* normalize and compare with original */
output_chars = unicode_normalize(form, input_chars);
output_size = 0;
for (pg_wchar *wp = output_chars; *wp; wp++)
output_size++;
result = (size == output_size) &&
(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
PG_RETURN_BOOL(result);
}
/*
* Check if first n chars are hexadecimal digits
*/
static bool
isxdigits_n(const char *instr, size_t n)
{
for (size_t i = 0; i < n; i++)
if (!isxdigit((unsigned char) instr[i]))
return false;
return true;
}
static unsigned int
hexval(unsigned char c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F')
return c - 'A' + 0xA;
elog(ERROR, "invalid hexadecimal digit");
return 0; /* not reached */
}
/*
* Translate string with hexadecimal digits to number
*/
static unsigned int
hexval_n(const char *instr, size_t n)
{
unsigned int result = 0;
for (size_t i = 0; i < n; i++)
result += hexval(instr[i]) << (4 * (n - i - 1));
return result;
}
/*
* Replaces Unicode escape sequences by Unicode characters
*/
Datum
unistr(PG_FUNCTION_ARGS)
{
text *input_text = PG_GETARG_TEXT_PP(0);
char *instr;
int len;
StringInfoData str;
text *result;
pg_wchar pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
len = VARSIZE_ANY_EXHDR(input_text);
initStringInfo(&str);
while (len > 0)
{
if (instr[0] == '\\')
{
if (len >= 2 &&
instr[1] == '\\')
{
if (pair_first)
goto invalid_pair;
appendStringInfoChar(&str, '\\');
instr += 2;
len -= 2;
}
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
pg_wchar unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 4 + offset;
len -= 4 + offset;
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
pg_wchar unicode;
unicode = hexval_n(instr + 2, 6);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 8;
len -= 8;
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
pg_wchar unicode;
unicode = hexval_n(instr + 2, 8);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 10;
len -= 10;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid Unicode escape"),
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
}
else
{
if (pair_first)
goto invalid_pair;
appendStringInfoChar(&str, *instr++);
len--;
}
}
/* unfinished surrogate pair? */
if (pair_first)
goto invalid_pair;
result = cstring_to_text_with_len(str.data, str.len);
pfree(str.data);
PG_RETURN_TEXT_P(result);
invalid_pair:
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid Unicode surrogate pair")));
PG_RETURN_NULL(); /* keep compiler quiet */
}