postgresql/src/backend/utils/adt/varchar.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1235 lines
28 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* varchar.c
* Functions for the built-in types char(n) and varchar(n).
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/utils/adt/varchar.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/detoast.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "libpq/pqformat.h"
#include "mb/pg_wchar.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
#include "utils/array.h"
1999-07-16 07:00:38 +02:00
#include "utils/builtins.h"
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
#include "utils/lsyscache.h"
#include "utils/pg_locale.h"
#include "utils/varlena.h"
/* common code for bpchartypmodin and varchartypmodin */
static int32
anychar_typmodin(ArrayType *ta, const char *typename)
{
int32 typmod;
int32 *tl;
int n;
tl = ArrayGetIntegerTypmods(ta, &n);
/*
* we're not too tense about good error message here because grammar
* shouldn't allow wrong number of modifiers for CHAR
*/
if (n != 1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid type modifier")));
if (*tl < 1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("length for type %s must be at least 1", typename)));
if (*tl > MaxAttrSize)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("length for type %s cannot exceed %d",
typename, MaxAttrSize)));
/*
* For largely historical reasons, the typmod is VARHDRSZ plus the number
* of characters; there is enough client-side code that knows about that
* that we'd better not change it.
*/
typmod = VARHDRSZ + *tl;
return typmod;
}
/* common code for bpchartypmodout and varchartypmodout */
static char *
anychar_typmodout(int32 typmod)
{
char *res = (char *) palloc(64);
if (typmod > VARHDRSZ)
snprintf(res, 64, "(%d)", (int) (typmod - VARHDRSZ));
else
*res = '\0';
return res;
}
/*
* CHAR() and VARCHAR() types are part of the SQL standard. CHAR()
* is for blank-padded string whose length is specified in CREATE TABLE.
* VARCHAR is for storing string whose length is at most the length specified
* at CREATE TABLE time.
*
* It's hard to implement these types because we cannot figure out
* the length of the type from the type itself. I changed (hopefully all) the
* fmgr calls that invoke input functions of a data type to supply the
* length also. (eg. in INSERTs, we have the tupleDescriptor which contains
* the length of the attributes and hence the exact length of the char() or
* varchar(). We pass this to bpcharin() or varcharin().) In the case where
* we cannot determine the length, we pass in -1 instead and the input
* converter does not enforce any length check.
*
* We actually implement this as a varlena so that we don't have to pass in
* the length for the comparison functions. (The difference between these
* types and "text" is that we truncate and possibly blank-pad the string
* at insertion time.)
*
* - ay 6/95
*/
/*****************************************************************************
* bpchar - char() *
*****************************************************************************/
/*
* bpchar_input -- common guts of bpcharin and bpcharrecv
*
* s is the input text of length len (may not be null-terminated)
* atttypmod is the typmod value to apply
*
* Note that atttypmod is measured in characters, which
* is not necessarily the same as the number of bytes.
*
* If the input string is too long, raise an error, unless the extra
* characters are spaces, in which case they're truncated. (per SQL)
*/
static BpChar *
bpchar_input(const char *s, size_t len, int32 atttypmod)
{
BpChar *result;
char *r;
size_t maxlen;
/* If typmod is -1 (or invalid), use the actual string length */
if (atttypmod < (int32) VARHDRSZ)
maxlen = len;
else
{
size_t charlen; /* number of CHARACTERS in the input */
maxlen = atttypmod - VARHDRSZ;
charlen = pg_mbstrlen_with_len(s, len);
if (charlen > maxlen)
{
/* Verify that extra characters are spaces, and clip them off */
size_t mbmaxlen = pg_mbcharcliplen(s, len, maxlen);
size_t j;
/*
* at this point, len is the actual BYTE length of the input
* string, maxlen is the max number of CHARACTERS allowed for this
* bpchar type, mbmaxlen is the length in BYTES of those chars.
*/
for (j = mbmaxlen; j < len; j++)
{
if (s[j] != ' ')
ereport(ERROR,
(errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION),
errmsg("value too long for type character(%d)",
(int) maxlen)));
}
/*
* Now we set maxlen to the necessary byte length, not the number
* of CHARACTERS!
*/
maxlen = len = mbmaxlen;
}
else
{
/*
* Now we set maxlen to the necessary byte length, not the number
* of CHARACTERS!
*/
maxlen = len + (maxlen - charlen);
}
}
result = (BpChar *) palloc(maxlen + VARHDRSZ);
SET_VARSIZE(result, maxlen + VARHDRSZ);
r = VARDATA(result);
memcpy(r, s, len);
/* blank pad the string if necessary */
if (maxlen > len)
memset(r + len, ' ', maxlen - len);
return result;
}
/*
* Convert a C string to CHARACTER internal representation. atttypmod
* is the declared length of the type plus VARHDRSZ.
*/
Datum
bpcharin(PG_FUNCTION_ARGS)
{
char *s = PG_GETARG_CSTRING(0);
2005-10-15 04:49:52 +02:00
#ifdef NOT_USED
Oid typelem = PG_GETARG_OID(1);
#endif
int32 atttypmod = PG_GETARG_INT32(2);
BpChar *result;
result = bpchar_input(s, strlen(s), atttypmod);
PG_RETURN_BPCHAR_P(result);
}
/*
* Convert a CHARACTER value to a C string.
*
* Uses the text conversion functions, which is only appropriate if BpChar
* and text are equivalent types.
*/
Datum
bpcharout(PG_FUNCTION_ARGS)
{
Datum txt = PG_GETARG_DATUM(0);
PG_RETURN_CSTRING(TextDatumGetCString(txt));
}
/*
* bpcharrecv - converts external binary format to bpchar
*/
Datum
bpcharrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
2005-10-15 04:49:52 +02:00
#ifdef NOT_USED
Oid typelem = PG_GETARG_OID(1);
#endif
int32 atttypmod = PG_GETARG_INT32(2);
BpChar *result;
char *str;
int nbytes;
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
result = bpchar_input(str, nbytes, atttypmod);
pfree(str);
PG_RETURN_BPCHAR_P(result);
}
/*
* bpcharsend - converts bpchar to binary format
*/
Datum
bpcharsend(PG_FUNCTION_ARGS)
{
/* Exactly the same as textsend, so share code */
return textsend(fcinfo);
}
/*
* Converts a CHARACTER type to the specified size.
*
* maxlen is the typmod, ie, declared length plus VARHDRSZ bytes.
* isExplicit is true if this is for an explicit cast to char(N).
*
* Truncation rules: for an explicit cast, silently truncate to the given
* length; for an implicit cast, raise error unless extra characters are
* all spaces. (This is sort-of per SQL: the spec would actually have us
* raise a "completion condition" for the explicit cast case, but Postgres
* hasn't got such a concept.)
*/
Datum
bpchar(PG_FUNCTION_ARGS)
{
BpChar *source = PG_GETARG_BPCHAR_PP(0);
int32 maxlen = PG_GETARG_INT32(1);
bool isExplicit = PG_GETARG_BOOL(2);
BpChar *result;
int32 len;
char *r;
char *s;
int i;
2005-07-29 14:59:15 +02:00
int charlen; /* number of characters in the input string +
* VARHDRSZ */
2002-09-04 22:31:48 +02:00
/* No work if typmod is invalid */
if (maxlen < (int32) VARHDRSZ)
PG_RETURN_BPCHAR_P(source);
maxlen -= VARHDRSZ;
len = VARSIZE_ANY_EXHDR(source);
s = VARDATA_ANY(source);
charlen = pg_mbstrlen_with_len(s, len);
/* No work if supplied data matches typmod already */
if (charlen == maxlen)
PG_RETURN_BPCHAR_P(source);
if (charlen > maxlen)
{
/* Verify that extra characters are spaces, and clip them off */
size_t maxmblen;
maxmblen = pg_mbcharcliplen(s, len, maxlen);
1999-05-25 18:15:34 +02:00
if (!isExplicit)
{
for (i = maxmblen; i < len; i++)
if (s[i] != ' ')
ereport(ERROR,
(errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION),
errmsg("value too long for type character(%d)",
maxlen)));
}
2001-03-22 05:01:46 +01:00
len = maxmblen;
/*
* At this point, maxlen is the necessary byte length, not the number
* of CHARACTERS!
*/
maxlen = len;
}
else
{
/*
* At this point, maxlen is the necessary byte length, not the number
* of CHARACTERS!
*/
maxlen = len + (maxlen - charlen);
}
Assert(maxlen >= len);
result = palloc(maxlen + VARHDRSZ);
SET_VARSIZE(result, maxlen + VARHDRSZ);
r = VARDATA(result);
memcpy(r, s, len);
/* blank pad the string if necessary */
if (maxlen > len)
memset(r + len, ' ', maxlen - len);
PG_RETURN_BPCHAR_P(result);
}
/* char_bpchar()
* Convert char to bpchar(1).
*/
Datum
char_bpchar(PG_FUNCTION_ARGS)
{
char c = PG_GETARG_CHAR(0);
BpChar *result;
result = (BpChar *) palloc(VARHDRSZ + 1);
SET_VARSIZE(result, VARHDRSZ + 1);
*(VARDATA(result)) = c;
PG_RETURN_BPCHAR_P(result);
}
/* bpchar_name()
* Converts a bpchar() type to a NameData type.
*/
Datum
bpchar_name(PG_FUNCTION_ARGS)
{
BpChar *s = PG_GETARG_BPCHAR_PP(0);
char *s_data;
Name result;
int len;
len = VARSIZE_ANY_EXHDR(s);
s_data = VARDATA_ANY(s);
/* Truncate oversize input */
if (len >= NAMEDATALEN)
len = pg_mbcliplen(s_data, len, NAMEDATALEN - 1);
/* Remove trailing blanks */
while (len > 0)
{
if (s_data[len - 1] != ' ')
break;
len--;
}
/* We use palloc0 here to ensure result is zero-padded */
result = (Name) palloc0(NAMEDATALEN);
memcpy(NameStr(*result), s_data, len);
PG_RETURN_NAME(result);
}
/* name_bpchar()
* Converts a NameData type to a bpchar type.
*
* Uses the text conversion functions, which is only appropriate if BpChar
* and text are equivalent types.
*/
Datum
name_bpchar(PG_FUNCTION_ARGS)
{
Name s = PG_GETARG_NAME(0);
BpChar *result;
result = (BpChar *) cstring_to_text(NameStr(*s));
PG_RETURN_BPCHAR_P(result);
}
Datum
bpchartypmodin(PG_FUNCTION_ARGS)
{
ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0);
PG_RETURN_INT32(anychar_typmodin(ta, "char"));
}
Datum
bpchartypmodout(PG_FUNCTION_ARGS)
{
int32 typmod = PG_GETARG_INT32(0);
PG_RETURN_CSTRING(anychar_typmodout(typmod));
}
/*****************************************************************************
* varchar - varchar(n)
*
* Note: varchar piggybacks on type text for most operations, and so has no
* C-coded functions except for I/O and typmod checking.
*****************************************************************************/
/*
* varchar_input -- common guts of varcharin and varcharrecv
*
* s is the input text of length len (may not be null-terminated)
* atttypmod is the typmod value to apply
*
* Note that atttypmod is measured in characters, which
* is not necessarily the same as the number of bytes.
*
* If the input string is too long, raise an error, unless the extra
* characters are spaces, in which case they're truncated. (per SQL)
*
* Uses the C string to text conversion function, which is only appropriate
* if VarChar and text are equivalent types.
*/
static VarChar *
varchar_input(const char *s, size_t len, int32 atttypmod)
{
VarChar *result;
size_t maxlen;
maxlen = atttypmod - VARHDRSZ;
if (atttypmod >= (int32) VARHDRSZ && len > maxlen)
{
/* Verify that extra characters are spaces, and clip them off */
size_t mbmaxlen = pg_mbcharcliplen(s, len, maxlen);
size_t j;
for (j = mbmaxlen; j < len; j++)
{
if (s[j] != ' ')
ereport(ERROR,
(errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION),
errmsg("value too long for type character varying(%d)",
(int) maxlen)));
}
len = mbmaxlen;
}
result = (VarChar *) cstring_to_text_with_len(s, len);
return result;
}
/*
* Convert a C string to VARCHAR internal representation. atttypmod
* is the declared length of the type plus VARHDRSZ.
*/
Datum
varcharin(PG_FUNCTION_ARGS)
{
char *s = PG_GETARG_CSTRING(0);
2005-10-15 04:49:52 +02:00
#ifdef NOT_USED
Oid typelem = PG_GETARG_OID(1);
#endif
int32 atttypmod = PG_GETARG_INT32(2);
VarChar *result;
result = varchar_input(s, strlen(s), atttypmod);
PG_RETURN_VARCHAR_P(result);
}
/*
* Convert a VARCHAR value to a C string.
*
* Uses the text to C string conversion function, which is only appropriate
* if VarChar and text are equivalent types.
*/
Datum
varcharout(PG_FUNCTION_ARGS)
{
Datum txt = PG_GETARG_DATUM(0);
PG_RETURN_CSTRING(TextDatumGetCString(txt));
}
/*
* varcharrecv - converts external binary format to varchar
*/
Datum
varcharrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
2005-10-15 04:49:52 +02:00
#ifdef NOT_USED
Oid typelem = PG_GETARG_OID(1);
#endif
int32 atttypmod = PG_GETARG_INT32(2);
VarChar *result;
char *str;
int nbytes;
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
result = varchar_input(str, nbytes, atttypmod);
pfree(str);
PG_RETURN_VARCHAR_P(result);
}
/*
* varcharsend - converts varchar to binary format
*/
Datum
varcharsend(PG_FUNCTION_ARGS)
{
/* Exactly the same as textsend, so share code */
return textsend(fcinfo);
}
/*
* varchar_support()
*
* Planner support function for the varchar() length coercion function.
*
* Currently, the only interesting thing we can do is flatten calls that set
* the new maximum length >= the previous maximum length. We can ignore the
* isExplicit argument, since that only affects truncation cases.
*/
Datum
varchar_support(PG_FUNCTION_ARGS)
{
Node *rawreq = (Node *) PG_GETARG_POINTER(0);
Node *ret = NULL;
if (IsA(rawreq, SupportRequestSimplify))
{
SupportRequestSimplify *req = (SupportRequestSimplify *) rawreq;
FuncExpr *expr = req->fcall;
Node *typmod;
Assert(list_length(expr->args) >= 2);
typmod = (Node *) lsecond(expr->args);
if (IsA(typmod, Const) && !((Const *) typmod)->constisnull)
{
Node *source = (Node *) linitial(expr->args);
int32 old_typmod = exprTypmod(source);
int32 new_typmod = DatumGetInt32(((Const *) typmod)->constvalue);
int32 old_max = old_typmod - VARHDRSZ;
int32 new_max = new_typmod - VARHDRSZ;
if (new_typmod < 0 || (old_typmod >= 0 && old_max <= new_max))
ret = relabel_to_typmod(source, new_typmod);
}
}
PG_RETURN_POINTER(ret);
}
/*
* Converts a VARCHAR type to the specified size.
*
* maxlen is the typmod, ie, declared length plus VARHDRSZ bytes.
* isExplicit is true if this is for an explicit cast to varchar(N).
*
* Truncation rules: for an explicit cast, silently truncate to the given
* length; for an implicit cast, raise error unless extra characters are
* all spaces. (This is sort-of per SQL: the spec would actually have us
* raise a "completion condition" for the explicit cast case, but Postgres
* hasn't got such a concept.)
*/
Datum
varchar(PG_FUNCTION_ARGS)
{
VarChar *source = PG_GETARG_VARCHAR_PP(0);
int32 typmod = PG_GETARG_INT32(1);
bool isExplicit = PG_GETARG_BOOL(2);
int32 len,
maxlen;
size_t maxmblen;
int i;
char *s_data;
len = VARSIZE_ANY_EXHDR(source);
s_data = VARDATA_ANY(source);
maxlen = typmod - VARHDRSZ;
/* No work if typmod is invalid or supplied data fits it already */
if (maxlen < 0 || len <= maxlen)
PG_RETURN_VARCHAR_P(source);
/* only reach here if string is too long... */
/* truncate multibyte string preserving multibyte boundary */
maxmblen = pg_mbcharcliplen(s_data, len, maxlen);
if (!isExplicit)
{
for (i = maxmblen; i < len; i++)
if (s_data[i] != ' ')
ereport(ERROR,
(errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION),
errmsg("value too long for type character varying(%d)",
maxlen)));
}
PG_RETURN_VARCHAR_P((VarChar *) cstring_to_text_with_len(s_data,
maxmblen));
}
Datum
varchartypmodin(PG_FUNCTION_ARGS)
{
ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0);
PG_RETURN_INT32(anychar_typmodin(ta, "varchar"));
}
Datum
varchartypmodout(PG_FUNCTION_ARGS)
{
int32 typmod = PG_GETARG_INT32(0);
PG_RETURN_CSTRING(anychar_typmodout(typmod));
}
/*****************************************************************************
* Exported functions
*****************************************************************************/
/* "True" length (not counting trailing blanks) of a BpChar */
static inline int
bcTruelen(BpChar *arg)
{
return bpchartruelen(VARDATA_ANY(arg), VARSIZE_ANY_EXHDR(arg));
}
int
bpchartruelen(char *s, int len)
{
int i;
/*
* Note that we rely on the assumption that ' ' is a singleton unit on
* every supported multibyte server encoding.
*/
for (i = len - 1; i >= 0; i--)
{
if (s[i] != ' ')
break;
}
return i + 1;
}
Datum
bpcharlen(PG_FUNCTION_ARGS)
{
BpChar *arg = PG_GETARG_BPCHAR_PP(0);
int len;
2001-03-22 05:01:46 +01:00
/* get number of bytes, ignoring trailing spaces */
len = bcTruelen(arg);
2004-08-29 07:07:03 +02:00
/* in multibyte encoding, convert to number of characters */
if (pg_database_encoding_max_length() != 1)
len = pg_mbstrlen_with_len(VARDATA_ANY(arg), len);
PG_RETURN_INT32(len);
}
Datum
bpcharoctetlen(PG_FUNCTION_ARGS)
{
Datum arg = PG_GETARG_DATUM(0);
/* We need not detoast the input at all */
PG_RETURN_INT32(toast_raw_datum_size(arg) - VARHDRSZ);
1998-01-08 04:05:01 +01:00
}
/*****************************************************************************
* Comparison Functions used for bpchar
*
* Note: btree indexes need these routines not to leak memory; therefore,
* be careful to free working copies of toasted datums. Most places don't
* need to be so careful.
*****************************************************************************/
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
static void
check_collation_set(Oid collid)
{
if (!OidIsValid(collid))
{
/*
* This typically means that the parser could not resolve a conflict
* of implicit collations, so report it that way.
*/
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for string comparison"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
}
}
Datum
bpchareq(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
bool result;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
Oid collid = PG_GET_COLLATION();
bool locale_is_c = false;
pg_locale_t mylocale = 0;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
check_collation_set(collid);
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
if (lc_collate_is_c(collid))
locale_is_c = true;
else
mylocale = pg_newlocale_from_collation(collid);
if (locale_is_c || !mylocale || mylocale->deterministic)
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
{
/*
* Since we only care about equality or not-equality, we can avoid all
* the expense of strcoll() here, and just do bitwise comparison.
*/
if (len1 != len2)
result = false;
else
result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 0);
}
else
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
{
result = (varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
collid) == 0);
}
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
bpcharne(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
bool result;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
Oid collid = PG_GET_COLLATION();
bool locale_is_c = false;
pg_locale_t mylocale = 0;
check_collation_set(collid);
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
if (lc_collate_is_c(collid))
locale_is_c = true;
else
mylocale = pg_newlocale_from_collation(collid);
if (locale_is_c || !mylocale || mylocale->deterministic)
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
{
/*
* Since we only care about equality or not-equality, we can avoid all
* the expense of strcoll() here, and just do bitwise comparison.
*/
if (len1 != len2)
result = true;
else
result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 0);
}
else
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
{
result = (varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
collid) != 0);
}
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result);
}
Datum
bpcharlt(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(cmp < 0);
}
Datum
bpcharle(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(cmp <= 0);
}
Datum
bpchargt(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(cmp > 0);
}
Datum
bpcharge(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(cmp >= 0);
}
Datum
bpcharcmp(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(cmp);
}
Datum
bpchar_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
Oid collid = ssup->ssup_collation;
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
/* Use generic string SortSupport */
varstr_sortsupport(ssup, BPCHAROID, collid);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_VOID();
}
Datum
bpchar_larger(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_RETURN_BPCHAR_P((cmp >= 0) ? arg1 : arg2);
}
Datum
bpchar_smaller(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int len1,
len2;
int cmp;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
cmp = varstr_cmp(VARDATA_ANY(arg1), len1, VARDATA_ANY(arg2), len2,
PG_GET_COLLATION());
PG_RETURN_BPCHAR_P((cmp <= 0) ? arg1 : arg2);
}
/*
* bpchar needs a specialized hash function because we want to ignore
* trailing blanks in comparisons.
*/
Datum
hashbpchar(PG_FUNCTION_ARGS)
{
BpChar *key = PG_GETARG_BPCHAR_PP(0);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
Oid collid = PG_GET_COLLATION();
char *keydata;
int keylen;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
pg_locale_t mylocale = 0;
Datum result;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
if (!collid)
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for string hashing"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
keydata = VARDATA_ANY(key);
keylen = bcTruelen(key);
if (!lc_collate_is_c(collid))
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
mylocale = pg_newlocale_from_collation(collid);
if (!mylocale || mylocale->deterministic)
{
result = hash_any((unsigned char *) keydata, keylen);
}
else
{
#ifdef USE_ICU
if (mylocale->provider == COLLPROVIDER_ICU)
{
int32_t ulen = -1;
UChar *uchar = NULL;
Size bsize;
uint8_t *buf;
ulen = icu_to_uchar(&uchar, keydata, keylen);
bsize = ucol_getSortKey(mylocale->info.icu.ucol,
uchar, ulen, NULL, 0);
buf = palloc(bsize);
ucol_getSortKey(mylocale->info.icu.ucol,
uchar, ulen, buf, bsize);
result = hash_any(buf, bsize);
pfree(buf);
}
else
#endif
/* shouldn't happen */
elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
}
/* Avoid leaking memory for toasted inputs */
PG_FREE_IF_COPY(key, 0);
return result;
}
Datum
hashbpcharextended(PG_FUNCTION_ARGS)
{
BpChar *key = PG_GETARG_BPCHAR_PP(0);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
Oid collid = PG_GET_COLLATION();
char *keydata;
int keylen;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
pg_locale_t mylocale = 0;
Datum result;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
if (!collid)
ereport(ERROR,
(errcode(ERRCODE_INDETERMINATE_COLLATION),
errmsg("could not determine which collation to use for string hashing"),
errhint("Use the COLLATE clause to set the collation explicitly.")));
keydata = VARDATA_ANY(key);
keylen = bcTruelen(key);
if (!lc_collate_is_c(collid))
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
mylocale = pg_newlocale_from_collation(collid);
if (!mylocale || mylocale->deterministic)
{
result = hash_any_extended((unsigned char *) keydata, keylen,
PG_GETARG_INT64(1));
}
else
{
#ifdef USE_ICU
if (mylocale->provider == COLLPROVIDER_ICU)
{
int32_t ulen = -1;
UChar *uchar = NULL;
Size bsize;
uint8_t *buf;
ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
bsize = ucol_getSortKey(mylocale->info.icu.ucol,
uchar, ulen, NULL, 0);
buf = palloc(bsize);
ucol_getSortKey(mylocale->info.icu.ucol,
uchar, ulen, buf, bsize);
result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
pfree(buf);
}
else
#endif
/* shouldn't happen */
elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
}
PG_FREE_IF_COPY(key, 0);
return result;
}
/*
* The following operators support character-by-character comparison
* of bpchar datums, to allow building indexes suitable for LIKE clauses.
* Note that the regular bpchareq/bpcharne comparison operators, and
* regular support functions 1 and 2 with "C" collation are assumed to be
* compatible with these!
*/
static int
internal_bpchar_pattern_compare(BpChar *arg1, BpChar *arg2)
{
int result;
int len1,
len2;
len1 = bcTruelen(arg1);
len2 = bcTruelen(arg2);
result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
if (result != 0)
return result;
else if (len1 < len2)
return -1;
else if (len1 > len2)
return 1;
else
return 0;
}
Datum
bpchar_pattern_lt(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int result;
result = internal_bpchar_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result < 0);
}
Datum
bpchar_pattern_le(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int result;
result = internal_bpchar_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result <= 0);
}
Datum
bpchar_pattern_ge(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int result;
result = internal_bpchar_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result >= 0);
}
Datum
bpchar_pattern_gt(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int result;
result = internal_bpchar_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_BOOL(result > 0);
}
Datum
btbpchar_pattern_cmp(PG_FUNCTION_ARGS)
{
BpChar *arg1 = PG_GETARG_BPCHAR_PP(0);
BpChar *arg2 = PG_GETARG_BPCHAR_PP(1);
int result;
result = internal_bpchar_pattern_compare(arg1, arg2);
PG_FREE_IF_COPY(arg1, 0);
PG_FREE_IF_COPY(arg2, 1);
PG_RETURN_INT32(result);
}
Datum
btbpchar_pattern_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
/* Use generic string SortSupport, forcing "C" collation */
varstr_sortsupport(ssup, BPCHAROID, C_COLLATION_OID);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_VOID();
}