1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* varlena.c
|
1997-09-07 07:04:48 +02:00
|
|
|
* Functions for the variable-length built-in types.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2020-01-01 18:21:45 +01:00
|
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/utils/adt/varlena.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
2001-02-10 03:31:31 +01:00
|
|
|
#include <ctype.h>
|
2010-11-20 06:21:17 +01:00
|
|
|
#include <limits.h>
|
2001-02-10 03:31:31 +01:00
|
|
|
|
2019-07-08 17:58:05 +02:00
|
|
|
#include "access/detoast.h"
|
2011-02-08 22:04:18 +01:00
|
|
|
#include "catalog/pg_collation.h"
|
2003-06-27 02:33:26 +02:00
|
|
|
#include "catalog/pg_type.h"
|
2020-02-27 04:55:41 +01:00
|
|
|
#include "common/hashfn.h"
|
2017-12-13 01:32:31 +01:00
|
|
|
#include "common/int.h"
|
2015-01-19 21:20:31 +01:00
|
|
|
#include "lib/hyperloglog.h"
|
2003-05-09 17:44:42 +02:00
|
|
|
#include "libpq/pqformat.h"
|
2006-11-08 20:22:25 +01:00
|
|
|
#include "miscadmin.h"
|
2004-02-21 01:34:53 +01:00
|
|
|
#include "parser/scansup.h"
|
2015-10-09 21:06:06 +02:00
|
|
|
#include "port/pg_bswap.h"
|
2005-10-18 22:38:58 +02:00
|
|
|
#include "regex/regex.h"
|
1999-07-16 07:00:38 +02:00
|
|
|
#include "utils/builtins.h"
|
2009-08-04 18:08:37 +02:00
|
|
|
#include "utils/bytea.h"
|
2003-06-27 02:33:26 +02:00
|
|
|
#include "utils/lsyscache.h"
|
2014-08-14 18:09:52 +02:00
|
|
|
#include "utils/memutils.h"
|
2004-02-21 01:34:53 +01:00
|
|
|
#include "utils/pg_locale.h"
|
2014-08-14 18:09:52 +02:00
|
|
|
#include "utils/sortsupport.h"
|
2017-01-21 02:29:53 +01:00
|
|
|
#include "utils/varlena.h"
|
1998-04-27 19:10:50 +02:00
|
|
|
|
2002-04-25 04:56:56 +02:00
|
|
|
|
2009-08-04 18:08:37 +02:00
|
|
|
/* GUC variable */
|
2010-02-26 03:01:40 +01:00
|
|
|
int bytea_output = BYTEA_OUTPUT_HEX;
|
2009-08-04 18:08:37 +02:00
|
|
|
|
2002-04-25 04:56:56 +02:00
|
|
|
typedef struct varlena unknown;
|
2016-02-08 21:15:56 +01:00
|
|
|
typedef struct varlena VarString;
|
2002-04-25 04:56:56 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/*
|
|
|
|
* State for text_position_* functions.
|
|
|
|
*/
|
2006-10-07 02:11:53 +02:00
|
|
|
typedef struct
|
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
bool is_multibyte; /* T if multibyte encoding */
|
|
|
|
bool is_multibyte_char_in_char;
|
|
|
|
|
|
|
|
char *str1; /* haystack string */
|
|
|
|
char *str2; /* needle string */
|
|
|
|
int len1; /* string lengths in bytes */
|
2006-10-07 02:11:53 +02:00
|
|
|
int len2;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
|
2008-09-07 06:20:00 +02:00
|
|
|
/* Skip table for Boyer-Moore-Horspool search algorithm: */
|
|
|
|
int skiptablemask; /* mask for ANDing with skiptable subscripts */
|
2009-06-11 16:49:15 +02:00
|
|
|
int skiptable[256]; /* skip distance for given mismatched char */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
|
|
|
|
char *last_match; /* pointer to last match in 'str1' */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sometimes we need to convert the byte position of a match to a
|
|
|
|
* character position. These store the last position that was converted,
|
|
|
|
* so that on the next call, we can continue from that point, rather than
|
|
|
|
* count characters from the very beginning.
|
|
|
|
*/
|
|
|
|
char *refpoint; /* pointer within original haystack string */
|
|
|
|
int refpos; /* 0-based character offset of the same point */
|
2007-11-15 23:25:18 +01:00
|
|
|
} TextPositionState;
|
2006-10-07 02:11:53 +02:00
|
|
|
|
2014-08-14 18:09:52 +02:00
|
|
|
typedef struct
|
|
|
|
{
|
2015-05-24 03:35:49 +02:00
|
|
|
char *buf1; /* 1st string, or abbreviation original string
|
|
|
|
* buf */
|
|
|
|
char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
|
|
|
|
int buflen1;
|
|
|
|
int buflen2;
|
2015-11-03 14:32:22 +01:00
|
|
|
int last_len1; /* Length of last buf1 string/strxfrm() input */
|
2015-10-10 01:03:44 +02:00
|
|
|
int last_len2; /* Length of last buf2 string/strxfrm() blob */
|
|
|
|
int last_returned; /* Last comparison result (cache) */
|
2015-10-20 15:27:50 +02:00
|
|
|
bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
|
2015-05-24 03:35:49 +02:00
|
|
|
bool collate_c;
|
2019-01-10 20:07:01 +01:00
|
|
|
Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
|
2015-05-24 03:35:49 +02:00
|
|
|
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
|
|
|
|
hyperLogLogState full_card; /* Full key cardinality state */
|
|
|
|
double prop_card; /* Required cardinality proportion */
|
2014-08-14 18:09:52 +02:00
|
|
|
pg_locale_t locale;
|
2016-02-08 21:15:56 +01:00
|
|
|
} VarStringSortSupport;
|
2014-08-14 18:09:52 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This should be large enough that most strings will fit, but small enough
|
|
|
|
* that we feel comfortable putting it on the stack
|
|
|
|
*/
|
|
|
|
#define TEXTBUFLEN 1024
|
|
|
|
|
2002-04-25 04:56:56 +02:00
|
|
|
#define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
|
2003-05-09 17:44:42 +02:00
|
|
|
#define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
|
2002-04-25 04:56:56 +02:00
|
|
|
#define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
|
2003-08-04 02:43:34 +02:00
|
|
|
#define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
|
2002-04-25 04:56:56 +02:00
|
|
|
#define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
|
2003-05-09 17:44:42 +02:00
|
|
|
|
2016-02-08 21:15:56 +01:00
|
|
|
#define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
|
|
|
|
#define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
|
2016-02-03 20:17:35 +01:00
|
|
|
|
|
|
|
static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
|
2016-02-08 21:17:40 +01:00
|
|
|
static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
|
|
|
|
static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
|
|
|
|
static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
|
|
|
|
static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
|
2016-02-03 20:17:35 +01:00
|
|
|
static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
|
|
|
|
static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
|
|
|
|
static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
|
2002-08-22 05:24:01 +02:00
|
|
|
static int32 text_length(Datum str);
|
2010-01-25 21:55:32 +01:00
|
|
|
static text *text_catenate(text *t1, text *t2);
|
2002-08-22 05:24:01 +02:00
|
|
|
static text *text_substring(Datum str,
|
2019-05-22 19:04:48 +02:00
|
|
|
int32 start,
|
|
|
|
int32 length,
|
|
|
|
bool length_not_specified);
|
2010-01-25 21:55:32 +01:00
|
|
|
static text *text_overlay(text *t1, text *t2, int sp, int sl);
|
2019-03-22 12:09:32 +01:00
|
|
|
static int text_position(text *t1, text *t2, Oid collid);
|
|
|
|
static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
static bool text_position_next(TextPositionState *state);
|
|
|
|
static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
|
|
|
|
static char *text_position_get_match_ptr(TextPositionState *state);
|
|
|
|
static int text_position_get_match_pos(TextPositionState *state);
|
2013-03-15 03:56:56 +01:00
|
|
|
static void text_position_cleanup(TextPositionState *state);
|
2019-03-22 12:09:32 +01:00
|
|
|
static void check_collation_set(Oid collid);
|
2013-03-15 03:56:56 +01:00
|
|
|
static int text_cmp(text *arg1, text *arg2, Oid collid);
|
2010-01-25 21:55:32 +01:00
|
|
|
static bytea *bytea_catenate(bytea *t1, bytea *t2);
|
|
|
|
static bytea *bytea_substring(Datum str,
|
2019-05-22 19:04:48 +02:00
|
|
|
int S,
|
|
|
|
int L,
|
|
|
|
bool length_not_specified);
|
2010-01-25 21:55:32 +01:00
|
|
|
static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
|
2013-03-15 03:56:56 +01:00
|
|
|
static void appendStringInfoText(StringInfo str, const text *t);
|
2010-08-10 23:51:00 +02:00
|
|
|
static Datum text_to_array_internal(PG_FUNCTION_ARGS);
|
|
|
|
static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
|
2019-05-22 19:04:48 +02:00
|
|
|
const char *fldsep, const char *null_string);
|
2013-03-15 03:56:56 +01:00
|
|
|
static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
|
|
|
|
static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
|
2019-05-22 19:04:48 +02:00
|
|
|
int *value);
|
2013-03-15 03:56:56 +01:00
|
|
|
static const char *text_format_parse_format(const char *start_ptr,
|
2019-05-22 19:04:48 +02:00
|
|
|
const char *end_ptr,
|
|
|
|
int *argpos, int *widthpos,
|
|
|
|
int *flags, int *width);
|
2013-03-15 03:56:56 +01:00
|
|
|
static void text_format_string_conversion(StringInfo buf, char conversion,
|
2019-05-22 19:04:48 +02:00
|
|
|
FmgrInfo *typOutputInfo,
|
|
|
|
Datum value, bool isNull,
|
|
|
|
int flags, int width);
|
2013-03-15 03:56:56 +01:00
|
|
|
static void text_format_append_string(StringInfo buf, const char *str,
|
2019-05-22 19:04:48 +02:00
|
|
|
int flags, int width);
|
2010-08-10 23:51:00 +02:00
|
|
|
|
1998-10-08 20:30:52 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
* CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cstring_to_text
|
|
|
|
*
|
|
|
|
* Create a text value from a null-terminated C string.
|
|
|
|
*
|
|
|
|
* The new text value is freshly palloc'd with a full-size VARHDR.
|
|
|
|
*/
|
|
|
|
text *
|
|
|
|
cstring_to_text(const char *s)
|
|
|
|
{
|
|
|
|
return cstring_to_text_with_len(s, strlen(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cstring_to_text_with_len
|
|
|
|
*
|
|
|
|
* Same as cstring_to_text except the caller specifies the string length;
|
|
|
|
* the string need not be null_terminated.
|
|
|
|
*/
|
|
|
|
text *
|
|
|
|
cstring_to_text_with_len(const char *s, int len)
|
|
|
|
{
|
|
|
|
text *result = (text *) palloc(len + VARHDRSZ);
|
|
|
|
|
|
|
|
SET_VARSIZE(result, len + VARHDRSZ);
|
|
|
|
memcpy(VARDATA(result), s, len);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text_to_cstring
|
|
|
|
*
|
|
|
|
* Create a palloc'd, null-terminated C string from a text value.
|
|
|
|
*
|
|
|
|
* We support being passed a compressed or toasted text value.
|
|
|
|
* This is a bit bogus since such values shouldn't really be referred to as
|
|
|
|
* "text *", but it seems useful for robustness. If we didn't handle that
|
|
|
|
* case here, we'd need another routine that did, anyway.
|
|
|
|
*/
|
|
|
|
char *
|
|
|
|
text_to_cstring(const text *t)
|
|
|
|
{
|
|
|
|
/* must cast away the const, unfortunately */
|
2018-10-25 01:42:57 +02:00
|
|
|
text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
|
2008-03-25 23:42:46 +01:00
|
|
|
int len = VARSIZE_ANY_EXHDR(tunpacked);
|
|
|
|
char *result;
|
|
|
|
|
|
|
|
result = (char *) palloc(len + 1);
|
|
|
|
memcpy(result, VARDATA_ANY(tunpacked), len);
|
|
|
|
result[len] = '\0';
|
|
|
|
|
|
|
|
if (tunpacked != t)
|
|
|
|
pfree(tunpacked);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text_to_cstring_buffer
|
|
|
|
*
|
|
|
|
* Copy a text value into a caller-supplied buffer of size dst_len.
|
|
|
|
*
|
|
|
|
* The text string is truncated if necessary to fit. The result is
|
|
|
|
* guaranteed null-terminated (unless dst_len == 0).
|
|
|
|
*
|
|
|
|
* We support being passed a compressed or toasted text value.
|
|
|
|
* This is a bit bogus since such values shouldn't really be referred to as
|
|
|
|
* "text *", but it seems useful for robustness. If we didn't handle that
|
|
|
|
* case here, we'd need another routine that did, anyway.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
|
|
|
|
{
|
|
|
|
/* must cast away the const, unfortunately */
|
2018-10-25 01:42:57 +02:00
|
|
|
text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
|
2008-03-25 23:42:46 +01:00
|
|
|
size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
|
|
|
|
|
|
|
|
if (dst_len > 0)
|
|
|
|
{
|
|
|
|
dst_len--;
|
|
|
|
if (dst_len >= src_len)
|
|
|
|
dst_len = src_len;
|
2017-06-21 20:39:04 +02:00
|
|
|
else /* ensure truncation is encoding-safe */
|
2008-03-25 23:42:46 +01:00
|
|
|
dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
|
|
|
|
memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
|
|
|
|
dst[dst_len] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srcunpacked != src)
|
|
|
|
pfree(srcunpacked);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*****************************************************************************
|
|
|
|
* USER I/O ROUTINES *
|
1996-07-09 08:22:35 +02:00
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
#define VAL(CH) ((CH) - '0')
|
|
|
|
#define DIG(VAL) ((VAL) + '0')
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* byteain - converts from printable representation of byte array
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* Non-printable characters must be passed as '\nnn' (octal) and are
|
|
|
|
* converted to internal form. '\' must be passed as '\\'.
|
2003-07-27 06:53:12 +02:00
|
|
|
* ereport(ERROR, ...) if bad form.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* BUGS:
|
2006-02-26 03:23:41 +01:00
|
|
|
* The input is scanned twice.
|
1997-09-07 07:04:48 +02:00
|
|
|
* The error checking of input is minimal.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-29 05:26:51 +02:00
|
|
|
Datum
|
|
|
|
byteain(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-29 05:26:51 +02:00
|
|
|
char *inputText = PG_GETARG_CSTRING(0);
|
1998-01-01 06:50:50 +01:00
|
|
|
char *tp;
|
|
|
|
char *rp;
|
2009-08-04 18:08:37 +02:00
|
|
|
int bc;
|
2000-03-24 03:41:46 +01:00
|
|
|
bytea *result;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2009-08-04 18:08:37 +02:00
|
|
|
/* Recognize hex input */
|
|
|
|
if (inputText[0] == '\\' && inputText[1] == 'x')
|
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
size_t len = strlen(inputText);
|
2009-08-04 18:08:37 +02:00
|
|
|
|
2010-02-26 03:01:40 +01:00
|
|
|
bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
|
2009-08-04 18:08:37 +02:00
|
|
|
result = palloc(bc);
|
|
|
|
bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
|
2009-08-04 18:08:37 +02:00
|
|
|
|
|
|
|
PG_RETURN_BYTEA_P(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Else, it's the traditional escaped style */
|
|
|
|
for (bc = 0, tp = inputText; *tp != '\0'; bc++)
|
2000-07-29 05:26:51 +02:00
|
|
|
{
|
2001-09-14 19:46:40 +02:00
|
|
|
if (tp[0] != '\\')
|
|
|
|
tp++;
|
2001-10-25 07:50:21 +02:00
|
|
|
else if ((tp[0] == '\\') &&
|
|
|
|
(tp[1] >= '0' && tp[1] <= '3') &&
|
|
|
|
(tp[2] >= '0' && tp[2] <= '7') &&
|
|
|
|
(tp[3] >= '0' && tp[3] <= '7'))
|
2001-09-14 19:46:40 +02:00
|
|
|
tp += 4;
|
2001-10-25 07:50:21 +02:00
|
|
|
else if ((tp[0] == '\\') &&
|
|
|
|
(tp[1] == '\\'))
|
2001-09-14 19:46:40 +02:00
|
|
|
tp += 2;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2009-08-04 18:08:37 +02:00
|
|
|
* one backslash, not followed by another or ### valid octal
|
2001-09-14 19:46:40 +02:00
|
|
|
*/
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
|
2017-01-18 20:08:20 +01:00
|
|
|
errmsg("invalid input syntax for type %s", "bytea")));
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-29 05:26:51 +02:00
|
|
|
}
|
|
|
|
|
2009-08-04 18:08:37 +02:00
|
|
|
bc += VARHDRSZ;
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2009-08-04 18:08:37 +02:00
|
|
|
result = (bytea *) palloc(bc);
|
|
|
|
SET_VARSIZE(result, bc);
|
2000-07-29 05:26:51 +02:00
|
|
|
|
|
|
|
tp = inputText;
|
2003-05-09 17:44:42 +02:00
|
|
|
rp = VARDATA(result);
|
1997-09-07 07:04:48 +02:00
|
|
|
while (*tp != '\0')
|
2000-07-29 05:26:51 +02:00
|
|
|
{
|
2001-09-14 19:46:40 +02:00
|
|
|
if (tp[0] != '\\')
|
1997-09-07 07:04:48 +02:00
|
|
|
*rp++ = *tp++;
|
2001-10-25 07:50:21 +02:00
|
|
|
else if ((tp[0] == '\\') &&
|
|
|
|
(tp[1] >= '0' && tp[1] <= '3') &&
|
|
|
|
(tp[2] >= '0' && tp[2] <= '7') &&
|
|
|
|
(tp[3] >= '0' && tp[3] <= '7'))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2009-08-04 18:08:37 +02:00
|
|
|
bc = VAL(tp[1]);
|
|
|
|
bc <<= 3;
|
|
|
|
bc += VAL(tp[2]);
|
|
|
|
bc <<= 3;
|
|
|
|
*rp++ = bc + VAL(tp[3]);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2001-09-14 19:46:40 +02:00
|
|
|
tp += 4;
|
|
|
|
}
|
2001-10-25 07:50:21 +02:00
|
|
|
else if ((tp[0] == '\\') &&
|
|
|
|
(tp[1] == '\\'))
|
2001-09-14 19:46:40 +02:00
|
|
|
{
|
|
|
|
*rp++ = '\\';
|
|
|
|
tp += 2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We should never get here. The first pass should not allow it.
|
2001-09-14 19:46:40 +02:00
|
|
|
*/
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
|
2017-01-18 20:08:20 +01:00
|
|
|
errmsg("invalid input syntax for type %s", "bytea")));
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-29 05:26:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
PG_RETURN_BYTEA_P(result);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* byteaout - converts to printable representation of byte array
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2009-08-04 18:08:37 +02:00
|
|
|
* In the traditional escaped format, non-printable characters are
|
|
|
|
* printed as '\nnn' (octal) and '\' as '\\'.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-29 05:26:51 +02:00
|
|
|
Datum
|
|
|
|
byteaout(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *vlena = PG_GETARG_BYTEA_PP(0);
|
1998-01-01 06:50:50 +01:00
|
|
|
char *result;
|
|
|
|
char *rp;
|
2009-08-04 18:08:37 +02:00
|
|
|
|
|
|
|
if (bytea_output == BYTEA_OUTPUT_HEX)
|
|
|
|
{
|
|
|
|
/* Print hex format */
|
|
|
|
rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
|
|
|
|
*rp++ = '\\';
|
|
|
|
*rp++ = 'x';
|
|
|
|
rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
|
|
|
|
}
|
|
|
|
else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
|
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
/* Print traditional escaped format */
|
|
|
|
char *vp;
|
|
|
|
int len;
|
|
|
|
int i;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-02-26 03:01:40 +01:00
|
|
|
len = 1; /* empty string has 1 char */
|
|
|
|
vp = VARDATA_ANY(vlena);
|
|
|
|
for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
if (*vp == '\\')
|
|
|
|
len += 2;
|
|
|
|
else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
|
|
|
|
len += 4;
|
|
|
|
else
|
|
|
|
len++;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2010-02-26 03:01:40 +01:00
|
|
|
rp = result = (char *) palloc(len);
|
|
|
|
vp = VARDATA_ANY(vlena);
|
|
|
|
for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
if (*vp == '\\')
|
|
|
|
{
|
|
|
|
*rp++ = '\\';
|
|
|
|
*rp++ = '\\';
|
|
|
|
}
|
|
|
|
else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
|
|
|
|
{
|
|
|
|
int val; /* holds unprintable chars */
|
|
|
|
|
|
|
|
val = *vp;
|
|
|
|
rp[0] = '\\';
|
|
|
|
rp[3] = DIG(val & 07);
|
|
|
|
val >>= 3;
|
|
|
|
rp[2] = DIG(val & 07);
|
|
|
|
val >>= 3;
|
|
|
|
rp[1] = DIG(val & 03);
|
|
|
|
rp += 4;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
*rp++ = *vp;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2009-08-04 18:08:37 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
elog(ERROR, "unrecognized bytea_output setting: %d",
|
|
|
|
bytea_output);
|
|
|
|
rp = result = NULL; /* keep compiler quiet */
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
*rp = '\0';
|
2000-07-29 05:26:51 +02:00
|
|
|
PG_RETURN_CSTRING(result);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2003-05-09 17:44:42 +02:00
|
|
|
/*
|
|
|
|
* bytearecv - converts external binary format to bytea
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
bytearecv(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
|
|
|
|
bytea *result;
|
|
|
|
int nbytes;
|
|
|
|
|
|
|
|
nbytes = buf->len - buf->cursor;
|
|
|
|
result = (bytea *) palloc(nbytes + VARHDRSZ);
|
2007-02-28 00:48:10 +01:00
|
|
|
SET_VARSIZE(result, nbytes + VARHDRSZ);
|
2003-05-09 17:44:42 +02:00
|
|
|
pq_copymsgbytes(buf, VARDATA(result), nbytes);
|
|
|
|
PG_RETURN_BYTEA_P(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* byteasend - converts bytea to binary format
|
|
|
|
*
|
|
|
|
* This is a special case: just copy the input...
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
byteasend(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
|
|
|
|
|
|
|
|
PG_RETURN_BYTEA_P(vlena);
|
|
|
|
}
|
|
|
|
|
2011-12-23 14:40:25 +01:00
|
|
|
Datum
|
2012-04-13 20:36:59 +02:00
|
|
|
bytea_string_agg_transfn(PG_FUNCTION_ARGS)
|
2011-12-23 14:40:25 +01:00
|
|
|
{
|
|
|
|
StringInfo state;
|
|
|
|
|
|
|
|
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
|
|
|
|
|
|
|
|
/* Append the value unless null. */
|
|
|
|
if (!PG_ARGISNULL(1))
|
|
|
|
{
|
|
|
|
bytea *value = PG_GETARG_BYTEA_PP(1);
|
|
|
|
|
2012-04-13 20:36:59 +02:00
|
|
|
/* On the first time through, we ignore the delimiter. */
|
2011-12-23 14:40:25 +01:00
|
|
|
if (state == NULL)
|
|
|
|
state = makeStringAggState(fcinfo);
|
2012-04-13 20:36:59 +02:00
|
|
|
else if (!PG_ARGISNULL(2))
|
|
|
|
{
|
|
|
|
bytea *delim = PG_GETARG_BYTEA_PP(2);
|
|
|
|
|
|
|
|
appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
|
|
|
|
}
|
2011-12-23 14:40:25 +01:00
|
|
|
|
|
|
|
appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-04-13 20:36:59 +02:00
|
|
|
* The transition type for string_agg() is declared to be "internal",
|
2011-12-23 14:40:25 +01:00
|
|
|
* which is a pass-by-value type the same size as a pointer.
|
|
|
|
*/
|
|
|
|
PG_RETURN_POINTER(state);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
2012-04-13 20:36:59 +02:00
|
|
|
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
|
2011-12-23 14:40:25 +01:00
|
|
|
{
|
|
|
|
StringInfo state;
|
|
|
|
|
|
|
|
/* cannot be called directly because of internal-type argument */
|
|
|
|
Assert(AggCheckCallContext(fcinfo, NULL));
|
|
|
|
|
|
|
|
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
|
|
|
|
|
|
|
|
if (state != NULL)
|
|
|
|
{
|
|
|
|
bytea *result;
|
|
|
|
|
|
|
|
result = (bytea *) palloc(state->len + VARHDRSZ);
|
|
|
|
SET_VARSIZE(result, state->len + VARHDRSZ);
|
|
|
|
memcpy(VARDATA(result), state->data, state->len);
|
|
|
|
PG_RETURN_BYTEA_P(result);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
PG_RETURN_NULL();
|
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* textin - converts "..." to internal representation
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-06 01:12:09 +02:00
|
|
|
Datum
|
|
|
|
textin(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-06 01:12:09 +02:00
|
|
|
char *inputText = PG_GETARG_CSTRING(0);
|
2001-09-11 07:18:59 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(inputText));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* textout - converts internal representation to "..."
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-06 01:12:09 +02:00
|
|
|
Datum
|
|
|
|
textout(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2008-03-25 23:42:46 +01:00
|
|
|
Datum txt = PG_GETARG_DATUM(0);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_CSTRING(TextDatumGetCString(txt));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2003-05-09 17:44:42 +02:00
|
|
|
/*
|
|
|
|
* textrecv - converts external binary format to text
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
textrecv(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
|
|
|
|
text *result;
|
|
|
|
char *str;
|
|
|
|
int nbytes;
|
|
|
|
|
|
|
|
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
|
2005-07-10 23:14:00 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
result = cstring_to_text_with_len(str, nbytes);
|
2003-05-09 17:44:42 +02:00
|
|
|
pfree(str);
|
|
|
|
PG_RETURN_TEXT_P(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* textsend - converts text to binary format
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
textsend(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *t = PG_GETARG_TEXT_PP(0);
|
2003-05-09 17:44:42 +02:00
|
|
|
StringInfoData buf;
|
|
|
|
|
|
|
|
pq_begintypsend(&buf);
|
2007-04-06 06:21:44 +02:00
|
|
|
pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
|
2003-05-09 17:44:42 +02:00
|
|
|
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2002-04-24 04:12:53 +02:00
|
|
|
/*
|
|
|
|
* unknownin - converts "..." to internal representation
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
unknownin(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-05-30 03:20:50 +02:00
|
|
|
char *str = PG_GETARG_CSTRING(0);
|
2002-04-24 04:12:53 +02:00
|
|
|
|
2005-05-30 03:20:50 +02:00
|
|
|
/* representation is same as cstring */
|
|
|
|
PG_RETURN_CSTRING(pstrdup(str));
|
2002-04-24 04:12:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* unknownout - converts internal representation to "..."
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
unknownout(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-05-30 03:20:50 +02:00
|
|
|
/* representation is same as cstring */
|
|
|
|
char *str = PG_GETARG_CSTRING(0);
|
2002-04-24 04:12:53 +02:00
|
|
|
|
2005-05-30 03:20:50 +02:00
|
|
|
PG_RETURN_CSTRING(pstrdup(str));
|
2002-04-24 04:12:53 +02:00
|
|
|
}
|
|
|
|
|
2003-05-09 17:44:42 +02:00
|
|
|
/*
|
|
|
|
* unknownrecv - converts external binary format to unknown
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
unknownrecv(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
|
2005-05-30 03:20:50 +02:00
|
|
|
char *str;
|
2003-05-09 17:44:42 +02:00
|
|
|
int nbytes;
|
|
|
|
|
2005-05-30 03:20:50 +02:00
|
|
|
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
|
|
|
|
/* representation is same as cstring */
|
|
|
|
PG_RETURN_CSTRING(str);
|
2003-05-09 17:44:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* unknownsend - converts unknown to binary format
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
unknownsend(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-05-30 03:20:50 +02:00
|
|
|
/* representation is same as cstring */
|
|
|
|
char *str = PG_GETARG_CSTRING(0);
|
|
|
|
StringInfoData buf;
|
2003-05-09 17:44:42 +02:00
|
|
|
|
2005-05-30 03:20:50 +02:00
|
|
|
pq_begintypsend(&buf);
|
|
|
|
pq_sendtext(&buf, str, strlen(str));
|
|
|
|
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
|
2003-05-09 17:44:42 +02:00
|
|
|
}
|
|
|
|
|
2002-04-24 04:12:53 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* ========== PUBLIC ROUTINES ========== */
|
|
|
|
|
1996-07-19 08:08:21 +02:00
|
|
|
/*
|
|
|
|
* textlen -
|
1998-04-27 19:10:50 +02:00
|
|
|
* returns the logical length of a text*
|
1998-02-26 05:46:47 +01:00
|
|
|
* (which is less than the VARSIZE of the text*)
|
1996-07-19 08:08:21 +02:00
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
textlen(PG_FUNCTION_ARGS)
|
1996-07-19 08:08:21 +02:00
|
|
|
{
|
2004-01-31 01:45:21 +01:00
|
|
|
Datum str = PG_GETARG_DATUM(0);
|
|
|
|
|
|
|
|
/* try to avoid decompressing argument */
|
|
|
|
PG_RETURN_INT32(text_length(str));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
|
|
|
* text_length -
|
|
|
|
* Does the real work for textlen()
|
2004-01-31 01:45:21 +01:00
|
|
|
*
|
2002-08-22 05:24:01 +02:00
|
|
|
* This is broken out so it can be called directly by other string processing
|
2014-05-06 18:12:18 +02:00
|
|
|
* functions. Note that the argument is passed as a Datum, to indicate that
|
2004-01-31 01:45:21 +01:00
|
|
|
* it may still be in compressed form. We can avoid decompressing it at all
|
|
|
|
* in some cases.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
static int32
|
|
|
|
text_length(Datum str)
|
|
|
|
{
|
|
|
|
/* fastpath when max encoding length is one */
|
|
|
|
if (pg_database_encoding_max_length() == 1)
|
|
|
|
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
|
2004-01-31 01:45:21 +01:00
|
|
|
else
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *t = DatumGetTextPP(str);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2007-11-15 22:14:46 +01:00
|
|
|
PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
|
2007-04-06 06:21:44 +02:00
|
|
|
VARSIZE_ANY_EXHDR(t)));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
1996-07-19 08:08:21 +02:00
|
|
|
|
1998-04-27 19:10:50 +02:00
|
|
|
/*
|
|
|
|
* textoctetlen -
|
|
|
|
* returns the physical length of a text*
|
|
|
|
* (which is less than the VARSIZE of the text*)
|
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
textoctetlen(PG_FUNCTION_ARGS)
|
1998-04-27 19:10:50 +02:00
|
|
|
{
|
2004-01-31 01:45:21 +01:00
|
|
|
Datum str = PG_GETARG_DATUM(0);
|
|
|
|
|
|
|
|
/* We need not detoast the input at all */
|
|
|
|
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
1998-04-27 19:10:50 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
|
|
|
* textcat -
|
2000-07-06 07:48:31 +02:00
|
|
|
* takes two text* and returns a text* that is the concatenation of
|
1997-09-07 07:04:48 +02:00
|
|
|
* the two.
|
1997-07-29 18:12:07 +02:00
|
|
|
*
|
|
|
|
* Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
|
|
|
|
* Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
|
|
|
|
* Allocate space for output in all cases.
|
|
|
|
* XXX - thomas 1997-07-10
|
1996-07-22 23:58:28 +02:00
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
textcat(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *t1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *t2 = PG_GETARG_TEXT_PP(1);
|
2010-01-25 21:55:32 +01:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(text_catenate(t1, t2));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text_catenate
|
|
|
|
* Guts of textcat(), broken out so it can be used by other functions
|
|
|
|
*
|
|
|
|
* Arguments can be in short-header form, but not compressed or out-of-line
|
|
|
|
*/
|
|
|
|
static text *
|
|
|
|
text_catenate(text *t1, text *t2)
|
|
|
|
{
|
|
|
|
text *result;
|
1997-09-08 04:41:22 +02:00
|
|
|
int len1,
|
|
|
|
len2,
|
|
|
|
len;
|
2000-07-06 07:48:31 +02:00
|
|
|
char *ptr;
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(t1);
|
2010-01-25 21:55:32 +01:00
|
|
|
len2 = VARSIZE_ANY_EXHDR(t2);
|
|
|
|
|
|
|
|
/* paranoia ... probably should throw error instead? */
|
1997-09-07 07:04:48 +02:00
|
|
|
if (len1 < 0)
|
|
|
|
len1 = 0;
|
|
|
|
if (len2 < 0)
|
|
|
|
len2 = 0;
|
1996-07-22 23:58:28 +02:00
|
|
|
|
1998-05-10 00:45:14 +02:00
|
|
|
len = len1 + len2 + VARHDRSZ;
|
2000-07-06 07:48:31 +02:00
|
|
|
result = (text *) palloc(len);
|
1998-05-10 00:45:14 +02:00
|
|
|
|
|
|
|
/* Set size of result string... */
|
2007-02-28 00:48:10 +01:00
|
|
|
SET_VARSIZE(result, len);
|
1996-07-22 23:58:28 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/* Fill data field of result string... */
|
|
|
|
ptr = VARDATA(result);
|
1998-05-10 00:45:14 +02:00
|
|
|
if (len1 > 0)
|
2007-04-06 06:21:44 +02:00
|
|
|
memcpy(ptr, VARDATA_ANY(t1), len1);
|
1998-05-10 00:45:14 +02:00
|
|
|
if (len2 > 0)
|
2007-04-06 06:21:44 +02:00
|
|
|
memcpy(ptr + len1, VARDATA_ANY(t2), len2);
|
1996-07-19 08:08:21 +02:00
|
|
|
|
2010-01-25 21:55:32 +01:00
|
|
|
return result;
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
1998-01-01 06:50:50 +01:00
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
/*
|
|
|
|
* charlen_to_bytelen()
|
|
|
|
* Compute the number of bytes occupied by n characters starting at *p
|
|
|
|
*
|
|
|
|
* It is caller's responsibility that there actually are n characters;
|
|
|
|
* the string need not be null-terminated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
charlen_to_bytelen(const char *p, int n)
|
|
|
|
{
|
|
|
|
if (pg_database_encoding_max_length() == 1)
|
|
|
|
{
|
|
|
|
/* Optimization for single-byte encodings */
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
const char *s;
|
|
|
|
|
|
|
|
for (s = p; n > 0; n--)
|
|
|
|
s += pg_mblen(s);
|
|
|
|
|
|
|
|
return s - p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1998-01-01 06:50:50 +01:00
|
|
|
/*
|
|
|
|
* text_substr()
|
|
|
|
* Return a substring starting at the specified position.
|
|
|
|
* - thomas 1997-12-31
|
|
|
|
*
|
|
|
|
* Input:
|
1998-02-26 05:46:47 +01:00
|
|
|
* - string
|
|
|
|
* - starting position (is one-based)
|
|
|
|
* - string length
|
1998-01-01 06:50:50 +01:00
|
|
|
*
|
1998-12-14 00:36:48 +01:00
|
|
|
* If the starting position is zero or less, then return from the start of the string
|
2013-04-20 17:04:41 +02:00
|
|
|
* adjusting the length to be consistent with the "negative start" per SQL.
|
1998-01-01 06:50:50 +01:00
|
|
|
* If the length is less than zero, return the remaining string.
|
|
|
|
*
|
2002-09-03 23:45:44 +02:00
|
|
|
* Added multibyte support.
|
1998-04-27 19:10:50 +02:00
|
|
|
* - Tatsuo Ishii 1998-4-21
|
2013-04-20 17:04:41 +02:00
|
|
|
* Changed behavior if starting position is less than one to conform to SQL behavior.
|
1998-12-14 00:36:48 +01:00
|
|
|
* Formerly returned the entire string; now returns a portion.
|
|
|
|
* - Thomas Lockhart 1998-12-10
|
2002-03-05 06:33:31 +01:00
|
|
|
* Now uses faster TOAST-slicing interface
|
|
|
|
* - John Gray 2002-02-22
|
2002-08-22 05:24:01 +02:00
|
|
|
* Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
|
2013-04-20 17:04:41 +02:00
|
|
|
* behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
|
2002-08-22 05:24:01 +02:00
|
|
|
* error; if E < 1, return '', not entire string). Fixed MB related bug when
|
|
|
|
* S > LC and < LC + 4 sometimes garbage characters are returned.
|
2002-09-04 22:31:48 +02:00
|
|
|
* - Joe Conway 2002-08-10
|
1998-01-01 06:50:50 +01:00
|
|
|
*/
|
2000-06-13 09:35:40 +02:00
|
|
|
Datum
|
|
|
|
text_substr(PG_FUNCTION_ARGS)
|
1998-01-01 06:50:50 +01:00
|
|
|
{
|
2002-08-22 05:24:01 +02:00
|
|
|
PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
|
|
|
|
PG_GETARG_INT32(1),
|
|
|
|
PG_GETARG_INT32(2),
|
|
|
|
false));
|
|
|
|
}
|
1999-05-25 18:15:34 +02:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
|
|
|
* text_substr_no_len -
|
|
|
|
* Wrapper to avoid opr_sanity failure due to
|
|
|
|
* one function accepting a different number of args.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_substr_no_len(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
|
|
|
|
PG_GETARG_INT32(1),
|
|
|
|
-1, true));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text_substring -
|
|
|
|
* Does the real work for text_substr() and text_substr_no_len()
|
2004-01-31 01:45:21 +01:00
|
|
|
*
|
2002-08-22 05:24:01 +02:00
|
|
|
* This is broken out so it can be called directly by other string processing
|
2014-05-06 18:12:18 +02:00
|
|
|
* functions. Note that the argument is passed as a Datum, to indicate that
|
2004-01-31 01:45:21 +01:00
|
|
|
* it may still be in compressed/toasted form. We can avoid detoasting all
|
|
|
|
* of it in some cases.
|
2006-11-08 20:22:25 +01:00
|
|
|
*
|
|
|
|
* The result is always a freshly palloc'd datum.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
2002-09-04 22:31:48 +02:00
|
|
|
static text *
|
2002-08-22 05:24:01 +02:00
|
|
|
text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
|
|
|
{
|
|
|
|
int32 eml = pg_database_encoding_max_length();
|
2002-09-04 22:31:48 +02:00
|
|
|
int32 S = start; /* start position */
|
|
|
|
int32 S1; /* adjusted start position */
|
|
|
|
int32 L1; /* adjusted substring length */
|
2002-08-22 05:24:01 +02:00
|
|
|
|
|
|
|
/* life is easy if the encoding max length is 1 */
|
|
|
|
if (eml == 1)
|
1998-01-01 06:50:50 +01:00
|
|
|
{
|
2002-08-22 05:24:01 +02:00
|
|
|
S1 = Max(S, 1);
|
1998-01-01 06:50:50 +01:00
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
if (length_not_specified) /* special case - get length to end of
|
|
|
|
* string */
|
2002-08-22 05:24:01 +02:00
|
|
|
L1 = -1;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* end position */
|
2002-09-04 22:31:48 +02:00
|
|
|
int E = S + length;
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* A negative value for L is the only way for the end position to
|
|
|
|
* be before the start. SQL99 says to throw an error.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (E < S)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_SUBSTRING_ERROR),
|
2005-10-15 04:49:52 +02:00
|
|
|
errmsg("negative substring length not allowed")));
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* A zero or negative value for the end position can happen if the
|
|
|
|
* start was negative or one. SQL99 says to return a zero-length
|
|
|
|
* string.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (E < 1)
|
2008-03-25 23:42:46 +01:00
|
|
|
return cstring_to_text("");
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
L1 = E - S1;
|
|
|
|
}
|
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* If the start position is past the end of the string, SQL99 says to
|
|
|
|
* return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
|
|
|
|
* that for us. Convert to zero-based starting position
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
return DatumGetTextPSlice(str, S1 - 1, L1);
|
|
|
|
}
|
|
|
|
else if (eml > 1)
|
2002-03-05 06:33:31 +01:00
|
|
|
{
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
|
|
|
* When encoding max length is > 1, we can't get LC without
|
2005-10-15 04:49:52 +02:00
|
|
|
* detoasting, so we'll grab a conservatively large slice now and go
|
|
|
|
* back later to do the right thing
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
int32 slice_start;
|
|
|
|
int32 slice_size;
|
|
|
|
int32 slice_strlen;
|
2002-09-04 22:31:48 +02:00
|
|
|
text *slice;
|
2002-08-22 05:24:01 +02:00
|
|
|
int32 E1;
|
|
|
|
int32 i;
|
|
|
|
char *p;
|
|
|
|
char *s;
|
|
|
|
text *ret;
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* if S is past the end of the string, the tuple toaster will return a
|
|
|
|
* zero-length string to us
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
S1 = Max(S, 1);
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* We need to start at position zero because there is no way to know
|
|
|
|
* in advance which byte offset corresponds to the supplied start
|
|
|
|
* position.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
slice_start = 0;
|
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
if (length_not_specified) /* special case - get length to end of
|
|
|
|
* string */
|
2002-08-22 05:24:01 +02:00
|
|
|
slice_size = L1 = -1;
|
2002-04-15 09:54:37 +02:00
|
|
|
else
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2002-09-04 22:31:48 +02:00
|
|
|
int E = S + length;
|
2002-08-22 05:24:01 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* A negative value for L is the only way for the end position to
|
|
|
|
* be before the start. SQL99 says to throw an error.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (E < S)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_SUBSTRING_ERROR),
|
2005-10-15 04:49:52 +02:00
|
|
|
errmsg("negative substring length not allowed")));
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* A zero or negative value for the end position can happen if the
|
|
|
|
* start was negative or one. SQL99 says to return a zero-length
|
|
|
|
* string.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (E < 1)
|
2008-03-25 23:42:46 +01:00
|
|
|
return cstring_to_text("");
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
2002-09-04 22:31:48 +02:00
|
|
|
* if E is past the end of the string, the tuple toaster will
|
|
|
|
* truncate the length for us
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
L1 = E - S1;
|
2002-03-05 06:33:31 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Total slice size in bytes can't be any longer than the start
|
|
|
|
* position plus substring length times the encoding max length.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
slice_size = (S1 + L1) * eml;
|
|
|
|
}
|
2006-11-08 20:22:25 +01:00
|
|
|
|
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* If we're working with an untoasted source, no need to do an extra
|
|
|
|
* copying step.
|
2006-11-08 20:22:25 +01:00
|
|
|
*/
|
2008-09-07 06:20:00 +02:00
|
|
|
if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
|
2008-04-13 01:21:04 +02:00
|
|
|
VARATT_IS_EXTERNAL(DatumGetPointer(str)))
|
2006-11-08 20:22:25 +01:00
|
|
|
slice = DatumGetTextPSlice(str, slice_start, slice_size);
|
|
|
|
else
|
|
|
|
slice = (text *) DatumGetPointer(str);
|
1998-12-14 00:36:48 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/* see if we got back an empty string */
|
2007-09-22 02:36:38 +02:00
|
|
|
if (VARSIZE_ANY_EXHDR(slice) == 0)
|
2006-11-08 20:22:25 +01:00
|
|
|
{
|
|
|
|
if (slice != (text *) DatumGetPointer(str))
|
|
|
|
pfree(slice);
|
2008-03-25 23:42:46 +01:00
|
|
|
return cstring_to_text("");
|
2006-11-08 20:22:25 +01:00
|
|
|
}
|
2000-06-13 09:35:40 +02:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/* Now we can get the actual length of the slice in MB characters */
|
2007-09-22 02:36:38 +02:00
|
|
|
slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
|
|
|
|
VARSIZE_ANY_EXHDR(slice));
|
1998-01-01 06:50:50 +01:00
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Check that the start position wasn't > slice_strlen. If so, SQL99
|
|
|
|
* says to return a zero-length string.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (S1 > slice_strlen)
|
2006-11-08 20:22:25 +01:00
|
|
|
{
|
|
|
|
if (slice != (text *) DatumGetPointer(str))
|
|
|
|
pfree(slice);
|
2008-03-25 23:42:46 +01:00
|
|
|
return cstring_to_text("");
|
2006-11-08 20:22:25 +01:00
|
|
|
}
|
1998-01-01 06:50:50 +01:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Adjust L1 and E1 now that we know the slice string length. Again
|
|
|
|
* remember that S1 is one based, and slice_start is zero based.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (L1 > -1)
|
2002-09-04 22:31:48 +02:00
|
|
|
E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
|
2002-08-22 05:24:01 +02:00
|
|
|
else
|
|
|
|
E1 = slice_start + 1 + slice_strlen;
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Find the start position in the slice; remember S1 is not zero based
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
2007-09-22 02:36:38 +02:00
|
|
|
p = VARDATA_ANY(slice);
|
2002-08-22 05:24:01 +02:00
|
|
|
for (i = 0; i < S1 - 1; i++)
|
|
|
|
p += pg_mblen(p);
|
|
|
|
|
|
|
|
/* hang onto a pointer to our start position */
|
|
|
|
s = p;
|
|
|
|
|
|
|
|
/*
|
2002-09-04 22:31:48 +02:00
|
|
|
* Count the actual bytes used by the substring of the requested
|
|
|
|
* length.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
for (i = S1; i < E1; i++)
|
|
|
|
p += pg_mblen(p);
|
|
|
|
|
|
|
|
ret = (text *) palloc(VARHDRSZ + (p - s));
|
2007-02-28 00:48:10 +01:00
|
|
|
SET_VARSIZE(ret, VARHDRSZ + (p - s));
|
2002-08-22 05:24:01 +02:00
|
|
|
memcpy(VARDATA(ret), s, (p - s));
|
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
if (slice != (text *) DatumGetPointer(str))
|
|
|
|
pfree(slice);
|
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
else
|
2003-07-27 06:53:12 +02:00
|
|
|
elog(ERROR, "invalid backend encoding: encoding max length < 1");
|
2002-08-22 05:24:01 +02:00
|
|
|
|
|
|
|
/* not reached: suppress compiler warning */
|
2004-01-31 01:45:21 +01:00
|
|
|
return NULL;
|
2000-06-13 09:35:40 +02:00
|
|
|
}
|
1997-07-29 18:12:07 +02:00
|
|
|
|
2010-01-25 21:55:32 +01:00
|
|
|
/*
|
|
|
|
* textoverlay
|
|
|
|
* Replace specified substring of first string with second
|
|
|
|
*
|
|
|
|
* The SQL standard defines OVERLAY() in terms of substring and concatenation.
|
|
|
|
* This code is a direct implementation of what the standard says.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
textoverlay(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *t1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *t2 = PG_GETARG_TEXT_PP(1);
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
int sp = PG_GETARG_INT32(2); /* substring start position */
|
|
|
|
int sl = PG_GETARG_INT32(3); /* substring length */
|
2010-01-25 21:55:32 +01:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textoverlay_no_len(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *t1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *t2 = PG_GETARG_TEXT_PP(1);
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
int sp = PG_GETARG_INT32(2); /* substring start position */
|
2010-01-25 21:55:32 +01:00
|
|
|
int sl;
|
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
|
2010-01-25 21:55:32 +01:00
|
|
|
PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
|
|
|
|
}
|
|
|
|
|
|
|
|
static text *
|
|
|
|
text_overlay(text *t1, text *t2, int sp, int sl)
|
|
|
|
{
|
|
|
|
text *result;
|
|
|
|
text *s1;
|
|
|
|
text *s2;
|
|
|
|
int sp_pl_sl;
|
|
|
|
|
|
|
|
/*
|
2010-02-26 03:01:40 +01:00
|
|
|
* Check for possible integer-overflow cases. For negative sp, throw a
|
|
|
|
* "substring length" error because that's what should be expected
|
|
|
|
* according to the spec's definition of OVERLAY().
|
2010-01-25 21:55:32 +01:00
|
|
|
*/
|
|
|
|
if (sp <= 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_SUBSTRING_ERROR),
|
|
|
|
errmsg("negative substring length not allowed")));
|
2017-12-13 01:32:31 +01:00
|
|
|
if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
|
2010-01-25 21:55:32 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
|
|
|
|
errmsg("integer out of range")));
|
|
|
|
|
2010-02-26 03:01:40 +01:00
|
|
|
s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
|
2010-01-25 21:55:32 +01:00
|
|
|
s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
|
|
|
|
result = text_catenate(s1, t2);
|
|
|
|
result = text_catenate(result, s2);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
1997-07-29 18:12:07 +02:00
|
|
|
/*
|
|
|
|
* textpos -
|
1997-09-07 07:04:48 +02:00
|
|
|
* Return the position of the specified substring.
|
2013-04-20 17:04:41 +02:00
|
|
|
* Implements the SQL POSITION() function.
|
1997-09-07 07:04:48 +02:00
|
|
|
* Ref: A Guide To The SQL Standard, Date & Darwen, 1997
|
1997-07-29 18:12:07 +02:00
|
|
|
* - thomas 1997-07-27
|
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
textpos(PG_FUNCTION_ARGS)
|
1997-07-29 18:12:07 +02:00
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
text *str = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *search_str = PG_GETARG_TEXT_PP(1);
|
2004-01-31 01:45:21 +01:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text_position -
|
|
|
|
* Does the real work for textpos()
|
2004-01-31 01:45:21 +01:00
|
|
|
*
|
|
|
|
* Inputs:
|
|
|
|
* t1 - string to be searched
|
|
|
|
* t2 - pattern to match within t1
|
|
|
|
* Result:
|
|
|
|
* Character index of the first matched char, starting from 1,
|
|
|
|
* or 0 if no match.
|
|
|
|
*
|
2002-08-22 05:24:01 +02:00
|
|
|
* This is broken out so it can be called directly by other string processing
|
|
|
|
* functions.
|
|
|
|
*/
|
2006-10-07 02:11:53 +02:00
|
|
|
static int
|
2019-03-22 12:09:32 +01:00
|
|
|
text_position(text *t1, text *t2, Oid collid)
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2006-10-07 02:11:53 +02:00
|
|
|
TextPositionState state;
|
|
|
|
int result;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2019-10-28 17:21:13 +01:00
|
|
|
/* Empty needle always matches at position 1 */
|
|
|
|
if (VARSIZE_ANY_EXHDR(t2) < 1)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* Otherwise, can't match if haystack is shorter than needle */
|
|
|
|
if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
return 0;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
text_position_setup(t1, t2, collid, &state);
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
if (!text_position_next(&state))
|
|
|
|
result = 0;
|
|
|
|
else
|
|
|
|
result = text_position_get_match_pos(&state);
|
2006-10-07 02:11:53 +02:00
|
|
|
text_position_cleanup(&state);
|
|
|
|
return result;
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2008-09-07 06:20:00 +02:00
|
|
|
|
2006-10-07 02:11:53 +02:00
|
|
|
/*
|
|
|
|
* text_position_setup, text_position_next, text_position_cleanup -
|
|
|
|
* Component steps of text_position()
|
|
|
|
*
|
|
|
|
* These are broken out so that a string can be efficiently searched for
|
|
|
|
* multiple occurrences of the same pattern. text_position_next may be
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
* called multiple times, and it advances to the next match on each call.
|
|
|
|
* text_position_get_match_ptr() and text_position_get_match_pos() return
|
|
|
|
* a pointer or 1-based character position of the last match, respectively.
|
|
|
|
*
|
|
|
|
* The "state" variable is normally just a local variable in the caller.
|
|
|
|
*
|
|
|
|
* NOTE: text_position_next skips over the matched portion. For example,
|
|
|
|
* searching for "xx" in "xxx" returns only one match, not two.
|
2006-10-07 02:11:53 +02:00
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2006-10-07 02:11:53 +02:00
|
|
|
static void
|
2019-03-22 12:09:32 +01:00
|
|
|
text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
|
2006-10-07 02:11:53 +02:00
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
int len1 = VARSIZE_ANY_EXHDR(t1);
|
|
|
|
int len2 = VARSIZE_ANY_EXHDR(t2);
|
2019-05-22 18:55:34 +02:00
|
|
|
pg_locale_t mylocale = 0;
|
2019-03-22 12:09:32 +01:00
|
|
|
|
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
|
|
|
|
mylocale = pg_newlocale_from_collation(collid);
|
|
|
|
|
|
|
|
if (mylocale && !mylocale->deterministic)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
|
|
errmsg("nondeterministic collations are not supported for substring searches")));
|
2002-08-22 05:24:01 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
Assert(len1 > 0);
|
|
|
|
Assert(len2 > 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Even with a multi-byte encoding, we perform the search using the raw
|
|
|
|
* byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
|
|
|
|
* because in UTF-8 the byte sequence of one character cannot contain
|
|
|
|
* another character. For other multi-byte encodings, we do the search
|
|
|
|
* initially as a simple byte search, ignoring multibyte issues, but
|
|
|
|
* verify afterwards that the match we found is at a character boundary,
|
|
|
|
* and continue the search if it was a false match.
|
|
|
|
*/
|
2004-01-31 01:45:21 +01:00
|
|
|
if (pg_database_encoding_max_length() == 1)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
state->is_multibyte = false;
|
|
|
|
state->is_multibyte_char_in_char = false;
|
|
|
|
}
|
|
|
|
else if (GetDatabaseEncoding() == PG_UTF8)
|
|
|
|
{
|
|
|
|
state->is_multibyte = true;
|
|
|
|
state->is_multibyte_char_in_char = false;
|
2006-10-07 02:11:53 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
state->is_multibyte = true;
|
|
|
|
state->is_multibyte_char_in_char = true;
|
2006-10-07 02:11:53 +02:00
|
|
|
}
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
state->str1 = VARDATA_ANY(t1);
|
|
|
|
state->str2 = VARDATA_ANY(t2);
|
|
|
|
state->len1 = len1;
|
|
|
|
state->len2 = len2;
|
|
|
|
state->last_match = NULL;
|
|
|
|
state->refpoint = state->str1;
|
|
|
|
state->refpos = 0;
|
|
|
|
|
2008-09-07 06:20:00 +02:00
|
|
|
/*
|
|
|
|
* Prepare the skip table for Boyer-Moore-Horspool searching. In these
|
|
|
|
* notes we use the terminology that the "haystack" is the string to be
|
|
|
|
* searched (t1) and the "needle" is the pattern being sought (t2).
|
|
|
|
*
|
|
|
|
* If the needle is empty or bigger than the haystack then there is no
|
2014-05-06 18:12:18 +02:00
|
|
|
* point in wasting cycles initializing the table. We also choose not to
|
2009-06-11 16:49:15 +02:00
|
|
|
* use B-M-H for needles of length 1, since the skip table can't possibly
|
|
|
|
* save anything in that case.
|
2008-09-07 06:20:00 +02:00
|
|
|
*/
|
|
|
|
if (len1 >= len2 && len2 > 1)
|
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
int searchlength = len1 - len2;
|
|
|
|
int skiptablemask;
|
|
|
|
int last;
|
|
|
|
int i;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
const char *str2 = state->str2;
|
2008-09-07 06:20:00 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* First we must determine how much of the skip table to use. The
|
|
|
|
* declaration of TextPositionState allows up to 256 elements, but for
|
|
|
|
* short search problems we don't really want to have to initialize so
|
|
|
|
* many elements --- it would take too long in comparison to the
|
2014-05-06 18:12:18 +02:00
|
|
|
* actual search time. So we choose a useful skip table size based on
|
2008-09-07 06:20:00 +02:00
|
|
|
* the haystack length minus the needle length. The closer the needle
|
|
|
|
* length is to the haystack length the less useful skipping becomes.
|
|
|
|
*
|
|
|
|
* Note: since we use bit-masking to select table elements, the skip
|
|
|
|
* table size MUST be a power of 2, and so the mask must be 2^N-1.
|
|
|
|
*/
|
|
|
|
if (searchlength < 16)
|
|
|
|
skiptablemask = 3;
|
|
|
|
else if (searchlength < 64)
|
|
|
|
skiptablemask = 7;
|
|
|
|
else if (searchlength < 128)
|
|
|
|
skiptablemask = 15;
|
|
|
|
else if (searchlength < 512)
|
|
|
|
skiptablemask = 31;
|
|
|
|
else if (searchlength < 2048)
|
|
|
|
skiptablemask = 63;
|
|
|
|
else if (searchlength < 4096)
|
|
|
|
skiptablemask = 127;
|
|
|
|
else
|
|
|
|
skiptablemask = 255;
|
|
|
|
state->skiptablemask = skiptablemask;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the skip table. We set all elements to the needle
|
|
|
|
* length, since this is the correct skip distance for any character
|
|
|
|
* not found in the needle.
|
|
|
|
*/
|
|
|
|
for (i = 0; i <= skiptablemask; i++)
|
|
|
|
state->skiptable[i] = len2;
|
|
|
|
|
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Now examine the needle. For each character except the last one,
|
2008-09-07 06:20:00 +02:00
|
|
|
* set the corresponding table element to the appropriate skip
|
|
|
|
* distance. Note that when two characters share the same skip table
|
2009-06-11 16:49:15 +02:00
|
|
|
* entry, the one later in the needle must determine the skip
|
|
|
|
* distance.
|
2008-09-07 06:20:00 +02:00
|
|
|
*/
|
|
|
|
last = len2 - 1;
|
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
for (i = 0; i < last; i++)
|
|
|
|
state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
|
2008-09-07 06:20:00 +02:00
|
|
|
}
|
2006-10-07 02:11:53 +02:00
|
|
|
}
|
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/*
|
|
|
|
* Advance to the next match, starting from the end of the previous match
|
|
|
|
* (or the beginning of the string, on first call). Returns true if a match
|
|
|
|
* is found.
|
2019-10-28 17:21:13 +01:00
|
|
|
*
|
|
|
|
* Note that this refuses to match an empty-string needle. Most callers
|
|
|
|
* will have handled that case specially and we'll never see it here.
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
text_position_next(TextPositionState *state)
|
2006-10-07 02:11:53 +02:00
|
|
|
{
|
2008-09-07 06:20:00 +02:00
|
|
|
int needle_len = state->len2;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
char *start_ptr;
|
|
|
|
char *matchptr;
|
2006-10-07 02:11:53 +02:00
|
|
|
|
2008-09-07 06:20:00 +02:00
|
|
|
if (needle_len <= 0)
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
return false; /* result for empty pattern */
|
2006-10-07 02:11:53 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* Start from the point right after the previous match. */
|
|
|
|
if (state->last_match)
|
|
|
|
start_ptr = state->last_match + needle_len;
|
|
|
|
else
|
|
|
|
start_ptr = state->str1;
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
retry:
|
|
|
|
matchptr = text_position_next_internal(start_ptr, state);
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
if (!matchptr)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Found a match for the byte sequence. If this is a multibyte encoding,
|
|
|
|
* where one character's byte sequence can appear inside a longer
|
|
|
|
* multi-byte character, we need to verify that the match was at a
|
|
|
|
* character boundary, not in the middle of a multi-byte character.
|
|
|
|
*/
|
|
|
|
if (state->is_multibyte_char_in_char)
|
2006-10-07 02:11:53 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* Walk one character at a time, until we reach the match. */
|
2004-01-31 01:45:21 +01:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* the search should never move backwards. */
|
|
|
|
Assert(state->refpoint <= matchptr);
|
|
|
|
|
|
|
|
while (state->refpoint < matchptr)
|
2008-09-07 06:20:00 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* step to next character. */
|
|
|
|
state->refpoint += pg_mblen(state->refpoint);
|
|
|
|
state->refpos++;
|
2006-10-07 02:11:53 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/*
|
|
|
|
* If we stepped over the match's start position, then it was a
|
|
|
|
* false positive, where the byte sequence appeared in the middle
|
|
|
|
* of a multi-byte character. Skip it, and continue the search at
|
|
|
|
* the next character boundary.
|
|
|
|
*/
|
|
|
|
if (state->refpoint > matchptr)
|
2008-09-07 06:20:00 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
start_ptr = state->refpoint;
|
|
|
|
goto retry;
|
2008-09-07 06:20:00 +02:00
|
|
|
}
|
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
}
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
state->last_match = matchptr;
|
|
|
|
return true;
|
|
|
|
}
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/*
|
|
|
|
* Subroutine of text_position_next(). This searches for the raw byte
|
|
|
|
* sequence, ignoring any multi-byte encoding issues. Returns the first
|
|
|
|
* match starting at 'start_ptr', or NULL if no match is found.
|
|
|
|
*/
|
|
|
|
static char *
|
|
|
|
text_position_next_internal(char *start_ptr, TextPositionState *state)
|
|
|
|
{
|
|
|
|
int haystack_len = state->len1;
|
|
|
|
int needle_len = state->len2;
|
|
|
|
int skiptablemask = state->skiptablemask;
|
|
|
|
const char *haystack = state->str1;
|
|
|
|
const char *needle = state->str2;
|
|
|
|
const char *haystack_end = &haystack[haystack_len];
|
|
|
|
const char *hptr;
|
2009-06-11 16:49:15 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
Assert(start_ptr >= haystack && start_ptr <= haystack_end);
|
|
|
|
|
|
|
|
if (needle_len == 1)
|
|
|
|
{
|
|
|
|
/* No point in using B-M-H for a one-character needle */
|
|
|
|
char nchar = *needle;
|
|
|
|
|
|
|
|
hptr = start_ptr;
|
|
|
|
while (hptr < haystack_end)
|
|
|
|
{
|
|
|
|
if (*hptr == nchar)
|
|
|
|
return (char *) hptr;
|
|
|
|
hptr++;
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
}
|
2004-01-31 01:45:21 +01:00
|
|
|
else
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
const char *needle_last = &needle[needle_len - 1];
|
2006-10-07 02:11:53 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* Start at startpos plus the length of the needle */
|
|
|
|
hptr = start_ptr + needle_len - 1;
|
|
|
|
while (hptr < haystack_end)
|
2008-09-07 06:20:00 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* Match the needle scanning *backward* */
|
|
|
|
const char *nptr;
|
|
|
|
const char *p;
|
2004-01-31 01:45:21 +01:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
nptr = needle_last;
|
|
|
|
p = hptr;
|
|
|
|
while (*nptr == *p)
|
2008-09-07 06:20:00 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* Matched it all? If so, return 1-based position */
|
|
|
|
if (nptr == needle)
|
|
|
|
return (char *) p;
|
|
|
|
nptr--, p--;
|
2008-09-07 06:20:00 +02:00
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* No match, so use the haystack char at hptr to decide how far to
|
|
|
|
* advance. If the needle had any occurrence of that character
|
|
|
|
* (or more precisely, one sharing the same skiptable entry)
|
|
|
|
* before its last character, then we advance far enough to align
|
|
|
|
* the last such needle character with that haystack position.
|
|
|
|
* Otherwise we can advance by the whole needle length.
|
|
|
|
*/
|
|
|
|
hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
|
2008-09-07 06:20:00 +02:00
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
}
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
return 0; /* not found */
|
|
|
|
}
|
2008-09-07 06:20:00 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/*
|
|
|
|
* Return a pointer to the current match.
|
|
|
|
*
|
|
|
|
* The returned pointer points into correct position in the original
|
|
|
|
* the haystack string.
|
|
|
|
*/
|
|
|
|
static char *
|
|
|
|
text_position_get_match_ptr(TextPositionState *state)
|
|
|
|
{
|
|
|
|
return state->last_match;
|
|
|
|
}
|
2009-06-11 16:49:15 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/*
|
|
|
|
* Return the offset of the current match.
|
|
|
|
*
|
|
|
|
* The offset is in characters, 1-based.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
text_position_get_match_pos(TextPositionState *state)
|
|
|
|
{
|
|
|
|
if (!state->is_multibyte)
|
|
|
|
return state->last_match - state->str1 + 1;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Convert the byte position to char position. */
|
|
|
|
while (state->refpoint < state->last_match)
|
|
|
|
{
|
|
|
|
state->refpoint += pg_mblen(state->refpoint);
|
|
|
|
state->refpos++;
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
Assert(state->refpoint == state->last_match);
|
|
|
|
return state->refpos + 1;
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2006-10-07 02:11:53 +02:00
|
|
|
static void
|
2007-11-15 23:25:18 +01:00
|
|
|
text_position_cleanup(TextPositionState *state)
|
2006-10-07 02:11:53 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
/* no cleanup needed */
|
2006-10-07 02:11:53 +02:00
|
|
|
}
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
static void
|
|
|
|
check_collation_set(Oid collid)
|
|
|
|
{
|
|
|
|
if (!OidIsValid(collid))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This typically means that the parser could not resolve a conflict
|
|
|
|
* of implicit collations, so report it that way.
|
|
|
|
*/
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
|
|
errmsg("could not determine which collation to use for string comparison"),
|
|
|
|
errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1998-06-16 08:42:02 +02:00
|
|
|
/* varstr_cmp()
|
|
|
|
* Comparison function for text strings with given lengths.
|
1997-04-09 10:29:35 +02:00
|
|
|
* Includes locale support, but must copy strings to temporary memory
|
1997-09-07 07:04:48 +02:00
|
|
|
* to allow null-termination for inputs to strcoll().
|
2009-04-23 09:19:09 +02:00
|
|
|
* Returns an integer less than, equal to, or greater than zero, indicating
|
|
|
|
* whether arg1 is less than, equal to, or greater than arg2.
|
Straighten out leakproofness markings on text comparison functions.
Since we introduced the idea of leakproof functions, texteq and textne
were marked leakproof but their sibling text comparison functions were
not. This inconsistency seemed justified because texteq/textne just
relied on memcmp() and so could easily be seen to be leakproof, while
the other comparison functions are far more complex and indeed can
throw input-dependent errors.
However, that argument crashed and burned with the addition of
nondeterministic collations, because now texteq/textne may invoke
the exact same varstr_cmp() infrastructure as the rest. It makes no
sense whatever to give them different leakproofness markings.
After a certain amount of angst we've concluded that it's all right
to consider varstr_cmp() to be leakproof, mostly because the other
choice would be disastrous for performance of many queries where
leakproofness matters. The input-dependent errors should only be
reachable for corrupt input data, or so we hope anyway; certainly,
if they are reachable in practice, we've got problems with requirements
as basic as maintaining a btree index on a text column.
Hence, run around to all the SQL functions that derive from varstr_cmp()
and mark them leakproof. This should result in a useful gain in
flexibility/performance for queries in which non-leakproofness degrades
the efficiency of the query plan.
Back-patch to v12 where nondeterministic collations were added.
While this isn't an essential bug fix given the determination
that varstr_cmp() is leakproof, we might as well apply it now that
we've been forced into a post-beta4 catversion bump.
Discussion: https://postgr.es/m/31481.1568303470@sss.pgh.pa.us
2019-09-21 22:56:30 +02:00
|
|
|
*
|
|
|
|
* Note: many functions that depend on this are marked leakproof; therefore,
|
|
|
|
* avoid reporting the actual contents of the input when throwing errors.
|
|
|
|
* All errors herein should be things that can't happen except on corrupt
|
|
|
|
* data, anyway; otherwise we will have trouble with indexing strings that
|
|
|
|
* would cause them.
|
1997-04-09 10:29:35 +02:00
|
|
|
*/
|
1998-06-16 08:42:02 +02:00
|
|
|
int
|
2017-10-31 15:34:31 +01:00
|
|
|
varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1998-09-01 06:40:42 +02:00
|
|
|
int result;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
2002-04-03 07:39:33 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Unfortunately, there is no strncoll(), so in the non-C locale case we
|
2014-05-06 18:12:18 +02:00
|
|
|
* have to do some memory copying. This turns out to be significantly
|
2005-10-15 04:49:52 +02:00
|
|
|
* slower, so we optimize the case where LC_COLLATE is C. We also try to
|
|
|
|
* optimize relatively-short strings by avoiding palloc/pfree overhead.
|
2002-04-03 07:39:33 +02:00
|
|
|
*/
|
2011-02-08 22:04:18 +01:00
|
|
|
if (lc_collate_is_c(collid))
|
2005-08-26 19:40:36 +02:00
|
|
|
{
|
2010-12-22 04:11:40 +01:00
|
|
|
result = memcmp(arg1, arg2, Min(len1, len2));
|
2005-08-26 19:40:36 +02:00
|
|
|
if ((result == 0) && (len1 != len2))
|
|
|
|
result = (len1 < len2) ? -1 : 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-08-14 18:09:52 +02:00
|
|
|
char a1buf[TEXTBUFLEN];
|
|
|
|
char a2buf[TEXTBUFLEN];
|
2003-08-04 02:43:34 +02:00
|
|
|
char *a1p,
|
|
|
|
*a2p;
|
2011-04-10 17:42:00 +02:00
|
|
|
pg_locale_t mylocale = 0;
|
2011-02-08 22:04:18 +01:00
|
|
|
|
|
|
|
if (collid != DEFAULT_COLLATION_OID)
|
|
|
|
mylocale = pg_newlocale_from_collation(collid);
|
2002-11-18 00:01:30 +01:00
|
|
|
|
2014-09-19 18:39:00 +02:00
|
|
|
/*
|
2015-05-24 03:35:49 +02:00
|
|
|
* memcmp() can't tell us which of two unequal strings sorts first,
|
|
|
|
* but it's a cheap way to tell if they're equal. Testing shows that
|
2014-09-19 18:39:00 +02:00
|
|
|
* memcmp() followed by strcoll() is only trivially slower than
|
|
|
|
* strcoll() by itself, so we don't lose much if this doesn't work out
|
|
|
|
* very often, and if it does - for example, because there are many
|
|
|
|
* equal strings in the input - then we win big by avoiding expensive
|
|
|
|
* collation-aware comparisons.
|
|
|
|
*/
|
|
|
|
if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
|
|
|
|
return 0;
|
|
|
|
|
2005-08-26 19:40:36 +02:00
|
|
|
#ifdef WIN32
|
2005-08-24 19:50:00 +02:00
|
|
|
/* Win32 does not have UTF-8, so we need to map to UTF-16 */
|
2017-06-16 16:08:54 +02:00
|
|
|
if (GetDatabaseEncoding() == PG_UTF8
|
|
|
|
&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
|
2005-08-24 19:50:00 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
int a1len;
|
|
|
|
int a2len;
|
|
|
|
int r;
|
2005-08-24 19:50:00 +02:00
|
|
|
|
2014-08-14 18:09:52 +02:00
|
|
|
if (len1 >= TEXTBUFLEN / 2)
|
2005-08-24 19:50:00 +02:00
|
|
|
{
|
|
|
|
a1len = len1 * 2 + 2;
|
|
|
|
a1p = palloc(a1len);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-08-14 18:09:52 +02:00
|
|
|
a1len = TEXTBUFLEN;
|
2005-08-24 19:50:00 +02:00
|
|
|
a1p = a1buf;
|
|
|
|
}
|
2014-08-14 18:09:52 +02:00
|
|
|
if (len2 >= TEXTBUFLEN / 2)
|
2005-08-24 19:50:00 +02:00
|
|
|
{
|
|
|
|
a2len = len2 * 2 + 2;
|
|
|
|
a2p = palloc(a2len);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-08-14 18:09:52 +02:00
|
|
|
a2len = TEXTBUFLEN;
|
2005-08-24 19:50:00 +02:00
|
|
|
a2p = a2buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* stupid Microsloth API does not work for zero-length input */
|
|
|
|
if (len1 == 0)
|
|
|
|
r = 0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
|
2005-10-15 04:49:52 +02:00
|
|
|
(LPWSTR) a1p, a1len / 2);
|
2005-08-24 19:50:00 +02:00
|
|
|
if (!r)
|
|
|
|
ereport(ERROR,
|
2012-06-10 21:20:04 +02:00
|
|
|
(errmsg("could not convert string to UTF-16: error code %lu",
|
|
|
|
GetLastError())));
|
2005-08-24 19:50:00 +02:00
|
|
|
}
|
|
|
|
((LPWSTR) a1p)[r] = 0;
|
|
|
|
|
|
|
|
if (len2 == 0)
|
|
|
|
r = 0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
|
2005-10-15 04:49:52 +02:00
|
|
|
(LPWSTR) a2p, a2len / 2);
|
2005-08-24 19:50:00 +02:00
|
|
|
if (!r)
|
|
|
|
ereport(ERROR,
|
2012-06-10 21:20:04 +02:00
|
|
|
(errmsg("could not convert string to UTF-16: error code %lu",
|
|
|
|
GetLastError())));
|
2005-08-24 19:50:00 +02:00
|
|
|
}
|
|
|
|
((LPWSTR) a2p)[r] = 0;
|
|
|
|
|
|
|
|
errno = 0;
|
2011-04-09 23:14:20 +02:00
|
|
|
#ifdef HAVE_LOCALE_T
|
|
|
|
if (mylocale)
|
2017-03-25 05:38:12 +01:00
|
|
|
result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
|
2011-04-09 23:14:20 +02:00
|
|
|
else
|
|
|
|
#endif
|
2011-04-10 17:42:00 +02:00
|
|
|
result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
|
2005-10-15 04:49:52 +02:00
|
|
|
if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
|
|
|
|
* headers */
|
2005-08-24 19:50:00 +02:00
|
|
|
ereport(ERROR,
|
2005-10-29 02:31:52 +02:00
|
|
|
(errmsg("could not compare Unicode strings: %m")));
|
2005-08-24 19:50:00 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
/* Break tie if necessary. */
|
|
|
|
if (result == 0 &&
|
|
|
|
(!mylocale || mylocale->deterministic))
|
2008-03-13 19:31:56 +01:00
|
|
|
{
|
2010-12-22 04:11:40 +01:00
|
|
|
result = memcmp(arg1, arg2, Min(len1, len2));
|
2008-03-13 19:31:56 +01:00
|
|
|
if ((result == 0) && (len1 != len2))
|
|
|
|
result = (len1 < len2) ? -1 : 1;
|
|
|
|
}
|
|
|
|
|
2005-08-24 19:50:00 +02:00
|
|
|
if (a1p != a1buf)
|
|
|
|
pfree(a1p);
|
|
|
|
if (a2p != a2buf)
|
|
|
|
pfree(a2p);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
#endif /* WIN32 */
|
2005-08-24 19:50:00 +02:00
|
|
|
|
2014-08-14 18:09:52 +02:00
|
|
|
if (len1 >= TEXTBUFLEN)
|
2005-08-26 19:40:36 +02:00
|
|
|
a1p = (char *) palloc(len1 + 1);
|
|
|
|
else
|
|
|
|
a1p = a1buf;
|
2014-08-14 18:09:52 +02:00
|
|
|
if (len2 >= TEXTBUFLEN)
|
2005-08-26 19:40:36 +02:00
|
|
|
a2p = (char *) palloc(len2 + 1);
|
|
|
|
else
|
|
|
|
a2p = a2buf;
|
2005-08-24 19:50:00 +02:00
|
|
|
|
2005-08-26 19:40:36 +02:00
|
|
|
memcpy(a1p, arg1, len1);
|
|
|
|
a1p[len1] = '\0';
|
|
|
|
memcpy(a2p, arg2, len2);
|
|
|
|
a2p[len2] = '\0';
|
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
if (mylocale)
|
2017-03-23 20:25:34 +01:00
|
|
|
{
|
|
|
|
if (mylocale->provider == COLLPROVIDER_ICU)
|
|
|
|
{
|
|
|
|
#ifdef USE_ICU
|
|
|
|
#ifdef HAVE_UCOL_STRCOLLUTF8
|
|
|
|
if (GetDatabaseEncoding() == PG_UTF8)
|
|
|
|
{
|
|
|
|
UErrorCode status;
|
|
|
|
|
|
|
|
status = U_ZERO_ERROR;
|
|
|
|
result = ucol_strcollUTF8(mylocale->info.icu.ucol,
|
|
|
|
arg1, len1,
|
|
|
|
arg2, len2,
|
|
|
|
&status);
|
|
|
|
if (U_FAILURE(status))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errmsg("collation failed: %s", u_errorName(status))));
|
|
|
|
}
|
|
|
|
else
|
2011-02-08 22:04:18 +01:00
|
|
|
#endif
|
2017-03-23 20:25:34 +01:00
|
|
|
{
|
2017-05-17 22:31:56 +02:00
|
|
|
int32_t ulen1,
|
|
|
|
ulen2;
|
|
|
|
UChar *uchar1,
|
|
|
|
*uchar2;
|
2017-03-23 20:25:34 +01:00
|
|
|
|
|
|
|
ulen1 = icu_to_uchar(&uchar1, arg1, len1);
|
|
|
|
ulen2 = icu_to_uchar(&uchar2, arg2, len2);
|
|
|
|
|
|
|
|
result = ucol_strcoll(mylocale->info.icu.ucol,
|
|
|
|
uchar1, ulen1,
|
|
|
|
uchar2, ulen2);
|
Fix memory leakage in ICU encoding conversion, and other code review.
Callers of icu_to_uchar() neglected to pfree the result string when done
with it. This results in catastrophic memory leaks in varstr_cmp(),
because of our prevailing assumption that btree comparison functions don't
leak memory. For safety, make all the call sites clean up leaks, though
I suspect that we could get away without it in formatting.c. I audited
callers of icu_from_uchar() as well, but found no places that seemed to
have a comparable issue.
Add function API specifications for icu_to_uchar() and icu_from_uchar();
the lack of any thought-through specification is perhaps not unrelated
to the existence of this bug in the first place. Fix icu_to_uchar()
to guarantee a nul-terminated result; although no existing caller appears
to care, the fact that it would have been nul-terminated except in
extreme corner cases seems ideally designed to bite someone on the rear
someday. Fix ucnv_fromUChars() destCapacity argument --- in the worst
case, that could perhaps have led to a non-nul-terminated result, too.
Fix icu_from_uchar() to have a more reasonable definition of the function
result --- no callers are actually paying attention, so this isn't a live
bug, but it's certainly sloppily designed. Const-ify icu_from_uchar()'s
input string for consistency.
That is not the end of what needs to be done to these functions, but
it's as much as I have the patience for right now.
Discussion: https://postgr.es/m/1955.1498181798@sss.pgh.pa.us
2017-06-23 18:22:06 +02:00
|
|
|
|
|
|
|
pfree(uchar1);
|
|
|
|
pfree(uchar2);
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
2017-05-17 22:31:56 +02:00
|
|
|
#else /* not USE_ICU */
|
2017-03-23 20:25:34 +01:00
|
|
|
/* shouldn't happen */
|
|
|
|
elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
#endif /* not USE_ICU */
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
#ifdef HAVE_LOCALE_T
|
|
|
|
result = strcoll_l(a1p, a2p, mylocale->info.lt);
|
|
|
|
#else
|
|
|
|
/* shouldn't happen */
|
|
|
|
elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2011-04-10 17:42:00 +02:00
|
|
|
result = strcoll(a1p, a2p);
|
2005-08-26 19:40:36 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
/* Break tie if necessary. */
|
|
|
|
if (result == 0 &&
|
|
|
|
(!mylocale || mylocale->deterministic))
|
2005-12-22 23:50:00 +01:00
|
|
|
result = strcmp(a1p, a2p);
|
|
|
|
|
2005-08-26 19:40:36 +02:00
|
|
|
if (a1p != a1buf)
|
|
|
|
pfree(a1p);
|
|
|
|
if (a2p != a2buf)
|
|
|
|
pfree(a2p);
|
2002-04-03 07:39:33 +02:00
|
|
|
}
|
1997-04-21 06:31:53 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return result;
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
1997-04-09 10:29:35 +02:00
|
|
|
|
1998-06-16 08:42:02 +02:00
|
|
|
/* text_cmp()
|
2001-05-03 21:00:37 +02:00
|
|
|
* Internal comparison function for text strings.
|
1998-06-16 08:42:02 +02:00
|
|
|
* Returns -1, 0 or 1
|
1997-04-09 10:29:35 +02:00
|
|
|
*/
|
1998-10-08 20:30:52 +02:00
|
|
|
static int
|
2011-02-08 22:04:18 +01:00
|
|
|
text_cmp(text *arg1, text *arg2, Oid collid)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1998-09-01 06:40:42 +02:00
|
|
|
char *a1p,
|
|
|
|
*a2p;
|
|
|
|
int len1,
|
|
|
|
len2;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
a1p = VARDATA_ANY(arg1);
|
|
|
|
a2p = VARDATA_ANY(arg2);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2007-11-15 22:14:46 +01:00
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
return varstr_cmp(a1p, len1, a2p, len2, collid);
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
1997-04-21 06:31:53 +02:00
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
/*
|
|
|
|
* Comparison functions for text strings.
|
2000-07-12 04:37:39 +02:00
|
|
|
*
|
|
|
|
* Note: btree indexes need these routines not to leak memory; therefore,
|
|
|
|
* be careful to free working copies of toasted datums. Most places don't
|
|
|
|
* need to be so careful.
|
1998-06-16 08:42:02 +02:00
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
|
2001-05-03 21:00:37 +02:00
|
|
|
Datum
|
|
|
|
texteq(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2001-05-03 21:00:37 +02:00
|
|
|
bool result;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (lc_collate_is_c(collid) ||
|
|
|
|
collid == DEFAULT_COLLATION_OID ||
|
|
|
|
pg_newlocale_from_collation(collid)->deterministic)
|
|
|
|
{
|
|
|
|
Datum arg1 = PG_GETARG_DATUM(0);
|
|
|
|
Datum arg2 = PG_GETARG_DATUM(1);
|
|
|
|
Size len1,
|
|
|
|
len2;
|
|
|
|
|
|
|
|
/*
|
2019-05-22 18:55:34 +02:00
|
|
|
* Since we only care about equality or not-equality, we can avoid all
|
|
|
|
* the expense of strcoll() here, and just do bitwise comparison. In
|
|
|
|
* fact, we don't even have to do a bitwise comparison if we can show
|
|
|
|
* the lengths of the strings are unequal; which might save us from
|
|
|
|
* having to detoast one or both values.
|
2019-03-22 12:09:32 +01:00
|
|
|
*/
|
|
|
|
len1 = toast_raw_datum_size(arg1);
|
|
|
|
len2 = toast_raw_datum_size(arg2);
|
|
|
|
if (len1 != len2)
|
|
|
|
result = false;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
text *targ1 = DatumGetTextPP(arg1);
|
|
|
|
text *targ2 = DatumGetTextPP(arg2);
|
|
|
|
|
|
|
|
result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
|
|
|
|
len1 - VARHDRSZ) == 0);
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(targ1, 0);
|
|
|
|
PG_FREE_IF_COPY(targ2, 1);
|
|
|
|
}
|
|
|
|
}
|
2001-05-03 21:00:37 +02:00
|
|
|
else
|
2011-01-18 20:09:22 +01:00
|
|
|
{
|
2019-03-22 12:09:32 +01:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2001-05-03 21:00:37 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
result = (text_cmp(arg1, arg2, collid) == 0);
|
2011-01-18 20:09:22 +01:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
2011-01-18 20:09:22 +01:00
|
|
|
}
|
2001-05-03 21:00:37 +02:00
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textne(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2001-05-03 21:00:37 +02:00
|
|
|
bool result;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (lc_collate_is_c(collid) ||
|
|
|
|
collid == DEFAULT_COLLATION_OID ||
|
|
|
|
pg_newlocale_from_collation(collid)->deterministic)
|
|
|
|
{
|
|
|
|
Datum arg1 = PG_GETARG_DATUM(0);
|
|
|
|
Datum arg2 = PG_GETARG_DATUM(1);
|
|
|
|
Size len1,
|
|
|
|
len2;
|
|
|
|
|
|
|
|
/* See comment in texteq() */
|
|
|
|
len1 = toast_raw_datum_size(arg1);
|
|
|
|
len2 = toast_raw_datum_size(arg2);
|
|
|
|
if (len1 != len2)
|
|
|
|
result = true;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
text *targ1 = DatumGetTextPP(arg1);
|
|
|
|
text *targ2 = DatumGetTextPP(arg2);
|
|
|
|
|
|
|
|
result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
|
|
|
|
len1 - VARHDRSZ) != 0);
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(targ1, 0);
|
|
|
|
PG_FREE_IF_COPY(targ2, 1);
|
|
|
|
}
|
|
|
|
}
|
2001-05-03 21:00:37 +02:00
|
|
|
else
|
2011-01-18 20:09:22 +01:00
|
|
|
{
|
2019-03-22 12:09:32 +01:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2001-05-03 21:00:37 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
result = (text_cmp(arg1, arg2, collid) != 0);
|
2011-01-18 20:09:22 +01:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
2011-01-18 20:09:22 +01:00
|
|
|
}
|
2001-05-03 21:00:37 +02:00
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
text_lt(PG_FUNCTION_ARGS)
|
1998-06-16 08:42:02 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2000-07-12 04:37:39 +02:00
|
|
|
bool result;
|
1998-06-16 08:42:02 +02:00
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
|
2000-07-12 04:37:39 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
text_le(PG_FUNCTION_ARGS)
|
1998-06-16 08:42:02 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2000-07-12 04:37:39 +02:00
|
|
|
bool result;
|
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
|
2000-07-12 04:37:39 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
PG_RETURN_BOOL(result);
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
text_gt(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2000-07-12 04:37:39 +02:00
|
|
|
bool result;
|
2000-07-06 07:48:31 +02:00
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
|
2000-07-12 04:37:39 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
text_ge(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2000-07-12 04:37:39 +02:00
|
|
|
bool result;
|
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
|
2000-07-12 04:37:39 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
2000-07-06 07:48:31 +02:00
|
|
|
|
2000-07-12 04:37:39 +02:00
|
|
|
PG_RETURN_BOOL(result);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2018-04-03 18:46:45 +02:00
|
|
|
Datum
|
|
|
|
text_starts_with(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
Datum arg1 = PG_GETARG_DATUM(0);
|
|
|
|
Datum arg2 = PG_GETARG_DATUM(1);
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2019-05-22 18:55:34 +02:00
|
|
|
pg_locale_t mylocale = 0;
|
2018-04-03 18:46:45 +02:00
|
|
|
bool result;
|
|
|
|
Size len1,
|
|
|
|
len2;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
|
|
|
|
mylocale = pg_newlocale_from_collation(collid);
|
|
|
|
|
|
|
|
if (mylocale && !mylocale->deterministic)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
|
|
errmsg("nondeterministic collations are not supported for substring searches")));
|
|
|
|
|
2018-04-03 18:46:45 +02:00
|
|
|
len1 = toast_raw_datum_size(arg1);
|
|
|
|
len2 = toast_raw_datum_size(arg2);
|
|
|
|
if (len2 > len1)
|
|
|
|
result = false;
|
|
|
|
else
|
|
|
|
{
|
2019-04-02 18:35:32 +02:00
|
|
|
text *targ1 = text_substring(arg1, 1, len2, false);
|
2018-04-03 18:46:45 +02:00
|
|
|
text *targ2 = DatumGetTextPP(arg2);
|
|
|
|
|
|
|
|
result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
|
|
|
|
VARSIZE_ANY_EXHDR(targ2)) == 0);
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(targ1, 0);
|
|
|
|
PG_FREE_IF_COPY(targ2, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
2001-05-03 21:00:37 +02:00
|
|
|
Datum
|
|
|
|
bttextcmp(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2001-05-03 21:00:37 +02:00
|
|
|
int32 result;
|
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = text_cmp(arg1, arg2, PG_GET_COLLATION());
|
2001-05-03 21:00:37 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_INT32(result);
|
|
|
|
}
|
|
|
|
|
2014-08-14 18:09:52 +02:00
|
|
|
Datum
|
|
|
|
bttextsortsupport(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2015-05-24 03:35:49 +02:00
|
|
|
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
|
|
|
|
Oid collid = ssup->ssup_collation;
|
|
|
|
MemoryContext oldcontext;
|
2014-08-14 18:09:52 +02:00
|
|
|
|
|
|
|
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
/* Use generic string SortSupport */
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
varstr_sortsupport(ssup, TEXTOID, collid);
|
2014-08-14 18:09:52 +02:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
|
|
|
|
PG_RETURN_VOID();
|
|
|
|
}
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
/*
|
|
|
|
* Generic sortsupport interface for character type's operator classes.
|
|
|
|
* Includes locale support, and support for BpChar semantics (i.e. removing
|
|
|
|
* trailing spaces before comparison).
|
|
|
|
*
|
|
|
|
* Relies on the assumption that text, VarChar, BpChar, and bytea all have the
|
|
|
|
* same representation. Callers that always use the C collation (e.g.
|
|
|
|
* non-collatable type callers like bytea) may have NUL bytes in their strings;
|
|
|
|
* this will not work with any other collation, though.
|
|
|
|
*/
|
|
|
|
void
|
2019-01-10 20:07:01 +01:00
|
|
|
varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
|
2014-08-14 18:09:52 +02:00
|
|
|
{
|
2015-05-24 03:35:49 +02:00
|
|
|
bool abbreviate = ssup->abbreviate;
|
|
|
|
bool collate_c = false;
|
2016-02-08 21:15:56 +01:00
|
|
|
VarStringSortSupport *sss;
|
2015-05-24 03:35:49 +02:00
|
|
|
pg_locale_t locale = 0;
|
2014-08-14 18:09:52 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
2014-08-14 18:09:52 +02:00
|
|
|
/*
|
2015-01-22 16:46:42 +01:00
|
|
|
* If possible, set ssup->comparator to a function which can be used to
|
|
|
|
* directly compare two datums. If we can do this, we'll avoid the
|
2015-05-24 03:35:49 +02:00
|
|
|
* overhead of a trip through the fmgr layer for every comparison, which
|
|
|
|
* can be substantial.
|
2015-01-19 21:20:31 +01:00
|
|
|
*
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
* Most typically, we'll set the comparator to varlenafastcmp_locale,
|
|
|
|
* which uses strcoll() to perform comparisons. We use that for the
|
|
|
|
* BpChar case too, but type NAME uses namefastcmp_locale. However, if
|
|
|
|
* LC_COLLATE = C, we can make things quite a bit faster with
|
|
|
|
* varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
|
|
|
|
* memcmp() rather than strcoll().
|
2014-08-14 18:09:52 +02:00
|
|
|
*/
|
2015-01-22 16:46:42 +01:00
|
|
|
if (lc_collate_is_c(collid))
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
{
|
2019-01-10 20:07:01 +01:00
|
|
|
if (typid == BPCHAROID)
|
2016-02-03 20:17:35 +01:00
|
|
|
ssup->comparator = bpcharfastcmp_c;
|
2019-01-10 20:07:01 +01:00
|
|
|
else if (typid == NAMEOID)
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
{
|
|
|
|
ssup->comparator = namefastcmp_c;
|
|
|
|
/* Not supporting abbreviation with type NAME, for now */
|
|
|
|
abbreviate = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
ssup->comparator = varstrfastcmp_c;
|
2016-02-03 20:17:35 +01:00
|
|
|
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
collate_c = true;
|
|
|
|
}
|
2015-01-22 16:46:42 +01:00
|
|
|
else
|
2014-08-14 18:09:52 +02:00
|
|
|
{
|
2015-01-22 16:46:42 +01:00
|
|
|
/*
|
|
|
|
* We need a collation-sensitive comparison. To make things faster,
|
|
|
|
* we'll figure out the collation based on the locale id and cache the
|
|
|
|
* result.
|
|
|
|
*/
|
|
|
|
if (collid != DEFAULT_COLLATION_OID)
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
locale = pg_newlocale_from_collation(collid);
|
2017-09-24 06:56:31 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* There is a further exception on Windows. When the database
|
|
|
|
* encoding is UTF-8 and we are not using the C collation, complex
|
|
|
|
* hacks are required. We don't currently have a comparator that
|
|
|
|
* handles that case, so we fall back on the slow method of having the
|
|
|
|
* sort code invoke bttextcmp() (in the case of text) via the fmgr
|
|
|
|
* trampoline. ICU locales work just the same on Windows, however.
|
|
|
|
*/
|
|
|
|
#ifdef WIN32
|
|
|
|
if (GetDatabaseEncoding() == PG_UTF8 &&
|
|
|
|
!(locale && locale->provider == COLLPROVIDER_ICU))
|
|
|
|
return;
|
|
|
|
#endif
|
|
|
|
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
/*
|
|
|
|
* We use varlenafastcmp_locale except for type NAME.
|
|
|
|
*/
|
2019-01-10 20:07:01 +01:00
|
|
|
if (typid == NAMEOID)
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
{
|
|
|
|
ssup->comparator = namefastcmp_locale;
|
|
|
|
/* Not supporting abbreviation with type NAME, for now */
|
|
|
|
abbreviate = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
ssup->comparator = varlenafastcmp_locale;
|
2014-08-14 18:09:52 +02:00
|
|
|
}
|
|
|
|
|
2015-01-19 21:20:31 +01:00
|
|
|
/*
|
2016-03-23 20:58:34 +01:00
|
|
|
* Unfortunately, it seems that abbreviation for non-C collations is
|
|
|
|
* broken on many common platforms; testing of multiple versions of glibc
|
|
|
|
* reveals that, for many locales, strcoll() and strxfrm() do not return
|
|
|
|
* consistent results, which is fatal to this optimization. While no
|
|
|
|
* other libc other than Cygwin has so far been shown to have a problem,
|
|
|
|
* we take the conservative course of action for right now and disable
|
|
|
|
* this categorically. (Users who are certain this isn't a problem on
|
|
|
|
* their system can define TRUST_STRXFRM.)
|
|
|
|
*
|
|
|
|
* Even apart from the risk of broken locales, it's possible that there
|
|
|
|
* are platforms where the use of abbreviated keys should be disabled at
|
|
|
|
* compile time. Having only 4 byte datums could make worst-case
|
Refer to OS X as "macOS", except for the port name which is still "darwin".
We weren't terribly consistent about whether to call Apple's OS "OS X"
or "Mac OS X", and the former is probably confusing to people who aren't
Apple users. Now that Apple has rebranded it "macOS", follow their lead
to establish a consistent naming pattern. Also, avoid the use of the
ancient project name "Darwin", except as the port code name which does not
seem desirable to change. (In short, this patch touches documentation and
comments, but no actual code.)
I didn't touch contrib/start-scripts/osx/, either. I suspect those are
obsolete and due for a rewrite, anyway.
I dithered about whether to apply this edit to old release notes, but
those were responsible for quite a lot of the inconsistencies, so I ended
up changing them too. Anyway, Apple's being ahistorical about this,
so why shouldn't we be?
2016-09-25 21:40:57 +02:00
|
|
|
* performance drastically more likely, for example. Moreover, macOS's
|
|
|
|
* strxfrm() implementation is known to not effectively concentrate a
|
2016-03-23 20:58:34 +01:00
|
|
|
* significant amount of entropy from the original string in earlier
|
|
|
|
* transformed blobs. It's possible that other supported platforms are
|
|
|
|
* similarly encumbered. So, if we ever get past disabling this
|
|
|
|
* categorically, we may still want or need to disable it for particular
|
|
|
|
* platforms.
|
2015-01-19 21:20:31 +01:00
|
|
|
*/
|
2016-03-23 20:58:34 +01:00
|
|
|
#ifndef TRUST_STRXFRM
|
2017-03-23 20:25:34 +01:00
|
|
|
if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
|
2016-03-23 20:58:34 +01:00
|
|
|
abbreviate = false;
|
|
|
|
#endif
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2015-01-22 16:46:42 +01:00
|
|
|
/*
|
|
|
|
* If we're using abbreviated keys, or if we're using a locale-aware
|
2019-08-13 06:53:41 +02:00
|
|
|
* comparison, we need to initialize a VarStringSortSupport object. Both
|
2016-02-03 20:17:35 +01:00
|
|
|
* cases will make use of the temporary buffers we initialize here for
|
|
|
|
* scratch space (and to detect requirement for BpChar semantics from
|
|
|
|
* caller), and the abbreviation case requires additional state.
|
2015-01-22 16:46:42 +01:00
|
|
|
*/
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
if (abbreviate || !collate_c)
|
2015-01-19 21:20:31 +01:00
|
|
|
{
|
2016-02-08 21:15:56 +01:00
|
|
|
sss = palloc(sizeof(VarStringSortSupport));
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->buf1 = palloc(TEXTBUFLEN);
|
|
|
|
sss->buflen1 = TEXTBUFLEN;
|
|
|
|
sss->buf2 = palloc(TEXTBUFLEN);
|
|
|
|
sss->buflen2 = TEXTBUFLEN;
|
2015-10-10 01:03:44 +02:00
|
|
|
/* Start with invalid values */
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->last_len1 = -1;
|
|
|
|
sss->last_len2 = -1;
|
2015-10-20 15:27:50 +02:00
|
|
|
/* Initialize */
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->last_returned = 0;
|
|
|
|
sss->locale = locale;
|
2016-02-08 21:17:40 +01:00
|
|
|
|
2015-10-10 01:03:44 +02:00
|
|
|
/*
|
2015-10-20 15:27:50 +02:00
|
|
|
* To avoid somehow confusing a strxfrm() blob and an original string,
|
|
|
|
* constantly keep track of the variety of data that buf1 and buf2
|
|
|
|
* currently contain.
|
|
|
|
*
|
|
|
|
* Comparisons may be interleaved with conversion calls. Frequently,
|
|
|
|
* conversions and comparisons are batched into two distinct phases,
|
|
|
|
* but the correctness of caching cannot hinge upon this. For
|
|
|
|
* comparison caching, buffer state is only trusted if cache_blob is
|
|
|
|
* found set to false, whereas strxfrm() caching only trusts the state
|
|
|
|
* when cache_blob is found set to true.
|
|
|
|
*
|
|
|
|
* Arbitrarily initialize cache_blob to true.
|
2015-10-10 01:03:44 +02:00
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->cache_blob = true;
|
|
|
|
sss->collate_c = collate_c;
|
2019-01-10 20:07:01 +01:00
|
|
|
sss->typid = typid;
|
2016-02-03 20:17:35 +01:00
|
|
|
ssup->ssup_extra = sss;
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2015-01-22 16:46:42 +01:00
|
|
|
/*
|
|
|
|
* If possible, plan to use the abbreviated keys optimization. The
|
|
|
|
* core code may switch back to authoritative comparator should
|
|
|
|
* abbreviation be aborted.
|
|
|
|
*/
|
|
|
|
if (abbreviate)
|
|
|
|
{
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->prop_card = 0.20;
|
|
|
|
initHyperLogLog(&sss->abbr_card, 10);
|
|
|
|
initHyperLogLog(&sss->full_card, 10);
|
2015-01-22 16:46:42 +01:00
|
|
|
ssup->abbrev_full_comparator = ssup->comparator;
|
2016-02-03 20:17:35 +01:00
|
|
|
ssup->comparator = varstrcmp_abbrev;
|
|
|
|
ssup->abbrev_converter = varstr_abbrev_convert;
|
|
|
|
ssup->abbrev_abort = varstr_abbrev_abort;
|
2015-01-22 16:46:42 +01:00
|
|
|
}
|
|
|
|
}
|
2014-08-14 18:09:52 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sortsupport comparison func (for C locale case)
|
|
|
|
*/
|
|
|
|
static int
|
2016-02-03 20:17:35 +01:00
|
|
|
varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
|
2014-08-14 18:09:52 +02:00
|
|
|
{
|
2016-02-08 21:15:56 +01:00
|
|
|
VarString *arg1 = DatumGetVarStringPP(x);
|
|
|
|
VarString *arg2 = DatumGetVarStringPP(y);
|
2014-08-14 18:09:52 +02:00
|
|
|
char *a1p,
|
|
|
|
*a2p;
|
|
|
|
int len1,
|
|
|
|
len2,
|
|
|
|
result;
|
|
|
|
|
|
|
|
a1p = VARDATA_ANY(arg1);
|
|
|
|
a2p = VARDATA_ANY(arg2);
|
|
|
|
|
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
|
|
|
|
|
|
|
result = memcmp(a1p, a2p, Min(len1, len2));
|
|
|
|
if ((result == 0) && (len1 != len2))
|
|
|
|
result = (len1 < len2) ? -1 : 1;
|
|
|
|
|
|
|
|
/* We can't afford to leak memory here. */
|
|
|
|
if (PointerGetDatum(arg1) != x)
|
|
|
|
pfree(arg1);
|
|
|
|
if (PointerGetDatum(arg2) != y)
|
|
|
|
pfree(arg2);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
/*
|
|
|
|
* sortsupport comparison func (for BpChar C locale case)
|
|
|
|
*
|
|
|
|
* BpChar outsources its sortsupport to this module. Specialization for the
|
|
|
|
* varstr_sortsupport BpChar case, modeled on
|
|
|
|
* internal_bpchar_pattern_compare().
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
|
|
|
|
{
|
|
|
|
BpChar *arg1 = DatumGetBpCharPP(x);
|
|
|
|
BpChar *arg2 = DatumGetBpCharPP(y);
|
|
|
|
char *a1p,
|
|
|
|
*a2p;
|
|
|
|
int len1,
|
|
|
|
len2,
|
|
|
|
result;
|
|
|
|
|
|
|
|
a1p = VARDATA_ANY(arg1);
|
|
|
|
a2p = VARDATA_ANY(arg2);
|
|
|
|
|
|
|
|
len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
|
|
|
|
len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
|
|
|
|
|
|
|
|
result = memcmp(a1p, a2p, Min(len1, len2));
|
|
|
|
if ((result == 0) && (len1 != len2))
|
|
|
|
result = (len1 < len2) ? -1 : 1;
|
|
|
|
|
|
|
|
/* We can't afford to leak memory here. */
|
|
|
|
if (PointerGetDatum(arg1) != x)
|
|
|
|
pfree(arg1);
|
|
|
|
if (PointerGetDatum(arg2) != y)
|
|
|
|
pfree(arg2);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-08-14 18:09:52 +02:00
|
|
|
/*
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
* sortsupport comparison func (for NAME C locale case)
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
namefastcmp_c(Datum x, Datum y, SortSupport ssup)
|
|
|
|
{
|
|
|
|
Name arg1 = DatumGetName(x);
|
|
|
|
Name arg2 = DatumGetName(y);
|
|
|
|
|
|
|
|
return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sortsupport comparison func (for locale case with all varlena types)
|
2014-08-14 18:09:52 +02:00
|
|
|
*/
|
|
|
|
static int
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
|
2014-08-14 18:09:52 +02:00
|
|
|
{
|
2016-02-08 21:17:40 +01:00
|
|
|
VarString *arg1 = DatumGetVarStringPP(x);
|
|
|
|
VarString *arg2 = DatumGetVarStringPP(y);
|
2015-05-24 03:35:49 +02:00
|
|
|
char *a1p,
|
|
|
|
*a2p;
|
|
|
|
int len1,
|
|
|
|
len2,
|
|
|
|
result;
|
2014-08-14 18:09:52 +02:00
|
|
|
|
|
|
|
a1p = VARDATA_ANY(arg1);
|
|
|
|
a2p = VARDATA_ANY(arg2);
|
|
|
|
|
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
|
|
|
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
|
|
|
|
|
|
|
|
/* We can't afford to leak memory here. */
|
|
|
|
if (PointerGetDatum(arg1) != x)
|
|
|
|
pfree(arg1);
|
|
|
|
if (PointerGetDatum(arg2) != y)
|
|
|
|
pfree(arg2);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sortsupport comparison func (for locale case with NAME type)
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
|
|
|
|
{
|
|
|
|
Name arg1 = DatumGetName(x);
|
|
|
|
Name arg2 = DatumGetName(y);
|
|
|
|
|
|
|
|
return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
|
|
|
|
NameStr(*arg2), strlen(NameStr(*arg2)),
|
|
|
|
ssup);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sortsupport comparison func for locale cases
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
|
|
|
|
{
|
|
|
|
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
|
|
|
|
int result;
|
|
|
|
bool arg1_match;
|
|
|
|
|
2014-09-19 18:39:00 +02:00
|
|
|
/* Fast pre-check for equality, as discussed in varstr_cmp() */
|
|
|
|
if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
|
|
|
|
{
|
2015-10-10 01:03:44 +02:00
|
|
|
/*
|
|
|
|
* No change in buf1 or buf2 contents, so avoid changing last_len1 or
|
2016-02-08 21:17:40 +01:00
|
|
|
* last_len2. Existing contents of buffers might still be used by
|
|
|
|
* next call.
|
2016-02-03 20:17:35 +01:00
|
|
|
*
|
2016-02-08 21:17:40 +01:00
|
|
|
* It's fine to allow the comparison of BpChar padding bytes here,
|
|
|
|
* even though that implies that the memcmp() will usually be
|
|
|
|
* performed for BpChar callers (though multibyte characters could
|
|
|
|
* still prevent that from occurring). The memcmp() is still very
|
|
|
|
* cheap, and BpChar's funny semantics have us remove trailing spaces
|
|
|
|
* (not limited to padding), so we need make no distinction between
|
|
|
|
* padding space characters and "real" space characters.
|
2015-10-10 01:03:44 +02:00
|
|
|
*/
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
return 0;
|
2014-09-19 18:39:00 +02:00
|
|
|
}
|
|
|
|
|
2019-01-10 20:07:01 +01:00
|
|
|
if (sss->typid == BPCHAROID)
|
2014-08-14 18:09:52 +02:00
|
|
|
{
|
2016-02-03 20:17:35 +01:00
|
|
|
/* Get true number of bytes, ignoring trailing spaces */
|
|
|
|
len1 = bpchartruelen(a1p, len1);
|
|
|
|
len2 = bpchartruelen(a2p, len2);
|
2014-08-14 18:09:52 +02:00
|
|
|
}
|
2016-02-03 20:17:35 +01:00
|
|
|
|
|
|
|
if (len1 >= sss->buflen1)
|
2014-08-14 18:09:52 +02:00
|
|
|
{
|
2016-02-03 20:17:35 +01:00
|
|
|
pfree(sss->buf1);
|
|
|
|
sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
|
|
|
|
sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
|
|
|
|
}
|
|
|
|
if (len2 >= sss->buflen2)
|
|
|
|
{
|
|
|
|
pfree(sss->buf2);
|
|
|
|
sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
|
|
|
|
sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
|
2014-08-14 18:09:52 +02:00
|
|
|
}
|
|
|
|
|
2015-10-10 01:03:44 +02:00
|
|
|
/*
|
|
|
|
* We're likely to be asked to compare the same strings repeatedly, and
|
|
|
|
* memcmp() is so much cheaper than strcoll() that it pays to try to cache
|
|
|
|
* comparisons, even though in general there is no reason to think that
|
2016-02-08 21:17:40 +01:00
|
|
|
* that will work out (every string datum may be unique). Caching does
|
|
|
|
* not slow things down measurably when it doesn't work out, and can speed
|
2015-10-10 01:03:44 +02:00
|
|
|
* things up by rather a lot when it does. In part, this is because the
|
|
|
|
* memcmp() compares data from cachelines that are needed in L1 cache even
|
|
|
|
* when the last comparison's result cannot be reused.
|
|
|
|
*/
|
|
|
|
arg1_match = true;
|
2016-02-03 20:17:35 +01:00
|
|
|
if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
|
2015-10-10 01:03:44 +02:00
|
|
|
{
|
|
|
|
arg1_match = false;
|
2016-02-03 20:17:35 +01:00
|
|
|
memcpy(sss->buf1, a1p, len1);
|
|
|
|
sss->buf1[len1] = '\0';
|
|
|
|
sss->last_len1 = len1;
|
2015-10-10 01:03:44 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're comparing the same two strings as last time, we can return the
|
|
|
|
* same answer without calling strcoll() again. This is more likely than
|
|
|
|
* it seems (at least with moderate to low cardinality sets), because
|
|
|
|
* quicksort compares the same pivot against many values.
|
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
|
2015-10-10 01:03:44 +02:00
|
|
|
{
|
2016-02-03 20:17:35 +01:00
|
|
|
memcpy(sss->buf2, a2p, len2);
|
|
|
|
sss->buf2[len2] = '\0';
|
|
|
|
sss->last_len2 = len2;
|
2015-10-10 01:03:44 +02:00
|
|
|
}
|
2016-02-03 20:17:35 +01:00
|
|
|
else if (arg1_match && !sss->cache_blob)
|
2015-10-10 01:03:44 +02:00
|
|
|
{
|
|
|
|
/* Use result cached following last actual strcoll() call */
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
return sss->last_returned;
|
2015-10-10 01:03:44 +02:00
|
|
|
}
|
2014-08-14 18:09:52 +02:00
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
if (sss->locale)
|
2017-03-23 20:25:34 +01:00
|
|
|
{
|
|
|
|
if (sss->locale->provider == COLLPROVIDER_ICU)
|
|
|
|
{
|
|
|
|
#ifdef USE_ICU
|
|
|
|
#ifdef HAVE_UCOL_STRCOLLUTF8
|
|
|
|
if (GetDatabaseEncoding() == PG_UTF8)
|
|
|
|
{
|
|
|
|
UErrorCode status;
|
|
|
|
|
|
|
|
status = U_ZERO_ERROR;
|
|
|
|
result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
|
|
|
|
a1p, len1,
|
|
|
|
a2p, len2,
|
|
|
|
&status);
|
|
|
|
if (U_FAILURE(status))
|
|
|
|
ereport(ERROR,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
(errmsg("collation failed: %s", u_errorName(status))));
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
2017-05-17 22:31:56 +02:00
|
|
|
int32_t ulen1,
|
|
|
|
ulen2;
|
|
|
|
UChar *uchar1,
|
|
|
|
*uchar2;
|
2017-03-23 20:25:34 +01:00
|
|
|
|
|
|
|
ulen1 = icu_to_uchar(&uchar1, a1p, len1);
|
|
|
|
ulen2 = icu_to_uchar(&uchar2, a2p, len2);
|
|
|
|
|
|
|
|
result = ucol_strcoll(sss->locale->info.icu.ucol,
|
|
|
|
uchar1, ulen1,
|
|
|
|
uchar2, ulen2);
|
Fix memory leakage in ICU encoding conversion, and other code review.
Callers of icu_to_uchar() neglected to pfree the result string when done
with it. This results in catastrophic memory leaks in varstr_cmp(),
because of our prevailing assumption that btree comparison functions don't
leak memory. For safety, make all the call sites clean up leaks, though
I suspect that we could get away without it in formatting.c. I audited
callers of icu_from_uchar() as well, but found no places that seemed to
have a comparable issue.
Add function API specifications for icu_to_uchar() and icu_from_uchar();
the lack of any thought-through specification is perhaps not unrelated
to the existence of this bug in the first place. Fix icu_to_uchar()
to guarantee a nul-terminated result; although no existing caller appears
to care, the fact that it would have been nul-terminated except in
extreme corner cases seems ideally designed to bite someone on the rear
someday. Fix ucnv_fromUChars() destCapacity argument --- in the worst
case, that could perhaps have led to a non-nul-terminated result, too.
Fix icu_from_uchar() to have a more reasonable definition of the function
result --- no callers are actually paying attention, so this isn't a live
bug, but it's certainly sloppily designed. Const-ify icu_from_uchar()'s
input string for consistency.
That is not the end of what needs to be done to these functions, but
it's as much as I have the patience for right now.
Discussion: https://postgr.es/m/1955.1498181798@sss.pgh.pa.us
2017-06-23 18:22:06 +02:00
|
|
|
|
|
|
|
pfree(uchar1);
|
|
|
|
pfree(uchar2);
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
2017-05-17 22:31:56 +02:00
|
|
|
#else /* not USE_ICU */
|
2017-03-23 20:25:34 +01:00
|
|
|
/* shouldn't happen */
|
|
|
|
elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
#endif /* not USE_ICU */
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
#ifdef HAVE_LOCALE_T
|
|
|
|
result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
|
|
|
|
#else
|
|
|
|
/* shouldn't happen */
|
|
|
|
elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
|
2014-08-14 18:09:52 +02:00
|
|
|
#endif
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2016-02-03 20:17:35 +01:00
|
|
|
result = strcoll(sss->buf1, sss->buf2);
|
2014-08-14 18:09:52 +02:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
/* Break tie if necessary. */
|
|
|
|
if (result == 0 &&
|
|
|
|
(!sss->locale || sss->locale->deterministic))
|
2016-02-03 20:17:35 +01:00
|
|
|
result = strcmp(sss->buf1, sss->buf2);
|
2014-08-14 18:09:52 +02:00
|
|
|
|
2015-10-10 01:03:44 +02:00
|
|
|
/* Cache result, perhaps saving an expensive strcoll() call next time */
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->cache_blob = false;
|
|
|
|
sss->last_returned = result;
|
2014-08-14 18:09:52 +02:00
|
|
|
return result;
|
|
|
|
}
|
2001-05-03 21:00:37 +02:00
|
|
|
|
2015-01-19 21:20:31 +01:00
|
|
|
/*
|
|
|
|
* Abbreviated key comparison func
|
|
|
|
*/
|
|
|
|
static int
|
2016-02-03 20:17:35 +01:00
|
|
|
varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
|
2015-01-19 21:20:31 +01:00
|
|
|
{
|
|
|
|
/*
|
2016-02-03 20:17:35 +01:00
|
|
|
* When 0 is returned, the core system will call varstrfastcmp_c()
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
* (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
|
2016-02-03 20:17:35 +01:00
|
|
|
* strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
|
|
|
|
* authoritatively, for the same reason that there is a strcoll()
|
|
|
|
* tie-breaker call to strcmp() in varstr_cmp().
|
2015-01-19 21:20:31 +01:00
|
|
|
*/
|
2015-10-09 21:06:06 +02:00
|
|
|
if (x > y)
|
|
|
|
return 1;
|
|
|
|
else if (x == y)
|
|
|
|
return 0;
|
|
|
|
else
|
|
|
|
return -1;
|
2015-01-19 21:20:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-02-03 20:17:35 +01:00
|
|
|
* Conversion routine for sortsupport. Converts original to abbreviated key
|
|
|
|
* representation. Our encoding strategy is simple -- pack the first 8 bytes
|
|
|
|
* of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
|
|
|
|
* stored in reverse order), and treat it as an unsigned integer. When the "C"
|
|
|
|
* locale is used, or in case of bytea, just memcpy() from original instead.
|
2015-01-19 21:20:31 +01:00
|
|
|
*/
|
|
|
|
static Datum
|
2016-02-03 20:17:35 +01:00
|
|
|
varstr_abbrev_convert(Datum original, SortSupport ssup)
|
2015-01-19 21:20:31 +01:00
|
|
|
{
|
2016-02-08 21:15:56 +01:00
|
|
|
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
|
2016-02-08 21:17:40 +01:00
|
|
|
VarString *authoritative = DatumGetVarStringPP(original);
|
|
|
|
char *authoritative_data = VARDATA_ANY(authoritative);
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
/* working state */
|
2015-05-24 03:35:49 +02:00
|
|
|
Datum res;
|
|
|
|
char *pres;
|
|
|
|
int len;
|
|
|
|
uint32 hash;
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
pres = (char *) &res;
|
|
|
|
/* memset(), so any non-overwritten bytes are NUL */
|
|
|
|
memset(pres, 0, sizeof(Datum));
|
|
|
|
len = VARSIZE_ANY_EXHDR(authoritative);
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
/* Get number of bytes, ignoring trailing spaces */
|
2019-01-10 20:07:01 +01:00
|
|
|
if (sss->typid == BPCHAROID)
|
2016-02-03 20:17:35 +01:00
|
|
|
len = bpchartruelen(authoritative_data, len);
|
|
|
|
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
/*
|
2016-02-05 14:04:48 +01:00
|
|
|
* If we're using the C collation, use memcpy(), rather than strxfrm(), to
|
2015-05-24 03:35:49 +02:00
|
|
|
* abbreviate keys. The full comparator for the C locale is always
|
2016-02-03 20:17:35 +01:00
|
|
|
* memcmp(). It would be incorrect to allow bytea callers (callers that
|
|
|
|
* always force the C collation -- bytea isn't a collatable type, but this
|
2016-02-08 21:17:40 +01:00
|
|
|
* approach is convenient) to use strxfrm(). This is because bytea
|
|
|
|
* strings may contain NUL bytes. Besides, this should be faster, too.
|
2016-02-03 20:17:35 +01:00
|
|
|
*
|
|
|
|
* More generally, it's okay that bytea callers can have NUL bytes in
|
|
|
|
* strings because varstrcmp_abbrev() need not make a distinction between
|
|
|
|
* terminating NUL bytes, and NUL bytes representing actual NULs in the
|
|
|
|
* authoritative representation. Hopefully a comparison at or past one
|
|
|
|
* abbreviated key's terminating NUL byte will resolve the comparison
|
|
|
|
* without consulting the authoritative representation; specifically, some
|
|
|
|
* later non-NUL byte in the longer string can resolve the comparison
|
|
|
|
* against a subsequent terminating NUL in the shorter string. There will
|
|
|
|
* usually be what is effectively a "length-wise" resolution there and
|
|
|
|
* then.
|
|
|
|
*
|
2016-02-08 21:17:40 +01:00
|
|
|
* If that doesn't work out -- if all bytes in the longer string
|
|
|
|
* positioned at or past the offset of the smaller string's (first)
|
|
|
|
* terminating NUL are actually representative of NUL bytes in the
|
|
|
|
* authoritative binary string (perhaps with some *terminating* NUL bytes
|
|
|
|
* towards the end of the longer string iff it happens to still be small)
|
|
|
|
* -- then an authoritative tie-breaker will happen, and do the right
|
|
|
|
* thing: explicitly consider string length.
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
if (sss->collate_c)
|
2015-01-22 18:47:46 +01:00
|
|
|
memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
else
|
2015-01-19 21:20:31 +01:00
|
|
|
{
|
2015-05-24 03:35:49 +02:00
|
|
|
Size bsize;
|
2017-03-23 20:25:34 +01:00
|
|
|
#ifdef USE_ICU
|
|
|
|
int32_t ulen = -1;
|
Fix memory leakage in ICU encoding conversion, and other code review.
Callers of icu_to_uchar() neglected to pfree the result string when done
with it. This results in catastrophic memory leaks in varstr_cmp(),
because of our prevailing assumption that btree comparison functions don't
leak memory. For safety, make all the call sites clean up leaks, though
I suspect that we could get away without it in formatting.c. I audited
callers of icu_from_uchar() as well, but found no places that seemed to
have a comparable issue.
Add function API specifications for icu_to_uchar() and icu_from_uchar();
the lack of any thought-through specification is perhaps not unrelated
to the existence of this bug in the first place. Fix icu_to_uchar()
to guarantee a nul-terminated result; although no existing caller appears
to care, the fact that it would have been nul-terminated except in
extreme corner cases seems ideally designed to bite someone on the rear
someday. Fix ucnv_fromUChars() destCapacity argument --- in the worst
case, that could perhaps have led to a non-nul-terminated result, too.
Fix icu_from_uchar() to have a more reasonable definition of the function
result --- no callers are actually paying attention, so this isn't a live
bug, but it's certainly sloppily designed. Const-ify icu_from_uchar()'s
input string for consistency.
That is not the end of what needs to be done to these functions, but
it's as much as I have the patience for right now.
Discussion: https://postgr.es/m/1955.1498181798@sss.pgh.pa.us
2017-06-23 18:22:06 +02:00
|
|
|
UChar *uchar = NULL;
|
2017-03-23 20:25:34 +01:00
|
|
|
#endif
|
2015-01-22 18:47:46 +01:00
|
|
|
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
/*
|
2017-03-23 20:25:34 +01:00
|
|
|
* We're not using the C collation, so fall back on strxfrm or ICU
|
|
|
|
* analogs.
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
*/
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
/* By convention, we use buffer 1 to store and NUL-terminate */
|
|
|
|
if (len >= sss->buflen1)
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
{
|
2016-02-03 20:17:35 +01:00
|
|
|
pfree(sss->buf1);
|
|
|
|
sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
|
|
|
|
sss->buf1 = palloc(sss->buflen1);
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
}
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2015-10-10 01:03:44 +02:00
|
|
|
/* Might be able to reuse strxfrm() blob from last call */
|
2016-02-03 20:17:35 +01:00
|
|
|
if (sss->last_len1 == len && sss->cache_blob &&
|
|
|
|
memcmp(sss->buf1, authoritative_data, len) == 0)
|
2015-10-10 01:03:44 +02:00
|
|
|
{
|
2016-02-03 20:17:35 +01:00
|
|
|
memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
|
2015-10-10 01:03:44 +02:00
|
|
|
/* No change affecting cardinality, so no hashing required */
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
memcpy(sss->buf1, authoritative_data, len);
|
2017-05-17 22:31:56 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
|
|
|
|
* necessary for ICU, but doesn't hurt.
|
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->buf1[len] = '\0';
|
|
|
|
sss->last_len1 = len;
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2017-03-23 20:25:34 +01:00
|
|
|
#ifdef USE_ICU
|
|
|
|
/* When using ICU and not UTF8, convert string to UChar. */
|
|
|
|
if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
|
|
|
|
GetDatabaseEncoding() != PG_UTF8)
|
|
|
|
ulen = icu_to_uchar(&uchar, sss->buf1, len);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
|
|
|
|
* and try again. Both of these functions have the result buffer
|
|
|
|
* content undefined if the result did not fit, so we need to retry
|
|
|
|
* until everything fits, even though we only need the first few bytes
|
|
|
|
* in the end. When using ucol_nextSortKeyPart(), however, we only
|
|
|
|
* ask for as many bytes as we actually need.
|
|
|
|
*/
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
for (;;)
|
|
|
|
{
|
2017-03-23 20:25:34 +01:00
|
|
|
#ifdef USE_ICU
|
|
|
|
if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* When using UTF8, use the iteration interface so we only
|
|
|
|
* need to produce as many bytes as we actually need.
|
|
|
|
*/
|
|
|
|
if (GetDatabaseEncoding() == PG_UTF8)
|
|
|
|
{
|
|
|
|
UCharIterator iter;
|
|
|
|
uint32_t state[2];
|
|
|
|
UErrorCode status;
|
|
|
|
|
|
|
|
uiter_setUTF8(&iter, sss->buf1, len);
|
2017-05-17 22:31:56 +02:00
|
|
|
state[0] = state[1] = 0; /* won't need that again */
|
2017-03-23 20:25:34 +01:00
|
|
|
status = U_ZERO_ERROR;
|
|
|
|
bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
|
|
|
|
&iter,
|
|
|
|
state,
|
|
|
|
(uint8_t *) sss->buf2,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
Min(sizeof(Datum), sss->buflen2),
|
2017-03-23 20:25:34 +01:00
|
|
|
&status);
|
|
|
|
if (U_FAILURE(status))
|
|
|
|
ereport(ERROR,
|
Fix memory leakage in ICU encoding conversion, and other code review.
Callers of icu_to_uchar() neglected to pfree the result string when done
with it. This results in catastrophic memory leaks in varstr_cmp(),
because of our prevailing assumption that btree comparison functions don't
leak memory. For safety, make all the call sites clean up leaks, though
I suspect that we could get away without it in formatting.c. I audited
callers of icu_from_uchar() as well, but found no places that seemed to
have a comparable issue.
Add function API specifications for icu_to_uchar() and icu_from_uchar();
the lack of any thought-through specification is perhaps not unrelated
to the existence of this bug in the first place. Fix icu_to_uchar()
to guarantee a nul-terminated result; although no existing caller appears
to care, the fact that it would have been nul-terminated except in
extreme corner cases seems ideally designed to bite someone on the rear
someday. Fix ucnv_fromUChars() destCapacity argument --- in the worst
case, that could perhaps have led to a non-nul-terminated result, too.
Fix icu_from_uchar() to have a more reasonable definition of the function
result --- no callers are actually paying attention, so this isn't a live
bug, but it's certainly sloppily designed. Const-ify icu_from_uchar()'s
input string for consistency.
That is not the end of what needs to be done to these functions, but
it's as much as I have the patience for right now.
Discussion: https://postgr.es/m/1955.1498181798@sss.pgh.pa.us
2017-06-23 18:22:06 +02:00
|
|
|
(errmsg("sort key generation failed: %s",
|
|
|
|
u_errorName(status))));
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
|
|
|
|
uchar, ulen,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
(uint8_t *) sss->buf2, sss->buflen2);
|
2017-03-23 20:25:34 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
2015-01-19 21:20:31 +01:00
|
|
|
#ifdef HAVE_LOCALE_T
|
2017-03-23 20:25:34 +01:00
|
|
|
if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
|
2016-02-03 20:17:35 +01:00
|
|
|
bsize = strxfrm_l(sss->buf2, sss->buf1,
|
2017-03-23 20:25:34 +01:00
|
|
|
sss->buflen2, sss->locale->info.lt);
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
else
|
2015-01-19 21:20:31 +01:00
|
|
|
#endif
|
2016-02-03 20:17:35 +01:00
|
|
|
bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->last_len2 = bsize;
|
|
|
|
if (bsize < sss->buflen2)
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
2017-03-23 20:25:34 +01:00
|
|
|
* Grow buffer and retry.
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
pfree(sss->buf2);
|
|
|
|
sss->buflen2 = Max(bsize + 1,
|
|
|
|
Min(sss->buflen2 * 2, MaxAllocSize));
|
|
|
|
sss->buf2 = palloc(sss->buflen2);
|
More fixes for abbreviated keys infrastructure.
First, when LC_COLLATE = C, bttext_abbrev_convert should use memcpy()
rather than strxfrm() to construct the abbreviated key, because the
authoritative comparator uses memcpy(). If we do anything else here,
we might get inconsistent answers, and the buildfarm says this risk
is not theoretical. It should be faster this way, too.
Second, while I'm looking at bttext_abbrev_convert, convert a needless
use of goto into the loop it's trying to implement into an actual
loop.
Both of the above problems date to the original commit of abbreviated
keys, commit 4ea51cdfe85ceef8afabceb03c446574daa0ac23.
Third, fix a bogus assignment to tss->locale before tss is set up.
That's a new goof in commit b529b65d1bf8537ca7fa024760a9782d7c8b66e5.
2015-01-22 17:58:58 +01:00
|
|
|
}
|
2015-01-22 18:47:46 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Every Datum byte is always compared. This is safe because the
|
|
|
|
* strxfrm() blob is itself NUL terminated, leaving no danger of
|
|
|
|
* misinterpreting any NUL bytes not intended to be interpreted as
|
|
|
|
* logically representing termination.
|
2016-02-03 20:17:35 +01:00
|
|
|
*
|
|
|
|
* (Actually, even if there were NUL bytes in the blob it would be
|
|
|
|
* okay. See remarks on bytea case above.)
|
2015-01-22 18:47:46 +01:00
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
|
Fix memory leakage in ICU encoding conversion, and other code review.
Callers of icu_to_uchar() neglected to pfree the result string when done
with it. This results in catastrophic memory leaks in varstr_cmp(),
because of our prevailing assumption that btree comparison functions don't
leak memory. For safety, make all the call sites clean up leaks, though
I suspect that we could get away without it in formatting.c. I audited
callers of icu_from_uchar() as well, but found no places that seemed to
have a comparable issue.
Add function API specifications for icu_to_uchar() and icu_from_uchar();
the lack of any thought-through specification is perhaps not unrelated
to the existence of this bug in the first place. Fix icu_to_uchar()
to guarantee a nul-terminated result; although no existing caller appears
to care, the fact that it would have been nul-terminated except in
extreme corner cases seems ideally designed to bite someone on the rear
someday. Fix ucnv_fromUChars() destCapacity argument --- in the worst
case, that could perhaps have led to a non-nul-terminated result, too.
Fix icu_from_uchar() to have a more reasonable definition of the function
result --- no callers are actually paying attention, so this isn't a live
bug, but it's certainly sloppily designed. Const-ify icu_from_uchar()'s
input string for consistency.
That is not the end of what needs to be done to these functions, but
it's as much as I have the patience for right now.
Discussion: https://postgr.es/m/1955.1498181798@sss.pgh.pa.us
2017-06-23 18:22:06 +02:00
|
|
|
|
|
|
|
#ifdef USE_ICU
|
|
|
|
if (uchar)
|
|
|
|
pfree(uchar);
|
|
|
|
#endif
|
2015-01-19 21:20:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maintain approximate cardinality of both abbreviated keys and original,
|
|
|
|
* authoritative keys using HyperLogLog. Used as cheap insurance against
|
2015-05-24 03:35:49 +02:00
|
|
|
* the worst case, where we do many string transformations for no saving
|
|
|
|
* in full strcoll()-based comparisons. These statistics are used by
|
2016-02-03 20:17:35 +01:00
|
|
|
* varstr_abbrev_abort().
|
2015-01-19 21:20:31 +01:00
|
|
|
*
|
|
|
|
* First, Hash key proper, or a significant fraction of it. Mix in length
|
|
|
|
* in order to compensate for cases where differences are past
|
2015-01-23 21:06:29 +01:00
|
|
|
* PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
|
2015-01-19 21:20:31 +01:00
|
|
|
*/
|
2015-04-02 17:57:35 +02:00
|
|
|
hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
|
|
|
|
Min(len, PG_CACHE_LINE_SIZE)));
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
if (len > PG_CACHE_LINE_SIZE)
|
|
|
|
hash ^= DatumGetUInt32(hash_uint32((uint32) len));
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
addHyperLogLog(&sss->full_card, hash);
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
/* Hash abbreviated key */
|
|
|
|
#if SIZEOF_DATUM == 8
|
|
|
|
{
|
2015-05-24 03:35:49 +02:00
|
|
|
uint32 lohalf,
|
|
|
|
hihalf;
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
lohalf = (uint32) res;
|
|
|
|
hihalf = (uint32) (res >> 32);
|
2015-04-02 17:57:35 +02:00
|
|
|
hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
|
2015-01-19 21:20:31 +01:00
|
|
|
}
|
|
|
|
#else /* SIZEOF_DATUM != 8 */
|
2015-04-02 17:57:35 +02:00
|
|
|
hash = DatumGetUInt32(hash_uint32((uint32) res));
|
2015-01-19 21:20:31 +01:00
|
|
|
#endif
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
addHyperLogLog(&sss->abbr_card, hash);
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2015-10-20 15:27:50 +02:00
|
|
|
/* Cache result, perhaps saving an expensive strxfrm() call next time */
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->cache_blob = true;
|
2015-10-10 01:03:44 +02:00
|
|
|
done:
|
2016-02-08 21:17:40 +01:00
|
|
|
|
2015-10-09 21:06:06 +02:00
|
|
|
/*
|
|
|
|
* Byteswap on little-endian machines.
|
|
|
|
*
|
2016-02-03 20:17:35 +01:00
|
|
|
* This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
|
2015-10-09 21:06:06 +02:00
|
|
|
* comparator) works correctly on all platforms. If we didn't do this,
|
|
|
|
* the comparator would have to call memcmp() with a pair of pointers to
|
|
|
|
* the first byte of each abbreviated key, which is slower.
|
|
|
|
*/
|
|
|
|
res = DatumBigEndianToNative(res);
|
|
|
|
|
2015-06-30 05:53:05 +02:00
|
|
|
/* Don't leak memory here */
|
|
|
|
if (PointerGetDatum(authoritative) != original)
|
|
|
|
pfree(authoritative);
|
|
|
|
|
2015-01-19 21:20:31 +01:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Callback for estimating effectiveness of abbreviated key optimization, using
|
|
|
|
* heuristic rules. Returns value indicating if the abbreviation optimization
|
|
|
|
* should be aborted, based on its projected effectiveness.
|
|
|
|
*/
|
|
|
|
static bool
|
2016-02-03 20:17:35 +01:00
|
|
|
varstr_abbrev_abort(int memtupcount, SortSupport ssup)
|
2015-01-19 21:20:31 +01:00
|
|
|
{
|
2016-02-08 21:15:56 +01:00
|
|
|
VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
|
2015-05-24 03:35:49 +02:00
|
|
|
double abbrev_distinct,
|
|
|
|
key_distinct;
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
Assert(ssup->abbreviate);
|
|
|
|
|
|
|
|
/* Have a little patience */
|
2015-04-03 14:32:05 +02:00
|
|
|
if (memtupcount < 100)
|
2015-01-19 21:20:31 +01:00
|
|
|
return false;
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
|
|
|
|
key_distinct = estimateHyperLogLog(&sss->full_card);
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
/*
|
2015-05-24 03:35:49 +02:00
|
|
|
* Clamp cardinality estimates to at least one distinct value. While
|
|
|
|
* NULLs are generally disregarded, if only NULL values were seen so far,
|
|
|
|
* that might misrepresent costs if we failed to clamp.
|
2015-01-19 21:20:31 +01:00
|
|
|
*/
|
|
|
|
if (abbrev_distinct <= 1.0)
|
|
|
|
abbrev_distinct = 1.0;
|
|
|
|
|
|
|
|
if (key_distinct <= 1.0)
|
|
|
|
key_distinct = 1.0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the worst case all abbreviated keys are identical, while at the same
|
|
|
|
* time there are differences within full key strings not captured in
|
|
|
|
* abbreviations.
|
|
|
|
*/
|
2015-04-08 04:45:17 +02:00
|
|
|
#ifdef TRACE_SORT
|
|
|
|
if (trace_sort)
|
2015-01-19 21:20:31 +01:00
|
|
|
{
|
2015-05-24 03:35:49 +02:00
|
|
|
double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
|
2015-01-19 21:20:31 +01:00
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
|
2015-04-08 04:45:17 +02:00
|
|
|
"(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
|
2015-04-03 14:32:05 +02:00
|
|
|
memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->prop_card);
|
2015-01-19 21:20:31 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the number of distinct abbreviated keys approximately matches the
|
|
|
|
* number of distinct authoritative original keys, that's reason enough to
|
|
|
|
* proceed. We can win even with a very low cardinality set if most
|
|
|
|
* tie-breakers only memcmp(). This is by far the most important
|
|
|
|
* consideration.
|
|
|
|
*
|
|
|
|
* While comparisons that are resolved at the abbreviated key level are
|
|
|
|
* considerably cheaper than tie-breakers resolved with memcmp(), both of
|
|
|
|
* those two outcomes are so much cheaper than a full strcoll() once
|
|
|
|
* sorting is underway that it doesn't seem worth it to weigh abbreviated
|
|
|
|
* cardinality against the overall size of the set in order to more
|
|
|
|
* accurately model costs. Assume that an abbreviated comparison, and an
|
|
|
|
* abbreviated comparison with a cheap memcmp()-based authoritative
|
|
|
|
* resolution are equivalent.
|
|
|
|
*/
|
2016-02-03 20:17:35 +01:00
|
|
|
if (abbrev_distinct > key_distinct * sss->prop_card)
|
2015-04-03 14:32:05 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* When we have exceeded 10,000 tuples, decay required cardinality
|
|
|
|
* aggressively for next call.
|
|
|
|
*
|
2015-05-24 03:35:49 +02:00
|
|
|
* This is useful because the number of comparisons required on
|
|
|
|
* average increases at a linearithmic rate, and at roughly 10,000
|
|
|
|
* tuples that factor will start to dominate over the linear costs of
|
|
|
|
* string transformation (this is a conservative estimate). The decay
|
|
|
|
* rate is chosen to be a little less aggressive than halving -- which
|
|
|
|
* (since we're called at points at which memtupcount has doubled)
|
|
|
|
* would never see the cost model actually abort past the first call
|
|
|
|
* following a decay. This decay rate is mostly a precaution against
|
|
|
|
* a sudden, violent swing in how well abbreviated cardinality tracks
|
|
|
|
* full key cardinality. The decay also serves to prevent a marginal
|
|
|
|
* case from being aborted too late, when too much has already been
|
|
|
|
* invested in string transformation.
|
2015-04-03 14:32:05 +02:00
|
|
|
*
|
2015-05-24 03:35:49 +02:00
|
|
|
* It's possible for sets of several million distinct strings with
|
|
|
|
* mere tens of thousands of distinct abbreviated keys to still
|
|
|
|
* benefit very significantly. This will generally occur provided
|
|
|
|
* each abbreviated key is a proxy for a roughly uniform number of the
|
|
|
|
* set's full keys. If it isn't so, we hope to catch that early and
|
|
|
|
* abort. If it isn't caught early, by the time the problem is
|
|
|
|
* apparent it's probably not worth aborting.
|
2015-04-03 14:32:05 +02:00
|
|
|
*/
|
|
|
|
if (memtupcount > 10000)
|
2016-02-03 20:17:35 +01:00
|
|
|
sss->prop_card *= 0.65;
|
2015-04-03 14:32:05 +02:00
|
|
|
|
2015-01-19 21:20:31 +01:00
|
|
|
return false;
|
2015-04-03 14:32:05 +02:00
|
|
|
}
|
2015-01-19 21:20:31 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Abort abbreviation strategy.
|
|
|
|
*
|
|
|
|
* The worst case, where all abbreviated keys are identical while all
|
|
|
|
* original strings differ will typically only see a regression of about
|
|
|
|
* 10% in execution time for small to medium sized lists of strings.
|
|
|
|
* Whereas on modern CPUs where cache stalls are the dominant cost, we can
|
|
|
|
* often expect very large improvements, particularly with sets of strings
|
|
|
|
* of moderately high to high abbreviated cardinality. There is little to
|
|
|
|
* lose but much to gain, which our strategy reflects.
|
|
|
|
*/
|
2015-04-08 04:45:17 +02:00
|
|
|
#ifdef TRACE_SORT
|
|
|
|
if (trace_sort)
|
2016-02-03 20:17:35 +01:00
|
|
|
elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
|
2015-04-08 04:45:17 +02:00
|
|
|
"(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
|
2016-02-03 20:17:35 +01:00
|
|
|
memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
|
2015-01-19 21:20:31 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-02-26 20:28:25 +01:00
|
|
|
/*
|
|
|
|
* Generic equalimage support function for character type's operator classes.
|
|
|
|
* Disables the use of deduplication with nondeterministic collations.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
btvarstrequalimage(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
/* Oid opcintype = PG_GETARG_OID(0); */
|
|
|
|
Oid collid = PG_GET_COLLATION();
|
|
|
|
|
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (lc_collate_is_c(collid) ||
|
|
|
|
collid == DEFAULT_COLLATION_OID ||
|
|
|
|
get_collation_isdeterministic(collid))
|
|
|
|
PG_RETURN_BOOL(true);
|
|
|
|
else
|
|
|
|
PG_RETURN_BOOL(false);
|
|
|
|
}
|
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
text_larger(PG_FUNCTION_ARGS)
|
1998-12-08 07:19:15 +01:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
1999-05-25 18:15:34 +02:00
|
|
|
text *result;
|
1998-12-08 07:19:15 +01:00
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
|
1998-12-08 07:19:15 +01:00
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
PG_RETURN_TEXT_P(result);
|
1998-12-08 07:19:15 +01:00
|
|
|
}
|
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
text_smaller(PG_FUNCTION_ARGS)
|
1998-12-08 07:19:15 +01:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
1999-05-25 18:15:34 +02:00
|
|
|
text *result;
|
1998-12-08 07:19:15 +01:00
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
|
1998-12-08 07:19:15 +01:00
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
PG_RETURN_TEXT_P(result);
|
1998-12-08 07:19:15 +01:00
|
|
|
}
|
|
|
|
|
2003-05-15 17:50:21 +02:00
|
|
|
|
2018-12-19 23:46:07 +01:00
|
|
|
/*
|
|
|
|
* Cross-type comparison functions for types text and name.
|
|
|
|
*/
|
|
|
|
|
|
|
|
Datum
|
|
|
|
nameeqtext(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
Name arg1 = PG_GETARG_NAME(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
|
|
|
size_t len1 = strlen(NameStr(*arg1));
|
|
|
|
size_t len2 = VARSIZE_ANY_EXHDR(arg2);
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2018-12-19 23:46:07 +01:00
|
|
|
bool result;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (collid == C_COLLATION_OID)
|
|
|
|
result = (len1 == len2 &&
|
|
|
|
memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
|
|
|
|
else
|
|
|
|
result = (varstr_cmp(NameStr(*arg1), len1,
|
|
|
|
VARDATA_ANY(arg2), len2,
|
|
|
|
collid) == 0);
|
2018-12-19 23:46:07 +01:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
texteqname(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
Name arg2 = PG_GETARG_NAME(1);
|
|
|
|
size_t len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
size_t len2 = strlen(NameStr(*arg2));
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2018-12-19 23:46:07 +01:00
|
|
|
bool result;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (collid == C_COLLATION_OID)
|
|
|
|
result = (len1 == len2 &&
|
|
|
|
memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
|
|
|
|
else
|
|
|
|
result = (varstr_cmp(VARDATA_ANY(arg1), len1,
|
|
|
|
NameStr(*arg2), len2,
|
|
|
|
collid) == 0);
|
2018-12-19 23:46:07 +01:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
namenetext(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
Name arg1 = PG_GETARG_NAME(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
|
|
|
size_t len1 = strlen(NameStr(*arg1));
|
|
|
|
size_t len2 = VARSIZE_ANY_EXHDR(arg2);
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2018-12-19 23:46:07 +01:00
|
|
|
bool result;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (collid == C_COLLATION_OID)
|
|
|
|
result = !(len1 == len2 &&
|
|
|
|
memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
|
|
|
|
else
|
|
|
|
result = !(varstr_cmp(NameStr(*arg1), len1,
|
|
|
|
VARDATA_ANY(arg2), len2,
|
|
|
|
collid) == 0);
|
2018-12-19 23:46:07 +01:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textnename(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
Name arg2 = PG_GETARG_NAME(1);
|
|
|
|
size_t len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
size_t len2 = strlen(NameStr(*arg2));
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid collid = PG_GET_COLLATION();
|
2018-12-19 23:46:07 +01:00
|
|
|
bool result;
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
check_collation_set(collid);
|
|
|
|
|
|
|
|
if (collid == C_COLLATION_OID)
|
|
|
|
result = !(len1 == len2 &&
|
|
|
|
memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
|
|
|
|
else
|
|
|
|
result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
|
|
|
|
NameStr(*arg2), len2,
|
|
|
|
collid) == 0);
|
2018-12-19 23:46:07 +01:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
btnametextcmp(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
Name arg1 = PG_GETARG_NAME(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
|
|
|
int32 result;
|
|
|
|
|
|
|
|
result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
|
|
|
|
VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
|
|
|
|
PG_GET_COLLATION());
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_INT32(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
bttextnamecmp(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
Name arg2 = PG_GETARG_NAME(1);
|
|
|
|
int32 result;
|
|
|
|
|
|
|
|
result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
|
|
|
|
NameStr(*arg2), strlen(NameStr(*arg2)),
|
|
|
|
PG_GET_COLLATION());
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
|
|
|
|
PG_RETURN_INT32(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define CmpCall(cmpfunc) \
|
|
|
|
DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
|
|
|
|
PG_GET_COLLATION(), \
|
|
|
|
PG_GETARG_DATUM(0), \
|
|
|
|
PG_GETARG_DATUM(1)))
|
|
|
|
|
|
|
|
Datum
|
|
|
|
namelttext(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
nameletext(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
namegttext(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
namegetext(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textltname(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textlename(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textgtname(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
textgename(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef CmpCall
|
|
|
|
|
|
|
|
|
2003-05-15 17:50:21 +02:00
|
|
|
/*
|
|
|
|
* The following operators support character-by-character comparison
|
2008-05-27 02:13:09 +02:00
|
|
|
* of text datums, to allow building indexes suitable for LIKE clauses.
|
2016-02-03 20:17:35 +01:00
|
|
|
* Note that the regular texteq/textne comparison operators, and regular
|
|
|
|
* support functions 1 and 2 with "C" collation are assumed to be
|
|
|
|
* compatible with these!
|
2003-05-15 17:50:21 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
Fix up handling of nondeterministic collations with pattern_ops opclasses.
text_pattern_ops and its siblings can't be used with nondeterministic
collations, because they use the text_eq operator which will not behave
as bitwise equality if applied with a nondeterministic collation. The
initial implementation of that restriction was to insert a run-time test
in the related comparison functions, but that is inefficient, may throw
misleading errors, and will throw errors in some cases that would work.
It seems sufficient to just prevent the combination during CREATE INDEX,
so do that instead.
Lacking any better way to identify the opclasses involved, we need to
hard-wire tests for them, which requires hand-assigned values for their
OIDs, which forces a catversion bump because they previously had OIDs
that would be assigned automatically. That's slightly annoying in the
v12 branch, but fortunately we're not at rc1 yet, so just do it.
Back-patch to v12 where nondeterministic collations were added.
In passing, run make reformat-dat-files, which found some unrelated
whitespace issues (slightly different ones in HEAD and v12).
Peter Eisentraut, with small corrections by me
Discussion: https://postgr.es/m/22566.1568675619@sss.pgh.pa.us
2019-09-21 22:29:17 +02:00
|
|
|
internal_text_pattern_compare(text *arg1, text *arg2)
|
2003-05-15 17:50:21 +02:00
|
|
|
{
|
2003-08-04 02:43:34 +02:00
|
|
|
int result;
|
2008-05-27 02:13:09 +02:00
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
|
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2003-05-15 17:50:21 +02:00
|
|
|
|
2010-12-22 04:11:40 +01:00
|
|
|
result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
|
2003-05-15 17:50:21 +02:00
|
|
|
if (result != 0)
|
|
|
|
return result;
|
2008-05-27 02:13:09 +02:00
|
|
|
else if (len1 < len2)
|
2003-05-15 17:50:21 +02:00
|
|
|
return -1;
|
2008-05-27 02:13:09 +02:00
|
|
|
else if (len1 > len2)
|
2003-05-15 17:50:21 +02:00
|
|
|
return 1;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
text_pattern_lt(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2003-05-15 17:50:21 +02:00
|
|
|
int result;
|
|
|
|
|
Fix up handling of nondeterministic collations with pattern_ops opclasses.
text_pattern_ops and its siblings can't be used with nondeterministic
collations, because they use the text_eq operator which will not behave
as bitwise equality if applied with a nondeterministic collation. The
initial implementation of that restriction was to insert a run-time test
in the related comparison functions, but that is inefficient, may throw
misleading errors, and will throw errors in some cases that would work.
It seems sufficient to just prevent the combination during CREATE INDEX,
so do that instead.
Lacking any better way to identify the opclasses involved, we need to
hard-wire tests for them, which requires hand-assigned values for their
OIDs, which forces a catversion bump because they previously had OIDs
that would be assigned automatically. That's slightly annoying in the
v12 branch, but fortunately we're not at rc1 yet, so just do it.
Back-patch to v12 where nondeterministic collations were added.
In passing, run make reformat-dat-files, which found some unrelated
whitespace issues (slightly different ones in HEAD and v12).
Peter Eisentraut, with small corrections by me
Discussion: https://postgr.es/m/22566.1568675619@sss.pgh.pa.us
2019-09-21 22:29:17 +02:00
|
|
|
result = internal_text_pattern_compare(arg1, arg2);
|
2003-05-15 17:50:21 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result < 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
text_pattern_le(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2003-05-15 17:50:21 +02:00
|
|
|
int result;
|
|
|
|
|
Fix up handling of nondeterministic collations with pattern_ops opclasses.
text_pattern_ops and its siblings can't be used with nondeterministic
collations, because they use the text_eq operator which will not behave
as bitwise equality if applied with a nondeterministic collation. The
initial implementation of that restriction was to insert a run-time test
in the related comparison functions, but that is inefficient, may throw
misleading errors, and will throw errors in some cases that would work.
It seems sufficient to just prevent the combination during CREATE INDEX,
so do that instead.
Lacking any better way to identify the opclasses involved, we need to
hard-wire tests for them, which requires hand-assigned values for their
OIDs, which forces a catversion bump because they previously had OIDs
that would be assigned automatically. That's slightly annoying in the
v12 branch, but fortunately we're not at rc1 yet, so just do it.
Back-patch to v12 where nondeterministic collations were added.
In passing, run make reformat-dat-files, which found some unrelated
whitespace issues (slightly different ones in HEAD and v12).
Peter Eisentraut, with small corrections by me
Discussion: https://postgr.es/m/22566.1568675619@sss.pgh.pa.us
2019-09-21 22:29:17 +02:00
|
|
|
result = internal_text_pattern_compare(arg1, arg2);
|
2003-05-15 17:50:21 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result <= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
text_pattern_ge(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2003-05-15 17:50:21 +02:00
|
|
|
int result;
|
|
|
|
|
Fix up handling of nondeterministic collations with pattern_ops opclasses.
text_pattern_ops and its siblings can't be used with nondeterministic
collations, because they use the text_eq operator which will not behave
as bitwise equality if applied with a nondeterministic collation. The
initial implementation of that restriction was to insert a run-time test
in the related comparison functions, but that is inefficient, may throw
misleading errors, and will throw errors in some cases that would work.
It seems sufficient to just prevent the combination during CREATE INDEX,
so do that instead.
Lacking any better way to identify the opclasses involved, we need to
hard-wire tests for them, which requires hand-assigned values for their
OIDs, which forces a catversion bump because they previously had OIDs
that would be assigned automatically. That's slightly annoying in the
v12 branch, but fortunately we're not at rc1 yet, so just do it.
Back-patch to v12 where nondeterministic collations were added.
In passing, run make reformat-dat-files, which found some unrelated
whitespace issues (slightly different ones in HEAD and v12).
Peter Eisentraut, with small corrections by me
Discussion: https://postgr.es/m/22566.1568675619@sss.pgh.pa.us
2019-09-21 22:29:17 +02:00
|
|
|
result = internal_text_pattern_compare(arg1, arg2);
|
2003-05-15 17:50:21 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
text_pattern_gt(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2003-05-15 17:50:21 +02:00
|
|
|
int result;
|
|
|
|
|
Fix up handling of nondeterministic collations with pattern_ops opclasses.
text_pattern_ops and its siblings can't be used with nondeterministic
collations, because they use the text_eq operator which will not behave
as bitwise equality if applied with a nondeterministic collation. The
initial implementation of that restriction was to insert a run-time test
in the related comparison functions, but that is inefficient, may throw
misleading errors, and will throw errors in some cases that would work.
It seems sufficient to just prevent the combination during CREATE INDEX,
so do that instead.
Lacking any better way to identify the opclasses involved, we need to
hard-wire tests for them, which requires hand-assigned values for their
OIDs, which forces a catversion bump because they previously had OIDs
that would be assigned automatically. That's slightly annoying in the
v12 branch, but fortunately we're not at rc1 yet, so just do it.
Back-patch to v12 where nondeterministic collations were added.
In passing, run make reformat-dat-files, which found some unrelated
whitespace issues (slightly different ones in HEAD and v12).
Peter Eisentraut, with small corrections by me
Discussion: https://postgr.es/m/22566.1568675619@sss.pgh.pa.us
2019-09-21 22:29:17 +02:00
|
|
|
result = internal_text_pattern_compare(arg1, arg2);
|
2003-05-15 17:50:21 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL(result > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Datum
|
|
|
|
bttext_pattern_cmp(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *arg1 = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *arg2 = PG_GETARG_TEXT_PP(1);
|
2003-05-15 17:50:21 +02:00
|
|
|
int result;
|
|
|
|
|
Fix up handling of nondeterministic collations with pattern_ops opclasses.
text_pattern_ops and its siblings can't be used with nondeterministic
collations, because they use the text_eq operator which will not behave
as bitwise equality if applied with a nondeterministic collation. The
initial implementation of that restriction was to insert a run-time test
in the related comparison functions, but that is inefficient, may throw
misleading errors, and will throw errors in some cases that would work.
It seems sufficient to just prevent the combination during CREATE INDEX,
so do that instead.
Lacking any better way to identify the opclasses involved, we need to
hard-wire tests for them, which requires hand-assigned values for their
OIDs, which forces a catversion bump because they previously had OIDs
that would be assigned automatically. That's slightly annoying in the
v12 branch, but fortunately we're not at rc1 yet, so just do it.
Back-patch to v12 where nondeterministic collations were added.
In passing, run make reformat-dat-files, which found some unrelated
whitespace issues (slightly different ones in HEAD and v12).
Peter Eisentraut, with small corrections by me
Discussion: https://postgr.es/m/22566.1568675619@sss.pgh.pa.us
2019-09-21 22:29:17 +02:00
|
|
|
result = internal_text_pattern_compare(arg1, arg2);
|
2003-05-15 17:50:21 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_INT32(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
Datum
|
|
|
|
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
|
|
|
|
MemoryContext oldcontext;
|
|
|
|
|
|
|
|
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
|
|
|
|
|
|
|
|
/* Use generic string SortSupport, forcing "C" collation */
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
|
2016-02-03 20:17:35 +01:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
|
|
|
|
PG_RETURN_VOID();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------
|
2000-03-24 03:41:46 +01:00
|
|
|
* byteaoctetlen
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* get the number of bytes contained in an instance of type 'bytea'
|
|
|
|
*-------------------------------------------------------------
|
|
|
|
*/
|
2000-07-29 05:26:51 +02:00
|
|
|
Datum
|
|
|
|
byteaoctetlen(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2004-01-31 01:45:21 +01:00
|
|
|
Datum str = PG_GETARG_DATUM(0);
|
|
|
|
|
|
|
|
/* We need not detoast the input at all */
|
|
|
|
PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2001-09-14 19:46:40 +02:00
|
|
|
/*
|
|
|
|
* byteacat -
|
|
|
|
* takes two bytea* and returns a bytea* that is the concatenation of
|
|
|
|
* the two.
|
|
|
|
*
|
|
|
|
* Cloned from textcat and modified as required.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
byteacat(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *t1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *t2 = PG_GETARG_BYTEA_PP(1);
|
2010-01-25 21:55:32 +01:00
|
|
|
|
|
|
|
PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* bytea_catenate
|
|
|
|
* Guts of byteacat(), broken out so it can be used by other functions
|
|
|
|
*
|
|
|
|
* Arguments can be in short-header form, but not compressed or out-of-line
|
|
|
|
*/
|
|
|
|
static bytea *
|
|
|
|
bytea_catenate(bytea *t1, bytea *t2)
|
|
|
|
{
|
|
|
|
bytea *result;
|
2001-09-14 19:46:40 +02:00
|
|
|
int len1,
|
|
|
|
len2,
|
|
|
|
len;
|
|
|
|
char *ptr;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(t1);
|
2010-01-25 21:55:32 +01:00
|
|
|
len2 = VARSIZE_ANY_EXHDR(t2);
|
|
|
|
|
|
|
|
/* paranoia ... probably should throw error instead? */
|
2001-09-14 19:46:40 +02:00
|
|
|
if (len1 < 0)
|
|
|
|
len1 = 0;
|
|
|
|
if (len2 < 0)
|
|
|
|
len2 = 0;
|
|
|
|
|
|
|
|
len = len1 + len2 + VARHDRSZ;
|
|
|
|
result = (bytea *) palloc(len);
|
|
|
|
|
|
|
|
/* Set size of result string... */
|
2007-02-28 00:48:10 +01:00
|
|
|
SET_VARSIZE(result, len);
|
2001-09-14 19:46:40 +02:00
|
|
|
|
|
|
|
/* Fill data field of result string... */
|
|
|
|
ptr = VARDATA(result);
|
|
|
|
if (len1 > 0)
|
2007-04-06 06:21:44 +02:00
|
|
|
memcpy(ptr, VARDATA_ANY(t1), len1);
|
2001-09-14 19:46:40 +02:00
|
|
|
if (len2 > 0)
|
2007-04-06 06:21:44 +02:00
|
|
|
memcpy(ptr + len1, VARDATA_ANY(t2), len2);
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2010-01-25 21:55:32 +01:00
|
|
|
return result;
|
2001-09-14 19:46:40 +02:00
|
|
|
}
|
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
#define PG_STR_GET_BYTEA(str_) \
|
2017-03-13 00:35:34 +01:00
|
|
|
DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
|
2006-10-07 02:11:53 +02:00
|
|
|
|
2001-09-14 19:46:40 +02:00
|
|
|
/*
|
|
|
|
* bytea_substr()
|
|
|
|
* Return a substring starting at the specified position.
|
|
|
|
* Cloned from text_substr and modified as required.
|
|
|
|
*
|
|
|
|
* Input:
|
|
|
|
* - string
|
|
|
|
* - starting position (is one-based)
|
2002-08-22 05:24:01 +02:00
|
|
|
* - string length (optional)
|
2001-09-14 19:46:40 +02:00
|
|
|
*
|
|
|
|
* If the starting position is zero or less, then return from the start of the string
|
2013-04-20 17:04:41 +02:00
|
|
|
* adjusting the length to be consistent with the "negative start" per SQL.
|
2002-08-22 05:24:01 +02:00
|
|
|
* If the length is less than zero, an ERROR is thrown. If no third argument
|
|
|
|
* (length) is provided, the length to the end of the string is assumed.
|
2001-09-14 19:46:40 +02:00
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
bytea_substr(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2010-01-25 21:55:32 +01:00
|
|
|
PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
|
|
|
|
PG_GETARG_INT32(1),
|
|
|
|
PG_GETARG_INT32(2),
|
|
|
|
false));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* bytea_substr_no_len -
|
|
|
|
* Wrapper to avoid opr_sanity failure due to
|
|
|
|
* one function accepting a different number of args.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
bytea_substr_no_len(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
|
|
|
|
PG_GETARG_INT32(1),
|
|
|
|
-1,
|
|
|
|
true));
|
|
|
|
}
|
|
|
|
|
|
|
|
static bytea *
|
|
|
|
bytea_substring(Datum str,
|
|
|
|
int S,
|
|
|
|
int L,
|
|
|
|
bool length_not_specified)
|
|
|
|
{
|
2002-09-04 22:31:48 +02:00
|
|
|
int S1; /* adjusted start position */
|
|
|
|
int L1; /* adjusted substring length */
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
S1 = Max(S, 1);
|
|
|
|
|
2010-01-25 21:55:32 +01:00
|
|
|
if (length_not_specified)
|
2001-09-14 19:46:40 +02:00
|
|
|
{
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
2010-02-26 03:01:40 +01:00
|
|
|
* Not passed a length - DatumGetByteaPSlice() grabs everything to the
|
|
|
|
* end of the string if we pass it a negative value for length.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
L1 = -1;
|
2001-09-14 19:46:40 +02:00
|
|
|
}
|
2002-08-22 05:24:01 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
/* end position */
|
2010-01-25 21:55:32 +01:00
|
|
|
int E = S + L;
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* A negative value for L is the only way for the end position to be
|
|
|
|
* before the start. SQL99 says to throw an error.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (E < S)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_SUBSTRING_ERROR),
|
|
|
|
errmsg("negative substring length not allowed")));
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
/*
|
|
|
|
* A zero or negative value for the end position can happen if the
|
|
|
|
* start was negative or one. SQL99 says to return a zero-length
|
|
|
|
* string.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
if (E < 1)
|
2010-01-25 21:55:32 +01:00
|
|
|
return PG_STR_GET_BYTEA("");
|
2002-08-22 05:24:01 +02:00
|
|
|
|
|
|
|
L1 = E - S1;
|
|
|
|
}
|
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
/*
|
|
|
|
* If the start position is past the end of the string, SQL99 says to
|
2010-02-26 03:01:40 +01:00
|
|
|
* return a zero-length string -- DatumGetByteaPSlice() will do that for
|
|
|
|
* us. Convert to zero-based starting position
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
2010-01-25 21:55:32 +01:00
|
|
|
return DatumGetByteaPSlice(str, S1 - 1, L1);
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-01-25 21:55:32 +01:00
|
|
|
* byteaoverlay
|
|
|
|
* Replace specified substring of first string with second
|
|
|
|
*
|
|
|
|
* The SQL standard defines OVERLAY() in terms of substring and concatenation.
|
|
|
|
* This code is a direct implementation of what the standard says.
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
Datum
|
2010-01-25 21:55:32 +01:00
|
|
|
byteaoverlay(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
bytea *t1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *t2 = PG_GETARG_BYTEA_PP(1);
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
int sp = PG_GETARG_INT32(2); /* substring start position */
|
|
|
|
int sl = PG_GETARG_INT32(3); /* substring length */
|
2010-01-25 21:55:32 +01:00
|
|
|
|
|
|
|
PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteaoverlay_no_len(PG_FUNCTION_ARGS)
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2010-01-25 21:55:32 +01:00
|
|
|
bytea *t1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *t2 = PG_GETARG_BYTEA_PP(1);
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
int sp = PG_GETARG_INT32(2); /* substring start position */
|
2010-01-25 21:55:32 +01:00
|
|
|
int sl;
|
|
|
|
|
2010-02-26 03:01:40 +01:00
|
|
|
sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
|
2010-01-25 21:55:32 +01:00
|
|
|
PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
|
|
|
|
}
|
|
|
|
|
|
|
|
static bytea *
|
|
|
|
bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
|
|
|
|
{
|
|
|
|
bytea *result;
|
|
|
|
bytea *s1;
|
|
|
|
bytea *s2;
|
|
|
|
int sp_pl_sl;
|
|
|
|
|
|
|
|
/*
|
2010-02-26 03:01:40 +01:00
|
|
|
* Check for possible integer-overflow cases. For negative sp, throw a
|
|
|
|
* "substring length" error because that's what should be expected
|
|
|
|
* according to the spec's definition of OVERLAY().
|
2010-01-25 21:55:32 +01:00
|
|
|
*/
|
|
|
|
if (sp <= 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_SUBSTRING_ERROR),
|
|
|
|
errmsg("negative substring length not allowed")));
|
2017-12-13 01:32:31 +01:00
|
|
|
if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
|
2010-01-25 21:55:32 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
|
|
|
|
errmsg("integer out of range")));
|
|
|
|
|
2010-02-26 03:01:40 +01:00
|
|
|
s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
|
2010-01-25 21:55:32 +01:00
|
|
|
s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
|
|
|
|
result = bytea_catenate(s1, t2);
|
|
|
|
result = bytea_catenate(result, s2);
|
|
|
|
|
|
|
|
return result;
|
2001-09-14 19:46:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* byteapos -
|
|
|
|
* Return the position of the specified substring.
|
2013-04-20 17:04:41 +02:00
|
|
|
* Implements the SQL POSITION() function.
|
2001-09-14 19:46:40 +02:00
|
|
|
* Cloned from textpos and modified as required.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
byteapos(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *t1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *t2 = PG_GETARG_BYTEA_PP(1);
|
2001-09-14 19:46:40 +02:00
|
|
|
int pos;
|
|
|
|
int px,
|
|
|
|
p;
|
|
|
|
int len1,
|
|
|
|
len2;
|
2001-10-25 07:50:21 +02:00
|
|
|
char *p1,
|
|
|
|
*p2;
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(t1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(t2);
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
if (len2 <= 0)
|
|
|
|
PG_RETURN_INT32(1); /* result for empty pattern */
|
2001-09-14 19:46:40 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
p1 = VARDATA_ANY(t1);
|
|
|
|
p2 = VARDATA_ANY(t2);
|
2001-09-14 19:46:40 +02:00
|
|
|
|
|
|
|
pos = 0;
|
|
|
|
px = (len1 - len2);
|
|
|
|
for (p = 0; p <= px; p++)
|
|
|
|
{
|
|
|
|
if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
|
|
|
|
{
|
|
|
|
pos = p + 1;
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
p1++;
|
|
|
|
};
|
|
|
|
|
|
|
|
PG_RETURN_INT32(pos);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------
|
|
|
|
* byteaGetByte
|
|
|
|
*
|
|
|
|
* this routine treats "bytea" as an array of bytes.
|
2000-06-13 09:35:40 +02:00
|
|
|
* It returns the Nth byte (a number between 0 and 255).
|
1996-07-09 08:22:35 +02:00
|
|
|
*-------------------------------------------------------------
|
|
|
|
*/
|
2000-06-13 09:35:40 +02:00
|
|
|
Datum
|
|
|
|
byteaGetByte(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *v = PG_GETARG_BYTEA_PP(0);
|
2000-06-13 09:35:40 +02:00
|
|
|
int32 n = PG_GETARG_INT32(1);
|
1997-09-08 04:41:22 +02:00
|
|
|
int len;
|
2011-04-10 17:42:00 +02:00
|
|
|
int byte;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len = VARSIZE_ANY_EXHDR(v);
|
2000-03-24 03:41:46 +01:00
|
|
|
|
|
|
|
if (n < 0 || n >= len)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
|
|
|
|
errmsg("index %d out of valid range, 0..%d",
|
|
|
|
n, len - 1)));
|
2000-03-24 03:41:46 +01:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
byte = ((unsigned char *) VARDATA_ANY(v))[n];
|
2000-03-24 03:41:46 +01:00
|
|
|
|
2000-06-13 09:35:40 +02:00
|
|
|
PG_RETURN_INT32(byte);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*-------------------------------------------------------------
|
|
|
|
* byteaGetBit
|
|
|
|
*
|
|
|
|
* This routine treats a "bytea" type like an array of bits.
|
|
|
|
* It returns the value of the Nth bit (0 or 1).
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------
|
|
|
|
*/
|
2000-06-13 09:35:40 +02:00
|
|
|
Datum
|
|
|
|
byteaGetBit(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *v = PG_GETARG_BYTEA_PP(0);
|
2000-06-13 09:35:40 +02:00
|
|
|
int32 n = PG_GETARG_INT32(1);
|
1997-09-08 04:41:22 +02:00
|
|
|
int byteNo,
|
|
|
|
bitNo;
|
2000-03-24 03:41:46 +01:00
|
|
|
int len;
|
2011-04-10 17:42:00 +02:00
|
|
|
int byte;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len = VARSIZE_ANY_EXHDR(v);
|
2000-03-24 03:41:46 +01:00
|
|
|
|
2000-04-12 19:17:23 +02:00
|
|
|
if (n < 0 || n >= len * 8)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
|
|
|
|
errmsg("index %d out of valid range, 0..%d",
|
|
|
|
n, len * 8 - 1)));
|
2000-03-24 03:41:46 +01:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
byteNo = n / 8;
|
|
|
|
bitNo = n % 8;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-04-10 17:42:00 +02:00
|
|
|
if (byte & (1 << bitNo))
|
2000-06-13 09:35:40 +02:00
|
|
|
PG_RETURN_INT32(1);
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
2000-06-13 09:35:40 +02:00
|
|
|
PG_RETURN_INT32(0);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------
|
|
|
|
* byteaSetByte
|
|
|
|
*
|
|
|
|
* Given an instance of type 'bytea' creates a new one with
|
|
|
|
* the Nth byte set to the given value.
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------
|
|
|
|
*/
|
2000-06-13 09:35:40 +02:00
|
|
|
Datum
|
|
|
|
byteaSetByte(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2017-03-13 00:35:34 +01:00
|
|
|
bytea *res = PG_GETARG_BYTEA_P_COPY(0);
|
2000-06-13 09:35:40 +02:00
|
|
|
int32 n = PG_GETARG_INT32(1);
|
|
|
|
int32 newByte = PG_GETARG_INT32(2);
|
1997-09-08 04:41:22 +02:00
|
|
|
int len;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2017-03-13 00:35:34 +01:00
|
|
|
len = VARSIZE(res) - VARHDRSZ;
|
2000-03-24 03:41:46 +01:00
|
|
|
|
|
|
|
if (n < 0 || n >= len)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
|
|
|
|
errmsg("index %d out of valid range, 0..%d",
|
|
|
|
n, len - 1)));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Now set the byte.
|
|
|
|
*/
|
2000-03-24 03:41:46 +01:00
|
|
|
((unsigned char *) VARDATA(res))[n] = newByte;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-06-13 09:35:40 +02:00
|
|
|
PG_RETURN_BYTEA_P(res);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*-------------------------------------------------------------
|
|
|
|
* byteaSetBit
|
|
|
|
*
|
|
|
|
* Given an instance of type 'bytea' creates a new one with
|
|
|
|
* the Nth bit set to the given value.
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------
|
|
|
|
*/
|
2000-06-13 09:35:40 +02:00
|
|
|
Datum
|
|
|
|
byteaSetBit(PG_FUNCTION_ARGS)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2017-03-13 00:35:34 +01:00
|
|
|
bytea *res = PG_GETARG_BYTEA_P_COPY(0);
|
2000-06-13 09:35:40 +02:00
|
|
|
int32 n = PG_GETARG_INT32(1);
|
|
|
|
int32 newBit = PG_GETARG_INT32(2);
|
2000-03-24 03:41:46 +01:00
|
|
|
int len;
|
1997-09-08 04:41:22 +02:00
|
|
|
int oldByte,
|
|
|
|
newByte;
|
|
|
|
int byteNo,
|
|
|
|
bitNo;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2017-03-13 00:35:34 +01:00
|
|
|
len = VARSIZE(res) - VARHDRSZ;
|
2000-03-24 03:41:46 +01:00
|
|
|
|
2000-04-12 19:17:23 +02:00
|
|
|
if (n < 0 || n >= len * 8)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
|
|
|
|
errmsg("index %d out of valid range, 0..%d",
|
|
|
|
n, len * 8 - 1)));
|
2000-03-24 03:41:46 +01:00
|
|
|
|
|
|
|
byteNo = n / 8;
|
|
|
|
bitNo = n % 8;
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
|
|
|
* sanity check!
|
|
|
|
*/
|
|
|
|
if (newBit != 0 && newBit != 1)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("new bit must be 0 or 1")));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-06-13 09:35:40 +02:00
|
|
|
* Update the byte.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-06-13 09:35:40 +02:00
|
|
|
oldByte = ((unsigned char *) VARDATA(res))[byteNo];
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
if (newBit == 0)
|
|
|
|
newByte = oldByte & (~(1 << bitNo));
|
|
|
|
else
|
|
|
|
newByte = oldByte | (1 << bitNo);
|
|
|
|
|
2000-06-13 09:35:40 +02:00
|
|
|
((unsigned char *) VARDATA(res))[byteNo] = newByte;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-06-13 09:35:40 +02:00
|
|
|
PG_RETURN_BYTEA_P(res);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1998-05-29 15:33:58 +02:00
|
|
|
|
|
|
|
|
|
|
|
/* text_name()
|
2000-07-06 07:48:31 +02:00
|
|
|
* Converts a text type to a Name type.
|
1998-05-29 15:33:58 +02:00
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
text_name(PG_FUNCTION_ARGS)
|
1998-05-29 15:33:58 +02:00
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
text *s = PG_GETARG_TEXT_PP(0);
|
2000-07-06 07:48:31 +02:00
|
|
|
Name result;
|
1998-05-29 15:33:58 +02:00
|
|
|
int len;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len = VARSIZE_ANY_EXHDR(s);
|
2000-07-06 07:48:31 +02:00
|
|
|
|
|
|
|
/* Truncate oversize input */
|
|
|
|
if (len >= NAMEDATALEN)
|
2012-05-25 23:34:51 +02:00
|
|
|
len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
|
1998-05-29 15:33:58 +02:00
|
|
|
|
2012-05-25 23:34:51 +02:00
|
|
|
/* We use palloc0 here to ensure result is zero-padded */
|
|
|
|
result = (Name) palloc0(NAMEDATALEN);
|
2007-04-06 06:21:44 +02:00
|
|
|
memcpy(NameStr(*result), VARDATA_ANY(s), len);
|
1998-05-29 15:33:58 +02:00
|
|
|
|
2000-07-06 07:48:31 +02:00
|
|
|
PG_RETURN_NAME(result);
|
|
|
|
}
|
1998-05-29 15:33:58 +02:00
|
|
|
|
|
|
|
/* name_text()
|
2000-07-06 07:48:31 +02:00
|
|
|
* Converts a Name type to a text type.
|
1998-05-29 15:33:58 +02:00
|
|
|
*/
|
2000-07-06 07:48:31 +02:00
|
|
|
Datum
|
|
|
|
name_text(PG_FUNCTION_ARGS)
|
1998-05-29 15:33:58 +02:00
|
|
|
{
|
2000-07-06 07:48:31 +02:00
|
|
|
Name s = PG_GETARG_NAME(0);
|
1998-05-29 15:33:58 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
|
2000-07-06 07:48:31 +02:00
|
|
|
}
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
|
2002-03-30 02:02:42 +01:00
|
|
|
/*
|
|
|
|
* textToQualifiedNameList - convert a text object to list of names
|
|
|
|
*
|
|
|
|
* This implements the input parsing needed by nextval() and other
|
|
|
|
* functions that take a text parameter representing a qualified name.
|
|
|
|
* We split the name at dots, downcase if not double-quoted, and
|
|
|
|
* truncate names if they're too long.
|
|
|
|
*/
|
|
|
|
List *
|
2005-05-27 02:57:49 +02:00
|
|
|
textToQualifiedNameList(text *textval)
|
2002-03-30 02:02:42 +01:00
|
|
|
{
|
|
|
|
char *rawname;
|
|
|
|
List *result = NIL;
|
2002-04-01 05:34:27 +02:00
|
|
|
List *namelist;
|
2004-05-26 06:41:50 +02:00
|
|
|
ListCell *l;
|
2002-03-30 02:02:42 +01:00
|
|
|
|
|
|
|
/* Convert to C string (handles possible detoasting). */
|
|
|
|
/* Note we rely on being able to modify rawname below. */
|
2008-03-25 23:42:46 +01:00
|
|
|
rawname = text_to_cstring(textval);
|
2002-03-30 02:02:42 +01:00
|
|
|
|
2002-04-01 05:34:27 +02:00
|
|
|
if (!SplitIdentifierString(rawname, '.', &namelist))
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_NAME),
|
|
|
|
errmsg("invalid name syntax")));
|
2002-04-01 05:34:27 +02:00
|
|
|
|
|
|
|
if (namelist == NIL)
|
2003-07-27 06:53:12 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_NAME),
|
|
|
|
errmsg("invalid name syntax")));
|
2002-04-01 05:34:27 +02:00
|
|
|
|
|
|
|
foreach(l, namelist)
|
|
|
|
{
|
2002-09-04 22:31:48 +02:00
|
|
|
char *curname = (char *) lfirst(l);
|
2002-04-01 05:34:27 +02:00
|
|
|
|
|
|
|
result = lappend(result, makeString(pstrdup(curname)));
|
|
|
|
}
|
|
|
|
|
|
|
|
pfree(rawname);
|
2004-05-31 01:40:41 +02:00
|
|
|
list_free(namelist);
|
2002-04-01 05:34:27 +02:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SplitIdentifierString --- parse a string containing identifiers
|
|
|
|
*
|
|
|
|
* This is the guts of textToQualifiedNameList, and is exported for use in
|
2014-05-06 18:12:18 +02:00
|
|
|
* other situations such as parsing GUC variables. In the GUC case, it's
|
2002-04-01 05:34:27 +02:00
|
|
|
* important to avoid memory leaks, so the API is designed to minimize the
|
|
|
|
* amount of stuff that needs to be allocated and freed.
|
|
|
|
*
|
|
|
|
* Inputs:
|
2002-09-04 22:31:48 +02:00
|
|
|
* rawstring: the input string; must be overwritable! On return, it's
|
2002-04-01 05:34:27 +02:00
|
|
|
* been modified to contain the separated identifiers.
|
|
|
|
* separator: the separator punctuation expected between identifiers
|
2014-05-06 18:12:18 +02:00
|
|
|
* (typically '.' or ','). Whitespace may also appear around
|
2002-04-01 05:34:27 +02:00
|
|
|
* identifiers.
|
|
|
|
* Outputs:
|
|
|
|
* namelist: filled with a palloc'd list of pointers to identifiers within
|
2005-09-16 06:13:18 +02:00
|
|
|
* rawstring. Caller should list_free() this even on error return.
|
2002-04-01 05:34:27 +02:00
|
|
|
*
|
2017-08-16 06:22:32 +02:00
|
|
|
* Returns true if okay, false if there is a syntax error in the string.
|
2002-04-01 05:34:27 +02:00
|
|
|
*
|
|
|
|
* Note that an empty string is considered okay here, though not in
|
|
|
|
* textToQualifiedNameList.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
SplitIdentifierString(char *rawstring, char separator,
|
|
|
|
List **namelist)
|
|
|
|
{
|
|
|
|
char *nextp = rawstring;
|
|
|
|
bool done = false;
|
|
|
|
|
|
|
|
*namelist = NIL;
|
|
|
|
|
2017-05-24 21:28:34 +02:00
|
|
|
while (scanner_isspace(*nextp))
|
2002-04-01 05:34:27 +02:00
|
|
|
nextp++; /* skip leading whitespace */
|
|
|
|
|
|
|
|
if (*nextp == '\0')
|
|
|
|
return true; /* allow empty string */
|
|
|
|
|
|
|
|
/* At the top of the loop, we are at start of a new identifier. */
|
2002-03-30 02:02:42 +01:00
|
|
|
do
|
|
|
|
{
|
|
|
|
char *curname;
|
|
|
|
char *endp;
|
|
|
|
|
2015-12-23 04:43:46 +01:00
|
|
|
if (*nextp == '"')
|
2002-03-30 02:02:42 +01:00
|
|
|
{
|
|
|
|
/* Quoted name --- collapse quote-quote pairs, no downcasing */
|
|
|
|
curname = nextp + 1;
|
|
|
|
for (;;)
|
|
|
|
{
|
2015-12-23 04:43:46 +01:00
|
|
|
endp = strchr(nextp + 1, '"');
|
2002-03-30 02:02:42 +01:00
|
|
|
if (endp == NULL)
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
return false; /* mismatched quotes */
|
2015-12-23 04:43:46 +01:00
|
|
|
if (endp[1] != '"')
|
2002-03-30 02:02:42 +01:00
|
|
|
break; /* found end of quoted name */
|
|
|
|
/* Collapse adjacent quotes into one quote, and look again */
|
2002-09-04 22:31:48 +02:00
|
|
|
memmove(endp, endp + 1, strlen(endp));
|
2002-03-30 02:02:42 +01:00
|
|
|
nextp = endp;
|
|
|
|
}
|
2002-04-01 05:34:27 +02:00
|
|
|
/* endp now points at the terminating quote */
|
2002-03-30 02:02:42 +01:00
|
|
|
nextp = endp + 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2002-04-01 05:34:27 +02:00
|
|
|
/* Unquoted name --- extends to separator or whitespace */
|
2004-02-21 01:34:53 +01:00
|
|
|
char *downname;
|
|
|
|
int len;
|
|
|
|
|
2002-03-30 02:02:42 +01:00
|
|
|
curname = nextp;
|
2002-04-01 05:34:27 +02:00
|
|
|
while (*nextp && *nextp != separator &&
|
2017-05-24 21:28:34 +02:00
|
|
|
!scanner_isspace(*nextp))
|
2002-04-01 05:34:27 +02:00
|
|
|
nextp++;
|
|
|
|
endp = nextp;
|
|
|
|
if (curname == nextp)
|
|
|
|
return false; /* empty unquoted name not allowed */
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2004-02-21 01:34:53 +01:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Downcase the identifier, using same code as main lexer does.
|
2004-02-21 01:34:53 +01:00
|
|
|
*
|
|
|
|
* XXX because we want to overwrite the input in-place, we cannot
|
2005-10-15 04:49:52 +02:00
|
|
|
* support a downcasing transformation that increases the string
|
2014-05-06 18:12:18 +02:00
|
|
|
* length. This is not a problem given the current implementation
|
2005-10-15 04:49:52 +02:00
|
|
|
* of downcase_truncate_identifier, but we'll probably have to do
|
|
|
|
* something about this someday.
|
2004-02-21 01:34:53 +01:00
|
|
|
*/
|
|
|
|
len = endp - curname;
|
|
|
|
downname = downcase_truncate_identifier(curname, len, false);
|
|
|
|
Assert(strlen(downname) <= len);
|
Replace a bunch more uses of strncpy() with safer coding.
strncpy() has a well-deserved reputation for being unsafe, so make an
effort to get rid of nearly all occurrences in HEAD.
A large fraction of the remaining uses were passing length less than or
equal to the known strlen() of the source, in which case no null-padding
can occur and the behavior is equivalent to memcpy(), though doubtless
slower and certainly harder to reason about. So just use memcpy() in
these cases.
In other cases, use either StrNCpy() or strlcpy() as appropriate (depending
on whether padding to the full length of the destination buffer seems
useful).
I left a few strncpy() calls alone in the src/timezone/ code, to keep it
in sync with upstream (the IANA tzcode distribution). There are also a
few such calls in ecpg that could possibly do with more analysis.
AFAICT, none of these changes are more than cosmetic, except for the four
occurrences in fe-secure-openssl.c, which are in fact buggy: an overlength
source leads to a non-null-terminated destination buffer and ensuing
misbehavior. These don't seem like security issues, first because no stack
clobber is possible and second because if your values of sslcert etc are
coming from untrusted sources then you've got problems way worse than this.
Still, it's undesirable to have unpredictable behavior for overlength
inputs, so back-patch those four changes to all active branches.
2015-01-24 19:05:42 +01:00
|
|
|
strncpy(curname, downname, len); /* strncpy is required here */
|
2004-02-21 01:34:53 +01:00
|
|
|
pfree(downname);
|
2002-03-30 02:02:42 +01:00
|
|
|
}
|
|
|
|
|
2017-05-24 21:28:34 +02:00
|
|
|
while (scanner_isspace(*nextp))
|
2002-04-01 05:34:27 +02:00
|
|
|
nextp++; /* skip trailing whitespace */
|
|
|
|
|
|
|
|
if (*nextp == separator)
|
|
|
|
{
|
|
|
|
nextp++;
|
2017-05-24 21:28:34 +02:00
|
|
|
while (scanner_isspace(*nextp))
|
2002-04-01 05:34:27 +02:00
|
|
|
nextp++; /* skip leading whitespace for next */
|
|
|
|
/* we expect another name, so done remains false */
|
|
|
|
}
|
|
|
|
else if (*nextp == '\0')
|
|
|
|
done = true;
|
|
|
|
else
|
|
|
|
return false; /* invalid syntax */
|
|
|
|
|
|
|
|
/* Now safe to overwrite separator with a null */
|
|
|
|
*endp = '\0';
|
|
|
|
|
2004-02-21 01:34:53 +01:00
|
|
|
/* Truncate name if it's overlength */
|
|
|
|
truncate_identifier(curname, strlen(curname), false);
|
2002-03-30 02:02:42 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Finished isolating current name --- add it to list
|
|
|
|
*/
|
2002-04-01 05:34:27 +02:00
|
|
|
*namelist = lappend(*namelist, curname);
|
2002-03-30 02:02:42 +01:00
|
|
|
|
2002-04-01 05:34:27 +02:00
|
|
|
/* Loop back if we didn't reach end of string */
|
|
|
|
} while (!done);
|
2002-03-30 02:02:42 +01:00
|
|
|
|
2002-04-01 05:34:27 +02:00
|
|
|
return true;
|
2002-03-30 02:02:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-08-10 23:26:44 +02:00
|
|
|
/*
|
2017-06-20 19:02:42 +02:00
|
|
|
* SplitDirectoriesString --- parse a string containing file/directory names
|
|
|
|
*
|
|
|
|
* This works fine on file names too; the function name is historical.
|
2012-08-10 23:26:44 +02:00
|
|
|
*
|
|
|
|
* This is similar to SplitIdentifierString, except that the parsing
|
|
|
|
* rules are meant to handle pathnames instead of identifiers: there is
|
2012-09-06 17:43:51 +02:00
|
|
|
* no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
|
|
|
|
* and we apply canonicalize_path() to each extracted string. Because of the
|
|
|
|
* last, the returned strings are separately palloc'd rather than being
|
2012-08-10 23:26:44 +02:00
|
|
|
* pointers into rawstring --- but we still scribble on rawstring.
|
|
|
|
*
|
|
|
|
* Inputs:
|
|
|
|
* rawstring: the input string; must be modifiable!
|
|
|
|
* separator: the separator punctuation expected between directories
|
2014-05-06 18:12:18 +02:00
|
|
|
* (typically ',' or ';'). Whitespace may also appear around
|
2012-08-10 23:26:44 +02:00
|
|
|
* directories.
|
|
|
|
* Outputs:
|
|
|
|
* namelist: filled with a palloc'd list of directory names.
|
|
|
|
* Caller should list_free_deep() this even on error return.
|
|
|
|
*
|
2017-08-16 06:22:32 +02:00
|
|
|
* Returns true if okay, false if there is a syntax error in the string.
|
2012-08-10 23:26:44 +02:00
|
|
|
*
|
|
|
|
* Note that an empty string is considered okay here.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
SplitDirectoriesString(char *rawstring, char separator,
|
|
|
|
List **namelist)
|
|
|
|
{
|
|
|
|
char *nextp = rawstring;
|
|
|
|
bool done = false;
|
|
|
|
|
|
|
|
*namelist = NIL;
|
|
|
|
|
2017-05-24 21:28:34 +02:00
|
|
|
while (scanner_isspace(*nextp))
|
2012-08-10 23:26:44 +02:00
|
|
|
nextp++; /* skip leading whitespace */
|
|
|
|
|
|
|
|
if (*nextp == '\0')
|
|
|
|
return true; /* allow empty string */
|
|
|
|
|
|
|
|
/* At the top of the loop, we are at start of a new directory. */
|
|
|
|
do
|
|
|
|
{
|
|
|
|
char *curname;
|
|
|
|
char *endp;
|
|
|
|
|
2015-12-23 04:43:46 +01:00
|
|
|
if (*nextp == '"')
|
2012-08-10 23:26:44 +02:00
|
|
|
{
|
|
|
|
/* Quoted name --- collapse quote-quote pairs */
|
|
|
|
curname = nextp + 1;
|
|
|
|
for (;;)
|
|
|
|
{
|
2015-12-23 04:43:46 +01:00
|
|
|
endp = strchr(nextp + 1, '"');
|
2012-08-10 23:26:44 +02:00
|
|
|
if (endp == NULL)
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
return false; /* mismatched quotes */
|
2015-12-23 04:43:46 +01:00
|
|
|
if (endp[1] != '"')
|
2012-08-10 23:26:44 +02:00
|
|
|
break; /* found end of quoted name */
|
|
|
|
/* Collapse adjacent quotes into one quote, and look again */
|
|
|
|
memmove(endp, endp + 1, strlen(endp));
|
|
|
|
nextp = endp;
|
|
|
|
}
|
|
|
|
/* endp now points at the terminating quote */
|
|
|
|
nextp = endp + 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2012-09-06 17:43:51 +02:00
|
|
|
/* Unquoted name --- extends to separator or end of string */
|
|
|
|
curname = endp = nextp;
|
|
|
|
while (*nextp && *nextp != separator)
|
|
|
|
{
|
|
|
|
/* trailing whitespace should not be included in name */
|
2017-05-24 21:28:34 +02:00
|
|
|
if (!scanner_isspace(*nextp))
|
2012-09-06 17:43:51 +02:00
|
|
|
endp = nextp + 1;
|
2012-08-10 23:26:44 +02:00
|
|
|
nextp++;
|
2012-09-06 17:43:51 +02:00
|
|
|
}
|
|
|
|
if (curname == endp)
|
2012-08-10 23:26:44 +02:00
|
|
|
return false; /* empty unquoted name not allowed */
|
|
|
|
}
|
|
|
|
|
2017-05-24 21:28:34 +02:00
|
|
|
while (scanner_isspace(*nextp))
|
2012-08-10 23:26:44 +02:00
|
|
|
nextp++; /* skip trailing whitespace */
|
|
|
|
|
|
|
|
if (*nextp == separator)
|
|
|
|
{
|
|
|
|
nextp++;
|
2017-05-24 21:28:34 +02:00
|
|
|
while (scanner_isspace(*nextp))
|
2012-08-10 23:26:44 +02:00
|
|
|
nextp++; /* skip leading whitespace for next */
|
|
|
|
/* we expect another name, so done remains false */
|
|
|
|
}
|
|
|
|
else if (*nextp == '\0')
|
|
|
|
done = true;
|
|
|
|
else
|
|
|
|
return false; /* invalid syntax */
|
|
|
|
|
|
|
|
/* Now safe to overwrite separator with a null */
|
|
|
|
*endp = '\0';
|
|
|
|
|
|
|
|
/* Truncate path if it's overlength */
|
|
|
|
if (strlen(curname) >= MAXPGPATH)
|
|
|
|
curname[MAXPGPATH - 1] = '\0';
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Finished isolating current name --- add it to list
|
|
|
|
*/
|
|
|
|
curname = pstrdup(curname);
|
|
|
|
canonicalize_path(curname);
|
|
|
|
*namelist = lappend(*namelist, curname);
|
|
|
|
|
|
|
|
/* Loop back if we didn't reach end of string */
|
|
|
|
} while (!done);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Further fixes for quoted-list GUC values in pg_dump and ruleutils.c.
Commits 742869946 et al turn out to be a couple bricks shy of a load.
We were dumping the stored values of GUC_LIST_QUOTE variables as they
appear in proconfig or setconfig catalog columns. However, although that
quoting rule looks a lot like SQL-identifier double quotes, there are two
critical differences: empty strings ("") are legal, and depending on which
variable you're considering, values longer than NAMEDATALEN might be valid
too. So the current technique fails altogether on empty-string list
entries (as reported by Steven Winfield in bug #15248) and it also risks
truncating file pathnames during dump/reload of GUC values that are lists
of pathnames.
To fix, split the stored value without any downcasing or truncation,
and then emit each element as a SQL string literal.
This is a tad annoying, because we now have three copies of the
comma-separated-string splitting logic in varlena.c as well as a fourth
one in dumputils.c. (Not to mention the randomly-different-from-those
splitting logic in libpq...) I looked at unifying these, but it would
be rather a mess unless we're willing to tweak the API definitions of
SplitIdentifierString, SplitDirectoriesString, or both. That might be
worth doing in future; but it seems pretty unsafe for a back-patched
bug fix, so for now accept the duplication.
Back-patch to all supported branches, as the previous fix was.
Discussion: https://postgr.es/m/7585.1529435872@sss.pgh.pa.us
2018-07-31 19:00:07 +02:00
|
|
|
/*
|
|
|
|
* SplitGUCList --- parse a string containing identifiers or file names
|
|
|
|
*
|
|
|
|
* This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
|
|
|
|
* presuming whether the elements will be taken as identifiers or file names.
|
|
|
|
* We assume the input has already been through flatten_set_variable_args(),
|
|
|
|
* so that we need never downcase (if appropriate, that was done already).
|
|
|
|
* Nor do we ever truncate, since we don't know the correct max length.
|
|
|
|
* We disallow embedded whitespace for simplicity (it shouldn't matter,
|
|
|
|
* because any embedded whitespace should have led to double-quoting).
|
|
|
|
* Otherwise the API is identical to SplitIdentifierString.
|
|
|
|
*
|
|
|
|
* XXX it's annoying to have so many copies of this string-splitting logic.
|
|
|
|
* However, it's not clear that having one function with a bunch of option
|
|
|
|
* flags would be much better.
|
|
|
|
*
|
|
|
|
* XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
|
|
|
|
* Be sure to update that if you have to change this.
|
|
|
|
*
|
|
|
|
* Inputs:
|
|
|
|
* rawstring: the input string; must be overwritable! On return, it's
|
|
|
|
* been modified to contain the separated identifiers.
|
|
|
|
* separator: the separator punctuation expected between identifiers
|
|
|
|
* (typically '.' or ','). Whitespace may also appear around
|
|
|
|
* identifiers.
|
|
|
|
* Outputs:
|
|
|
|
* namelist: filled with a palloc'd list of pointers to identifiers within
|
|
|
|
* rawstring. Caller should list_free() this even on error return.
|
|
|
|
*
|
|
|
|
* Returns true if okay, false if there is a syntax error in the string.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
SplitGUCList(char *rawstring, char separator,
|
|
|
|
List **namelist)
|
|
|
|
{
|
|
|
|
char *nextp = rawstring;
|
|
|
|
bool done = false;
|
|
|
|
|
|
|
|
*namelist = NIL;
|
|
|
|
|
|
|
|
while (scanner_isspace(*nextp))
|
|
|
|
nextp++; /* skip leading whitespace */
|
|
|
|
|
|
|
|
if (*nextp == '\0')
|
|
|
|
return true; /* allow empty string */
|
|
|
|
|
|
|
|
/* At the top of the loop, we are at start of a new identifier. */
|
|
|
|
do
|
|
|
|
{
|
|
|
|
char *curname;
|
|
|
|
char *endp;
|
|
|
|
|
|
|
|
if (*nextp == '"')
|
|
|
|
{
|
|
|
|
/* Quoted name --- collapse quote-quote pairs */
|
|
|
|
curname = nextp + 1;
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
endp = strchr(nextp + 1, '"');
|
|
|
|
if (endp == NULL)
|
|
|
|
return false; /* mismatched quotes */
|
|
|
|
if (endp[1] != '"')
|
|
|
|
break; /* found end of quoted name */
|
|
|
|
/* Collapse adjacent quotes into one quote, and look again */
|
|
|
|
memmove(endp, endp + 1, strlen(endp));
|
|
|
|
nextp = endp;
|
|
|
|
}
|
|
|
|
/* endp now points at the terminating quote */
|
|
|
|
nextp = endp + 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Unquoted name --- extends to separator or whitespace */
|
|
|
|
curname = nextp;
|
|
|
|
while (*nextp && *nextp != separator &&
|
|
|
|
!scanner_isspace(*nextp))
|
|
|
|
nextp++;
|
|
|
|
endp = nextp;
|
|
|
|
if (curname == nextp)
|
|
|
|
return false; /* empty unquoted name not allowed */
|
|
|
|
}
|
|
|
|
|
|
|
|
while (scanner_isspace(*nextp))
|
|
|
|
nextp++; /* skip trailing whitespace */
|
|
|
|
|
|
|
|
if (*nextp == separator)
|
|
|
|
{
|
|
|
|
nextp++;
|
|
|
|
while (scanner_isspace(*nextp))
|
|
|
|
nextp++; /* skip leading whitespace for next */
|
|
|
|
/* we expect another name, so done remains false */
|
|
|
|
}
|
|
|
|
else if (*nextp == '\0')
|
|
|
|
done = true;
|
|
|
|
else
|
|
|
|
return false; /* invalid syntax */
|
|
|
|
|
|
|
|
/* Now safe to overwrite separator with a null */
|
|
|
|
*endp = '\0';
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Finished isolating current name --- add it to list
|
|
|
|
*/
|
|
|
|
*namelist = lappend(*namelist, curname);
|
|
|
|
|
|
|
|
/* Loop back if we didn't reach end of string */
|
|
|
|
} while (!done);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2001-08-13 20:45:36 +02:00
|
|
|
/*****************************************************************************
|
|
|
|
* Comparison Functions used for bytea
|
|
|
|
*
|
|
|
|
* Note: btree indexes need these routines not to leak memory; therefore,
|
|
|
|
* be careful to free working copies of toasted datums. Most places don't
|
|
|
|
* need to be so careful.
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteaeq(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2011-01-18 20:09:22 +01:00
|
|
|
Datum arg1 = PG_GETARG_DATUM(0);
|
|
|
|
Datum arg2 = PG_GETARG_DATUM(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
bool result;
|
2011-01-18 20:09:22 +01:00
|
|
|
Size len1,
|
|
|
|
len2;
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2011-01-18 20:09:22 +01:00
|
|
|
/*
|
|
|
|
* We can use a fast path for unequal lengths, which might save us from
|
|
|
|
* having to detoast one or both values.
|
|
|
|
*/
|
|
|
|
len1 = toast_raw_datum_size(arg1);
|
|
|
|
len2 = toast_raw_datum_size(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
if (len1 != len2)
|
|
|
|
result = false;
|
|
|
|
else
|
2011-01-18 20:09:22 +01:00
|
|
|
{
|
|
|
|
bytea *barg1 = DatumGetByteaPP(arg1);
|
|
|
|
bytea *barg2 = DatumGetByteaPP(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2011-01-18 20:09:22 +01:00
|
|
|
result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
|
|
|
|
len1 - VARHDRSZ) == 0);
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(barg1, 0);
|
|
|
|
PG_FREE_IF_COPY(barg2, 1);
|
|
|
|
}
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteane(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2011-01-18 20:09:22 +01:00
|
|
|
Datum arg1 = PG_GETARG_DATUM(0);
|
|
|
|
Datum arg2 = PG_GETARG_DATUM(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
bool result;
|
2011-01-18 20:09:22 +01:00
|
|
|
Size len1,
|
|
|
|
len2;
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2011-01-18 20:09:22 +01:00
|
|
|
/*
|
|
|
|
* We can use a fast path for unequal lengths, which might save us from
|
|
|
|
* having to detoast one or both values.
|
|
|
|
*/
|
|
|
|
len1 = toast_raw_datum_size(arg1);
|
|
|
|
len2 = toast_raw_datum_size(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
if (len1 != len2)
|
|
|
|
result = true;
|
|
|
|
else
|
2011-01-18 20:09:22 +01:00
|
|
|
{
|
|
|
|
bytea *barg1 = DatumGetByteaPP(arg1);
|
|
|
|
bytea *barg2 = DatumGetByteaPP(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2011-01-18 20:09:22 +01:00
|
|
|
result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
|
|
|
|
len1 - VARHDRSZ) != 0);
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(barg1, 0);
|
|
|
|
PG_FREE_IF_COPY(barg2, 1);
|
|
|
|
}
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
PG_RETURN_BOOL(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
bytealt(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
int cmp;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteale(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
int cmp;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteagt(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
int cmp;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteage(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
int cmp;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
|
2001-08-13 20:45:36 +02:00
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
byteacmp(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-04-06 06:21:44 +02:00
|
|
|
bytea *arg1 = PG_GETARG_BYTEA_PP(0);
|
|
|
|
bytea *arg2 = PG_GETARG_BYTEA_PP(1);
|
2001-08-13 20:45:36 +02:00
|
|
|
int len1,
|
|
|
|
len2;
|
|
|
|
int cmp;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
len1 = VARSIZE_ANY_EXHDR(arg1);
|
|
|
|
len2 = VARSIZE_ANY_EXHDR(arg2);
|
2001-08-13 20:45:36 +02:00
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
|
2001-08-13 20:45:36 +02:00
|
|
|
if ((cmp == 0) && (len1 != len2))
|
|
|
|
cmp = (len1 < len2) ? -1 : 1;
|
|
|
|
|
|
|
|
PG_FREE_IF_COPY(arg1, 0);
|
|
|
|
PG_FREE_IF_COPY(arg2, 1);
|
|
|
|
|
|
|
|
PG_RETURN_INT32(cmp);
|
|
|
|
}
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2016-02-03 20:17:35 +01:00
|
|
|
Datum
|
|
|
|
bytea_sortsupport(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
|
|
|
|
MemoryContext oldcontext;
|
|
|
|
|
|
|
|
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
|
|
|
|
|
|
|
|
/* Use generic string SortSupport, forcing "C" collation */
|
Make type "name" collation-aware.
The "name" comparison operators now all support collations, making them
functionally equivalent to "text" comparisons, except for the different
physical representation of the datatype. They do, in fact, mostly share
the varstr_cmp and varstr_sortsupport infrastructure, which has been
slightly enlarged to handle the case.
To avoid changes in the default behavior of the datatype, set name's
typcollation to C_COLLATION_OID not DEFAULT_COLLATION_OID, so that
by default comparisons to a name value will continue to use strcmp
semantics. (This would have been the case for system catalog columns
anyway, because of commit 6b0faf723, but doing this makes it true for
user-created name columns as well. In particular, this avoids
locale-dependent changes in our regression test results.)
In consequence, tweak a couple of places that made assumptions about
collatable base types always having typcollation DEFAULT_COLLATION_OID.
I have not, however, attempted to relax the restriction that user-
defined collatable types must have that. Hence, "name" doesn't
behave quite like a user-defined type; it acts more like a domain
with COLLATE "C". (Conceivably, if we ever get rid of the need for
catalog name columns to be fixed-length, "name" could actually become
such a domain over text. But that'd be a pretty massive undertaking,
and I'm not volunteering.)
Discussion: https://postgr.es/m/15938.1544377821@sss.pgh.pa.us
2018-12-19 23:35:12 +01:00
|
|
|
varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
|
2016-02-03 20:17:35 +01:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
|
|
|
|
PG_RETURN_VOID();
|
|
|
|
}
|
|
|
|
|
2005-07-04 20:56:44 +02:00
|
|
|
/*
|
|
|
|
* appendStringInfoText
|
|
|
|
*
|
|
|
|
* Append a text to str.
|
2008-03-25 23:42:46 +01:00
|
|
|
* Like appendStringInfoString(str, text_to_cstring(t)) but faster.
|
2005-07-04 20:56:44 +02:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
appendStringInfoText(StringInfo str, const text *t)
|
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
|
2005-07-04 20:56:44 +02:00
|
|
|
}
|
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
|
|
|
* replace_text
|
2003-03-10 23:28:22 +01:00
|
|
|
* replace all occurrences of 'old_sub_str' in 'orig_str'
|
2002-08-22 05:24:01 +02:00
|
|
|
* with 'new_sub_str' to form 'new_str'
|
2002-09-04 22:31:48 +02:00
|
|
|
*
|
2002-08-22 05:24:01 +02:00
|
|
|
* returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
|
2002-09-04 22:31:48 +02:00
|
|
|
* otherwise returns 'new_str'
|
2002-08-22 05:24:01 +02:00
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
replace_text(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
text *src_text = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *from_sub_text = PG_GETARG_TEXT_PP(1);
|
|
|
|
text *to_sub_text = PG_GETARG_TEXT_PP(2);
|
2007-07-19 22:34:20 +02:00
|
|
|
int src_text_len;
|
|
|
|
int from_sub_text_len;
|
2006-10-07 02:11:53 +02:00
|
|
|
TextPositionState state;
|
2002-09-04 22:31:48 +02:00
|
|
|
text *ret_text;
|
2006-11-08 20:22:25 +01:00
|
|
|
int chunk_len;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
char *curr_ptr;
|
2006-11-08 20:22:25 +01:00
|
|
|
char *start_ptr;
|
2006-03-01 07:51:01 +01:00
|
|
|
StringInfoData str;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
bool found;
|
2002-08-22 05:24:01 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
src_text_len = VARSIZE_ANY_EXHDR(src_text);
|
|
|
|
from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
|
2007-07-19 22:34:20 +02:00
|
|
|
|
|
|
|
/* Return unmodified source string if empty source or pattern */
|
|
|
|
if (src_text_len < 1 || from_sub_text_len < 1)
|
|
|
|
{
|
|
|
|
PG_RETURN_TEXT_P(src_text);
|
|
|
|
}
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
|
|
|
|
found = text_position_next(&state);
|
2005-07-04 20:56:44 +02:00
|
|
|
|
|
|
|
/* When the from_sub_text is not found, there is nothing to do. */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
if (!found)
|
2006-10-07 02:11:53 +02:00
|
|
|
{
|
|
|
|
text_position_cleanup(&state);
|
2005-07-04 20:56:44 +02:00
|
|
|
PG_RETURN_TEXT_P(src_text);
|
2006-10-07 02:11:53 +02:00
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
curr_ptr = text_position_get_match_ptr(&state);
|
2007-09-22 02:36:38 +02:00
|
|
|
start_ptr = VARDATA_ANY(src_text);
|
2006-11-08 20:22:25 +01:00
|
|
|
|
2006-03-01 07:51:01 +01:00
|
|
|
initStringInfo(&str);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2006-10-07 02:11:53 +02:00
|
|
|
do
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2007-07-19 22:34:20 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
/* copy the data skipped over by last text_position_next() */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
chunk_len = curr_ptr - start_ptr;
|
2006-11-08 20:22:25 +01:00
|
|
|
appendBinaryStringInfo(&str, start_ptr, chunk_len);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2006-03-01 07:51:01 +01:00
|
|
|
appendStringInfoText(&str, to_sub_text);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
start_ptr = curr_ptr + from_sub_text_len;
|
2006-11-08 20:22:25 +01:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
found = text_position_next(&state);
|
|
|
|
if (found)
|
|
|
|
curr_ptr = text_position_get_match_ptr(&state);
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
while (found);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
/* copy trailing data */
|
2007-09-22 02:36:38 +02:00
|
|
|
chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
|
2006-11-08 20:22:25 +01:00
|
|
|
appendBinaryStringInfo(&str, start_ptr, chunk_len);
|
2006-10-07 02:11:53 +02:00
|
|
|
|
|
|
|
text_position_cleanup(&state);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
ret_text = cstring_to_text_with_len(str.data, str.len);
|
2006-03-01 07:51:01 +01:00
|
|
|
pfree(str.data);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(ret_text);
|
|
|
|
}
|
|
|
|
|
2005-07-10 06:54:33 +02:00
|
|
|
/*
|
|
|
|
* check_replace_text_has_escape_char
|
2005-10-18 22:38:58 +02:00
|
|
|
*
|
|
|
|
* check whether replace_text contains escape char.
|
2005-07-10 06:54:33 +02:00
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
check_replace_text_has_escape_char(const text *replace_text)
|
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
const char *p = VARDATA_ANY(replace_text);
|
|
|
|
const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
|
2005-07-10 06:54:33 +02:00
|
|
|
|
|
|
|
if (pg_database_encoding_max_length() == 1)
|
|
|
|
{
|
|
|
|
for (; p < p_end; p++)
|
2005-10-18 22:38:58 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
if (*p == '\\')
|
|
|
|
return true;
|
2005-10-18 22:38:58 +02:00
|
|
|
}
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (; p < p_end; p += pg_mblen(p))
|
2005-10-18 22:38:58 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
if (*p == '\\')
|
|
|
|
return true;
|
2005-10-18 22:38:58 +02:00
|
|
|
}
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* appendStringInfoRegexpSubstr
|
2005-10-18 22:38:58 +02:00
|
|
|
*
|
|
|
|
* Append replace_text to str, substituting regexp back references for
|
2014-05-06 18:12:18 +02:00
|
|
|
* \n escapes. start_ptr is the start of the match in the source string,
|
2006-11-08 20:22:25 +01:00
|
|
|
* at logical character position data_pos.
|
2005-07-10 06:54:33 +02:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
|
2006-11-08 20:22:25 +01:00
|
|
|
regmatch_t *pmatch,
|
|
|
|
char *start_ptr, int data_pos)
|
2005-07-10 06:54:33 +02:00
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
const char *p = VARDATA_ANY(replace_text);
|
|
|
|
const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
|
2005-07-10 06:54:33 +02:00
|
|
|
int eml = pg_database_encoding_max_length();
|
|
|
|
|
2005-10-18 22:38:58 +02:00
|
|
|
for (;;)
|
2005-07-10 06:54:33 +02:00
|
|
|
{
|
2005-10-18 22:38:58 +02:00
|
|
|
const char *chunk_start = p;
|
|
|
|
int so;
|
|
|
|
int eo;
|
|
|
|
|
|
|
|
/* Find next escape char. */
|
2005-07-10 06:54:33 +02:00
|
|
|
if (eml == 1)
|
|
|
|
{
|
|
|
|
for (; p < p_end && *p != '\\'; p++)
|
2005-11-22 19:17:34 +01:00
|
|
|
/* nothing */ ;
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (; p < p_end && *p != '\\'; p += pg_mblen(p))
|
2005-11-22 19:17:34 +01:00
|
|
|
/* nothing */ ;
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
2005-10-18 22:38:58 +02:00
|
|
|
/* Copy the text we just scanned over, if any. */
|
|
|
|
if (p > chunk_start)
|
|
|
|
appendBinaryStringInfo(str, chunk_start, p - chunk_start);
|
2005-07-10 06:54:33 +02:00
|
|
|
|
2005-10-18 22:38:58 +02:00
|
|
|
/* Done if at end of string, else advance over escape char. */
|
|
|
|
if (p >= p_end)
|
2005-07-10 06:54:33 +02:00
|
|
|
break;
|
|
|
|
p++;
|
2005-10-18 22:38:58 +02:00
|
|
|
|
|
|
|
if (p >= p_end)
|
|
|
|
{
|
|
|
|
/* Escape at very end of input. Treat same as unexpected char */
|
|
|
|
appendStringInfoChar(str, '\\');
|
|
|
|
break;
|
|
|
|
}
|
2005-07-10 06:54:33 +02:00
|
|
|
|
|
|
|
if (*p >= '1' && *p <= '9')
|
|
|
|
{
|
|
|
|
/* Use the back reference of regexp. */
|
2005-10-15 04:49:52 +02:00
|
|
|
int idx = *p - '0';
|
|
|
|
|
2005-07-10 06:54:33 +02:00
|
|
|
so = pmatch[idx].rm_so;
|
|
|
|
eo = pmatch[idx].rm_eo;
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
else if (*p == '&')
|
|
|
|
{
|
|
|
|
/* Use the entire matched string. */
|
|
|
|
so = pmatch[0].rm_so;
|
|
|
|
eo = pmatch[0].rm_eo;
|
|
|
|
p++;
|
2005-10-18 22:38:58 +02:00
|
|
|
}
|
|
|
|
else if (*p == '\\')
|
|
|
|
{
|
|
|
|
/* \\ means transfer one \ to output. */
|
|
|
|
appendStringInfoChar(str, '\\');
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2005-11-22 19:17:34 +01:00
|
|
|
* If escape char is not followed by any expected char, just treat
|
|
|
|
* it as ordinary data to copy. (XXX would it be better to throw
|
|
|
|
* an error?)
|
2005-10-18 22:38:58 +02:00
|
|
|
*/
|
|
|
|
appendStringInfoChar(str, '\\');
|
|
|
|
continue;
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (so != -1 && eo != -1)
|
|
|
|
{
|
2005-10-18 22:38:58 +02:00
|
|
|
/*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Copy the text that is back reference of regexp. Note so and eo
|
2007-11-15 22:14:46 +01:00
|
|
|
* are counted in characters not bytes.
|
2005-10-18 22:38:58 +02:00
|
|
|
*/
|
2006-11-08 20:22:25 +01:00
|
|
|
char *chunk_start;
|
|
|
|
int chunk_len;
|
|
|
|
|
|
|
|
Assert(so >= data_pos);
|
|
|
|
chunk_start = start_ptr;
|
|
|
|
chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
|
|
|
|
chunk_len = charlen_to_bytelen(chunk_start, eo - so);
|
|
|
|
appendBinaryStringInfo(str, chunk_start, chunk_len);
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define REGEXP_REPLACE_BACKREF_CNT 10
|
|
|
|
|
|
|
|
/*
|
|
|
|
* replace_text_regexp
|
2005-10-18 22:38:58 +02:00
|
|
|
*
|
2005-07-10 06:54:33 +02:00
|
|
|
* replace text that matches to regexp in src_text to replace_text.
|
2005-10-18 22:38:58 +02:00
|
|
|
*
|
|
|
|
* Note: to avoid having to include regex.h in builtins.h, we declare
|
|
|
|
* the regexp argument as void *, but really it's regex_t *.
|
2005-07-10 06:54:33 +02:00
|
|
|
*/
|
2005-10-18 22:38:58 +02:00
|
|
|
text *
|
|
|
|
replace_text_regexp(text *src_text, void *regexp,
|
|
|
|
text *replace_text, bool glob)
|
2005-07-10 06:54:33 +02:00
|
|
|
{
|
|
|
|
text *ret_text;
|
2005-10-18 22:38:58 +02:00
|
|
|
regex_t *re = (regex_t *) regexp;
|
2007-09-22 02:36:38 +02:00
|
|
|
int src_text_len = VARSIZE_ANY_EXHDR(src_text);
|
2006-10-04 02:30:14 +02:00
|
|
|
StringInfoData buf;
|
2005-07-10 06:54:33 +02:00
|
|
|
regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
|
|
|
|
pg_wchar *data;
|
|
|
|
size_t data_len;
|
|
|
|
int search_start;
|
|
|
|
int data_pos;
|
2006-11-08 20:22:25 +01:00
|
|
|
char *start_ptr;
|
2005-07-10 06:54:33 +02:00
|
|
|
bool have_escape;
|
|
|
|
|
2006-03-01 07:51:01 +01:00
|
|
|
initStringInfo(&buf);
|
|
|
|
|
2005-07-10 06:54:33 +02:00
|
|
|
/* Convert data string to wide characters. */
|
|
|
|
data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
|
2007-09-22 02:36:38 +02:00
|
|
|
data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
|
2005-07-10 06:54:33 +02:00
|
|
|
|
|
|
|
/* Check whether replace_text has escape char. */
|
|
|
|
have_escape = check_replace_text_has_escape_char(replace_text);
|
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
/* start_ptr points to the data_pos'th character of src_text */
|
2007-09-22 02:36:38 +02:00
|
|
|
start_ptr = (char *) VARDATA_ANY(src_text);
|
2006-11-08 20:22:25 +01:00
|
|
|
data_pos = 0;
|
|
|
|
|
|
|
|
search_start = 0;
|
|
|
|
while (search_start <= data_len)
|
2005-07-10 06:54:33 +02:00
|
|
|
{
|
2006-10-04 02:30:14 +02:00
|
|
|
int regexec_result;
|
2006-03-01 07:51:01 +01:00
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
2005-07-10 06:54:33 +02:00
|
|
|
regexec_result = pg_regexec(re,
|
|
|
|
data,
|
|
|
|
data_len,
|
|
|
|
search_start,
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
NULL, /* no details */
|
2005-07-10 06:54:33 +02:00
|
|
|
REGEXP_REPLACE_BACKREF_CNT,
|
|
|
|
pmatch,
|
|
|
|
0);
|
|
|
|
|
2006-03-01 07:51:01 +01:00
|
|
|
if (regexec_result == REG_NOMATCH)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (regexec_result != REG_OKAY)
|
2005-07-10 06:54:33 +02:00
|
|
|
{
|
2005-10-15 04:49:52 +02:00
|
|
|
char errMsg[100];
|
2005-07-10 06:54:33 +02:00
|
|
|
|
2014-03-01 21:20:56 +01:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
2005-07-10 06:54:33 +02:00
|
|
|
pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
|
|
|
|
ereport(ERROR,
|
2005-10-15 04:49:52 +02:00
|
|
|
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
|
|
|
errmsg("regular expression failed: %s", errMsg)));
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
2005-10-15 04:49:52 +02:00
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* Copy the text to the left of the match position. Note we are given
|
|
|
|
* character not byte indexes.
|
2005-10-15 04:49:52 +02:00
|
|
|
*/
|
2005-07-10 06:54:33 +02:00
|
|
|
if (pmatch[0].rm_so - data_pos > 0)
|
|
|
|
{
|
2006-11-08 20:22:25 +01:00
|
|
|
int chunk_len;
|
|
|
|
|
|
|
|
chunk_len = charlen_to_bytelen(start_ptr,
|
|
|
|
pmatch[0].rm_so - data_pos);
|
|
|
|
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
|
2007-11-15 22:14:46 +01:00
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* Advance start_ptr over that text, to avoid multiple rescans of
|
|
|
|
* it if the replace_text contains multiple back-references.
|
2006-11-08 20:22:25 +01:00
|
|
|
*/
|
|
|
|
start_ptr += chunk_len;
|
|
|
|
data_pos = pmatch[0].rm_so;
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy the replace_text. Process back references when the
|
2005-08-24 19:50:00 +02:00
|
|
|
* replace_text has escape characters.
|
2005-07-10 06:54:33 +02:00
|
|
|
*/
|
|
|
|
if (have_escape)
|
2006-11-08 20:22:25 +01:00
|
|
|
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
|
|
|
|
start_ptr, data_pos);
|
2005-07-10 06:54:33 +02:00
|
|
|
else
|
2006-03-01 07:51:01 +01:00
|
|
|
appendStringInfoText(&buf, replace_text);
|
2005-07-10 06:54:33 +02:00
|
|
|
|
2006-11-08 20:22:25 +01:00
|
|
|
/* Advance start_ptr and data_pos over the matched text. */
|
|
|
|
start_ptr += charlen_to_bytelen(start_ptr,
|
|
|
|
pmatch[0].rm_eo - data_pos);
|
|
|
|
data_pos = pmatch[0].rm_eo;
|
2005-07-10 06:54:33 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When global option is off, replace the first instance only.
|
|
|
|
*/
|
2005-10-18 22:38:58 +02:00
|
|
|
if (!glob)
|
2005-07-10 06:54:33 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
2013-07-31 17:31:22 +02:00
|
|
|
* Advance search position. Normally we start the next search at the
|
|
|
|
* end of the previous match; but if the match was of zero length, we
|
|
|
|
* have to advance by one character, or we'd just find the same match
|
|
|
|
* again.
|
2005-07-10 06:54:33 +02:00
|
|
|
*/
|
2006-11-08 20:22:25 +01:00
|
|
|
search_start = data_pos;
|
2005-07-10 06:54:33 +02:00
|
|
|
if (pmatch[0].rm_so == pmatch[0].rm_eo)
|
|
|
|
search_start++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2005-10-18 22:38:58 +02:00
|
|
|
* Copy the text to the right of the last match.
|
2005-07-10 06:54:33 +02:00
|
|
|
*/
|
|
|
|
if (data_pos < data_len)
|
|
|
|
{
|
2006-11-08 20:22:25 +01:00
|
|
|
int chunk_len;
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2007-09-22 02:36:38 +02:00
|
|
|
chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
|
2006-11-08 20:22:25 +01:00
|
|
|
appendBinaryStringInfo(&buf, start_ptr, chunk_len);
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
ret_text = cstring_to_text_with_len(buf.data, buf.len);
|
2006-03-01 07:51:01 +01:00
|
|
|
pfree(buf.data);
|
2005-07-10 06:54:33 +02:00
|
|
|
pfree(data);
|
|
|
|
|
2005-10-18 22:38:58 +02:00
|
|
|
return ret_text;
|
2005-07-10 06:54:33 +02:00
|
|
|
}
|
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/*
|
|
|
|
* split_text
|
|
|
|
* parse input string
|
|
|
|
* return ord item (1 based)
|
|
|
|
* based on provided field separator
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
split_text(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2007-09-22 02:36:38 +02:00
|
|
|
text *inputstring = PG_GETARG_TEXT_PP(0);
|
|
|
|
text *fldsep = PG_GETARG_TEXT_PP(1);
|
2002-08-22 05:24:01 +02:00
|
|
|
int fldnum = PG_GETARG_INT32(2);
|
2007-07-19 22:34:20 +02:00
|
|
|
int inputstring_len;
|
|
|
|
int fldsep_len;
|
2006-10-07 02:11:53 +02:00
|
|
|
TextPositionState state;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
char *start_ptr;
|
|
|
|
char *end_ptr;
|
2002-09-04 22:31:48 +02:00
|
|
|
text *result_text;
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
bool found;
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2004-01-31 01:45:21 +01:00
|
|
|
/* field number is 1 based */
|
|
|
|
if (fldnum < 1)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("field position must be greater than zero")));
|
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
|
|
|
|
fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
|
2007-07-19 22:34:20 +02:00
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
/* return empty string for empty input string */
|
|
|
|
if (inputstring_len < 1)
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(""));
|
2002-08-22 05:24:01 +02:00
|
|
|
|
|
|
|
/* empty field separator */
|
|
|
|
if (fldsep_len < 1)
|
|
|
|
{
|
2007-07-19 22:34:20 +02:00
|
|
|
text_position_cleanup(&state);
|
2004-01-31 01:45:21 +01:00
|
|
|
/* if first field, return input string, else empty string */
|
|
|
|
if (fldnum == 1)
|
2002-08-22 05:24:01 +02:00
|
|
|
PG_RETURN_TEXT_P(inputstring);
|
2003-08-04 02:43:34 +02:00
|
|
|
else
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(""));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
|
2006-10-07 02:11:53 +02:00
|
|
|
/* identify bounds of first field */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
start_ptr = VARDATA_ANY(inputstring);
|
|
|
|
found = text_position_next(&state);
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2006-10-07 02:11:53 +02:00
|
|
|
/* special case if fldsep not found at all */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
if (!found)
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2006-10-07 02:11:53 +02:00
|
|
|
text_position_cleanup(&state);
|
|
|
|
/* if field 1 requested, return input string, else empty string */
|
2004-01-31 01:45:21 +01:00
|
|
|
if (fldnum == 1)
|
2002-08-22 05:24:01 +02:00
|
|
|
PG_RETURN_TEXT_P(inputstring);
|
2003-08-04 02:43:34 +02:00
|
|
|
else
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(""));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
end_ptr = text_position_get_match_ptr(&state);
|
2006-10-07 02:11:53 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
while (found && --fldnum > 0)
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2006-10-07 02:11:53 +02:00
|
|
|
/* identify bounds of next field */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
start_ptr = end_ptr + fldsep_len;
|
|
|
|
found = text_position_next(&state);
|
|
|
|
if (found)
|
|
|
|
end_ptr = text_position_get_match_ptr(&state);
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
2006-10-07 02:11:53 +02:00
|
|
|
|
|
|
|
text_position_cleanup(&state);
|
|
|
|
|
|
|
|
if (fldnum > 0)
|
2002-08-22 05:24:01 +02:00
|
|
|
{
|
2006-10-07 02:11:53 +02:00
|
|
|
/* N'th field separator not found */
|
|
|
|
/* if last field requested, return it, else empty string */
|
|
|
|
if (fldnum == 1)
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
{
|
|
|
|
int last_len = start_ptr - VARDATA_ANY(inputstring);
|
|
|
|
|
|
|
|
result_text = cstring_to_text_with_len(start_ptr,
|
|
|
|
inputstring_len - last_len);
|
|
|
|
}
|
2006-10-07 02:11:53 +02:00
|
|
|
else
|
2008-03-25 23:42:46 +01:00
|
|
|
result_text = cstring_to_text("");
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2006-10-07 02:11:53 +02:00
|
|
|
/* non-last field requested */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
2006-10-07 02:11:53 +02:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(result_text);
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/*
|
|
|
|
* Convenience function to return true when two text params are equal.
|
|
|
|
*/
|
|
|
|
static bool
|
2019-03-22 12:09:32 +01:00
|
|
|
text_isequal(text *txt1, text *txt2, Oid collid)
|
2010-08-10 23:51:00 +02:00
|
|
|
{
|
2019-03-22 12:09:32 +01:00
|
|
|
return DatumGetBool(DirectFunctionCall2Coll(texteq,
|
|
|
|
collid,
|
|
|
|
PointerGetDatum(txt1),
|
|
|
|
PointerGetDatum(txt2)));
|
2010-08-10 23:51:00 +02:00
|
|
|
}
|
|
|
|
|
2003-06-27 02:33:26 +02:00
|
|
|
/*
|
|
|
|
* text_to_array
|
2010-08-10 23:51:00 +02:00
|
|
|
* parse input string and return text array of elements,
|
2003-06-27 02:33:26 +02:00
|
|
|
* based on provided field separator
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_to_array(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2010-08-10 23:51:00 +02:00
|
|
|
return text_to_array_internal(fcinfo);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* text_to_array_null
|
|
|
|
* parse input string and return text array of elements,
|
|
|
|
* based on provided field separator and null string
|
|
|
|
*
|
|
|
|
* This is a separate entry point only to prevent the regression tests from
|
|
|
|
* complaining about different argument sets for the same internal function.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_to_array_null(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
return text_to_array_internal(fcinfo);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* common code for text_to_array and text_to_array_null functions
|
|
|
|
*
|
|
|
|
* These are not strict so we have to test for null inputs explicitly.
|
|
|
|
*/
|
|
|
|
static Datum
|
|
|
|
text_to_array_internal(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *inputstring;
|
|
|
|
text *fldsep;
|
|
|
|
text *null_string;
|
2007-07-19 22:34:20 +02:00
|
|
|
int inputstring_len;
|
|
|
|
int fldsep_len;
|
2006-11-08 20:22:25 +01:00
|
|
|
char *start_ptr;
|
2004-01-31 01:45:21 +01:00
|
|
|
text *result_text;
|
2010-08-10 23:51:00 +02:00
|
|
|
bool is_null;
|
2003-06-27 02:33:26 +02:00
|
|
|
ArrayBuildState *astate = NULL;
|
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* when input string is NULL, then result is NULL too */
|
|
|
|
if (PG_ARGISNULL(0))
|
2003-06-27 02:33:26 +02:00
|
|
|
PG_RETURN_NULL();
|
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
inputstring = PG_GETARG_TEXT_PP(0);
|
2006-10-07 02:11:53 +02:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* fldsep can be NULL */
|
|
|
|
if (!PG_ARGISNULL(1))
|
|
|
|
fldsep = PG_GETARG_TEXT_PP(1);
|
|
|
|
else
|
|
|
|
fldsep = NULL;
|
|
|
|
|
|
|
|
/* null_string can be NULL or omitted */
|
|
|
|
if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
|
|
|
|
null_string = PG_GETARG_TEXT_PP(2);
|
|
|
|
else
|
|
|
|
null_string = NULL;
|
2006-11-08 20:22:25 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
if (fldsep != NULL)
|
2003-06-27 02:33:26 +02:00
|
|
|
{
|
2010-08-10 23:51:00 +02:00
|
|
|
/*
|
|
|
|
* Normal case with non-null fldsep. Use the text_position machinery
|
|
|
|
* to search for occurrences of fldsep.
|
|
|
|
*/
|
|
|
|
TextPositionState state;
|
2010-11-23 21:27:50 +01:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
|
|
|
|
fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* return empty array for empty input string */
|
|
|
|
if (inputstring_len < 1)
|
|
|
|
PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* empty field separator: return the input string as a one-element
|
|
|
|
* array
|
|
|
|
*/
|
|
|
|
if (fldsep_len < 1)
|
2003-06-27 02:33:26 +02:00
|
|
|
{
|
Remove create_singleton_array(), hard-coding the case in its sole caller.
create_singleton_array() was not really as useful as we perhaps thought
when we added it. It had never accreted more than one call site, and is
only saving a dozen lines of code at that one, which is considerably less
bulk than the function itself. Moreover, because of its insistence on
using the caller's fn_extra cache space, it's arguably a coding hazard.
text_to_array_internal() does not currently use fn_extra in any other way,
but if it did it would be subtly broken, since the conflicting fn_extra
uses could be needed within a single query, in the seldom-tested case that
the field separator varies during the query. The same objection seems
likely to apply to any other potential caller.
The replacement code is a bit uglier, because it hardwires knowledge of
the storage parameters of type TEXT, but it's not like we haven't got
dozens or hundreds of other places that do the same. Uglier seems like
a good tradeoff for smaller, faster, and safer.
Per discussion with Neha Khatri.
Discussion: https://postgr.es/m/CAFO0U+_fS5SRhzq6uPG+4fbERhoA9N2+nPrtvaC9mmeWivxbsA@mail.gmail.com
2017-05-03 02:41:37 +02:00
|
|
|
Datum elems[1];
|
|
|
|
bool nulls[1];
|
|
|
|
int dims[1];
|
|
|
|
int lbs[1];
|
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* single element can be a NULL too */
|
2019-03-22 12:09:32 +01:00
|
|
|
is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
|
Remove create_singleton_array(), hard-coding the case in its sole caller.
create_singleton_array() was not really as useful as we perhaps thought
when we added it. It had never accreted more than one call site, and is
only saving a dozen lines of code at that one, which is considerably less
bulk than the function itself. Moreover, because of its insistence on
using the caller's fn_extra cache space, it's arguably a coding hazard.
text_to_array_internal() does not currently use fn_extra in any other way,
but if it did it would be subtly broken, since the conflicting fn_extra
uses could be needed within a single query, in the seldom-tested case that
the field separator varies during the query. The same objection seems
likely to apply to any other potential caller.
The replacement code is a bit uglier, because it hardwires knowledge of
the storage parameters of type TEXT, but it's not like we haven't got
dozens or hundreds of other places that do the same. Uglier seems like
a good tradeoff for smaller, faster, and safer.
Per discussion with Neha Khatri.
Discussion: https://postgr.es/m/CAFO0U+_fS5SRhzq6uPG+4fbERhoA9N2+nPrtvaC9mmeWivxbsA@mail.gmail.com
2017-05-03 02:41:37 +02:00
|
|
|
|
|
|
|
elems[0] = PointerGetDatum(inputstring);
|
|
|
|
nulls[0] = is_null;
|
|
|
|
dims[0] = 1;
|
|
|
|
lbs[0] = 1;
|
|
|
|
/* XXX: this hardcodes assumptions about the text type */
|
|
|
|
PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
|
|
|
|
1, dims, lbs,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
TEXTOID, -1, false, 'i'));
|
2003-06-27 02:33:26 +02:00
|
|
|
}
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2019-03-22 12:09:32 +01:00
|
|
|
text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
start_ptr = VARDATA_ANY(inputstring);
|
|
|
|
|
2019-01-25 17:27:44 +01:00
|
|
|
for (;;)
|
2010-08-10 23:51:00 +02:00
|
|
|
{
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
bool found;
|
|
|
|
char *end_ptr;
|
2019-01-25 17:27:44 +01:00
|
|
|
int chunk_len;
|
2003-06-27 02:33:26 +02:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
2006-11-08 20:22:25 +01:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
found = text_position_next(&state);
|
|
|
|
if (!found)
|
2010-08-10 23:51:00 +02:00
|
|
|
{
|
|
|
|
/* fetch last field */
|
|
|
|
chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
|
2019-01-25 17:27:44 +01:00
|
|
|
end_ptr = NULL; /* not used, but some compilers complain */
|
2010-08-10 23:51:00 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* fetch non-last field */
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
end_ptr = text_position_get_match_ptr(&state);
|
|
|
|
chunk_len = end_ptr - start_ptr;
|
2010-08-10 23:51:00 +02:00
|
|
|
}
|
2006-10-07 02:11:53 +02:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* must build a temp text datum to pass to accumArrayResult */
|
|
|
|
result_text = cstring_to_text_with_len(start_ptr, chunk_len);
|
2019-03-22 12:09:32 +01:00
|
|
|
is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* stash away this field */
|
|
|
|
astate = accumArrayResult(astate,
|
|
|
|
PointerGetDatum(result_text),
|
|
|
|
is_null,
|
|
|
|
TEXTOID,
|
|
|
|
CurrentMemoryContext);
|
2006-11-08 20:22:25 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
pfree(result_text);
|
2006-11-08 20:22:25 +01:00
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
if (!found)
|
2010-08-10 23:51:00 +02:00
|
|
|
break;
|
|
|
|
|
Use single-byte Boyer-Moore-Horspool search even with multibyte encodings.
The old implementation first converted the input strings to arrays of
wchars, and performed the conversion on those. However, the conversion is
expensive, and for a large input string, consumes a lot of memory.
Allocating the large arrays also meant that these functions could not be
used on strings larger 1 GB / pg_encoding_max_length() (256 MB for UTF-8).
Avoid the conversion, and instead use the single-byte algorithm even with
multibyte encodings. That can get fooled, if there is a matching byte
sequence in the middle of a multi-byte character, so to eliminate false
positives like that, we verify any matches by walking the string character
by character with pg_mblen(). Also, if the caller needs the position of
the match, as a character-offset, we also need to walk the string to count
the characters.
Performance testing shows that walking the whole string with pg_mblen() is
somewhat slower than converting the whole string to wchars. It's still
often a win, though, because we don't need to do it if there is no match,
and even when there is, we only need to walk up to the point where the
match is, not the whole string. Even in the worst case, there would be
room for optimization: Much of the CPU time in the current loop with
pg_mblen() is function call overhead, and could be improved by inlining
pg_mblen() and/or the encoding-specific mblen() functions. But I didn't
attempt to do that as part of this patch.
Most of the callers of text_position_setup/next functions were actually
not interested in the position of the match, counted in characters. To
cater for them, refactor the text_position_next() interface into two
parts: searching for the next match (text_position_next()), and returning
the current match's position as a pointer (text_position_get_match_ptr())
or as a character offset (text_position_get_match_pos()). Getting the
pointer to the match is a more convenient API for many callers, and with
UTF-8, it allows skipping the character-walking step altogether, because
UTF-8 can't have false matches even when treated like raw byte strings.
Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/3173d989-bc1c-fc8a-3b69-f24246f73876%40iki.fi
2019-01-25 15:25:05 +01:00
|
|
|
start_ptr = end_ptr + fldsep_len;
|
2010-08-10 23:51:00 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
text_position_cleanup(&state);
|
2003-06-27 02:33:26 +02:00
|
|
|
}
|
2010-08-10 23:51:00 +02:00
|
|
|
else
|
|
|
|
{
|
2010-11-23 21:27:50 +01:00
|
|
|
/*
|
2010-08-10 23:51:00 +02:00
|
|
|
* When fldsep is NULL, each character in the inputstring becomes an
|
2011-04-10 17:42:00 +02:00
|
|
|
* element in the result array. The separator is effectively the
|
|
|
|
* space between characters.
|
2010-08-10 23:51:00 +02:00
|
|
|
*/
|
|
|
|
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* return empty array for empty input string */
|
|
|
|
if (inputstring_len < 1)
|
|
|
|
PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
start_ptr = VARDATA_ANY(inputstring);
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
while (inputstring_len > 0)
|
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
int chunk_len = pg_mblen(start_ptr);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
|
|
|
/* must build a temp text datum to pass to accumArrayResult */
|
|
|
|
result_text = cstring_to_text_with_len(start_ptr, chunk_len);
|
2019-03-22 12:09:32 +01:00
|
|
|
is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
/* stash away this field */
|
|
|
|
astate = accumArrayResult(astate,
|
|
|
|
PointerGetDatum(result_text),
|
|
|
|
is_null,
|
|
|
|
TEXTOID,
|
|
|
|
CurrentMemoryContext);
|
|
|
|
|
|
|
|
pfree(result_text);
|
|
|
|
|
|
|
|
start_ptr += chunk_len;
|
|
|
|
inputstring_len -= chunk_len;
|
|
|
|
}
|
|
|
|
}
|
2006-10-07 02:11:53 +02:00
|
|
|
|
|
|
|
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
|
|
|
|
CurrentMemoryContext));
|
2003-06-27 02:33:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* array_to_text
|
|
|
|
* concatenate Cstring representation of input array elements
|
|
|
|
* using provided field separator
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
array_to_text(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
|
2008-03-25 23:42:46 +01:00
|
|
|
char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
2010-08-10 23:51:00 +02:00
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* array_to_text_null
|
|
|
|
* concatenate Cstring representation of input array elements
|
|
|
|
* using provided field separator and null string
|
|
|
|
*
|
|
|
|
* This version is not strict so we have to test for null inputs explicitly.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
array_to_text_null(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
ArrayType *v;
|
|
|
|
char *fldsep;
|
|
|
|
char *null_string;
|
|
|
|
|
|
|
|
/* returns NULL when first or second parameter is NULL */
|
|
|
|
if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
|
|
|
|
PG_RETURN_NULL();
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
v = PG_GETARG_ARRAYTYPE_P(0);
|
|
|
|
fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
|
|
|
|
|
|
|
/* NULL null string is passed through as a null pointer */
|
|
|
|
if (!PG_ARGISNULL(2))
|
|
|
|
null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
|
|
|
|
else
|
|
|
|
null_string = NULL;
|
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* common code for array_to_text and array_to_text_null functions
|
|
|
|
*/
|
|
|
|
static text *
|
|
|
|
array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
|
2013-01-25 06:19:18 +01:00
|
|
|
const char *fldsep, const char *null_string)
|
2010-08-10 23:51:00 +02:00
|
|
|
{
|
|
|
|
text *result;
|
2003-08-04 02:43:34 +02:00
|
|
|
int nitems,
|
|
|
|
*dims,
|
|
|
|
ndims;
|
2003-06-27 02:33:26 +02:00
|
|
|
Oid element_type;
|
|
|
|
int typlen;
|
|
|
|
bool typbyval;
|
|
|
|
char typalign;
|
2006-10-04 02:30:14 +02:00
|
|
|
StringInfoData buf;
|
2005-11-18 03:38:24 +01:00
|
|
|
bool printed = false;
|
|
|
|
char *p;
|
|
|
|
bits8 *bitmap;
|
|
|
|
int bitmask;
|
2003-06-27 02:33:26 +02:00
|
|
|
int i;
|
|
|
|
ArrayMetaState *my_extra;
|
|
|
|
|
|
|
|
ndims = ARR_NDIM(v);
|
|
|
|
dims = ARR_DIMS(v);
|
|
|
|
nitems = ArrayGetNItems(ndims, dims);
|
|
|
|
|
|
|
|
/* if there are no elements, return an empty string */
|
|
|
|
if (nitems == 0)
|
2010-08-10 23:51:00 +02:00
|
|
|
return cstring_to_text_with_len("", 0);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
|
|
|
element_type = ARR_ELEMTYPE(v);
|
2006-03-01 07:51:01 +01:00
|
|
|
initStringInfo(&buf);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We arrange to look up info about element type, including its output
|
2005-10-15 04:49:52 +02:00
|
|
|
* conversion proc, only once per series of calls, assuming the element
|
|
|
|
* type doesn't change underneath us.
|
2003-06-27 02:33:26 +02:00
|
|
|
*/
|
|
|
|
my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
|
|
|
|
if (my_extra == NULL)
|
|
|
|
{
|
|
|
|
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
|
2005-10-15 04:49:52 +02:00
|
|
|
sizeof(ArrayMetaState));
|
2003-06-27 02:33:26 +02:00
|
|
|
my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
|
2005-11-18 03:38:24 +01:00
|
|
|
my_extra->element_type = ~element_type;
|
2003-06-27 02:33:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (my_extra->element_type != element_type)
|
|
|
|
{
|
2003-08-04 02:43:34 +02:00
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Get info about element type, including its output conversion proc
|
2003-08-04 02:43:34 +02:00
|
|
|
*/
|
2003-06-27 02:33:26 +02:00
|
|
|
get_type_io_data(element_type, IOFunc_output,
|
|
|
|
&my_extra->typlen, &my_extra->typbyval,
|
|
|
|
&my_extra->typalign, &my_extra->typdelim,
|
2004-06-06 02:41:28 +02:00
|
|
|
&my_extra->typioparam, &my_extra->typiofunc);
|
2003-06-27 02:33:26 +02:00
|
|
|
fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
|
|
|
|
fcinfo->flinfo->fn_mcxt);
|
|
|
|
my_extra->element_type = element_type;
|
|
|
|
}
|
|
|
|
typlen = my_extra->typlen;
|
|
|
|
typbyval = my_extra->typbyval;
|
|
|
|
typalign = my_extra->typalign;
|
|
|
|
|
2005-11-18 03:38:24 +01:00
|
|
|
p = ARR_DATA_PTR(v);
|
|
|
|
bitmap = ARR_NULLBITMAP(v);
|
|
|
|
bitmask = 1;
|
|
|
|
|
2003-06-27 02:33:26 +02:00
|
|
|
for (i = 0; i < nitems; i++)
|
|
|
|
{
|
|
|
|
Datum itemvalue;
|
|
|
|
char *value;
|
|
|
|
|
2005-11-18 03:38:24 +01:00
|
|
|
/* Get source element, checking for NULL */
|
|
|
|
if (bitmap && (*bitmap & bitmask) == 0)
|
|
|
|
{
|
2010-08-10 23:51:00 +02:00
|
|
|
/* if null_string is NULL, we just ignore null elements */
|
|
|
|
if (null_string != NULL)
|
|
|
|
{
|
|
|
|
if (printed)
|
|
|
|
appendStringInfo(&buf, "%s%s", fldsep, null_string);
|
|
|
|
else
|
|
|
|
appendStringInfoString(&buf, null_string);
|
|
|
|
printed = true;
|
|
|
|
}
|
2005-11-18 03:38:24 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
itemvalue = fetch_att(p, typbyval, typlen);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
2006-04-04 21:35:37 +02:00
|
|
|
value = OutputFunctionCall(&my_extra->proc, itemvalue);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
2005-11-18 03:38:24 +01:00
|
|
|
if (printed)
|
2006-03-01 07:51:01 +01:00
|
|
|
appendStringInfo(&buf, "%s%s", fldsep, value);
|
2005-11-18 03:38:24 +01:00
|
|
|
else
|
2006-03-01 07:51:01 +01:00
|
|
|
appendStringInfoString(&buf, value);
|
2005-11-18 03:38:24 +01:00
|
|
|
printed = true;
|
|
|
|
|
2007-04-06 06:21:44 +02:00
|
|
|
p = att_addlength_pointer(p, typlen, p);
|
|
|
|
p = (char *) att_align_nominal(p, typalign);
|
2005-11-18 03:38:24 +01:00
|
|
|
}
|
2003-06-27 02:33:26 +02:00
|
|
|
|
2005-11-18 03:38:24 +01:00
|
|
|
/* advance bitmap pointer if any */
|
|
|
|
if (bitmap)
|
|
|
|
{
|
|
|
|
bitmask <<= 1;
|
|
|
|
if (bitmask == 0x100)
|
|
|
|
{
|
|
|
|
bitmap++;
|
|
|
|
bitmask = 1;
|
|
|
|
}
|
|
|
|
}
|
2003-06-27 02:33:26 +02:00
|
|
|
}
|
2010-11-23 21:27:50 +01:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
result = cstring_to_text_with_len(buf.data, buf.len);
|
|
|
|
pfree(buf.data);
|
2003-06-27 02:33:26 +02:00
|
|
|
|
2010-08-10 23:51:00 +02:00
|
|
|
return result;
|
2003-06-27 02:33:26 +02:00
|
|
|
}
|
|
|
|
|
2002-08-22 05:24:01 +02:00
|
|
|
#define HEXBASE 16
|
|
|
|
/*
|
2015-05-20 15:18:11 +02:00
|
|
|
* Convert an int32 to a string containing a base 16 (hex) representation of
|
2002-08-22 05:24:01 +02:00
|
|
|
* the number.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
to_hex32(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2003-12-19 05:56:41 +01:00
|
|
|
uint32 value = (uint32) PG_GETARG_INT32(0);
|
|
|
|
char *ptr;
|
|
|
|
const char *digits = "0123456789abcdef";
|
|
|
|
char buf[32]; /* bigger than needed, but reasonable */
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2002-08-28 22:46:24 +02:00
|
|
|
ptr = buf + sizeof(buf) - 1;
|
2002-08-22 05:24:01 +02:00
|
|
|
*ptr = '\0';
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
*--ptr = digits[value % HEXBASE];
|
|
|
|
value /= HEXBASE;
|
|
|
|
} while (ptr > buf && value);
|
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(ptr));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-05-20 15:18:11 +02:00
|
|
|
* Convert an int64 to a string containing a base 16 (hex) representation of
|
2002-08-22 05:24:01 +02:00
|
|
|
* the number.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
to_hex64(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2003-12-19 05:56:41 +01:00
|
|
|
uint64 value = (uint64) PG_GETARG_INT64(0);
|
|
|
|
char *ptr;
|
|
|
|
const char *digits = "0123456789abcdef";
|
|
|
|
char buf[32]; /* bigger than needed, but reasonable */
|
2002-08-22 05:24:01 +02:00
|
|
|
|
2002-08-28 22:46:24 +02:00
|
|
|
ptr = buf + sizeof(buf) - 1;
|
2002-08-22 05:24:01 +02:00
|
|
|
*ptr = '\0';
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
*--ptr = digits[value % HEXBASE];
|
|
|
|
value /= HEXBASE;
|
|
|
|
} while (ptr > buf && value);
|
|
|
|
|
2008-03-25 23:42:46 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text(ptr));
|
2002-08-22 05:24:01 +02:00
|
|
|
}
|
2002-12-06 06:20:28 +01:00
|
|
|
|
2005-08-24 19:50:00 +02:00
|
|
|
/*
|
2005-08-02 18:11:57 +02:00
|
|
|
* Return the size of a datum, possibly compressed
|
|
|
|
*
|
|
|
|
* Works on any data type
|
2005-07-06 21:02:54 +02:00
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
pg_column_size(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-08-02 18:11:57 +02:00
|
|
|
Datum value = PG_GETARG_DATUM(0);
|
|
|
|
int32 result;
|
|
|
|
int typlen;
|
2005-07-06 21:02:54 +02:00
|
|
|
|
2005-08-02 18:11:57 +02:00
|
|
|
/* On first call, get the input type's typlen, and save at *fn_extra */
|
|
|
|
if (fcinfo->flinfo->fn_extra == NULL)
|
2005-07-06 21:02:54 +02:00
|
|
|
{
|
2005-08-02 18:11:57 +02:00
|
|
|
/* Lookup the datatype of the supplied argument */
|
2005-10-15 04:49:52 +02:00
|
|
|
Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
|
2005-07-06 21:02:54 +02:00
|
|
|
|
2005-08-02 18:11:57 +02:00
|
|
|
typlen = get_typlen(argtypeid);
|
|
|
|
if (typlen == 0) /* should not happen */
|
2005-07-07 06:36:08 +02:00
|
|
|
elog(ERROR, "cache lookup failed for type %u", argtypeid);
|
|
|
|
|
2005-07-06 21:02:54 +02:00
|
|
|
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
|
|
|
|
sizeof(int));
|
2005-08-02 18:11:57 +02:00
|
|
|
*((int *) fcinfo->flinfo->fn_extra) = typlen;
|
2005-07-06 21:02:54 +02:00
|
|
|
}
|
2005-08-02 18:11:57 +02:00
|
|
|
else
|
|
|
|
typlen = *((int *) fcinfo->flinfo->fn_extra);
|
2005-07-06 21:02:54 +02:00
|
|
|
|
2005-08-02 18:11:57 +02:00
|
|
|
if (typlen == -1)
|
|
|
|
{
|
|
|
|
/* varlena type, possibly toasted */
|
|
|
|
result = toast_datum_size(value);
|
|
|
|
}
|
|
|
|
else if (typlen == -2)
|
|
|
|
{
|
|
|
|
/* cstring */
|
|
|
|
result = strlen(DatumGetCString(value)) + 1;
|
|
|
|
}
|
2005-07-06 21:02:54 +02:00
|
|
|
else
|
|
|
|
{
|
2005-08-02 18:11:57 +02:00
|
|
|
/* ordinary fixed-width type */
|
|
|
|
result = typlen;
|
2005-07-06 21:02:54 +02:00
|
|
|
}
|
2005-08-02 18:11:57 +02:00
|
|
|
|
|
|
|
PG_RETURN_INT32(result);
|
2005-07-06 21:02:54 +02:00
|
|
|
}
|
2010-02-01 04:14:45 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* string_agg - Concatenates values and returns string.
|
|
|
|
*
|
2010-08-05 20:21:19 +02:00
|
|
|
* Syntax: string_agg(value text, delimiter text) RETURNS text
|
2010-02-01 04:14:45 +01:00
|
|
|
*
|
|
|
|
* Note: Any NULL values are ignored. The first-call delimiter isn't
|
|
|
|
* actually used at all, and on subsequent calls the delimiter precedes
|
|
|
|
* the associated value.
|
|
|
|
*/
|
2010-02-08 21:39:52 +01:00
|
|
|
|
|
|
|
/* subroutine to initialize state */
|
2010-02-01 04:14:45 +01:00
|
|
|
static StringInfo
|
2010-02-08 21:39:52 +01:00
|
|
|
makeStringAggState(FunctionCallInfo fcinfo)
|
2010-02-01 04:14:45 +01:00
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
StringInfo state;
|
|
|
|
MemoryContext aggcontext;
|
|
|
|
MemoryContext oldcontext;
|
2010-02-01 04:14:45 +01:00
|
|
|
|
2010-02-08 21:39:52 +01:00
|
|
|
if (!AggCheckCallContext(fcinfo, &aggcontext))
|
2010-02-01 04:14:45 +01:00
|
|
|
{
|
|
|
|
/* cannot be called directly because of internal-type argument */
|
|
|
|
elog(ERROR, "string_agg_transfn called in non-aggregate context");
|
|
|
|
}
|
|
|
|
|
2010-02-08 21:39:52 +01:00
|
|
|
/*
|
|
|
|
* Create state in aggregate context. It'll stay there across subsequent
|
|
|
|
* calls.
|
|
|
|
*/
|
2010-02-01 04:14:45 +01:00
|
|
|
oldcontext = MemoryContextSwitchTo(aggcontext);
|
|
|
|
state = makeStringInfo();
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
|
|
|
|
return state;
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
string_agg_transfn(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
StringInfo state;
|
2010-02-01 04:14:45 +01:00
|
|
|
|
|
|
|
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
|
|
|
|
|
2010-02-08 21:39:52 +01:00
|
|
|
/* Append the value unless null. */
|
2010-02-01 04:14:45 +01:00
|
|
|
if (!PG_ARGISNULL(1))
|
|
|
|
{
|
2010-02-08 21:39:52 +01:00
|
|
|
/* On the first time through, we ignore the delimiter. */
|
2010-02-01 04:14:45 +01:00
|
|
|
if (state == NULL)
|
2010-02-08 21:39:52 +01:00
|
|
|
state = makeStringAggState(fcinfo);
|
2010-02-01 04:14:45 +01:00
|
|
|
else if (!PG_ARGISNULL(2))
|
|
|
|
appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
|
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
|
2010-02-01 04:14:45 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-02-26 03:01:40 +01:00
|
|
|
* The transition type for string_agg() is declared to be "internal",
|
|
|
|
* which is a pass-by-value type the same size as a pointer.
|
2010-02-01 04:14:45 +01:00
|
|
|
*/
|
|
|
|
PG_RETURN_POINTER(state);
|
|
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
|
|
string_agg_finalfn(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2010-02-26 03:01:40 +01:00
|
|
|
StringInfo state;
|
2010-02-01 04:14:45 +01:00
|
|
|
|
|
|
|
/* cannot be called directly because of internal-type argument */
|
2010-02-08 21:39:52 +01:00
|
|
|
Assert(AggCheckCallContext(fcinfo, NULL));
|
|
|
|
|
|
|
|
state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
|
2010-02-01 04:14:45 +01:00
|
|
|
|
|
|
|
if (state != NULL)
|
2011-12-21 14:53:50 +01:00
|
|
|
PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
|
2010-02-01 04:14:45 +01:00
|
|
|
else
|
|
|
|
PG_RETURN_NULL();
|
|
|
|
}
|
2010-08-24 08:30:44 +02:00
|
|
|
|
2017-09-19 21:09:34 +02:00
|
|
|
/*
|
|
|
|
* Prepare cache with fmgr info for the output functions of the datatypes of
|
|
|
|
* the arguments of a concat-like function, beginning with argument "argidx".
|
|
|
|
* (Arguments before that will have corresponding slots in the resulting
|
|
|
|
* FmgrInfo array, but we don't fill those slots.)
|
|
|
|
*/
|
|
|
|
static FmgrInfo *
|
|
|
|
build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
|
|
|
|
{
|
|
|
|
FmgrInfo *foutcache;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* We keep the info in fn_mcxt so it survives across calls */
|
|
|
|
foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
|
|
|
|
PG_NARGS() * sizeof(FmgrInfo));
|
|
|
|
|
|
|
|
for (i = argidx; i < PG_NARGS(); i++)
|
|
|
|
{
|
|
|
|
Oid valtype;
|
|
|
|
Oid typOutput;
|
|
|
|
bool typIsVarlena;
|
|
|
|
|
|
|
|
valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
|
|
|
|
if (!OidIsValid(valtype))
|
|
|
|
elog(ERROR, "could not determine data type of concat() input");
|
|
|
|
|
|
|
|
getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
|
|
|
|
fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
|
|
|
|
}
|
|
|
|
|
|
|
|
fcinfo->flinfo->fn_extra = foutcache;
|
|
|
|
|
|
|
|
return foutcache;
|
|
|
|
}
|
|
|
|
|
2011-08-29 21:20:57 +02:00
|
|
|
/*
|
|
|
|
* Implementation of both concat() and concat_ws().
|
|
|
|
*
|
2013-01-25 06:19:18 +01:00
|
|
|
* sepstr is the separator string to place between values.
|
2017-09-19 21:09:34 +02:00
|
|
|
* argidx identifies the first argument to concatenate (counting from zero);
|
|
|
|
* note that this must be constant across any one series of calls.
|
|
|
|
*
|
2013-01-25 06:19:18 +01:00
|
|
|
* Returns NULL if result should be NULL, else text value.
|
2011-08-29 21:20:57 +02:00
|
|
|
*/
|
2010-08-24 08:30:44 +02:00
|
|
|
static text *
|
2013-01-25 06:19:18 +01:00
|
|
|
concat_internal(const char *sepstr, int argidx,
|
2011-08-29 21:20:57 +02:00
|
|
|
FunctionCallInfo fcinfo)
|
2010-08-24 08:30:44 +02:00
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
text *result;
|
2011-08-29 21:20:57 +02:00
|
|
|
StringInfoData str;
|
2017-09-19 21:09:34 +02:00
|
|
|
FmgrInfo *foutcache;
|
2011-08-29 21:20:57 +02:00
|
|
|
bool first_arg = true;
|
2011-04-10 17:42:00 +02:00
|
|
|
int i;
|
2010-08-24 08:30:44 +02:00
|
|
|
|
2013-01-25 06:19:18 +01:00
|
|
|
/*
|
|
|
|
* concat(VARIADIC some-array) is essentially equivalent to
|
|
|
|
* array_to_text(), ie concat the array elements with the given separator.
|
|
|
|
* So we just pass the case off to that code.
|
|
|
|
*/
|
|
|
|
if (get_fn_expr_variadic(fcinfo->flinfo))
|
|
|
|
{
|
|
|
|
ArrayType *arr;
|
|
|
|
|
|
|
|
/* Should have just the one argument */
|
|
|
|
Assert(argidx == PG_NARGS() - 1);
|
|
|
|
|
|
|
|
/* concat(VARIADIC NULL) is defined as NULL */
|
|
|
|
if (PG_ARGISNULL(argidx))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
2014-04-03 22:57:45 +02:00
|
|
|
* Non-null argument had better be an array. We assume that any call
|
|
|
|
* context that could let get_fn_expr_variadic return true will have
|
2014-05-06 18:12:18 +02:00
|
|
|
* checked that a VARIADIC-labeled parameter actually is an array. So
|
2014-04-03 22:57:45 +02:00
|
|
|
* it should be okay to just Assert that it's an array rather than
|
|
|
|
* doing a full-fledged error check.
|
2013-01-25 06:19:18 +01:00
|
|
|
*/
|
2014-04-03 22:57:45 +02:00
|
|
|
Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
|
2013-01-25 06:19:18 +01:00
|
|
|
|
2014-04-03 22:57:45 +02:00
|
|
|
/* OK, safe to fetch the array value */
|
2013-01-25 06:19:18 +01:00
|
|
|
arr = PG_GETARG_ARRAYTYPE_P(argidx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* And serialize the array. We tell array_to_text to ignore null
|
|
|
|
* elements, which matches the behavior of the loop below.
|
|
|
|
*/
|
|
|
|
return array_to_text_internal(fcinfo, arr, sepstr, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Normal case without explicit VARIADIC marker */
|
2010-08-24 08:30:44 +02:00
|
|
|
initStringInfo(&str);
|
|
|
|
|
2017-09-19 21:09:34 +02:00
|
|
|
/* Get output function info, building it if first time through */
|
|
|
|
foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
|
|
|
|
if (foutcache == NULL)
|
|
|
|
foutcache = build_concat_foutcache(fcinfo, argidx);
|
|
|
|
|
2010-08-24 08:30:44 +02:00
|
|
|
for (i = argidx; i < PG_NARGS(); i++)
|
|
|
|
{
|
|
|
|
if (!PG_ARGISNULL(i))
|
|
|
|
{
|
2011-08-29 21:20:57 +02:00
|
|
|
Datum value = PG_GETARG_DATUM(i);
|
2010-08-24 08:30:44 +02:00
|
|
|
|
2011-08-29 21:20:57 +02:00
|
|
|
/* add separator if appropriate */
|
|
|
|
if (first_arg)
|
|
|
|
first_arg = false;
|
|
|
|
else
|
2013-01-25 06:19:18 +01:00
|
|
|
appendStringInfoString(&str, sepstr);
|
2010-08-24 08:30:44 +02:00
|
|
|
|
2011-08-29 21:20:57 +02:00
|
|
|
/* call the appropriate type output function, append the result */
|
2010-08-24 08:30:44 +02:00
|
|
|
appendStringInfoString(&str,
|
2017-09-19 21:09:34 +02:00
|
|
|
OutputFunctionCall(&foutcache[i], value));
|
2010-08-24 08:30:44 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
result = cstring_to_text_with_len(str.data, str.len);
|
|
|
|
pfree(str.data);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Concatenate all arguments. NULL arguments are ignored.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_concat(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2013-01-25 06:19:18 +01:00
|
|
|
text *result;
|
|
|
|
|
|
|
|
result = concat_internal("", 0, fcinfo);
|
|
|
|
if (result == NULL)
|
|
|
|
PG_RETURN_NULL();
|
|
|
|
PG_RETURN_TEXT_P(result);
|
2010-08-24 08:30:44 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-08-29 21:20:57 +02:00
|
|
|
* Concatenate all but first argument value with separators. The first
|
|
|
|
* parameter is used as the separator. NULL arguments are ignored.
|
2010-08-24 08:30:44 +02:00
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_concat_ws(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2013-01-25 06:19:18 +01:00
|
|
|
char *sep;
|
|
|
|
text *result;
|
2010-08-24 08:30:44 +02:00
|
|
|
|
|
|
|
/* return NULL when separator is NULL */
|
|
|
|
if (PG_ARGISNULL(0))
|
|
|
|
PG_RETURN_NULL();
|
2013-01-25 06:19:18 +01:00
|
|
|
sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
|
2010-08-24 08:30:44 +02:00
|
|
|
|
2013-01-25 06:19:18 +01:00
|
|
|
result = concat_internal(sep, 1, fcinfo);
|
|
|
|
if (result == NULL)
|
|
|
|
PG_RETURN_NULL();
|
|
|
|
PG_RETURN_TEXT_P(result);
|
2010-08-24 08:30:44 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return first n characters in the string. When n is negative,
|
|
|
|
* return all but last |n| characters.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_left(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2019-05-22 18:55:34 +02:00
|
|
|
int n = PG_GETARG_INT32(1);
|
2010-08-24 08:30:44 +02:00
|
|
|
|
|
|
|
if (n < 0)
|
2019-04-02 18:35:32 +02:00
|
|
|
{
|
|
|
|
text *str = PG_GETARG_TEXT_PP(0);
|
|
|
|
const char *p = VARDATA_ANY(str);
|
|
|
|
int len = VARSIZE_ANY_EXHDR(str);
|
|
|
|
int rlen;
|
2010-08-24 08:30:44 +02:00
|
|
|
|
2019-04-02 18:35:32 +02:00
|
|
|
n = pg_mbstrlen_with_len(p, len) + n;
|
|
|
|
rlen = pg_mbcharcliplen(p, len, n);
|
|
|
|
PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
|
2010-08-24 08:30:44 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return last n characters in the string. When n is negative,
|
|
|
|
* return all but first |n| characters.
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_right(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *str = PG_GETARG_TEXT_PP(0);
|
|
|
|
const char *p = VARDATA_ANY(str);
|
|
|
|
int len = VARSIZE_ANY_EXHDR(str);
|
|
|
|
int n = PG_GETARG_INT32(1);
|
|
|
|
int off;
|
|
|
|
|
|
|
|
if (n < 0)
|
|
|
|
n = -n;
|
|
|
|
else
|
|
|
|
n = pg_mbstrlen_with_len(p, len) - n;
|
|
|
|
off = pg_mbcharcliplen(p, len, n);
|
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return reversed string
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_reverse(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
text *str = PG_GETARG_TEXT_PP(0);
|
|
|
|
const char *p = VARDATA_ANY(str);
|
|
|
|
int len = VARSIZE_ANY_EXHDR(str);
|
|
|
|
const char *endp = p + len;
|
|
|
|
text *result;
|
|
|
|
char *dst;
|
2010-08-24 08:30:44 +02:00
|
|
|
|
|
|
|
result = palloc(len + VARHDRSZ);
|
2011-04-10 17:42:00 +02:00
|
|
|
dst = (char *) VARDATA(result) + len;
|
2010-08-24 08:30:44 +02:00
|
|
|
SET_VARSIZE(result, len + VARHDRSZ);
|
|
|
|
|
|
|
|
if (pg_database_encoding_max_length() > 1)
|
|
|
|
{
|
|
|
|
/* multibyte version */
|
|
|
|
while (p < endp)
|
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
int sz;
|
2010-08-24 08:30:44 +02:00
|
|
|
|
|
|
|
sz = pg_mblen(p);
|
|
|
|
dst -= sz;
|
|
|
|
memcpy(dst, p, sz);
|
|
|
|
p += sz;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* single byte version */
|
|
|
|
while (p < endp)
|
|
|
|
*(--dst) = *p++;
|
|
|
|
}
|
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(result);
|
|
|
|
}
|
2010-11-20 06:21:17 +01:00
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
|
2010-11-20 06:21:17 +01:00
|
|
|
/*
|
2013-03-15 03:56:56 +01:00
|
|
|
* Support macros for text_format()
|
|
|
|
*/
|
|
|
|
#define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
|
|
|
|
|
|
|
|
#define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
|
|
|
|
do { \
|
|
|
|
if (++(ptr) >= (end_ptr)) \
|
|
|
|
ereport(ERROR, \
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
|
2016-02-11 16:11:11 +01:00
|
|
|
errmsg("unterminated format() type specifier"), \
|
|
|
|
errhint("For a single \"%%\" use \"%%%%\"."))); \
|
2013-03-15 03:56:56 +01:00
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns a formatted string
|
2010-11-20 06:21:17 +01:00
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_format(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
text *fmt;
|
2011-04-10 17:42:00 +02:00
|
|
|
StringInfoData str;
|
2010-11-20 06:21:17 +01:00
|
|
|
const char *cp;
|
|
|
|
const char *start_ptr;
|
|
|
|
const char *end_ptr;
|
|
|
|
text *result;
|
2013-03-15 03:56:56 +01:00
|
|
|
int arg;
|
2013-01-25 06:19:18 +01:00
|
|
|
bool funcvariadic;
|
|
|
|
int nargs;
|
|
|
|
Datum *elements = NULL;
|
|
|
|
bool *nulls = NULL;
|
|
|
|
Oid element_type = InvalidOid;
|
|
|
|
Oid prev_type = InvalidOid;
|
2013-03-15 03:56:56 +01:00
|
|
|
Oid prev_width_type = InvalidOid;
|
2013-01-25 06:19:18 +01:00
|
|
|
FmgrInfo typoutputfinfo;
|
2013-03-15 03:56:56 +01:00
|
|
|
FmgrInfo typoutputinfo_width;
|
2010-11-20 06:21:17 +01:00
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/* When format string is null, immediately return null */
|
2010-11-20 06:21:17 +01:00
|
|
|
if (PG_ARGISNULL(0))
|
|
|
|
PG_RETURN_NULL();
|
|
|
|
|
2013-01-25 06:19:18 +01:00
|
|
|
/* If argument is marked VARIADIC, expand array into elements */
|
|
|
|
if (get_fn_expr_variadic(fcinfo->flinfo))
|
|
|
|
{
|
|
|
|
ArrayType *arr;
|
|
|
|
int16 elmlen;
|
|
|
|
bool elmbyval;
|
|
|
|
char elmalign;
|
|
|
|
int nitems;
|
|
|
|
|
|
|
|
/* Should have just the one argument */
|
|
|
|
Assert(PG_NARGS() == 2);
|
|
|
|
|
|
|
|
/* If argument is NULL, we treat it as zero-length array */
|
|
|
|
if (PG_ARGISNULL(1))
|
|
|
|
nitems = 0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2014-04-03 22:57:45 +02:00
|
|
|
* Non-null argument had better be an array. We assume that any
|
|
|
|
* call context that could let get_fn_expr_variadic return true
|
|
|
|
* will have checked that a VARIADIC-labeled parameter actually is
|
|
|
|
* an array. So it should be okay to just Assert that it's an
|
|
|
|
* array rather than doing a full-fledged error check.
|
2013-01-25 06:19:18 +01:00
|
|
|
*/
|
2014-04-03 22:57:45 +02:00
|
|
|
Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
|
2013-01-25 06:19:18 +01:00
|
|
|
|
2014-04-03 22:57:45 +02:00
|
|
|
/* OK, safe to fetch the array value */
|
2013-01-25 06:19:18 +01:00
|
|
|
arr = PG_GETARG_ARRAYTYPE_P(1);
|
|
|
|
|
|
|
|
/* Get info about array element type */
|
|
|
|
element_type = ARR_ELEMTYPE(arr);
|
|
|
|
get_typlenbyvalalign(element_type,
|
|
|
|
&elmlen, &elmbyval, &elmalign);
|
|
|
|
|
|
|
|
/* Extract all array elements */
|
|
|
|
deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
|
|
|
|
&elements, &nulls, &nitems);
|
|
|
|
}
|
|
|
|
|
|
|
|
nargs = nitems + 1;
|
|
|
|
funcvariadic = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Non-variadic case, we'll process the arguments individually */
|
|
|
|
nargs = PG_NARGS();
|
|
|
|
funcvariadic = false;
|
|
|
|
}
|
|
|
|
|
2010-11-20 06:21:17 +01:00
|
|
|
/* Setup for main loop. */
|
|
|
|
fmt = PG_GETARG_TEXT_PP(0);
|
|
|
|
start_ptr = VARDATA_ANY(fmt);
|
|
|
|
end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
|
|
|
|
initStringInfo(&str);
|
2013-03-15 03:56:56 +01:00
|
|
|
arg = 1; /* next argument position to print */
|
2010-11-20 06:21:17 +01:00
|
|
|
|
|
|
|
/* Scan format string, looking for conversion specifiers. */
|
|
|
|
for (cp = start_ptr; cp < end_ptr; cp++)
|
|
|
|
{
|
2013-03-15 03:56:56 +01:00
|
|
|
int argpos;
|
|
|
|
int widthpos;
|
|
|
|
int flags;
|
|
|
|
int width;
|
2011-04-10 17:42:00 +02:00
|
|
|
Datum value;
|
|
|
|
bool isNull;
|
|
|
|
Oid typid;
|
2010-11-20 06:21:17 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If it's not the start of a conversion specifier, just copy it to
|
|
|
|
* the output buffer.
|
|
|
|
*/
|
|
|
|
if (*cp != '%')
|
|
|
|
{
|
|
|
|
appendStringInfoCharMacro(&str, *cp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
ADVANCE_PARSE_POINTER(cp, end_ptr);
|
2010-11-20 06:21:17 +01:00
|
|
|
|
|
|
|
/* Easy case: %% outputs a single % */
|
|
|
|
if (*cp == '%')
|
|
|
|
{
|
|
|
|
appendStringInfoCharMacro(&str, *cp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/* Parse the optional portions of the format specifier */
|
|
|
|
cp = text_format_parse_format(cp, end_ptr,
|
|
|
|
&argpos, &widthpos,
|
|
|
|
&flags, &width);
|
|
|
|
|
2010-11-20 06:21:17 +01:00
|
|
|
/*
|
2013-03-15 03:56:56 +01:00
|
|
|
* Next we should see the main conversion specifier. Whether or not
|
|
|
|
* an argument position was present, it's known that at least one
|
|
|
|
* character remains in the string at this point. Experience suggests
|
|
|
|
* that it's worth checking that that character is one of the expected
|
|
|
|
* ones before we try to fetch arguments, so as to produce the least
|
|
|
|
* confusing response to a mis-formatted specifier.
|
2010-11-20 06:21:17 +01:00
|
|
|
*/
|
2013-03-15 03:56:56 +01:00
|
|
|
if (strchr("sIL", *cp) == NULL)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
2016-02-11 16:11:11 +01:00
|
|
|
errmsg("unrecognized format() type specifier \"%c\"",
|
|
|
|
*cp),
|
|
|
|
errhint("For a single \"%%\" use \"%%%%\".")));
|
2013-03-15 03:56:56 +01:00
|
|
|
|
|
|
|
/* If indirect width was specified, get its value */
|
|
|
|
if (widthpos >= 0)
|
2011-05-23 21:18:19 +02:00
|
|
|
{
|
2013-03-15 03:56:56 +01:00
|
|
|
/* Collect the specified or next argument position */
|
|
|
|
if (widthpos > 0)
|
|
|
|
arg = widthpos;
|
|
|
|
if (arg >= nargs)
|
2011-05-23 21:18:19 +02:00
|
|
|
ereport(ERROR,
|
2013-03-15 03:56:56 +01:00
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
2016-02-11 16:11:11 +01:00
|
|
|
errmsg("too few arguments for format()")));
|
2010-11-20 06:21:17 +01:00
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/* Get the value and type of the selected argument */
|
|
|
|
if (!funcvariadic)
|
2011-04-10 17:42:00 +02:00
|
|
|
{
|
2013-03-15 03:56:56 +01:00
|
|
|
value = PG_GETARG_DATUM(arg);
|
|
|
|
isNull = PG_ARGISNULL(arg);
|
|
|
|
typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
value = elements[arg - 1];
|
|
|
|
isNull = nulls[arg - 1];
|
|
|
|
typid = element_type;
|
|
|
|
}
|
|
|
|
if (!OidIsValid(typid))
|
|
|
|
elog(ERROR, "could not determine data type of format() input");
|
2011-05-23 21:18:19 +02:00
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
arg++;
|
2010-11-20 06:21:17 +01:00
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/* We can treat NULL width the same as zero */
|
|
|
|
if (isNull)
|
|
|
|
width = 0;
|
|
|
|
else if (typid == INT4OID)
|
|
|
|
width = DatumGetInt32(value);
|
|
|
|
else if (typid == INT2OID)
|
|
|
|
width = DatumGetInt16(value);
|
2010-11-20 06:21:17 +01:00
|
|
|
else
|
|
|
|
{
|
2013-03-15 03:56:56 +01:00
|
|
|
/* For less-usual datatypes, convert to text then to int */
|
|
|
|
char *str;
|
2010-11-20 06:21:17 +01:00
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
if (typid != prev_width_type)
|
|
|
|
{
|
|
|
|
Oid typoutputfunc;
|
|
|
|
bool typIsVarlena;
|
|
|
|
|
|
|
|
getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
|
|
|
|
fmgr_info(typoutputfunc, &typoutputinfo_width);
|
|
|
|
prev_width_type = typid;
|
|
|
|
}
|
|
|
|
|
|
|
|
str = OutputFunctionCall(&typoutputinfo_width, value);
|
|
|
|
|
2018-07-22 23:58:01 +02:00
|
|
|
/* pg_strtoint32 will complain about bad data or overflow */
|
|
|
|
width = pg_strtoint32(str);
|
2013-03-15 03:56:56 +01:00
|
|
|
|
|
|
|
pfree(str);
|
|
|
|
}
|
2010-11-20 06:21:17 +01:00
|
|
|
}
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/* Collect the specified or next argument position */
|
|
|
|
if (argpos > 0)
|
|
|
|
arg = argpos;
|
|
|
|
if (arg >= nargs)
|
2010-11-20 06:21:17 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
2016-08-08 14:15:41 +02:00
|
|
|
errmsg("too few arguments for format()")));
|
2010-11-20 06:21:17 +01:00
|
|
|
|
2013-01-25 06:19:18 +01:00
|
|
|
/* Get the value and type of the selected argument */
|
|
|
|
if (!funcvariadic)
|
|
|
|
{
|
|
|
|
value = PG_GETARG_DATUM(arg);
|
|
|
|
isNull = PG_ARGISNULL(arg);
|
|
|
|
typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
value = elements[arg - 1];
|
|
|
|
isNull = nulls[arg - 1];
|
|
|
|
typid = element_type;
|
|
|
|
}
|
|
|
|
if (!OidIsValid(typid))
|
|
|
|
elog(ERROR, "could not determine data type of format() input");
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
arg++;
|
|
|
|
|
2013-01-25 06:19:18 +01:00
|
|
|
/*
|
|
|
|
* Get the appropriate typOutput function, reusing previous one if
|
2014-05-06 18:12:18 +02:00
|
|
|
* same type as previous argument. That's particularly useful in the
|
2013-01-25 06:19:18 +01:00
|
|
|
* variadic-array case, but often saves work even for ordinary calls.
|
|
|
|
*/
|
|
|
|
if (typid != prev_type)
|
|
|
|
{
|
|
|
|
Oid typoutputfunc;
|
|
|
|
bool typIsVarlena;
|
|
|
|
|
|
|
|
getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
|
|
|
|
fmgr_info(typoutputfunc, &typoutputfinfo);
|
|
|
|
prev_type = typid;
|
|
|
|
}
|
|
|
|
|
2010-11-20 06:21:17 +01:00
|
|
|
/*
|
2013-03-15 03:56:56 +01:00
|
|
|
* And now we can format the value.
|
2010-11-20 06:21:17 +01:00
|
|
|
*/
|
|
|
|
switch (*cp)
|
|
|
|
{
|
|
|
|
case 's':
|
|
|
|
case 'I':
|
|
|
|
case 'L':
|
2013-01-25 06:19:18 +01:00
|
|
|
text_format_string_conversion(&str, *cp, &typoutputfinfo,
|
2013-03-15 03:56:56 +01:00
|
|
|
value, isNull,
|
|
|
|
flags, width);
|
2010-11-20 06:21:17 +01:00
|
|
|
break;
|
|
|
|
default:
|
2013-03-15 03:56:56 +01:00
|
|
|
/* should not get here, because of previous check */
|
2010-11-20 06:21:17 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
2016-06-10 00:02:36 +02:00
|
|
|
errmsg("unrecognized format() type specifier \"%c\"",
|
|
|
|
*cp),
|
|
|
|
errhint("For a single \"%%\" use \"%%%%\".")));
|
2013-03-15 03:56:56 +01:00
|
|
|
break;
|
2010-11-20 06:21:17 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-25 06:19:18 +01:00
|
|
|
/* Don't need deconstruct_array results anymore. */
|
|
|
|
if (elements != NULL)
|
|
|
|
pfree(elements);
|
|
|
|
if (nulls != NULL)
|
|
|
|
pfree(nulls);
|
|
|
|
|
2010-11-20 06:21:17 +01:00
|
|
|
/* Generate results. */
|
|
|
|
result = cstring_to_text_with_len(str.data, str.len);
|
|
|
|
pfree(str.data);
|
|
|
|
|
|
|
|
PG_RETURN_TEXT_P(result);
|
|
|
|
}
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/*
|
|
|
|
* Parse contiguous digits as a decimal number.
|
|
|
|
*
|
|
|
|
* Returns true if some digits could be parsed.
|
|
|
|
* The value is returned into *value, and *ptr is advanced to the next
|
|
|
|
* character to be parsed.
|
|
|
|
*
|
|
|
|
* Note parsing invariant: at least one character is known available before
|
|
|
|
* string end (end_ptr) at entry, and this is still true at exit.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
|
|
|
|
{
|
|
|
|
bool found = false;
|
|
|
|
const char *cp = *ptr;
|
|
|
|
int val = 0;
|
|
|
|
|
|
|
|
while (*cp >= '0' && *cp <= '9')
|
|
|
|
{
|
2017-12-13 01:32:31 +01:00
|
|
|
int8 digit = (*cp - '0');
|
2013-03-15 03:56:56 +01:00
|
|
|
|
2017-12-13 01:32:31 +01:00
|
|
|
if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
|
|
|
|
unlikely(pg_add_s32_overflow(val, digit, &val)))
|
2013-03-15 03:56:56 +01:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
|
|
|
|
errmsg("number is out of range")));
|
|
|
|
ADVANCE_PARSE_POINTER(cp, end_ptr);
|
|
|
|
found = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
*ptr = cp;
|
|
|
|
*value = val;
|
|
|
|
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parse a format specifier (generally following the SUS printf spec).
|
|
|
|
*
|
|
|
|
* We have already advanced over the initial '%', and we are looking for
|
|
|
|
* [argpos][flags][width]type (but the type character is not consumed here).
|
|
|
|
*
|
|
|
|
* Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
|
|
|
|
* Output parameters:
|
2014-05-06 18:12:18 +02:00
|
|
|
* argpos: argument position for value to be printed. -1 means unspecified.
|
|
|
|
* widthpos: argument position for width. Zero means the argument position
|
2013-03-15 03:56:56 +01:00
|
|
|
* was unspecified (ie, take the next arg) and -1 means no width
|
|
|
|
* argument (width was omitted or specified as a constant).
|
|
|
|
* flags: bitmask of flags.
|
2014-05-06 18:12:18 +02:00
|
|
|
* width: directly-specified width value. Zero means the width was omitted
|
2013-03-15 03:56:56 +01:00
|
|
|
* (note it's not necessary to distinguish this case from an explicit
|
|
|
|
* zero width value).
|
|
|
|
*
|
|
|
|
* The function result is the next character position to be parsed, ie, the
|
|
|
|
* location where the type character is/should be.
|
|
|
|
*
|
|
|
|
* Note parsing invariant: at least one character is known available before
|
|
|
|
* string end (end_ptr) at entry, and this is still true at exit.
|
|
|
|
*/
|
|
|
|
static const char *
|
|
|
|
text_format_parse_format(const char *start_ptr, const char *end_ptr,
|
|
|
|
int *argpos, int *widthpos,
|
|
|
|
int *flags, int *width)
|
|
|
|
{
|
|
|
|
const char *cp = start_ptr;
|
|
|
|
int n;
|
|
|
|
|
|
|
|
/* set defaults for output parameters */
|
|
|
|
*argpos = -1;
|
|
|
|
*widthpos = -1;
|
|
|
|
*flags = 0;
|
|
|
|
*width = 0;
|
|
|
|
|
|
|
|
/* try to identify first number */
|
|
|
|
if (text_format_parse_digits(&cp, end_ptr, &n))
|
|
|
|
{
|
|
|
|
if (*cp != '$')
|
|
|
|
{
|
|
|
|
/* Must be just a width and a type, so we're done */
|
|
|
|
*width = n;
|
|
|
|
return cp;
|
|
|
|
}
|
|
|
|
/* The number was argument position */
|
|
|
|
*argpos = n;
|
|
|
|
/* Explicit 0 for argument index is immediately refused */
|
|
|
|
if (n == 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("format specifies argument 0, but arguments are numbered from 1")));
|
|
|
|
ADVANCE_PARSE_POINTER(cp, end_ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle flags (only minus is supported now) */
|
|
|
|
while (*cp == '-')
|
|
|
|
{
|
|
|
|
*flags |= TEXT_FORMAT_FLAG_MINUS;
|
|
|
|
ADVANCE_PARSE_POINTER(cp, end_ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*cp == '*')
|
|
|
|
{
|
|
|
|
/* Handle indirect width */
|
|
|
|
ADVANCE_PARSE_POINTER(cp, end_ptr);
|
|
|
|
if (text_format_parse_digits(&cp, end_ptr, &n))
|
|
|
|
{
|
|
|
|
/* number in this position must be closed by $ */
|
|
|
|
if (*cp != '$')
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
errmsg("width argument position must be ended by \"$\"")));
|
2013-03-15 03:56:56 +01:00
|
|
|
/* The number was width argument position */
|
|
|
|
*widthpos = n;
|
|
|
|
/* Explicit 0 for argument index is immediately refused */
|
|
|
|
if (n == 0)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("format specifies argument 0, but arguments are numbered from 1")));
|
|
|
|
ADVANCE_PARSE_POINTER(cp, end_ptr);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
*widthpos = 0; /* width's argument position is unspecified */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Check for direct width specification */
|
|
|
|
if (text_format_parse_digits(&cp, end_ptr, &n))
|
|
|
|
*width = n;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* cp should now be pointing at type character */
|
|
|
|
return cp;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Format a %s, %I, or %L conversion
|
|
|
|
*/
|
2013-01-25 06:19:18 +01:00
|
|
|
static void
|
2010-11-20 06:21:17 +01:00
|
|
|
text_format_string_conversion(StringInfo buf, char conversion,
|
2013-01-25 06:19:18 +01:00
|
|
|
FmgrInfo *typOutputInfo,
|
2013-03-15 03:56:56 +01:00
|
|
|
Datum value, bool isNull,
|
|
|
|
int flags, int width)
|
2010-11-20 06:21:17 +01:00
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
char *str;
|
2010-11-20 06:21:17 +01:00
|
|
|
|
|
|
|
/* Handle NULL arguments before trying to stringify the value. */
|
|
|
|
if (isNull)
|
|
|
|
{
|
2013-03-15 03:56:56 +01:00
|
|
|
if (conversion == 's')
|
|
|
|
text_format_append_string(buf, "", flags, width);
|
|
|
|
else if (conversion == 'L')
|
|
|
|
text_format_append_string(buf, "NULL", flags, width);
|
2010-11-20 06:21:17 +01:00
|
|
|
else if (conversion == 'I')
|
2011-04-10 17:42:00 +02:00
|
|
|
ereport(ERROR,
|
2010-11-20 06:21:17 +01:00
|
|
|
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
errmsg("null values cannot be formatted as an SQL identifier")));
|
2010-11-20 06:21:17 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Stringify. */
|
2013-01-25 06:19:18 +01:00
|
|
|
str = OutputFunctionCall(typOutputInfo, value);
|
2010-11-20 06:21:17 +01:00
|
|
|
|
|
|
|
/* Escape. */
|
|
|
|
if (conversion == 'I')
|
|
|
|
{
|
|
|
|
/* quote_identifier may or may not allocate a new string. */
|
2013-03-15 03:56:56 +01:00
|
|
|
text_format_append_string(buf, quote_identifier(str), flags, width);
|
2010-11-20 06:21:17 +01:00
|
|
|
}
|
|
|
|
else if (conversion == 'L')
|
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
char *qstr = quote_literal_cstr(str);
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
text_format_append_string(buf, qstr, flags, width);
|
2010-11-20 06:21:17 +01:00
|
|
|
/* quote_literal_cstr() always allocates a new string */
|
|
|
|
pfree(qstr);
|
|
|
|
}
|
|
|
|
else
|
2013-03-15 03:56:56 +01:00
|
|
|
text_format_append_string(buf, str, flags, width);
|
2010-11-20 06:21:17 +01:00
|
|
|
|
|
|
|
/* Cleanup. */
|
|
|
|
pfree(str);
|
|
|
|
}
|
|
|
|
|
2013-03-15 03:56:56 +01:00
|
|
|
/*
|
|
|
|
* Append str to buf, padding as directed by flags/width
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
text_format_append_string(StringInfo buf, const char *str,
|
|
|
|
int flags, int width)
|
|
|
|
{
|
|
|
|
bool align_to_left = false;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
/* fast path for typical easy case */
|
|
|
|
if (width == 0)
|
|
|
|
{
|
|
|
|
appendStringInfoString(buf, str);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (width < 0)
|
|
|
|
{
|
|
|
|
/* Negative width: implicit '-' flag, then take absolute value */
|
|
|
|
align_to_left = true;
|
|
|
|
/* -INT_MIN is undefined */
|
|
|
|
if (width <= INT_MIN)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
|
|
|
|
errmsg("number is out of range")));
|
|
|
|
width = -width;
|
|
|
|
}
|
|
|
|
else if (flags & TEXT_FORMAT_FLAG_MINUS)
|
|
|
|
align_to_left = true;
|
|
|
|
|
|
|
|
len = pg_mbstrlen(str);
|
|
|
|
if (align_to_left)
|
|
|
|
{
|
|
|
|
/* left justify */
|
|
|
|
appendStringInfoString(buf, str);
|
|
|
|
if (len < width)
|
|
|
|
appendStringInfoSpaces(buf, width - len);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* right justify */
|
|
|
|
if (len < width)
|
|
|
|
appendStringInfoSpaces(buf, width - len);
|
|
|
|
appendStringInfoString(buf, str);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-20 06:21:17 +01:00
|
|
|
/*
|
|
|
|
* text_format_nv - nonvariadic wrapper for text_format function.
|
2011-04-10 17:42:00 +02:00
|
|
|
*
|
2011-05-23 21:18:19 +02:00
|
|
|
* note: this wrapper is necessary to pass the sanity check in opr_sanity,
|
|
|
|
* which checks that all built-in functions that share the implementing C
|
|
|
|
* function take the same number of arguments.
|
2010-11-20 06:21:17 +01:00
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
text_format_nv(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
return text_format(fcinfo);
|
|
|
|
}
|
2014-11-13 18:25:10 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function for Levenshtein distance functions. Faster than memcmp(),
|
|
|
|
* for this use case.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
rest_of_char_same(const char *s1, const char *s2, int len)
|
|
|
|
{
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
len--;
|
|
|
|
if (s1[len] != s2[len])
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Expand each Levenshtein distance variant */
|
|
|
|
#include "levenshtein.c"
|
|
|
|
#define LEVENSHTEIN_LESS_EQUAL
|
|
|
|
#include "levenshtein.c"
|