From cad764f349a535459e60926ed11e31c867d2ccd3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 23 Mar 2000 00:55:42 +0000 Subject: [PATCH] Improve selectivity estimation involving string constants: pay attention to more than one character, and try to do the right thing in non-ASCII locales. --- src/backend/utils/adt/selfuncs.c | 124 ++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 28 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 61ff43b3e9..af7a449f69 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.60 2000/03/20 15:42:46 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.61 2000/03/23 00:55:42 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,6 +48,8 @@ /* default selectivity estimate for inequalities such as "A < b" */ #define DEFAULT_INEQ_SEL (1.0 / 3.0) +static bool convert_string_to_scalar(char *str, int strlength, + double *scaleval); static void getattproperties(Oid relid, AttrNumber attnum, Oid *typid, int *typlen, @@ -472,9 +474,8 @@ scalargtjoinsel(Oid opid, * All numeric datatypes are simply converted to their equivalent * "double" values. * - * String datatypes are converted to a crude scale using their first character - * (only if it is in the ASCII range, to try to avoid problems with non-ASCII - * collating sequences). + * String datatypes are converted by convert_string_to_scalar(), + * which is explained below. * * The several datatypes representing absolute times are all converted * to Timestamp, which is actually a double, and then we just use that @@ -525,40 +526,25 @@ convert_to_scalar(Datum value, Oid typid, */ case CHAROID: { - char ch = DatumGetChar(value); + char ch = DatumGetChar(value); - if (ch >= 0 && ch < 127) - { - *scaleval = (double) ch; - return true; - } - break; + return convert_string_to_scalar(&ch, 1, scaleval); } case BPCHAROID: case VARCHAROID: case TEXTOID: - if (VARSIZE(DatumGetPointer(value)) > VARHDRSZ) - { - char ch = * (char *) VARDATA(DatumGetPointer(value)); + { + char *str = (char *) VARDATA(DatumGetPointer(value)); + int strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ; - if (ch >= 0 && ch < 127) - { - *scaleval = (double) ch; - return true; - } - } - break; + return convert_string_to_scalar(str, strlength, scaleval); + } case NAMEOID: { NameData *nm = (NameData *) DatumGetPointer(value); - char ch = NameStr(*nm)[0]; - if (ch >= 0 && ch < 127) - { - *scaleval = (double) ch; - return true; - } - break; + return convert_string_to_scalar(NameStr(*nm), strlen(NameStr(*nm)), + scaleval); } /* @@ -644,6 +630,88 @@ convert_to_scalar(Datum value, Oid typid, return false; } +/* + * Do convert_to_scalar()'s work for any character-string data type. + * + * String datatypes are converted to a scale that ranges from 0 to 1, where + * we visualize the bytes of the string as fractional base-256 digits. + * It's sufficient to consider the first few bytes, since double has only + * limited precision (and we can't expect huge accuracy in our selectivity + * predictions anyway!) + * + * If USE_LOCALE is defined, we must pass the string through strxfrm() + * before doing the computation, so as to generate correct locale-specific + * results. + */ +static bool +convert_string_to_scalar(char *str, int strlength, + double *scaleval) +{ + unsigned char *sptr; + int slen; +#ifdef USE_LOCALE + char *rawstr; + char *xfrmstr; + size_t xfrmsize; + size_t xfrmlen; +#endif + double num, + denom; + + if (strlength <= 0) + { + *scaleval = 0; /* empty string has scalar value 0 */ + return true; + } + +#ifdef USE_LOCALE + /* Need a null-terminated string to pass to strxfrm() */ + rawstr = (char *) palloc(strlength + 1); + memcpy(rawstr, str, strlength); + rawstr[strlength] = '\0'; + + /* Guess that transformed string is not much bigger */ + xfrmsize = strlength + 32; /* arbitrary pad value here... */ + xfrmstr = (char *) palloc(xfrmsize); + xfrmlen = strxfrm(xfrmstr, rawstr, xfrmsize); + if (xfrmlen >= xfrmsize) + { + /* Oops, didn't make it */ + pfree(xfrmstr); + xfrmstr = (char *) palloc(xfrmlen+1); + xfrmlen = strxfrm(xfrmstr, rawstr, xfrmlen+1); + } + pfree(rawstr); + + sptr = (unsigned char *) xfrmstr; + slen = xfrmlen; +#else + sptr = (unsigned char *) str; + slen = strlength; +#endif + + /* No need to consider more than about 8 bytes (sizeof double) */ + if (slen > 8) + slen = 8; + + /* Convert initial characters to fraction */ + num = 0.0; + denom = 256.0; + while (slen-- > 0) + { + num += ((double) (*sptr++)) / denom; + denom *= 256.0; + } + +#ifdef USE_LOCALE + pfree(xfrmstr); +#endif + + *scaleval = num; + return true; +} + + /* * getattproperties * Retrieve pg_attribute properties for an attribute,