From cb4ea994c628b6b63812397bd97d96270670724c Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 12 Dec 2005 11:10:12 +0000 Subject: [PATCH] Improve support of multibyte encoding: - tsvector_(in|out) - tsquery_(in|out) - to_tsvector - to_tsquery, plainto_tsquery - 'simple' dictionary --- contrib/tsearch2/dict.h | 1 - contrib/tsearch2/dict_ex.c | 1 + contrib/tsearch2/dict_ispell.c | 1 + contrib/tsearch2/dict_snowball.c | 1 + contrib/tsearch2/dict_syn.c | 1 + contrib/tsearch2/gendict/dict_snowball.c.IN | 1 + contrib/tsearch2/gendict/dict_tmpl.c.IN | 1 + contrib/tsearch2/ispell/spell.c | 19 +---- contrib/tsearch2/prs_dcfg.c | 67 ++++++++--------- contrib/tsearch2/query.c | 48 +++++++----- contrib/tsearch2/query.h | 4 +- contrib/tsearch2/stopword.c | 22 ++---- contrib/tsearch2/ts_locale.c | 59 ++++++++++++++- contrib/tsearch2/ts_locale.h | 48 +++++++++++- contrib/tsearch2/ts_stat.c | 41 ++++++----- contrib/tsearch2/tsvector.c | 82 +++++++++++---------- contrib/tsearch2/tsvector_op.c | 7 +- contrib/tsearch2/wordparser/parser.c | 3 + contrib/tsearch2/wordparser/parser.h | 2 + 19 files changed, 263 insertions(+), 146 deletions(-) diff --git a/contrib/tsearch2/dict.h b/contrib/tsearch2/dict.h index 0227bb4845..8aef0b0cb7 100644 --- a/contrib/tsearch2/dict.h +++ b/contrib/tsearch2/dict.h @@ -14,7 +14,6 @@ void sortstoplist(StopList * s); void freestoplist(StopList * s); void readstoplist(text *in, StopList * s); bool searchstoplist(StopList * s, char *key); -char *lowerstr(char *str); typedef struct { diff --git a/contrib/tsearch2/dict_ex.c b/contrib/tsearch2/dict_ex.c index 8ec3950f9f..334bb5248d 100644 --- a/contrib/tsearch2/dict_ex.c +++ b/contrib/tsearch2/dict_ex.c @@ -6,6 +6,7 @@ #include "dict.h" #include "common.h" +#include "ts_locale.h" typedef struct { diff --git a/contrib/tsearch2/dict_ispell.c b/contrib/tsearch2/dict_ispell.c index 28ce70a285..0e887da584 100644 --- a/contrib/tsearch2/dict_ispell.c +++ b/contrib/tsearch2/dict_ispell.c @@ -9,6 +9,7 @@ #include "dict.h" #include "common.h" #include "ispell/spell.h" +#include "ts_locale.h" typedef struct { diff --git a/contrib/tsearch2/dict_snowball.c b/contrib/tsearch2/dict_snowball.c index 0c08c293d3..bbd44246b8 100644 --- a/contrib/tsearch2/dict_snowball.c +++ b/contrib/tsearch2/dict_snowball.c @@ -10,6 +10,7 @@ #include "snowball/header.h" #include "snowball/english_stem.h" #include "snowball/russian_stem.h" +#include "ts_locale.h" typedef struct { diff --git a/contrib/tsearch2/dict_syn.c b/contrib/tsearch2/dict_syn.c index f328152080..b0c50334ea 100644 --- a/contrib/tsearch2/dict_syn.c +++ b/contrib/tsearch2/dict_syn.c @@ -8,6 +8,7 @@ #include "dict.h" #include "common.h" +#include "ts_locale.h" #define SYNBUFLEN 4096 typedef struct diff --git a/contrib/tsearch2/gendict/dict_snowball.c.IN b/contrib/tsearch2/gendict/dict_snowball.c.IN index ec25edc0ff..818fd6b157 100644 --- a/contrib/tsearch2/gendict/dict_snowball.c.IN +++ b/contrib/tsearch2/gendict/dict_snowball.c.IN @@ -12,6 +12,7 @@ #include "common.h" #include "snowball/header.h" #include "subinclude.h" +#include "ts_locale.h" typedef struct { struct SN_env *z; diff --git a/contrib/tsearch2/gendict/dict_tmpl.c.IN b/contrib/tsearch2/gendict/dict_tmpl.c.IN index e534ed30a7..9d90df712b 100644 --- a/contrib/tsearch2/gendict/dict_tmpl.c.IN +++ b/contrib/tsearch2/gendict/dict_tmpl.c.IN @@ -12,6 +12,7 @@ #include "common.h" #include "subinclude.h" +#include "ts_locale.h" HASINIT typedef struct { HASINIT StopList stoplist; diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c index 9999983cc8..baa36f31f1 100644 --- a/contrib/tsearch2/ispell/spell.c +++ b/contrib/tsearch2/ispell/spell.c @@ -6,6 +6,7 @@ #include "postgres.h" #include "spell.h" +#include "ts_locale.h" #define MAX_NORM 1024 #define MAXNORMLEN 256 @@ -30,18 +31,6 @@ cmpspellaffix(const void *s1, const void *s2) return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag)); } -static void -strlower(char *str) -{ - unsigned char *ptr = (unsigned char *) str; - - while (*ptr) - { - *ptr = tolower(*ptr); - ptr++; - } -} - static char * strnduplicate(char *s, int len) { @@ -175,7 +164,7 @@ NIImportDictionary(IspellDict * Conf, const char *filename) } else flag = ""; - strlower(str); + lowerstr(str); /* Dont load words if first letter is not required */ /* It allows to optimize loading at search time */ s = str; @@ -385,7 +374,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename) *s = 0; if (!*str) continue; - strlower(str); + lowerstr(str); strcpy(mask, ""); strcpy(find, ""); strcpy(repl, ""); @@ -851,7 +840,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) if (wrdlen > MAXNORMLEN) return NULL; - strlower(word); + lowerstr(word); cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); *cur = NULL; diff --git a/contrib/tsearch2/prs_dcfg.c b/contrib/tsearch2/prs_dcfg.c index 240aaa4497..c54ca11803 100644 --- a/contrib/tsearch2/prs_dcfg.c +++ b/contrib/tsearch2/prs_dcfg.c @@ -8,6 +8,7 @@ #include "dict.h" #include "common.h" +#include "ts_locale.h" #define CS_WAITKEY 0 #define CS_INKEY 1 @@ -30,11 +31,11 @@ nstrdup(char *ptr, int len) cptr = ptr = res; while (*ptr) { - if (*ptr == '\\') + if (t_iseq(ptr, '\\')) ptr++; - *cptr = *ptr; - ptr++; - cptr++; + COPYCHAR( cptr, ptr ); + cptr+=pg_mblen(ptr); + ptr+=pg_mblen(ptr); } *cptr = '\0'; @@ -52,9 +53,9 @@ parse_cfgdict(text *in, Map ** m) while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ) { - if (*ptr == ',') + if ( t_iseq(ptr, ',') ) num++; - ptr++; + ptr+=pg_mblen(ptr); } *m = mptr = (Map *) palloc(sizeof(Map) * (num + 2)); @@ -64,56 +65,56 @@ parse_cfgdict(text *in, Map ** m) { if (state == CS_WAITKEY) { - if (isalpha((unsigned char) *ptr)) + if (t_isalpha(ptr)) { begin = ptr; state = CS_INKEY; } - else if (!isspace((unsigned char) *ptr)) + else if (!t_isspace(ptr)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), - errdetail("Syntax error in position %d near \"%c\"", - (int) (ptr - VARDATA(in)), *ptr))); + errdetail("Syntax error in position %d", + (int) (ptr - VARDATA(in))))); } else if (state == CS_INKEY) { - if (isspace((unsigned char) *ptr)) + if (t_isspace(ptr)) { mptr->key = nstrdup(begin, ptr - begin); state = CS_WAITEQ; } - else if (*ptr == '=') + else if (t_iseq(ptr,'=')) { mptr->key = nstrdup(begin, ptr - begin); state = CS_WAITVALUE; } - else if (!isalpha((unsigned char) *ptr)) + else if (!t_isalpha(ptr)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), - errdetail("Syntax error in position %d near \"%c\"", - (int) (ptr - VARDATA(in)), *ptr))); + errdetail("Syntax error in position %d", + (int) (ptr - VARDATA(in))))); } else if (state == CS_WAITEQ) { - if (*ptr == '=') + if (t_iseq(ptr, '=')) state = CS_WAITVALUE; - else if (!isspace((unsigned char) *ptr)) + else if (!t_isspace(ptr)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), - errdetail("Syntax error in position %d near \"%c\"", - (int) (ptr - VARDATA(in)), *ptr))); + errdetail("Syntax error in position %d", + (int) (ptr - VARDATA(in))))); } else if (state == CS_WAITVALUE) { - if (*ptr == '"') + if (t_iseq(ptr, '"')) { begin = ptr + 1; state = CS_INVALUE; } - else if (!isspace((unsigned char) *ptr)) + else if (!t_isspace(ptr)) { begin = ptr; state = CS_IN2VALUE; @@ -121,36 +122,36 @@ parse_cfgdict(text *in, Map ** m) } else if (state == CS_INVALUE) { - if (*ptr == '"') + if (t_iseq(ptr, '"')) { mptr->value = nstrdup(begin, ptr - begin); mptr++; state = CS_WAITDELIM; } - else if (*ptr == '\\') + else if (t_iseq(ptr, '\\')) state = CS_INESC; } else if (state == CS_IN2VALUE) { - if (isspace((unsigned char) *ptr) || *ptr == ',') + if (t_isspace(ptr) || t_iseq(ptr, ',')) { mptr->value = nstrdup(begin, ptr - begin); mptr++; - state = (*ptr == ',') ? CS_WAITKEY : CS_WAITDELIM; + state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM; } - else if (*ptr == '\\') + else if (t_iseq(ptr, '\\')) state = CS_INESC; } else if (state == CS_WAITDELIM) { - if (*ptr == ',') + if (t_iseq(ptr, ',')) state = CS_WAITKEY; - else if (!isspace((unsigned char) *ptr)) + else if (!t_isspace(ptr)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), - errdetail("Syntax error in position %d near \"%c\"", - (int) (ptr - VARDATA(in)), *ptr))); + errdetail("Syntax error in position %d", + (int) (ptr - VARDATA(in))))); } else if (state == CS_INESC) state = CS_INVALUE; @@ -160,9 +161,9 @@ parse_cfgdict(text *in, Map ** m) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("bad parser state"), - errdetail("%d at position %d near \"%c\"", - state, (int) (ptr - VARDATA(in)), *ptr))); - ptr++; + errdetail("%d at position %d", + state, (int) (ptr - VARDATA(in))))); + ptr+=pg_mblen(ptr); } if (state == CS_IN2VALUE) diff --git a/contrib/tsearch2/query.c b/contrib/tsearch2/query.c index de6d96ed52..e6285fd9d2 100644 --- a/contrib/tsearch2/query.c +++ b/contrib/tsearch2/query.c @@ -25,7 +25,7 @@ #include "query.h" #include "query_cleanup.h" #include "common.h" - +#include "ts_locale.h" PG_FUNCTION_INFO_V1(tsquery_in); Datum tsquery_in(PG_FUNCTION_ARGS); @@ -108,24 +108,28 @@ get_weight(char *buf, int2 *weight) { *weight = 0; - if (*buf != ':') + if ( !t_iseq(buf, ':') ) return buf; buf++; - while (*buf) + while ( *buf && pg_mblen(buf) == 1 ) { - switch (tolower(*buf)) + switch (*buf) { case 'a': + case 'A': *weight |= 1 << 3; break; case 'b': + case 'B': *weight |= 1 << 2; break; case 'c': + case 'C': *weight |= 1 << 1; break; case 'd': + case 'D': *weight |= 1; break; default: @@ -149,25 +153,25 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2 { case WAITFIRSTOPERAND: case WAITOPERAND: - if (*(state->buf) == '!') + if ( t_iseq(state->buf, '!') ) { - (state->buf)++; + (state->buf)++; /* can safely ++, t_iseq guarantee that pg_mblen()==1 */ *val = (int4) '!'; return OPR; } - else if (*(state->buf) == '(') + else if ( t_iseq(state->buf, '(') ) { state->count++; (state->buf)++; return OPEN; } - else if (*(state->buf) == ':') + else if ( t_iseq(state->buf, ':') ) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("error at start of operand"))); } - else if (*(state->buf) != ' ') + else if ( !t_isspace(state->buf) ) { state->valstate.prsbuf = state->buf; if (gettoken_tsvector(&(state->valstate))) @@ -187,14 +191,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2 } break; case WAITOPERATOR: - if (*(state->buf) == '&' || *(state->buf) == '|') + if ( t_iseq(state->buf, '&') || t_iseq(state->buf, '|') ) { state->state = WAITOPERAND; *val = (int4) *(state->buf); (state->buf)++; return OPR; } - else if (*(state->buf) == ')') + else if ( t_iseq(state->buf, ')') ) { (state->buf)++; state->count--; @@ -202,7 +206,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2 } else if (*(state->buf) == '\0') return (state->count) ? ERR : END; - else if (*(state->buf) != ' ') + else if ( !t_isspace(state->buf) ) return ERR; break; case WAITSINGLEOPERAND: @@ -217,7 +221,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2 return ERR; break; } - (state->buf)++; + state->buf+=pg_mblen(state->buf); } return END; } @@ -697,8 +701,11 @@ static QUERYTYPE * Datum tsquery_in(PG_FUNCTION_ARGS) { + char * in = (char*)PG_GETARG_POINTER(0); + pg_verifymbstr( in, strlen(in), false); + SET_FUNCOID(); - PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false)); + PG_RETURN_POINTER(queryin((char *) in, pushval_asis, 0, false)); } /* @@ -732,20 +739,23 @@ infix(INFIX * in, bool first) if (in->curpol->type == VAL) { char *op = in->op + in->curpol->distance; + int clen; - RESIZEBUF(in, in->curpol->length * 2 + 2 + 5); + RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length()+1) + 2 + 5); *(in->cur) = '\''; in->cur++; while (*op) { - if (*op == '\'') + if ( t_iseq(op, '\'') ) { *(in->cur) = '\\'; in->cur++; } - *(in->cur) = *op; - op++; - in->cur++; + COPYCHAR(in->cur,op); + + clen = pg_mblen(op); + op+=clen; + in->cur+=clen; } *(in->cur) = '\''; in->cur++; diff --git a/contrib/tsearch2/query.h b/contrib/tsearch2/query.h index 9eff69cc71..b4d586a684 100644 --- a/contrib/tsearch2/query.h +++ b/contrib/tsearch2/query.h @@ -4,7 +4,7 @@ #define BS_DEBUG */ - +#include "ts_locale.h" /* * item in polish notation with back link * to left operand @@ -38,7 +38,7 @@ typedef struct #define GETQUERY(x) (ITEM*)( (char*)(x)+HDRSIZEQT ) #define GETOPERAND(x) ( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) ) -#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' ) +#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) ) #define END 0 #define ERR 1 diff --git a/contrib/tsearch2/stopword.c b/contrib/tsearch2/stopword.c index b8789f9e64..2a9a464596 100644 --- a/contrib/tsearch2/stopword.c +++ b/contrib/tsearch2/stopword.c @@ -10,22 +10,10 @@ #include "common.h" #include "dict.h" +#include "ts_locale.h" #define STOPBUFLEN 4096 -char * -lowerstr(char *str) -{ - char *ptr = str; - - while (*ptr) - { - *ptr = tolower(*(unsigned char *) ptr); - ptr++; - } - return str; -} - void freestoplist(StopList * s) { @@ -60,10 +48,16 @@ readstoplist(text *in, StopList * s) { char sharepath[MAXPGPATH]; char *absfn; +#ifdef WIN32 + char delim = '\\'; +#else + char delim = '/'; +#endif get_share_path(my_exec_path, sharepath); absfn = palloc(strlen(sharepath) + strlen(filename) + 2); - sprintf(absfn, "%s/%s", sharepath, filename); + sprintf(absfn, "%s%c%s", sharepath, delim, filename); + pfree(filename); filename = absfn; } diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c index 5dc67abc8d..29c07c0eab 100644 --- a/contrib/tsearch2/ts_locale.c +++ b/contrib/tsearch2/ts_locale.c @@ -5,7 +5,9 @@ #include "mb/pg_wchar.h" -#if defined(TS_USE_WIDE) && defined(WIN32) +#ifdef TS_USE_WIDE + +#ifdef WIN32 size_t wchar2char(char *to, const wchar_t *from, size_t len) @@ -69,4 +71,59 @@ char2wchar(wchar_t *to, const char *from, size_t len) return mbstowcs(to, from, len); } +#endif /* WIN32 */ + +int +_t_isalpha( char *ptr ) { + wchar_t character; + + char2wchar(&character, ptr, 1); + + return iswalpha( (wint_t)character ); +} + +int +_t_isprint( char *ptr ) { + wchar_t character; + + char2wchar(&character, ptr, 1); + + return iswprint( (wint_t)character ); +} + +#endif /* TS_USE_WIDE */ + +char * +lowerstr(char *str) +{ + char *ptr = str; + +#ifdef TS_USE_WIDE + /* + * Use wide char code only when max encoding length > 1 and ctype != C. + * Some operating systems fail with multi-byte encodings and a C locale. + * Also, for a C locale there is no need to process as multibyte. From + * backend/utils/adt/oracle_compat.c Teodor + */ + if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) { + wchar_t *wstr, *wptr; + int len = strlen(str); + + wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1)); + char2wchar(wstr, str, len+1); + while (*wptr) { + *wptr = towlower((wint_t) *wptr); + wptr++; + } + wchar2char(str, wstr, len); + pfree( wstr ); + } else #endif + while (*ptr) + { + *ptr = tolower(*(unsigned char *) ptr); + ptr++; + } + return str; +} + diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h index 905eb94af0..2d5bc17a96 100644 --- a/contrib/tsearch2/ts_locale.h +++ b/contrib/tsearch2/ts_locale.h @@ -2,6 +2,8 @@ #define __TSLOCALE_H__ #include "postgres.h" +#include "utils/pg_locale.h" +#include "mb/pg_wchar.h" #include #include @@ -19,18 +21,58 @@ #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) #define TS_USE_WIDE +#endif + +#ifdef TS_USE_WIDE +#endif /* TS_USE_WIDE */ + + +#define TOUCHAR(x) (*((unsigned char*)(x))) + +#ifdef TS_USE_WIDE #ifdef WIN32 size_t wchar2char(char *to, const wchar_t *from, size_t len); size_t char2wchar(wchar_t *to, const char *from, size_t len); -#else /* WIN32 */ +#else /* WIN32 */ /* correct mbstowcs */ #define char2wchar mbstowcs #define wchar2char wcstombs #endif /* WIN32 */ -#endif /* defined(HAVE_WCSTOMBS) && - * defined(HAVE_TOWLOWER) */ + +#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) +#define t_isspace(x) ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) ) +int _t_isalpha( char *ptr ); +#define t_isalpha(x) ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) ) +int _t_isprint( char *ptr ); +#define t_isprint(x) ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) ) +/* + * t_iseq() should be called only for ASCII symbols + */ +#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) + +#define COPYCHAR(d,s) do { \ + int lll = pg_mblen( s ); \ + \ + while( lll-- ) \ + TOUCHAR(d+lll) = TOUCHAR(s+lll); \ +} while(0) + + +#else /* not def TS_USE_WIDE */ + +#define t_isdigit(x) isdigit( TOUCHAR(x) ) +#define t_isspace(x) isspace( TOUCHAR(x) ) +#define t_isalpha(x) isalpha( TOUCHAR(x) ) +#define t_isprint(x) isprint( TOUCHAR(x) ) +#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) ) + +#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s) + +#endif + +char* lowerstr(char *str); #endif /* __TSLOCALE_H__ */ diff --git a/contrib/tsearch2/ts_stat.c b/contrib/tsearch2/ts_stat.c index b8ecf96e6d..ae9575b353 100644 --- a/contrib/tsearch2/ts_stat.c +++ b/contrib/tsearch2/ts_stat.c @@ -8,6 +8,7 @@ #include "catalog/pg_type.h" #include "executor/spi.h" #include "common.h" +#include "ts_locale.h" PG_FUNCTION_INFO_V1(tsstat_in); Datum tsstat_in(PG_FUNCTION_ARGS); @@ -476,24 +477,30 @@ ts_stat_sql(text *txt, text *ws) buf = VARDATA(ws); while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ) { - switch (tolower(*buf)) - { - case 'a': - stat->weight |= 1 << 3; - break; - case 'b': - stat->weight |= 1 << 2; - break; - case 'c': - stat->weight |= 1 << 1; - break; - case 'd': - stat->weight |= 1; - break; - default: - stat->weight |= 0; + if ( pg_mblen(buf) == 1 ) { + switch (*buf) + { + case 'A': + case 'a': + stat->weight |= 1 << 3; + break; + case 'B': + case 'b': + stat->weight |= 1 << 2; + break; + case 'C': + case 'c': + stat->weight |= 1 << 1; + break; + case 'D': + case 'd': + stat->weight |= 1; + break; + default: + stat->weight |= 0; + } } - buf++; + buf+=pg_mblen(buf); } } diff --git a/contrib/tsearch2/tsvector.c b/contrib/tsearch2/tsvector.c index cfed6e428a..dd895ff38a 100644 --- a/contrib/tsearch2/tsvector.c +++ b/contrib/tsearch2/tsvector.c @@ -16,8 +16,9 @@ #include "catalog/namespace.h" #include "utils/pg_locale.h" +#include "mb/pg_wchar.h" -#include /* tolower */ +#include #include "tsvector.h" #include "query.h" #include "ts_cfg.h" @@ -173,7 +174,7 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) #define RESIZEPRSBUF \ do { \ - if ( state->curpos - state->word + 1 >= state->len ) \ + if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ { \ int4 clen = state->curpos - state->word; \ state->len *= 2; \ @@ -182,6 +183,7 @@ do { \ } \ } while (0) + int4 gettoken_tsvector(TI_IN_STATE * state) { @@ -197,21 +199,21 @@ gettoken_tsvector(TI_IN_STATE * state) { if (*(state->prsbuf) == '\0') return 0; - else if (*(state->prsbuf) == '\'') + else if ( t_iseq(state->prsbuf, '\'') ) state->state = WAITENDCMPLX; - else if (*(state->prsbuf) == '\\') + else if ( t_iseq(state->prsbuf, '\\') ) { state->state = WAITNEXTCHAR; oldstate = WAITENDWORD; } - else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf))) + else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); - else if (*(state->prsbuf) != ' ') + else if (!t_isspace(state->prsbuf)) { - *(state->curpos) = *(state->prsbuf); - state->curpos++; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos+=pg_mblen(state->prsbuf); state->state = WAITENDWORD; } } @@ -224,20 +226,20 @@ gettoken_tsvector(TI_IN_STATE * state) else { RESIZEPRSBUF; - *(state->curpos) = *(state->prsbuf); - state->curpos++; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos+=pg_mblen(state->prsbuf); state->state = oldstate; } } else if (state->state == WAITENDWORD) { - if (*(state->prsbuf) == '\\') + if ( t_iseq(state->prsbuf, '\\') ) { state->state = WAITNEXTCHAR; oldstate = WAITENDWORD; } - else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' || - (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))) + else if ( t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf))) { RESIZEPRSBUF; if (state->curpos == state->word) @@ -247,7 +249,7 @@ gettoken_tsvector(TI_IN_STATE * state) *(state->curpos) = '\0'; return 1; } - else if (*(state->prsbuf) == ':') + else if ( t_iseq(state->prsbuf,':') ) { if (state->curpos == state->word) ereport(ERROR, @@ -262,13 +264,13 @@ gettoken_tsvector(TI_IN_STATE * state) else { RESIZEPRSBUF; - *(state->curpos) = *(state->prsbuf); - state->curpos++; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos+=pg_mblen(state->prsbuf); } } else if (state->state == WAITENDCMPLX) { - if (*(state->prsbuf) == '\'') + if ( t_iseq(state->prsbuf, '\'') ) { RESIZEPRSBUF; *(state->curpos) = '\0'; @@ -278,13 +280,13 @@ gettoken_tsvector(TI_IN_STATE * state) errmsg("syntax error"))); if (state->oprisdelim) { - state->prsbuf++; + state->prsbuf+=pg_mblen(state->prsbuf); return 1; } else state->state = WAITPOSINFO; } - else if (*(state->prsbuf) == '\\') + else if ( t_iseq(state->prsbuf, '\\') ) { state->state = WAITNEXTCHAR; oldstate = WAITENDCMPLX; @@ -296,20 +298,20 @@ gettoken_tsvector(TI_IN_STATE * state) else { RESIZEPRSBUF; - *(state->curpos) = *(state->prsbuf); - state->curpos++; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos+=pg_mblen(state->prsbuf); } } else if (state->state == WAITPOSINFO) { - if (*(state->prsbuf) == ':') + if ( t_iseq(state->prsbuf, ':') ) state->state = INPOSINFO; else return 1; } else if (state->state == INPOSINFO) { - if (isdigit((unsigned char) *(state->prsbuf))) + if (t_isdigit(state->prsbuf)) { if (state->alen == 0) { @@ -338,9 +340,9 @@ gettoken_tsvector(TI_IN_STATE * state) } else if (state->state == WAITPOSDELIM) { - if (*(state->prsbuf) == ',') + if ( t_iseq(state->prsbuf, ',') ) state->state = INPOSINFO; - else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*') + else if ( t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*') ) { if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) ereport(ERROR, @@ -348,7 +350,7 @@ gettoken_tsvector(TI_IN_STATE * state) errmsg("syntax error"))); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3); } - else if (tolower(*(state->prsbuf)) == 'b') + else if ( t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B') ) { if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) ereport(ERROR, @@ -356,7 +358,7 @@ gettoken_tsvector(TI_IN_STATE * state) errmsg("syntax error"))); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2); } - else if (tolower(*(state->prsbuf)) == 'c') + else if ( t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C') ) { if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) ereport(ERROR, @@ -364,7 +366,7 @@ gettoken_tsvector(TI_IN_STATE * state) errmsg("syntax error"))); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1); } - else if (tolower(*(state->prsbuf)) == 'd') + else if ( t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D') ) { if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) ereport(ERROR, @@ -372,10 +374,10 @@ gettoken_tsvector(TI_IN_STATE * state) errmsg("syntax error"))); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); } - else if (isspace((unsigned char) *(state->prsbuf)) || + else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0') return 1; - else if (!isdigit((unsigned char) *(state->prsbuf))) + else if (!t_isdigit(state->prsbuf)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); @@ -383,7 +385,7 @@ gettoken_tsvector(TI_IN_STATE * state) else /* internal error */ elog(ERROR, "internal error"); - state->prsbuf++; + state->prsbuf+=pg_mblen(state->prsbuf); } return 0; @@ -405,6 +407,8 @@ tsvector_in(PG_FUNCTION_ARGS) buflen = 256; SET_FUNCOID(); + + pg_verifymbstr( buf, strlen(buf), false ); state.prsbuf = buf; state.len = 32; state.word = (char *) palloc(state.len); @@ -495,17 +499,16 @@ tsvector_out(PG_FUNCTION_ARGS) tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); char *outbuf; int4 i, - j, lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); - char *curin, + char *curbegin, *curin, *curout; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) { - lenbuf += ptr[i].len * 2 /* for escape */ ; + lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length()/* for escape */ ; if (ptr[i].haspos) lenbuf += 7 * POSDATALEN(out, &(ptr[i])); } @@ -513,14 +516,14 @@ tsvector_out(PG_FUNCTION_ARGS) curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { - curin = STRPTR(out) + ptr->pos; + curbegin = curin = STRPTR(out) + ptr->pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; - j = ptr->len; - while (j--) + while ( curin-curbegin < ptr->len ) { - if (*curin == '\'') + int len = pg_mblen(curin); + if ( t_iseq(curin, '\'') ) { int4 pos = curout - outbuf; @@ -528,7 +531,8 @@ tsvector_out(PG_FUNCTION_ARGS) curout = outbuf + pos; *curout++ = '\\'; } - *curout++ = *curin++; + while(len--) + *curout++ = *curin++; } *curout++ = '\''; if ((pp = POSDATALEN(out, ptr)) != 0) diff --git a/contrib/tsearch2/tsvector_op.c b/contrib/tsearch2/tsvector_op.c index b2562e8984..c911975394 100644 --- a/contrib/tsearch2/tsvector_op.c +++ b/contrib/tsearch2/tsvector_op.c @@ -15,7 +15,6 @@ #include "utils/pg_locale.h" -#include /* tolower */ #include "tsvector.h" #include "query.h" #include "ts_cfg.h" @@ -76,17 +75,21 @@ setweight(PG_FUNCTION_ARGS) WordEntryPos *p; int w = 0; - switch (tolower(cw)) + switch (cw) { + case 'A': case 'a': w = 3; break; + case 'B': case 'b': w = 2; break; + case 'C': case 'c': w = 1; break; + case 'D': case 'd': w = 0; break; diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c index 23b031be79..8a5fcdabe6 100644 --- a/contrib/tsearch2/wordparser/parser.c +++ b/contrib/tsearch2/wordparser/parser.c @@ -71,8 +71,11 @@ TParserClose(TParser * prs) prs->state = ptr; } +#ifdef TS_USE_WIDE if (prs->wstr) pfree(prs->wstr); +#endif + pfree(prs); } diff --git a/contrib/tsearch2/wordparser/parser.h b/contrib/tsearch2/wordparser/parser.h index 923edea589..baeabf72cd 100644 --- a/contrib/tsearch2/wordparser/parser.h +++ b/contrib/tsearch2/wordparser/parser.h @@ -134,8 +134,10 @@ typedef struct TParser /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ +#ifdef TS_USE_WIDE wchar_t *wstr; /* wide character string */ int lenwstr; /* length of wsting */ +#endif /* State of parse */ int charmaxlen;