Improve support of multibyte encoding:

- tsvector_(in|out)
- tsquery_(in|out)
- to_tsvector
- to_tsquery, plainto_tsquery
- 'simple' dictionary
This commit is contained in:
Teodor Sigaev 2005-12-12 11:10:12 +00:00
parent ec0baf949e
commit cb4ea994c6
19 changed files with 263 additions and 146 deletions

View File

@ -14,7 +14,6 @@ void sortstoplist(StopList * s);
void freestoplist(StopList * s); void freestoplist(StopList * s);
void readstoplist(text *in, StopList * s); void readstoplist(text *in, StopList * s);
bool searchstoplist(StopList * s, char *key); bool searchstoplist(StopList * s, char *key);
char *lowerstr(char *str);
typedef struct typedef struct
{ {

View File

@ -6,6 +6,7 @@
#include "dict.h" #include "dict.h"
#include "common.h" #include "common.h"
#include "ts_locale.h"
typedef struct typedef struct
{ {

View File

@ -9,6 +9,7 @@
#include "dict.h" #include "dict.h"
#include "common.h" #include "common.h"
#include "ispell/spell.h" #include "ispell/spell.h"
#include "ts_locale.h"
typedef struct typedef struct
{ {

View File

@ -10,6 +10,7 @@
#include "snowball/header.h" #include "snowball/header.h"
#include "snowball/english_stem.h" #include "snowball/english_stem.h"
#include "snowball/russian_stem.h" #include "snowball/russian_stem.h"
#include "ts_locale.h"
typedef struct typedef struct
{ {

View File

@ -8,6 +8,7 @@
#include "dict.h" #include "dict.h"
#include "common.h" #include "common.h"
#include "ts_locale.h"
#define SYNBUFLEN 4096 #define SYNBUFLEN 4096
typedef struct typedef struct

View File

@ -12,6 +12,7 @@
#include "common.h" #include "common.h"
#include "snowball/header.h" #include "snowball/header.h"
#include "subinclude.h" #include "subinclude.h"
#include "ts_locale.h"
typedef struct { typedef struct {
struct SN_env *z; struct SN_env *z;

View File

@ -12,6 +12,7 @@
#include "common.h" #include "common.h"
#include "subinclude.h" #include "subinclude.h"
#include "ts_locale.h"
HASINIT typedef struct { HASINIT typedef struct {
HASINIT StopList stoplist; HASINIT StopList stoplist;

View File

@ -6,6 +6,7 @@
#include "postgres.h" #include "postgres.h"
#include "spell.h" #include "spell.h"
#include "ts_locale.h"
#define MAX_NORM 1024 #define MAX_NORM 1024
#define MAXNORMLEN 256 #define MAXNORMLEN 256
@ -30,18 +31,6 @@ cmpspellaffix(const void *s1, const void *s2)
return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag)); return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
} }
static void
strlower(char *str)
{
unsigned char *ptr = (unsigned char *) str;
while (*ptr)
{
*ptr = tolower(*ptr);
ptr++;
}
}
static char * static char *
strnduplicate(char *s, int len) strnduplicate(char *s, int len)
{ {
@ -175,7 +164,7 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
} }
else else
flag = ""; flag = "";
strlower(str); lowerstr(str);
/* Dont load words if first letter is not required */ /* Dont load words if first letter is not required */
/* It allows to optimize loading at search time */ /* It allows to optimize loading at search time */
s = str; s = str;
@ -385,7 +374,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
*s = 0; *s = 0;
if (!*str) if (!*str)
continue; continue;
strlower(str); lowerstr(str);
strcpy(mask, ""); strcpy(mask, "");
strcpy(find, ""); strcpy(find, "");
strcpy(repl, ""); strcpy(repl, "");
@ -851,7 +840,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
if (wrdlen > MAXNORMLEN) if (wrdlen > MAXNORMLEN)
return NULL; return NULL;
strlower(word); lowerstr(word);
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
*cur = NULL; *cur = NULL;

View File

@ -8,6 +8,7 @@
#include "dict.h" #include "dict.h"
#include "common.h" #include "common.h"
#include "ts_locale.h"
#define CS_WAITKEY 0 #define CS_WAITKEY 0
#define CS_INKEY 1 #define CS_INKEY 1
@ -30,11 +31,11 @@ nstrdup(char *ptr, int len)
cptr = ptr = res; cptr = ptr = res;
while (*ptr) while (*ptr)
{ {
if (*ptr == '\\') if (t_iseq(ptr, '\\'))
ptr++; ptr++;
*cptr = *ptr; COPYCHAR( cptr, ptr );
ptr++; cptr+=pg_mblen(ptr);
cptr++; ptr+=pg_mblen(ptr);
} }
*cptr = '\0'; *cptr = '\0';
@ -52,9 +53,9 @@ parse_cfgdict(text *in, Map ** m)
while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ) while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
{ {
if (*ptr == ',') if ( t_iseq(ptr, ',') )
num++; num++;
ptr++; ptr+=pg_mblen(ptr);
} }
*m = mptr = (Map *) palloc(sizeof(Map) * (num + 2)); *m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
@ -64,56 +65,56 @@ parse_cfgdict(text *in, Map ** m)
{ {
if (state == CS_WAITKEY) if (state == CS_WAITKEY)
{ {
if (isalpha((unsigned char) *ptr)) if (t_isalpha(ptr))
{ {
begin = ptr; begin = ptr;
state = CS_INKEY; state = CS_INKEY;
} }
else if (!isspace((unsigned char) *ptr)) else if (!t_isspace(ptr))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error"), errmsg("syntax error"),
errdetail("Syntax error in position %d near \"%c\"", errdetail("Syntax error in position %d",
(int) (ptr - VARDATA(in)), *ptr))); (int) (ptr - VARDATA(in)))));
} }
else if (state == CS_INKEY) else if (state == CS_INKEY)
{ {
if (isspace((unsigned char) *ptr)) if (t_isspace(ptr))
{ {
mptr->key = nstrdup(begin, ptr - begin); mptr->key = nstrdup(begin, ptr - begin);
state = CS_WAITEQ; state = CS_WAITEQ;
} }
else if (*ptr == '=') else if (t_iseq(ptr,'='))
{ {
mptr->key = nstrdup(begin, ptr - begin); mptr->key = nstrdup(begin, ptr - begin);
state = CS_WAITVALUE; state = CS_WAITVALUE;
} }
else if (!isalpha((unsigned char) *ptr)) else if (!t_isalpha(ptr))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error"), errmsg("syntax error"),
errdetail("Syntax error in position %d near \"%c\"", errdetail("Syntax error in position %d",
(int) (ptr - VARDATA(in)), *ptr))); (int) (ptr - VARDATA(in)))));
} }
else if (state == CS_WAITEQ) else if (state == CS_WAITEQ)
{ {
if (*ptr == '=') if (t_iseq(ptr, '='))
state = CS_WAITVALUE; state = CS_WAITVALUE;
else if (!isspace((unsigned char) *ptr)) else if (!t_isspace(ptr))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error"), errmsg("syntax error"),
errdetail("Syntax error in position %d near \"%c\"", errdetail("Syntax error in position %d",
(int) (ptr - VARDATA(in)), *ptr))); (int) (ptr - VARDATA(in)))));
} }
else if (state == CS_WAITVALUE) else if (state == CS_WAITVALUE)
{ {
if (*ptr == '"') if (t_iseq(ptr, '"'))
{ {
begin = ptr + 1; begin = ptr + 1;
state = CS_INVALUE; state = CS_INVALUE;
} }
else if (!isspace((unsigned char) *ptr)) else if (!t_isspace(ptr))
{ {
begin = ptr; begin = ptr;
state = CS_IN2VALUE; state = CS_IN2VALUE;
@ -121,36 +122,36 @@ parse_cfgdict(text *in, Map ** m)
} }
else if (state == CS_INVALUE) else if (state == CS_INVALUE)
{ {
if (*ptr == '"') if (t_iseq(ptr, '"'))
{ {
mptr->value = nstrdup(begin, ptr - begin); mptr->value = nstrdup(begin, ptr - begin);
mptr++; mptr++;
state = CS_WAITDELIM; state = CS_WAITDELIM;
} }
else if (*ptr == '\\') else if (t_iseq(ptr, '\\'))
state = CS_INESC; state = CS_INESC;
} }
else if (state == CS_IN2VALUE) else if (state == CS_IN2VALUE)
{ {
if (isspace((unsigned char) *ptr) || *ptr == ',') if (t_isspace(ptr) || t_iseq(ptr, ','))
{ {
mptr->value = nstrdup(begin, ptr - begin); mptr->value = nstrdup(begin, ptr - begin);
mptr++; mptr++;
state = (*ptr == ',') ? CS_WAITKEY : CS_WAITDELIM; state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
} }
else if (*ptr == '\\') else if (t_iseq(ptr, '\\'))
state = CS_INESC; state = CS_INESC;
} }
else if (state == CS_WAITDELIM) else if (state == CS_WAITDELIM)
{ {
if (*ptr == ',') if (t_iseq(ptr, ','))
state = CS_WAITKEY; state = CS_WAITKEY;
else if (!isspace((unsigned char) *ptr)) else if (!t_isspace(ptr))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error"), errmsg("syntax error"),
errdetail("Syntax error in position %d near \"%c\"", errdetail("Syntax error in position %d",
(int) (ptr - VARDATA(in)), *ptr))); (int) (ptr - VARDATA(in)))));
} }
else if (state == CS_INESC) else if (state == CS_INESC)
state = CS_INVALUE; state = CS_INVALUE;
@ -160,9 +161,9 @@ parse_cfgdict(text *in, Map ** m)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("bad parser state"), errmsg("bad parser state"),
errdetail("%d at position %d near \"%c\"", errdetail("%d at position %d",
state, (int) (ptr - VARDATA(in)), *ptr))); state, (int) (ptr - VARDATA(in)))));
ptr++; ptr+=pg_mblen(ptr);
} }
if (state == CS_IN2VALUE) if (state == CS_IN2VALUE)

View File

@ -25,7 +25,7 @@
#include "query.h" #include "query.h"
#include "query_cleanup.h" #include "query_cleanup.h"
#include "common.h" #include "common.h"
#include "ts_locale.h"
PG_FUNCTION_INFO_V1(tsquery_in); PG_FUNCTION_INFO_V1(tsquery_in);
Datum tsquery_in(PG_FUNCTION_ARGS); Datum tsquery_in(PG_FUNCTION_ARGS);
@ -108,24 +108,28 @@ get_weight(char *buf, int2 *weight)
{ {
*weight = 0; *weight = 0;
if (*buf != ':') if ( !t_iseq(buf, ':') )
return buf; return buf;
buf++; buf++;
while (*buf) while ( *buf && pg_mblen(buf) == 1 )
{ {
switch (tolower(*buf)) switch (*buf)
{ {
case 'a': case 'a':
case 'A':
*weight |= 1 << 3; *weight |= 1 << 3;
break; break;
case 'b': case 'b':
case 'B':
*weight |= 1 << 2; *weight |= 1 << 2;
break; break;
case 'c': case 'c':
case 'C':
*weight |= 1 << 1; *weight |= 1 << 1;
break; break;
case 'd': case 'd':
case 'D':
*weight |= 1; *weight |= 1;
break; break;
default: default:
@ -149,25 +153,25 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
{ {
case WAITFIRSTOPERAND: case WAITFIRSTOPERAND:
case WAITOPERAND: case WAITOPERAND:
if (*(state->buf) == '!') if ( t_iseq(state->buf, '!') )
{ {
(state->buf)++; (state->buf)++; /* can safely ++, t_iseq guarantee that pg_mblen()==1 */
*val = (int4) '!'; *val = (int4) '!';
return OPR; return OPR;
} }
else if (*(state->buf) == '(') else if ( t_iseq(state->buf, '(') )
{ {
state->count++; state->count++;
(state->buf)++; (state->buf)++;
return OPEN; return OPEN;
} }
else if (*(state->buf) == ':') else if ( t_iseq(state->buf, ':') )
{ {
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("error at start of operand"))); errmsg("error at start of operand")));
} }
else if (*(state->buf) != ' ') else if ( !t_isspace(state->buf) )
{ {
state->valstate.prsbuf = state->buf; state->valstate.prsbuf = state->buf;
if (gettoken_tsvector(&(state->valstate))) if (gettoken_tsvector(&(state->valstate)))
@ -187,14 +191,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
} }
break; break;
case WAITOPERATOR: case WAITOPERATOR:
if (*(state->buf) == '&' || *(state->buf) == '|') if ( t_iseq(state->buf, '&') || t_iseq(state->buf, '|') )
{ {
state->state = WAITOPERAND; state->state = WAITOPERAND;
*val = (int4) *(state->buf); *val = (int4) *(state->buf);
(state->buf)++; (state->buf)++;
return OPR; return OPR;
} }
else if (*(state->buf) == ')') else if ( t_iseq(state->buf, ')') )
{ {
(state->buf)++; (state->buf)++;
state->count--; state->count--;
@ -202,7 +206,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
} }
else if (*(state->buf) == '\0') else if (*(state->buf) == '\0')
return (state->count) ? ERR : END; return (state->count) ? ERR : END;
else if (*(state->buf) != ' ') else if ( !t_isspace(state->buf) )
return ERR; return ERR;
break; break;
case WAITSINGLEOPERAND: case WAITSINGLEOPERAND:
@ -217,7 +221,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
return ERR; return ERR;
break; break;
} }
(state->buf)++; state->buf+=pg_mblen(state->buf);
} }
return END; return END;
} }
@ -697,8 +701,11 @@ static QUERYTYPE *
Datum Datum
tsquery_in(PG_FUNCTION_ARGS) tsquery_in(PG_FUNCTION_ARGS)
{ {
char * in = (char*)PG_GETARG_POINTER(0);
pg_verifymbstr( in, strlen(in), false);
SET_FUNCOID(); SET_FUNCOID();
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false)); PG_RETURN_POINTER(queryin((char *) in, pushval_asis, 0, false));
} }
/* /*
@ -732,20 +739,23 @@ infix(INFIX * in, bool first)
if (in->curpol->type == VAL) if (in->curpol->type == VAL)
{ {
char *op = in->op + in->curpol->distance; char *op = in->op + in->curpol->distance;
int clen;
RESIZEBUF(in, in->curpol->length * 2 + 2 + 5); RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length()+1) + 2 + 5);
*(in->cur) = '\''; *(in->cur) = '\'';
in->cur++; in->cur++;
while (*op) while (*op)
{ {
if (*op == '\'') if ( t_iseq(op, '\'') )
{ {
*(in->cur) = '\\'; *(in->cur) = '\\';
in->cur++; in->cur++;
} }
*(in->cur) = *op; COPYCHAR(in->cur,op);
op++;
in->cur++; clen = pg_mblen(op);
op+=clen;
in->cur+=clen;
} }
*(in->cur) = '\''; *(in->cur) = '\'';
in->cur++; in->cur++;

View File

@ -4,7 +4,7 @@
#define BS_DEBUG #define BS_DEBUG
*/ */
#include "ts_locale.h"
/* /*
* item in polish notation with back link * item in polish notation with back link
* to left operand * to left operand
@ -38,7 +38,7 @@ typedef struct
#define GETQUERY(x) (ITEM*)( (char*)(x)+HDRSIZEQT ) #define GETQUERY(x) (ITEM*)( (char*)(x)+HDRSIZEQT )
#define GETOPERAND(x) ( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) ) #define GETOPERAND(x) ( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) )
#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' ) #define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
#define END 0 #define END 0
#define ERR 1 #define ERR 1

View File

@ -10,22 +10,10 @@
#include "common.h" #include "common.h"
#include "dict.h" #include "dict.h"
#include "ts_locale.h"
#define STOPBUFLEN 4096 #define STOPBUFLEN 4096
char *
lowerstr(char *str)
{
char *ptr = str;
while (*ptr)
{
*ptr = tolower(*(unsigned char *) ptr);
ptr++;
}
return str;
}
void void
freestoplist(StopList * s) freestoplist(StopList * s)
{ {
@ -60,10 +48,16 @@ readstoplist(text *in, StopList * s)
{ {
char sharepath[MAXPGPATH]; char sharepath[MAXPGPATH];
char *absfn; char *absfn;
#ifdef WIN32
char delim = '\\';
#else
char delim = '/';
#endif
get_share_path(my_exec_path, sharepath); get_share_path(my_exec_path, sharepath);
absfn = palloc(strlen(sharepath) + strlen(filename) + 2); absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
sprintf(absfn, "%s/%s", sharepath, filename); sprintf(absfn, "%s%c%s", sharepath, delim, filename);
pfree(filename); pfree(filename);
filename = absfn; filename = absfn;
} }

View File

@ -5,7 +5,9 @@
#include "mb/pg_wchar.h" #include "mb/pg_wchar.h"
#if defined(TS_USE_WIDE) && defined(WIN32) #ifdef TS_USE_WIDE
#ifdef WIN32
size_t size_t
wchar2char(char *to, const wchar_t *from, size_t len) wchar2char(char *to, const wchar_t *from, size_t len)
@ -69,4 +71,59 @@ char2wchar(wchar_t *to, const char *from, size_t len)
return mbstowcs(to, from, len); return mbstowcs(to, from, len);
} }
#endif /* WIN32 */
int
_t_isalpha( char *ptr ) {
wchar_t character;
char2wchar(&character, ptr, 1);
return iswalpha( (wint_t)character );
}
int
_t_isprint( char *ptr ) {
wchar_t character;
char2wchar(&character, ptr, 1);
return iswprint( (wint_t)character );
}
#endif /* TS_USE_WIDE */
char *
lowerstr(char *str)
{
char *ptr = str;
#ifdef TS_USE_WIDE
/*
* Use wide char code only when max encoding length > 1 and ctype != C.
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) {
wchar_t *wstr, *wptr;
int len = strlen(str);
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
char2wchar(wstr, str, len+1);
while (*wptr) {
*wptr = towlower((wint_t) *wptr);
wptr++;
}
wchar2char(str, wstr, len);
pfree( wstr );
} else
#endif #endif
while (*ptr)
{
*ptr = tolower(*(unsigned char *) ptr);
ptr++;
}
return str;
}

View File

@ -2,6 +2,8 @@
#define __TSLOCALE_H__ #define __TSLOCALE_H__
#include "postgres.h" #include "postgres.h"
#include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
#include <ctype.h> #include <ctype.h>
#include <limits.h> #include <limits.h>
@ -19,18 +21,58 @@
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
#define TS_USE_WIDE #define TS_USE_WIDE
#endif
#ifdef TS_USE_WIDE
#endif /* TS_USE_WIDE */
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
#ifdef WIN32 #ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len); size_t wchar2char(char *to, const wchar_t *from, size_t len);
size_t char2wchar(wchar_t *to, const char *from, size_t len); size_t char2wchar(wchar_t *to, const char *from, size_t len);
#else /* WIN32 */ #else /* WIN32 */
/* correct mbstowcs */ /* correct mbstowcs */
#define char2wchar mbstowcs #define char2wchar mbstowcs
#define wchar2char wcstombs #define wchar2char wcstombs
#endif /* WIN32 */ #endif /* WIN32 */
#endif /* defined(HAVE_WCSTOMBS) &&
* defined(HAVE_TOWLOWER) */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
#define t_isspace(x) ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
int _t_isalpha( char *ptr );
#define t_isalpha(x) ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
int _t_isprint( char *ptr );
#define t_isprint(x) ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
/*
* t_iseq() should be called only for ASCII symbols
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
#define COPYCHAR(d,s) do { \
int lll = pg_mblen( s ); \
\
while( lll-- ) \
TOUCHAR(d+lll) = TOUCHAR(s+lll); \
} while(0)
#else /* not def TS_USE_WIDE */
#define t_isdigit(x) isdigit( TOUCHAR(x) )
#define t_isspace(x) isspace( TOUCHAR(x) )
#define t_isalpha(x) isalpha( TOUCHAR(x) )
#define t_isprint(x) isprint( TOUCHAR(x) )
#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) )
#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s)
#endif
char* lowerstr(char *str);
#endif /* __TSLOCALE_H__ */ #endif /* __TSLOCALE_H__ */

View File

@ -8,6 +8,7 @@
#include "catalog/pg_type.h" #include "catalog/pg_type.h"
#include "executor/spi.h" #include "executor/spi.h"
#include "common.h" #include "common.h"
#include "ts_locale.h"
PG_FUNCTION_INFO_V1(tsstat_in); PG_FUNCTION_INFO_V1(tsstat_in);
Datum tsstat_in(PG_FUNCTION_ARGS); Datum tsstat_in(PG_FUNCTION_ARGS);
@ -476,24 +477,30 @@ ts_stat_sql(text *txt, text *ws)
buf = VARDATA(ws); buf = VARDATA(ws);
while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ) while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
{ {
switch (tolower(*buf)) if ( pg_mblen(buf) == 1 ) {
{ switch (*buf)
case 'a': {
stat->weight |= 1 << 3; case 'A':
break; case 'a':
case 'b': stat->weight |= 1 << 3;
stat->weight |= 1 << 2; break;
break; case 'B':
case 'c': case 'b':
stat->weight |= 1 << 1; stat->weight |= 1 << 2;
break; break;
case 'd': case 'C':
stat->weight |= 1; case 'c':
break; stat->weight |= 1 << 1;
default: break;
stat->weight |= 0; case 'D':
case 'd':
stat->weight |= 1;
break;
default:
stat->weight |= 0;
}
} }
buf++; buf+=pg_mblen(buf);
} }
} }

View File

@ -16,8 +16,9 @@
#include "catalog/namespace.h" #include "catalog/namespace.h"
#include "utils/pg_locale.h" #include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
#include <ctype.h> /* tolower */ #include <ctype.h>
#include "tsvector.h" #include "tsvector.h"
#include "query.h" #include "query.h"
#include "ts_cfg.h" #include "ts_cfg.h"
@ -173,7 +174,7 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
#define RESIZEPRSBUF \ #define RESIZEPRSBUF \
do { \ do { \
if ( state->curpos - state->word + 1 >= state->len ) \ if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
{ \ { \
int4 clen = state->curpos - state->word; \ int4 clen = state->curpos - state->word; \
state->len *= 2; \ state->len *= 2; \
@ -182,6 +183,7 @@ do { \
} \ } \
} while (0) } while (0)
int4 int4
gettoken_tsvector(TI_IN_STATE * state) gettoken_tsvector(TI_IN_STATE * state)
{ {
@ -197,21 +199,21 @@ gettoken_tsvector(TI_IN_STATE * state)
{ {
if (*(state->prsbuf) == '\0') if (*(state->prsbuf) == '\0')
return 0; return 0;
else if (*(state->prsbuf) == '\'') else if ( t_iseq(state->prsbuf, '\'') )
state->state = WAITENDCMPLX; state->state = WAITENDCMPLX;
else if (*(state->prsbuf) == '\\') else if ( t_iseq(state->prsbuf, '\\') )
{ {
state->state = WAITNEXTCHAR; state->state = WAITNEXTCHAR;
oldstate = WAITENDWORD; oldstate = WAITENDWORD;
} }
else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf))) else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error"))); errmsg("syntax error")));
else if (*(state->prsbuf) != ' ') else if (!t_isspace(state->prsbuf))
{ {
*(state->curpos) = *(state->prsbuf); COPYCHAR(state->curpos, state->prsbuf);
state->curpos++; state->curpos+=pg_mblen(state->prsbuf);
state->state = WAITENDWORD; state->state = WAITENDWORD;
} }
} }
@ -224,20 +226,20 @@ gettoken_tsvector(TI_IN_STATE * state)
else else
{ {
RESIZEPRSBUF; RESIZEPRSBUF;
*(state->curpos) = *(state->prsbuf); COPYCHAR(state->curpos, state->prsbuf);
state->curpos++; state->curpos+=pg_mblen(state->prsbuf);
state->state = oldstate; state->state = oldstate;
} }
} }
else if (state->state == WAITENDWORD) else if (state->state == WAITENDWORD)
{ {
if (*(state->prsbuf) == '\\') if ( t_iseq(state->prsbuf, '\\') )
{ {
state->state = WAITNEXTCHAR; state->state = WAITNEXTCHAR;
oldstate = WAITENDWORD; oldstate = WAITENDWORD;
} }
else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' || else if ( t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(*(state->prsbuf)))) (state->oprisdelim && ISOPERATOR(state->prsbuf)))
{ {
RESIZEPRSBUF; RESIZEPRSBUF;
if (state->curpos == state->word) if (state->curpos == state->word)
@ -247,7 +249,7 @@ gettoken_tsvector(TI_IN_STATE * state)
*(state->curpos) = '\0'; *(state->curpos) = '\0';
return 1; return 1;
} }
else if (*(state->prsbuf) == ':') else if ( t_iseq(state->prsbuf,':') )
{ {
if (state->curpos == state->word) if (state->curpos == state->word)
ereport(ERROR, ereport(ERROR,
@ -262,13 +264,13 @@ gettoken_tsvector(TI_IN_STATE * state)
else else
{ {
RESIZEPRSBUF; RESIZEPRSBUF;
*(state->curpos) = *(state->prsbuf); COPYCHAR(state->curpos, state->prsbuf);
state->curpos++; state->curpos+=pg_mblen(state->prsbuf);
} }
} }
else if (state->state == WAITENDCMPLX) else if (state->state == WAITENDCMPLX)
{ {
if (*(state->prsbuf) == '\'') if ( t_iseq(state->prsbuf, '\'') )
{ {
RESIZEPRSBUF; RESIZEPRSBUF;
*(state->curpos) = '\0'; *(state->curpos) = '\0';
@ -278,13 +280,13 @@ gettoken_tsvector(TI_IN_STATE * state)
errmsg("syntax error"))); errmsg("syntax error")));
if (state->oprisdelim) if (state->oprisdelim)
{ {
state->prsbuf++; state->prsbuf+=pg_mblen(state->prsbuf);
return 1; return 1;
} }
else else
state->state = WAITPOSINFO; state->state = WAITPOSINFO;
} }
else if (*(state->prsbuf) == '\\') else if ( t_iseq(state->prsbuf, '\\') )
{ {
state->state = WAITNEXTCHAR; state->state = WAITNEXTCHAR;
oldstate = WAITENDCMPLX; oldstate = WAITENDCMPLX;
@ -296,20 +298,20 @@ gettoken_tsvector(TI_IN_STATE * state)
else else
{ {
RESIZEPRSBUF; RESIZEPRSBUF;
*(state->curpos) = *(state->prsbuf); COPYCHAR(state->curpos, state->prsbuf);
state->curpos++; state->curpos+=pg_mblen(state->prsbuf);
} }
} }
else if (state->state == WAITPOSINFO) else if (state->state == WAITPOSINFO)
{ {
if (*(state->prsbuf) == ':') if ( t_iseq(state->prsbuf, ':') )
state->state = INPOSINFO; state->state = INPOSINFO;
else else
return 1; return 1;
} }
else if (state->state == INPOSINFO) else if (state->state == INPOSINFO)
{ {
if (isdigit((unsigned char) *(state->prsbuf))) if (t_isdigit(state->prsbuf))
{ {
if (state->alen == 0) if (state->alen == 0)
{ {
@ -338,9 +340,9 @@ gettoken_tsvector(TI_IN_STATE * state)
} }
else if (state->state == WAITPOSDELIM) else if (state->state == WAITPOSDELIM)
{ {
if (*(state->prsbuf) == ',') if ( t_iseq(state->prsbuf, ',') )
state->state = INPOSINFO; state->state = INPOSINFO;
else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*') else if ( t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*') )
{ {
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR, ereport(ERROR,
@ -348,7 +350,7 @@ gettoken_tsvector(TI_IN_STATE * state)
errmsg("syntax error"))); errmsg("syntax error")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
} }
else if (tolower(*(state->prsbuf)) == 'b') else if ( t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B') )
{ {
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR, ereport(ERROR,
@ -356,7 +358,7 @@ gettoken_tsvector(TI_IN_STATE * state)
errmsg("syntax error"))); errmsg("syntax error")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
} }
else if (tolower(*(state->prsbuf)) == 'c') else if ( t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C') )
{ {
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR, ereport(ERROR,
@ -364,7 +366,7 @@ gettoken_tsvector(TI_IN_STATE * state)
errmsg("syntax error"))); errmsg("syntax error")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
} }
else if (tolower(*(state->prsbuf)) == 'd') else if ( t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D') )
{ {
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR, ereport(ERROR,
@ -372,10 +374,10 @@ gettoken_tsvector(TI_IN_STATE * state)
errmsg("syntax error"))); errmsg("syntax error")));
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
} }
else if (isspace((unsigned char) *(state->prsbuf)) || else if (t_isspace(state->prsbuf) ||
*(state->prsbuf) == '\0') *(state->prsbuf) == '\0')
return 1; return 1;
else if (!isdigit((unsigned char) *(state->prsbuf))) else if (!t_isdigit(state->prsbuf))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR), (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error"))); errmsg("syntax error")));
@ -383,7 +385,7 @@ gettoken_tsvector(TI_IN_STATE * state)
else else
/* internal error */ /* internal error */
elog(ERROR, "internal error"); elog(ERROR, "internal error");
state->prsbuf++; state->prsbuf+=pg_mblen(state->prsbuf);
} }
return 0; return 0;
@ -405,6 +407,8 @@ tsvector_in(PG_FUNCTION_ARGS)
buflen = 256; buflen = 256;
SET_FUNCOID(); SET_FUNCOID();
pg_verifymbstr( buf, strlen(buf), false );
state.prsbuf = buf; state.prsbuf = buf;
state.len = 32; state.len = 32;
state.word = (char *) palloc(state.len); state.word = (char *) palloc(state.len);
@ -495,17 +499,16 @@ tsvector_out(PG_FUNCTION_ARGS)
tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
char *outbuf; char *outbuf;
int4 i, int4 i,
j,
lenbuf = 0, lenbuf = 0,
pp; pp;
WordEntry *ptr = ARRPTR(out); WordEntry *ptr = ARRPTR(out);
char *curin, char *curbegin, *curin,
*curout; *curout;
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
for (i = 0; i < out->size; i++) for (i = 0; i < out->size; i++)
{ {
lenbuf += ptr[i].len * 2 /* for escape */ ; lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length()/* for escape */ ;
if (ptr[i].haspos) if (ptr[i].haspos)
lenbuf += 7 * POSDATALEN(out, &(ptr[i])); lenbuf += 7 * POSDATALEN(out, &(ptr[i]));
} }
@ -513,14 +516,14 @@ tsvector_out(PG_FUNCTION_ARGS)
curout = outbuf = (char *) palloc(lenbuf); curout = outbuf = (char *) palloc(lenbuf);
for (i = 0; i < out->size; i++) for (i = 0; i < out->size; i++)
{ {
curin = STRPTR(out) + ptr->pos; curbegin = curin = STRPTR(out) + ptr->pos;
if (i != 0) if (i != 0)
*curout++ = ' '; *curout++ = ' ';
*curout++ = '\''; *curout++ = '\'';
j = ptr->len; while ( curin-curbegin < ptr->len )
while (j--)
{ {
if (*curin == '\'') int len = pg_mblen(curin);
if ( t_iseq(curin, '\'') )
{ {
int4 pos = curout - outbuf; int4 pos = curout - outbuf;
@ -528,7 +531,8 @@ tsvector_out(PG_FUNCTION_ARGS)
curout = outbuf + pos; curout = outbuf + pos;
*curout++ = '\\'; *curout++ = '\\';
} }
*curout++ = *curin++; while(len--)
*curout++ = *curin++;
} }
*curout++ = '\''; *curout++ = '\'';
if ((pp = POSDATALEN(out, ptr)) != 0) if ((pp = POSDATALEN(out, ptr)) != 0)

View File

@ -15,7 +15,6 @@
#include "utils/pg_locale.h" #include "utils/pg_locale.h"
#include <ctype.h> /* tolower */
#include "tsvector.h" #include "tsvector.h"
#include "query.h" #include "query.h"
#include "ts_cfg.h" #include "ts_cfg.h"
@ -76,17 +75,21 @@ setweight(PG_FUNCTION_ARGS)
WordEntryPos *p; WordEntryPos *p;
int w = 0; int w = 0;
switch (tolower(cw)) switch (cw)
{ {
case 'A':
case 'a': case 'a':
w = 3; w = 3;
break; break;
case 'B':
case 'b': case 'b':
w = 2; w = 2;
break; break;
case 'C':
case 'c': case 'c':
w = 1; w = 1;
break; break;
case 'D':
case 'd': case 'd':
w = 0; w = 0;
break; break;

View File

@ -71,8 +71,11 @@ TParserClose(TParser * prs)
prs->state = ptr; prs->state = ptr;
} }
#ifdef TS_USE_WIDE
if (prs->wstr) if (prs->wstr)
pfree(prs->wstr); pfree(prs->wstr);
#endif
pfree(prs); pfree(prs);
} }

View File

@ -134,8 +134,10 @@ typedef struct TParser
/* string and position information */ /* string and position information */
char *str; /* multibyte string */ char *str; /* multibyte string */
int lenstr; /* length of mbstring */ int lenstr; /* length of mbstring */
#ifdef TS_USE_WIDE
wchar_t *wstr; /* wide character string */ wchar_t *wstr; /* wide character string */
int lenwstr; /* length of wsting */ int lenwstr; /* length of wsting */
#endif
/* State of parse */ /* State of parse */
int charmaxlen; int charmaxlen;