Improve support of multibyte encoding:

- tsvector_(in|out) - tsquery_(in|out) - to_tsvector - to_tsquery, plainto_tsquery - 'simple' dictionary
2005-12-12 11:10:12 +00:00 · 2005-12-12 11:10:12 +00:00 · cb4ea994c6
parent ec0baf949e
commit cb4ea994c6
19 changed files with 263 additions and 146 deletions
--- a/contrib/tsearch2/dict.h
+++ b/contrib/tsearch2/dict.h
@ -14,7 +14,6 @@ void		sortstoplist(StopList * s);
 void		freestoplist(StopList * s);
 void		readstoplist(text *in, StopList * s);
 bool		searchstoplist(StopList * s, char *key);
 char	   *lowerstr(char *str);
 typedef struct
 {
--- a/contrib/tsearch2/dict_ex.c
+++ b/contrib/tsearch2/dict_ex.c
@ -6,6 +6,7 @@
 #include "dict.h"
 #include "common.h"
 #include "ts_locale.h"
 typedef struct
 {
--- a/contrib/tsearch2/dict_ispell.c
+++ b/contrib/tsearch2/dict_ispell.c
@ -9,6 +9,7 @@
 #include "dict.h"
 #include "common.h"
 #include "ispell/spell.h"
 #include "ts_locale.h"
 typedef struct
 {
--- a/contrib/tsearch2/dict_snowball.c
+++ b/contrib/tsearch2/dict_snowball.c
@ -10,6 +10,7 @@
 #include "snowball/header.h"
 #include "snowball/english_stem.h"
 #include "snowball/russian_stem.h"
 #include "ts_locale.h"
 typedef struct
 {
--- a/contrib/tsearch2/dict_syn.c
+++ b/contrib/tsearch2/dict_syn.c
@ -8,6 +8,7 @@
 #include "dict.h"
 #include "common.h"
 #include "ts_locale.h"
 #define SYNBUFLEN	4096
 typedef struct
--- a/contrib/tsearch2/gendict/dict_snowball.c.IN
+++ b/contrib/tsearch2/gendict/dict_snowball.c.IN
@ -12,6 +12,7 @@
 #include "common.h"
 #include "snowball/header.h"
 #include "subinclude.h"
 #include "ts_locale.h"
 typedef struct {
 	struct SN_env *z;
--- a/contrib/tsearch2/gendict/dict_tmpl.c.IN
+++ b/contrib/tsearch2/gendict/dict_tmpl.c.IN
@ -12,6 +12,7 @@
 #include "common.h"
 #include "subinclude.h"
 #include "ts_locale.h"
 HASINIT typedef struct {
 HASINIT 	StopList	stoplist;
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@ -6,6 +6,7 @@
 #include "postgres.h"
 #include "spell.h"
 #include "ts_locale.h"
 #define MAX_NORM 1024
 #define MAXNORMLEN 256
@ -30,18 +31,6 @@ cmpspellaffix(const void *s1, const void *s2)
 	return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
 }
 static void
 strlower(char *str)
 {
 	unsigned char *ptr = (unsigned char *) str;
 	while (*ptr)
 	{
 		*ptr = tolower(*ptr);
 		ptr++;
 	}
 }
 static char *
 strnduplicate(char *s, int len)
 {
@ -175,7 +164,7 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
 		}
 		else
 			flag = "";
-		strlower(str);
+		lowerstr(str);
 		/* Dont load words if first letter is not required */
 		/* It allows to optimize loading at  search time   */
 		s = str;
@ -385,7 +374,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
 			*s = 0;
 		if (!*str)
 			continue;
-		strlower(str);
+		lowerstr(str);
 		strcpy(mask, "");
 		strcpy(find, "");
 		strcpy(repl, "");
@ -851,7 +840,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
 	if (wrdlen > MAXNORMLEN)
 		return NULL;
-	strlower(word);
+	lowerstr(word);
 	cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
 	*cur = NULL;
--- a/contrib/tsearch2/prs_dcfg.c
+++ b/contrib/tsearch2/prs_dcfg.c
@ -8,6 +8,7 @@
 #include "dict.h"
 #include "common.h"
 #include "ts_locale.h"
 #define CS_WAITKEY	0
 #define CS_INKEY	1
@ -30,11 +31,11 @@ nstrdup(char *ptr, int len)
 	cptr = ptr = res;
 	while (*ptr)
 	{
-		if (*ptr == '\\')
+		if (t_iseq(ptr, '\\'))
 			ptr++;
-		*cptr = *ptr;
+		COPYCHAR( cptr, ptr );
-		ptr++;
+		cptr+=pg_mblen(ptr);
-		cptr++;
+		ptr+=pg_mblen(ptr);
 	}
 	*cptr = '\0';
@ -52,9 +53,9 @@ parse_cfgdict(text *in, Map ** m)
 	while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
 	{
-		if (*ptr == ',')
+		if ( t_iseq(ptr, ',') )
 			num++;
-		ptr++;
+		ptr+=pg_mblen(ptr);
 	}
 	*m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
@ -64,56 +65,56 @@ parse_cfgdict(text *in, Map ** m)
 	{
 		if (state == CS_WAITKEY)
 		{
-			if (isalpha((unsigned char) *ptr))
+			if (t_isalpha(ptr))
 			{
 				begin = ptr;
 				state = CS_INKEY;
 			}
-			else if (!isspace((unsigned char) *ptr))
+			else if (!t_isspace(ptr))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("syntax error"),
-						 errdetail("Syntax error in position %d near \"%c\"",
+						 errdetail("Syntax error in position %d",
-								   (int) (ptr - VARDATA(in)), *ptr)));
+								   (int) (ptr - VARDATA(in)))));
 		}
 		else if (state == CS_INKEY)
 		{
-			if (isspace((unsigned char) *ptr))
+			if (t_isspace(ptr))
 			{
 				mptr->key = nstrdup(begin, ptr - begin);
 				state = CS_WAITEQ;
 			}
-			else if (*ptr == '=')
+			else if (t_iseq(ptr,'='))
 			{
 				mptr->key = nstrdup(begin, ptr - begin);
 				state = CS_WAITVALUE;
 			}
-			else if (!isalpha((unsigned char) *ptr))
+			else if (!t_isalpha(ptr))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("syntax error"),
-						 errdetail("Syntax error in position %d near \"%c\"",
+						 errdetail("Syntax error in position %d",
-								   (int) (ptr - VARDATA(in)), *ptr)));
+								   (int) (ptr - VARDATA(in)))));
 		}
 		else if (state == CS_WAITEQ)
 		{
-			if (*ptr == '=')
+			if (t_iseq(ptr, '='))
 				state = CS_WAITVALUE;
-			else if (!isspace((unsigned char) *ptr))
+			else if (!t_isspace(ptr))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("syntax error"),
-						 errdetail("Syntax error in position %d near \"%c\"",
+						 errdetail("Syntax error in position %d",
-								   (int) (ptr - VARDATA(in)), *ptr)));
+								   (int) (ptr - VARDATA(in)))));
 		}
 		else if (state == CS_WAITVALUE)
 		{
-			if (*ptr == '"')
+			if (t_iseq(ptr, '"'))
 			{
 				begin = ptr + 1;
 				state = CS_INVALUE;
 			}
-			else if (!isspace((unsigned char) *ptr))
+			else if (!t_isspace(ptr))
 			{
 				begin = ptr;
 				state = CS_IN2VALUE;
@ -121,36 +122,36 @@ parse_cfgdict(text *in, Map ** m)
 		}
 		else if (state == CS_INVALUE)
 		{
-			if (*ptr == '"')
+			if (t_iseq(ptr, '"'))
 			{
 				mptr->value = nstrdup(begin, ptr - begin);
 				mptr++;
 				state = CS_WAITDELIM;
 			}
-			else if (*ptr == '\\')
+			else if (t_iseq(ptr, '\\'))
 				state = CS_INESC;
 		}
 		else if (state == CS_IN2VALUE)
 		{
-			if (isspace((unsigned char) *ptr) || *ptr == ',')
+			if (t_isspace(ptr) || t_iseq(ptr, ','))
 			{
 				mptr->value = nstrdup(begin, ptr - begin);
 				mptr++;
-				state = (*ptr == ',') ? CS_WAITKEY : CS_WAITDELIM;
+				state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
 			}
-			else if (*ptr == '\\')
+			else if (t_iseq(ptr, '\\'))
 				state = CS_INESC;
 		}
 		else if (state == CS_WAITDELIM)
 		{
-			if (*ptr == ',')
+			if (t_iseq(ptr, ','))
 				state = CS_WAITKEY;
-			else if (!isspace((unsigned char) *ptr))
+			else if (!t_isspace(ptr))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("syntax error"),
-						 errdetail("Syntax error in position %d near \"%c\"",
+						 errdetail("Syntax error in position %d",
-								   (int) (ptr - VARDATA(in)), *ptr)));
+								   (int) (ptr - VARDATA(in)))));
 		}
 		else if (state == CS_INESC)
 			state = CS_INVALUE;
@ -160,9 +161,9 @@ parse_cfgdict(text *in, Map ** m)
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("bad parser state"),
-					 errdetail("%d at position %d near \"%c\"",
+					 errdetail("%d at position %d",
-							   state, (int) (ptr - VARDATA(in)), *ptr)));
+							   state, (int) (ptr - VARDATA(in)))));
-		ptr++;
+		ptr+=pg_mblen(ptr);
 	}
 	if (state == CS_IN2VALUE)
--- a/contrib/tsearch2/query.c
+++ b/contrib/tsearch2/query.c
@ -25,7 +25,7 @@
 #include "query.h"
 #include "query_cleanup.h"
 #include "common.h"
-
+#include "ts_locale.h"
 PG_FUNCTION_INFO_V1(tsquery_in);
 Datum		tsquery_in(PG_FUNCTION_ARGS);
@ -108,24 +108,28 @@ get_weight(char *buf, int2 *weight)
 {
 	*weight = 0;
-	if (*buf != ':')
+	if ( !t_iseq(buf, ':') )
 		return buf;
 	buf++;
-	while (*buf)
+	while ( *buf && pg_mblen(buf) == 1 )
 	{
-		switch (tolower(*buf))
+		switch (*buf)
 		{
 			case 'a':
 			case 'A':
 				*weight |= 1 << 3;
 				break;
 			case 'b':
 			case 'B':
 				*weight |= 1 << 2;
 				break;
 			case 'c':
 			case 'C':
 				*weight |= 1 << 1;
 				break;
 			case 'd':
 			case 'D':
 				*weight |= 1;
 				break;
 			default:
@ -149,25 +153,25 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
 		{
 			case WAITFIRSTOPERAND:
 			case WAITOPERAND:
-				if (*(state->buf) == '!')
+				if ( t_iseq(state->buf, '!') )
 				{
-					(state->buf)++;
+					(state->buf)++; /* can safely ++, t_iseq guarantee that pg_mblen()==1 */
 					*val = (int4) '!';
 					return OPR;
 				}
-				else if (*(state->buf) == '(')
+				else if ( t_iseq(state->buf, '(') )
 				{
 					state->count++;
 					(state->buf)++;
 					return OPEN;
 				}
-				else if (*(state->buf) == ':')
+				else if ( t_iseq(state->buf, ':') )
 				{
 					ereport(ERROR,
 							(errcode(ERRCODE_SYNTAX_ERROR),
 							 errmsg("error at start of operand")));
 				}
-				else if (*(state->buf) != ' ')
+				else if ( !t_isspace(state->buf) )
 				{
 					state->valstate.prsbuf = state->buf;
 					if (gettoken_tsvector(&(state->valstate)))
@ -187,14 +191,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
 				}
 				break;
 			case WAITOPERATOR:
-				if (*(state->buf) == '&' || *(state->buf) == '|')
+				if ( t_iseq(state->buf, '&') || t_iseq(state->buf, '|') )
 				{
 					state->state = WAITOPERAND;
 					*val = (int4) *(state->buf);
 					(state->buf)++;
 					return OPR;
 				}
-				else if (*(state->buf) == ')')
+				else if ( t_iseq(state->buf, ')') )
 				{
 					(state->buf)++;
 					state->count--;
@ -202,7 +206,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
 				}
 				else if (*(state->buf) == '\0')
 					return (state->count) ? ERR : END;
-				else if (*(state->buf) != ' ')
+				else if ( !t_isspace(state->buf) )
 					return ERR;
 				break;
 			case WAITSINGLEOPERAND:
@ -217,7 +221,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
 				return ERR;
 				break;
 		}
-		(state->buf)++;
+		state->buf+=pg_mblen(state->buf);
 	}
 	return END;
 }
@ -697,8 +701,11 @@ static QUERYTYPE *
 Datum
 tsquery_in(PG_FUNCTION_ARGS)
 {
 	char * in = (char*)PG_GETARG_POINTER(0);
 	pg_verifymbstr( in, strlen(in), false);
 	SET_FUNCOID();
-	PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
+	PG_RETURN_POINTER(queryin((char *) in, pushval_asis, 0, false));
 }
 /*
@ -732,20 +739,23 @@ infix(INFIX * in, bool first)
 	if (in->curpol->type == VAL)
 	{
 		char	   *op = in->op + in->curpol->distance;
 		int		clen;
-		RESIZEBUF(in, in->curpol->length * 2 + 2 + 5);
+		RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length()+1) + 2 + 5);
 		*(in->cur) = '\'';
 		in->cur++;
 		while (*op)
 		{
-			if (*op == '\'')
+			if ( t_iseq(op, '\'') )
 			{
 				*(in->cur) = '\\';
 				in->cur++;
 			}
-			*(in->cur) = *op;
+			COPYCHAR(in->cur,op);
-			op++;
+
-			in->cur++;
+			clen = pg_mblen(op);
 			op+=clen;
 			in->cur+=clen;
 		}
 		*(in->cur) = '\'';
 		in->cur++;
--- a/contrib/tsearch2/query.h
+++ b/contrib/tsearch2/query.h
@ -4,7 +4,7 @@
 #define BS_DEBUG
 */
-
+#include "ts_locale.h"
 /*
 * item in polish notation with back link
 * to left operand
@ -38,7 +38,7 @@ typedef struct
 #define GETQUERY(x)  (ITEM*)( (char*)(x)+HDRSIZEQT )
 #define GETOPERAND(x)	( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) )
-#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' )
+#define ISOPERATOR(x) (  pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
 #define END				0
 #define ERR				1
--- a/contrib/tsearch2/stopword.c
+++ b/contrib/tsearch2/stopword.c
@ -10,22 +10,10 @@
 #include "common.h"
 #include "dict.h"
 #include "ts_locale.h"
 #define STOPBUFLEN	4096
 char *
 lowerstr(char *str)
 {
 	char	   *ptr = str;
 	while (*ptr)
 	{
 		*ptr = tolower(*(unsigned char *) ptr);
 		ptr++;
 	}
 	return str;
 }
 void
 freestoplist(StopList * s)
 {
@ -60,10 +48,16 @@ readstoplist(text *in, StopList * s)
 		{
 			char		sharepath[MAXPGPATH];
 			char	   *absfn;
 #ifdef	WIN32
 			char	delim = '\\';
 #else
 			char 	delim = '/';
 #endif
 			get_share_path(my_exec_path, sharepath);
 			absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
-			sprintf(absfn, "%s/%s", sharepath, filename);
+			sprintf(absfn, "%s%c%s", sharepath, delim, filename);
 			pfree(filename);
 			filename = absfn;
 		}
--- a/contrib/tsearch2/ts_locale.c
+++ b/contrib/tsearch2/ts_locale.c
@ -5,7 +5,9 @@
 #include "mb/pg_wchar.h"
-#if defined(TS_USE_WIDE) && defined(WIN32)
+#ifdef TS_USE_WIDE
 #ifdef WIN32
 size_t
 wchar2char(char *to, const wchar_t *from, size_t len)
@ -69,4 +71,59 @@ char2wchar(wchar_t *to, const char *from, size_t len)
 	return mbstowcs(to, from, len);
 }
 #endif /* WIN32 */
 int
 _t_isalpha( char *ptr ) {
 	wchar_t	character;
 	char2wchar(&character, ptr, 1);
 	return iswalpha( (wint_t)character );	
 }
 int
 _t_isprint( char *ptr ) {
 	wchar_t	character;
 	char2wchar(&character, ptr, 1);
 	return iswprint( (wint_t)character );	
 }
 #endif /* TS_USE_WIDE */
 char *
 lowerstr(char *str)
 {
 	char       *ptr = str;
 #ifdef TS_USE_WIDE
 	/*
 	 * Use wide char code only when max encoding length > 1 and ctype != C.
 	 * Some operating systems fail with multi-byte encodings and a C locale.
 	 * Also, for a C locale there is no need to process as multibyte. From
 	 * backend/utils/adt/oracle_compat.c Teodor
 	 */
 	if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) {
 			wchar_t *wstr, *wptr;
 			int len = strlen(str);
 			wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
 			char2wchar(wstr, str, len+1);
 			while (*wptr) {
 				*wptr = towlower((wint_t) *wptr);
 				wptr++;
 			}
 			wchar2char(str, wstr, len);
 			pfree( wstr );
 	} else
 #endif
 		while (*ptr)
 		{
 			*ptr = tolower(*(unsigned char *) ptr);
 			ptr++;
 		}
 	return str;
 }
--- a/contrib/tsearch2/ts_locale.h
+++ b/contrib/tsearch2/ts_locale.h
@ -2,6 +2,8 @@
 #define __TSLOCALE_H__
 #include "postgres.h"
 #include "utils/pg_locale.h"
 #include "mb/pg_wchar.h"
 #include <ctype.h>
 #include <limits.h>
@ -19,18 +21,58 @@
 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
 #define TS_USE_WIDE
 #endif
 #ifdef TS_USE_WIDE
 #endif   /* TS_USE_WIDE */
 #define TOUCHAR(x)	(*((unsigned char*)(x)))
 #ifdef TS_USE_WIDE
 #ifdef WIN32
 size_t		wchar2char(char *to, const wchar_t *from, size_t len);
 size_t		char2wchar(wchar_t *to, const char *from, size_t len);
-#else							/* WIN32 */
+#else    /* WIN32 */
 /* correct mbstowcs */
 #define char2wchar mbstowcs
 #define wchar2char wcstombs
 #endif   /* WIN32 */
-#endif   /* defined(HAVE_WCSTOMBS) &&
+
-								 * defined(HAVE_TOWLOWER) */
+#define	t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
 #define	t_isspace(x)	( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
 int _t_isalpha( char *ptr );
 #define	t_isalpha(x)	( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
 int _t_isprint( char *ptr );
 #define	t_isprint(x)	( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
 /*
 * t_iseq() should be called only for ASCII symbols 
 */
 #define t_iseq(x,c)	( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) 
 #define COPYCHAR(d,s)	do {				\
 	int lll = pg_mblen( s );			\
 							\
 	while( lll-- ) 					\
 		TOUCHAR(d+lll) = TOUCHAR(s+lll);	\
 } while(0)
 #else /* not def TS_USE_WIDE */
 #define t_isdigit(x) 	isdigit( TOUCHAR(x) )
 #define t_isspace(x) 	isspace( TOUCHAR(x) )
 #define t_isalpha(x) 	isalpha( TOUCHAR(x) )
 #define t_isprint(x) 	isprint( TOUCHAR(x) )
 #define t_iseq(x,c)	( TOUCHAR(x) == ((unsigned char)(c)) )
 #define COPYCHAR(d,s)	TOUCHAR(d) = TOUCHAR(s) 
 #endif
 char* lowerstr(char *str);
 #endif   /* __TSLOCALE_H__ */
--- a/contrib/tsearch2/ts_stat.c
+++ b/contrib/tsearch2/ts_stat.c
@ -8,6 +8,7 @@
 #include "catalog/pg_type.h"
 #include "executor/spi.h"
 #include "common.h"
 #include "ts_locale.h"
 PG_FUNCTION_INFO_V1(tsstat_in);
 Datum		tsstat_in(PG_FUNCTION_ARGS);
@ -476,24 +477,30 @@ ts_stat_sql(text *txt, text *ws)
 		buf = VARDATA(ws);
 		while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
 		{
-			switch (tolower(*buf))
+			if ( pg_mblen(buf) == 1 ) {
-			{
+				switch (*buf)
-				case 'a':
+				{
-					stat->weight |= 1 << 3;
+					case 'A':
-					break;
+					case 'a':
-				case 'b':
+						stat->weight |= 1 << 3;
-					stat->weight |= 1 << 2;
+						break;
-					break;
+					case 'B':
-				case 'c':
+					case 'b':
-					stat->weight |= 1 << 1;
+						stat->weight |= 1 << 2;
-					break;
+						break;
-				case 'd':
+					case 'C':
-					stat->weight |= 1;
+					case 'c':
-					break;
+						stat->weight |= 1 << 1;
-				default:
+						break;
-					stat->weight |= 0;
+					case 'D':
 					case 'd':
 						stat->weight |= 1;
 						break;
 					default:
 						stat->weight |= 0;
 				}
 			}
-			buf++;
+			buf+=pg_mblen(buf);
 		}
 	}
--- a/contrib/tsearch2/tsvector.c
+++ b/contrib/tsearch2/tsvector.c
@ -16,8 +16,9 @@
 #include "catalog/namespace.h"
 #include "utils/pg_locale.h"
 #include "mb/pg_wchar.h"
-#include <ctype.h>				/* tolower */
+#include <ctype.h>
 #include "tsvector.h"
 #include "query.h"
 #include "ts_cfg.h"
@ -173,7 +174,7 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
 #define RESIZEPRSBUF \
 do { \
-	if ( state->curpos - state->word + 1 >= state->len ) \
+	if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
 	{ \
 		int4 clen = state->curpos - state->word; \
 		state->len *= 2; \
@ -182,6 +183,7 @@ do { \
 	} \
 } while (0)
 int4
 gettoken_tsvector(TI_IN_STATE * state)
 {
@ -197,21 +199,21 @@ gettoken_tsvector(TI_IN_STATE * state)
 		{
 			if (*(state->prsbuf) == '\0')
 				return 0;
-			else if (*(state->prsbuf) == '\'')
+			else if ( t_iseq(state->prsbuf, '\'') )
 				state->state = WAITENDCMPLX;
-			else if (*(state->prsbuf) == '\\')
+			else if ( t_iseq(state->prsbuf, '\\') )
 			{
 				state->state = WAITNEXTCHAR;
 				oldstate = WAITENDWORD;
 			}
-			else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))
+			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("syntax error")));
-			else if (*(state->prsbuf) != ' ')
+			else if (!t_isspace(state->prsbuf))
 			{
-				*(state->curpos) = *(state->prsbuf);
+				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos++;
+				state->curpos+=pg_mblen(state->prsbuf);
 				state->state = WAITENDWORD;
 			}
 		}
@ -224,20 +226,20 @@ gettoken_tsvector(TI_IN_STATE * state)
 			else
 			{
 				RESIZEPRSBUF;
-				*(state->curpos) = *(state->prsbuf);
+				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos++;
+				state->curpos+=pg_mblen(state->prsbuf);
 				state->state = oldstate;
 			}
 		}
 		else if (state->state == WAITENDWORD)
 		{
-			if (*(state->prsbuf) == '\\')
+			if ( t_iseq(state->prsbuf, '\\') )
 			{
 				state->state = WAITNEXTCHAR;
 				oldstate = WAITENDWORD;
 			}
-			else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' ||
+			else if ( t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
-					 (state->oprisdelim && ISOPERATOR(*(state->prsbuf))))
+					 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
 			{
 				RESIZEPRSBUF;
 				if (state->curpos == state->word)
@ -247,7 +249,7 @@ gettoken_tsvector(TI_IN_STATE * state)
 				*(state->curpos) = '\0';
 				return 1;
 			}
-			else if (*(state->prsbuf) == ':')
+			else if ( t_iseq(state->prsbuf,':') )
 			{
 				if (state->curpos == state->word)
 					ereport(ERROR,
@ -262,13 +264,13 @@ gettoken_tsvector(TI_IN_STATE * state)
 			else
 			{
 				RESIZEPRSBUF;
-				*(state->curpos) = *(state->prsbuf);
+				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos++;
+				state->curpos+=pg_mblen(state->prsbuf);
 			}
 		}
 		else if (state->state == WAITENDCMPLX)
 		{
-			if (*(state->prsbuf) == '\'')
+			if ( t_iseq(state->prsbuf, '\'') )
 			{
 				RESIZEPRSBUF;
 				*(state->curpos) = '\0';
@ -278,13 +280,13 @@ gettoken_tsvector(TI_IN_STATE * state)
 							 errmsg("syntax error")));
 				if (state->oprisdelim)
 				{
-					state->prsbuf++;
+					state->prsbuf+=pg_mblen(state->prsbuf);
 					return 1;
 				}
 				else
 					state->state = WAITPOSINFO;
 			}
-			else if (*(state->prsbuf) == '\\')
+			else if ( t_iseq(state->prsbuf, '\\') )
 			{
 				state->state = WAITNEXTCHAR;
 				oldstate = WAITENDCMPLX;
@ -296,20 +298,20 @@ gettoken_tsvector(TI_IN_STATE * state)
 			else
 			{
 				RESIZEPRSBUF;
-				*(state->curpos) = *(state->prsbuf);
+				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos++;
+				state->curpos+=pg_mblen(state->prsbuf);
 			}
 		}
 		else if (state->state == WAITPOSINFO)
 		{
-			if (*(state->prsbuf) == ':')
+			if ( t_iseq(state->prsbuf, ':') )
 				state->state = INPOSINFO;
 			else
 				return 1;
 		}
 		else if (state->state == INPOSINFO)
 		{
-			if (isdigit((unsigned char) *(state->prsbuf)))
+			if (t_isdigit(state->prsbuf))
 			{
 				if (state->alen == 0)
 				{
@ -338,9 +340,9 @@ gettoken_tsvector(TI_IN_STATE * state)
 		}
 		else if (state->state == WAITPOSDELIM)
 		{
-			if (*(state->prsbuf) == ',')
+			if ( t_iseq(state->prsbuf, ',') )
 				state->state = INPOSINFO;
-			else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*')
+			else if ( t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*') )
 			{
 				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
 					ereport(ERROR,
@ -348,7 +350,7 @@ gettoken_tsvector(TI_IN_STATE * state)
 							 errmsg("syntax error")));
 				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
 			}
-			else if (tolower(*(state->prsbuf)) == 'b')
+			else if ( t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B') )
 			{
 				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
 					ereport(ERROR,
@ -356,7 +358,7 @@ gettoken_tsvector(TI_IN_STATE * state)
 							 errmsg("syntax error")));
 				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
 			}
-			else if (tolower(*(state->prsbuf)) == 'c')
+			else if ( t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C') )
 			{
 				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
 					ereport(ERROR,
@ -364,7 +366,7 @@ gettoken_tsvector(TI_IN_STATE * state)
 							 errmsg("syntax error")));
 				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
 			}
-			else if (tolower(*(state->prsbuf)) == 'd')
+			else if ( t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D') )
 			{
 				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
 					ereport(ERROR,
@ -372,10 +374,10 @@ gettoken_tsvector(TI_IN_STATE * state)
 							 errmsg("syntax error")));
 				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
 			}
-			else if (isspace((unsigned char) *(state->prsbuf)) ||
+			else if (t_isspace(state->prsbuf) ||
 					 *(state->prsbuf) == '\0')
 				return 1;
-			else if (!isdigit((unsigned char) *(state->prsbuf)))
+			else if (!t_isdigit(state->prsbuf))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("syntax error")));
@ -383,7 +385,7 @@ gettoken_tsvector(TI_IN_STATE * state)
 		else
 			/* internal error */
 			elog(ERROR, "internal error");
-		state->prsbuf++;
+		state->prsbuf+=pg_mblen(state->prsbuf);
 	}
 	return 0;
@ -405,6 +407,8 @@ tsvector_in(PG_FUNCTION_ARGS)
 				buflen = 256;
 	SET_FUNCOID();
 	pg_verifymbstr( buf, strlen(buf), false );
 	state.prsbuf = buf;
 	state.len = 32;
 	state.word = (char *) palloc(state.len);
@ -495,17 +499,16 @@ tsvector_out(PG_FUNCTION_ARGS)
 	tsvector   *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
 	char	   *outbuf;
 	int4		i,
 				j,
 				lenbuf = 0,
 				pp;
 	WordEntry  *ptr = ARRPTR(out);
-	char	   *curin,
+	char	   *curbegin, *curin,
 			   *curout;
 	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
 	for (i = 0; i < out->size; i++)
 	{
-		lenbuf += ptr[i].len * 2 /* for escape */ ;
+		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length()/* for escape */ ;
 		if (ptr[i].haspos)
 			lenbuf += 7 * POSDATALEN(out, &(ptr[i]));
 	}
@ -513,14 +516,14 @@ tsvector_out(PG_FUNCTION_ARGS)
 	curout = outbuf = (char *) palloc(lenbuf);
 	for (i = 0; i < out->size; i++)
 	{
-		curin = STRPTR(out) + ptr->pos;
+		curbegin = curin = STRPTR(out) + ptr->pos;
 		if (i != 0)
 			*curout++ = ' ';
 		*curout++ = '\'';
-		j = ptr->len;
+		while ( curin-curbegin < ptr->len )
 		while (j--)
 		{
-			if (*curin == '\'')
+			int len = pg_mblen(curin);
 			if ( t_iseq(curin, '\'') )
 			{
 				int4		pos = curout - outbuf;
@ -528,7 +531,8 @@ tsvector_out(PG_FUNCTION_ARGS)
 				curout = outbuf + pos;
 				*curout++ = '\\';
 			}
-			*curout++ = *curin++;
+			while(len--)
 				*curout++ = *curin++;
 		}
 		*curout++ = '\'';
 		if ((pp = POSDATALEN(out, ptr)) != 0)
--- a/contrib/tsearch2/tsvector_op.c
+++ b/contrib/tsearch2/tsvector_op.c
@ -15,7 +15,6 @@
 #include "utils/pg_locale.h"
 #include <ctype.h>				/* tolower */
 #include "tsvector.h"
 #include "query.h"
 #include "ts_cfg.h"
@ -76,17 +75,21 @@ setweight(PG_FUNCTION_ARGS)
 	WordEntryPos *p;
 	int			w = 0;
-	switch (tolower(cw))
+	switch (cw)
 	{
 		case 'A':
 		case 'a':
 			w = 3;
 			break;
 		case 'B':
 		case 'b':
 			w = 2;
 			break;
 		case 'C':
 		case 'c':
 			w = 1;
 			break;
 		case 'D':
 		case 'd':
 			w = 0;
 			break;
--- a/contrib/tsearch2/wordparser/parser.c
+++ b/contrib/tsearch2/wordparser/parser.c
@ -71,8 +71,11 @@ TParserClose(TParser * prs)
 		prs->state = ptr;
 	}
 #ifdef TS_USE_WIDE
 	if (prs->wstr)
 		pfree(prs->wstr);
 #endif
 	pfree(prs);
 }
--- a/contrib/tsearch2/wordparser/parser.h
+++ b/contrib/tsearch2/wordparser/parser.h
@ -134,8 +134,10 @@ typedef struct TParser
 	/* string and position information */
 	char	   *str;			/* multibyte string */
 	int			lenstr;			/* length of mbstring */
 #ifdef TS_USE_WIDE
 	wchar_t    *wstr;			/* wide character string */
 	int			lenwstr;		/* length of wsting */
 #endif
 	/* State of parse */
 	int			charmaxlen;