Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.

It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.
2006-05-31 14:05:31 +00:00 · 2006-05-31 14:05:31 +00:00 · 22505f4703
parent 3b7ed9ba9c
commit 22505f4703
13 changed files with 1260 additions and 132 deletions
--- a/contrib/tsearch2/Makefile
+++ b/contrib/tsearch2/Makefile
@ -1,13 +1,13 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $

 MODULE_big = tsearch2
 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
-       dict_snowball.o dict_ispell.o dict_syn.o \
+       dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
       wparser.o wparser_def.o \
       ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
       tsvector_op.o rank.o ts_stat.o \
       query_util.o query_support.o query_rewrite.o query_gist.o \
-       ts_locale.o ginidx.o
+       ts_locale.o ts_lexize.o ginidx.o

 SUBDIRS     := snowball ispell wordparser
 SUBDIROBJS  := $(SUBDIRS:%=%/SUBSYS.o)
@ -16,7 +16,7 @@ OBJS	+= $(SUBDIROBJS)

 PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser

-DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
+DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
 DATA_built = tsearch2.sql untsearch2.sql
 DOCS = README.tsearch2
 REGRESS = tsearch2
--- a/contrib/tsearch2/common.c
+++ b/contrib/tsearch2/common.c
@ -5,6 +5,7 @@
 #include "catalog/pg_proc.h"
 #include "catalog/pg_namespace.h"
 #include "utils/syscache.h"
+#include "miscadmin.h"

 #include "ts_cfg.h"
 #include "dict.h"
@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)

 	return nspoid;
 }
+
+    /* if path is relative, take it as relative to share dir */
+char *
+to_absfilename(char *filename) {
+	if (!is_absolute_path(filename)) {
+		char        sharepath[MAXPGPATH];
+		char       *absfn;
+#ifdef  WIN32
+		char    delim = '\\';
+#else
+		char    delim = '/';
+#endif
+		get_share_path(my_exec_path, sharepath);
+		absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
+		sprintf(absfn, "%s%c%s", sharepath, delim, filename);
+		filename = absfn;
+	}
+
+	return filename;
+}
--- a/contrib/tsearch2/common.h
+++ b/contrib/tsearch2/common.h
@ -16,6 +16,8 @@ text	   *mtextdup(text *in);

 int			text_cmp(text *a, text *b);

+char * to_absfilename(char *filename);
+
 #define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
 #define ARRNELEMS(x)  ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))

--- a/contrib/tsearch2/dict.c
+++ b/contrib/tsearch2/dict.c
@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */

 /*
 * interface functions to dictionary
@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
 		Datum		opt;
 		Oid			oid = InvalidOid;

+		/* setup dictlexize method */
+		oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
+		if (isnull || oid == InvalidOid)
+			ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
+		fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
+
+		/* setup and call dictinit method, optinally */
 		oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
 		if (!(isnull || oid == InvalidOid))
 		{
 			opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
 			dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
 		}
-		oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
-		if (isnull || oid == InvalidOid)
-			ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
-		fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
 		dict->dict_id = id;
 	}
 	else
@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
 	return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
 }

+static void
+insertdict(Oid id) {
+	DictInfo	newdict;
+
+	if (DList.len == DList.reallen)
+	{
+		DictInfo   *tmp;
+		int			reallen = (DList.reallen) ? 2 * DList.reallen : 16;
+
+		tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
+		if (!tmp)
+			ts_error(ERROR, "No memory");
+		DList.reallen = reallen;
+		DList.list = tmp;
+	}
+	init_dict(id, &newdict);
+
+	DList.list[DList.len] = newdict;
+	DList.len++;
+
+	qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
+}
+
 DictInfo *
 finddict(Oid id)
 {
@ -117,23 +143,8 @@ finddict(Oid id)
 			return DList.last_dict;
 	}

-	/* last chance */
-	if (DList.len == DList.reallen)
-	{
-		DictInfo   *tmp;
-		int			reallen = (DList.reallen) ? 2 * DList.reallen : 16;
-
-		tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
-		if (!tmp)
-			ts_error(ERROR, "No memory");
-		DList.reallen = reallen;
-		DList.list = tmp;
-	}
-	DList.last_dict = &(DList.list[DList.len]);
-	init_dict(id, DList.last_dict);
-
-	DList.len++;
-	qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
+	/* insert new dictionary */ 
+	insertdict(id);
 	return finddict(id); /* qsort changed order!! */ ;
 }

@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
 			   *ptr;
 	Datum	   *da;
 	ArrayType  *a;
+	DictSubState	dstate = { false, false, NULL };

 	SET_FUNCOID();
 	dict = finddict(PG_GETARG_OID(0));

 	ptr = res = (TSLexeme *) DatumGetPointer(
-										  FunctionCall3(&(dict->lexize_info),
-										   PointerGetDatum(dict->dictionary),
-												PointerGetDatum(VARDATA(in)),
-										Int32GetDatum(VARSIZE(in) - VARHDRSZ)
+										FunctionCall4(&(dict->lexize_info),
+										PointerGetDatum(dict->dictionary),
+										PointerGetDatum(VARDATA(in)),
+										Int32GetDatum(VARSIZE(in) - VARHDRSZ),
+										PointerGetDatum(&dstate)
 														)
 		);
+
+	if (dstate.getnext)  {
+		dstate.isend = true;	
+		ptr = res = (TSLexeme *) DatumGetPointer(
+										FunctionCall4(&(dict->lexize_info),
+										   PointerGetDatum(dict->dictionary),
+												PointerGetDatum(VARDATA(in)),
+										Int32GetDatum(VARSIZE(in) - VARHDRSZ),
+										PointerGetDatum(&dstate)
+														)
+		);
+	}
+
 	PG_FREE_IF_COPY(in, 1);
 	if (!res)
 	{
--- a/contrib/tsearch2/dict.h
+++ b/contrib/tsearch2/dict.h
@ -1,9 +1,10 @@
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */

 #ifndef __DICT_H__
 #define __DICT_H__
 #include "postgres.h"
 #include "fmgr.h"
+#include "ts_cfg.h"

 typedef struct
 {
@ -29,6 +30,11 @@ DictInfo   *finddict(Oid id);
 Oid			name2id_dict(text *name);
 void		reset_dict(void);

+typedef struct {
+	bool isend; /* in: marks for lexize_info about text end is reached */
+	bool getnext; /* out: dict wants next lexeme */
+	void	*private;  /* internal dict state between calls with getnext == true */
+} DictSubState;

 /* simple parser of cfg string */
 typedef struct
@ -45,17 +51,61 @@ typedef struct
 	/*
 	 * number of variant of split word , for example Word 'fotballklubber'
 	 * (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
-	 * ball, klubb ). So, dictionary should return: nvariant	lexeme 1
-	 * fotball 1	   klubb 2		 fot 2		 ball 2		  klubb
-	 *
+	 * ball, klubb ). So, dictionary should return: 
+	 * nvariant	lexeme 
+	 *   1 		fotball 
+	 *   1	   	klubb 
+	 *	 2		fot 
+	 *	 2		ball 
+	 *   2		klubb
 	 */
 	uint16		nvariant;

-	/* currently unused */
 	uint16		flags;

 	/* C-string */
 	char	   *lexeme;
 }	TSLexeme;

+#define TSL_ADDPOS		0x01
+
+
+/*
+ * Lexize subsystem
+ */
+
+typedef struct ParsedLex {
+    int     	type;
+    char    	*lemm;
+    int     	lenlemm;
+	bool		resfollow;
+    struct ParsedLex *next;
+} ParsedLex;
+
+typedef struct ListParsedLex {
+	ParsedLex	*head;
+	ParsedLex	*tail;
+} ListParsedLex;
+
+typedef struct {
+    TSCfgInfo       *cfg;
+    Oid             curDictId;
+    int             posDict;
+    DictSubState    dictState;
+    ParsedLex       *curSub;
+	ListParsedLex	towork;   /* current list to work */
+	ListParsedLex	waste;    /* list of lexemes that already lexized */
+
+	/* fields to store last variant to lexize (basically, thesaurus 
+	   or similar to, which wants  several lexemes */	
+	   
+	ParsedLex		*lastRes;
+	TSLexeme		*tmpRes;
+} LexizeData;
+
+
+void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
+void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
+TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
+
 #endif
--- a/contrib/tsearch2/dict_thesaurus.c
+++ b/contrib/tsearch2/dict_thesaurus.c
@ -0,0 +1,743 @@
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.1 2006/05/31 14:05:31 teodor Exp $ */
+
+/*
+ * thesaurus
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+#include "executor/spi.h"
+
+#include <ctype.h>
+
+#include "dict.h"
+#include "common.h"
+#include "ts_locale.h"
+
+typedef struct LexemeInfo {
+	uint16	idsubst; /* entry's number in DictThesaurus->subst */
+	uint16	posinsubst; /* pos info in entry */
+	uint16	tnvariant;  /* total num lexemes in one variant */
+	struct LexemeInfo *nextentry;
+	struct LexemeInfo *nextvariant;
+} LexemeInfo;
+
+typedef struct {
+	char 		*lexeme;
+	LexemeInfo	*entries;
+} TheLexeme; 
+
+typedef struct {
+	uint16	lastlexeme; /* number lexemes to substitute */
+	uint16	reslen;
+	TSLexeme	*res;   /* prepared substituted result */ 
+} TheSubstitute;
+
+typedef struct
+{
+	/* subdictionary to normalize lexemes */	
+	DictInfo	subdict;
+
+	/* Array to search lexeme by exact match */
+	TheLexeme	*wrds;
+	int			nwrds;
+	int			ntwrds;
+
+	/* Storage of substituted result, n-th element is for
+	   n-th expression */
+	TheSubstitute	*subst;
+	int				nsubst;
+}	DictThesaurus;
+
+PG_FUNCTION_INFO_V1(thesaurus_init);
+Datum		thesaurus_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(thesaurus_lexize);
+Datum		thesaurus_lexize(PG_FUNCTION_ARGS);
+
+static void
+freeDictThesaurus(DictThesaurus * d)
+{
+	free(d);
+}
+
+static void
+newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) {
+	TheLexeme	*ptr;
+
+	if ( d->nwrds >= d->ntwrds ) {
+		if ( d->ntwrds == 0 ) {
+			d->ntwrds = 16;
+			d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds);
+		} else {
+			d->ntwrds *= 2;
+			d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
+		}
+		if (!d->wrds)
+			elog(ERROR,"Out of memory");
+	}
+
+	ptr = d->wrds + d->nwrds;
+	d->nwrds++;
+
+	if  ( (ptr->lexeme = malloc(e-b+1)) == NULL )
+		elog(ERROR,"Out of memory");
+
+	memcpy(ptr->lexeme, b, e-b);
+	ptr->lexeme[e-b] = '\0';
+
+	if  ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL )
+		elog(ERROR,"Out of memory");
+
+	ptr->entries->nextentry=NULL;
+	ptr->entries->idsubst = idsubst;
+	ptr->entries->posinsubst = posinsubst;
+}
+
+static void
+addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
+	static	int nres=0;
+	static  int ntres = 0;
+	TheSubstitute	*ptr;
+
+	if ( nwrd == 0 ) {
+		nres = ntres = 0;
+
+		if ( idsubst <= d->nsubst ) {
+			if ( d->nsubst == 0 ) {
+				d->nsubst = 16;
+				d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst);
+			} else {
+				d->nsubst *= 2;
+				d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
+			}
+			if (!d->subst)
+				elog(ERROR,"Out of memory");
+		}
+	}
+
+	ptr = d->subst + idsubst;
+
+	ptr->lastlexeme = posinsubst-1;
+
+	if ( nres+1 >= ntres ) {
+		if ( ntres == 0 ) {
+			ntres = 2;
+			ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres );
+		} else {
+			ntres *= 2;
+			ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres );
+		}
+
+		if ( !ptr->res ) 
+				elog(ERROR,"Out of memory");
+	}
+
+	if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 ) 
+		elog(ERROR,"Out of memory");
+	memcpy(ptr->res[ nres ].lexeme, b, e-b);
+	ptr->res[ nres ].lexeme[e-b] = '\0';
+
+	ptr->res[ nres ].nvariant = nwrd;
+	ptr->res[ nres ].flags = TSL_ADDPOS;
+
+	ptr->res[ ++nres ].lexeme = NULL;
+}
+
+#define TR_WAITLEX	1
+#define TR_INLEX	2
+#define	TR_WAITSUBS	3
+#define TR_INSUBS	4
+
+static void
+thesaurusRead( char *filename, DictThesaurus *d ) {
+	FILE *fh;
+	char str[BUFSIZ];
+	int lineno=0;
+	uint16	idsubst = 0;
+
+	fh = fopen(to_absfilename(filename), "r");
+	if (!fh)
+		elog(ERROR,"Thesaurus: can't open '%s' file", filename);
+
+	while( fgets(str, sizeof(str), fh)) {
+		char *ptr = str;
+		int state = TR_WAITLEX;
+		char	*beginwrd = NULL;
+		uint16	posinsubst=0;
+		uint16	nwrd=0;
+
+		lineno++;
+
+		/* is it comment ? */
+		while( t_isspace(ptr) )
+			ptr += pg_mblen(ptr);
+		if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') )
+			continue;
+
+		pg_verifymbstr(ptr, strlen(ptr), false);
+		while(*ptr) {
+			if ( state == TR_WAITLEX ) {
+				if ( t_iseq(ptr, ':' ) ) {
+					if ( posinsubst == 0 ) {
+						fclose(fh);
+						elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno);
+					}
+					state = TR_WAITSUBS;
+				} else if ( !t_isspace(ptr) ) {
+					beginwrd = ptr;
+					state = TR_INLEX;
+				}
+			} else if ( state == TR_INLEX ) {
+				if ( t_iseq(ptr, ':') ) {
+					newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
+					state = TR_WAITSUBS;
+				} else if ( t_isspace(ptr) ) {
+					newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
+					state = TR_WAITLEX;
+				}
+			} else if ( state == TR_WAITSUBS ) {
+				if ( !t_isspace(ptr) ) { 
+					beginwrd = ptr;
+					state = TR_INSUBS;
+				}
+			} else if ( state == TR_INSUBS ) {
+				if ( t_isspace(ptr) ) { 
+					addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
+					state = TR_WAITSUBS;
+				}
+			} else
+				elog(ERROR,"Thesaurus: Unknown state: %d", state);
+				
+			ptr += pg_mblen(ptr);
+		}
+
+		if ( state == TR_INSUBS )
+			addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
+
+		idsubst++;
+
+		if ( !(nwrd && posinsubst) ) {
+			fclose(fh);
+			elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno);
+		}
+			
+	}
+
+	d->nsubst = idsubst;
+
+	fclose(fh);
+}
+
+static TheLexeme*
+addCompiledLexeme(TheLexeme   *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) {
+
+	if ( *nnw >= *tnm ) {
+		*tnm *= 2;
+		newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm);
+		if (!newwrds)
+			elog(ERROR,"Out of memory");
+	}
+
+	newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) );
+	if (!newwrds[ *nnw ].entries)
+		elog(ERROR,"Out of memory");
+
+	if ( lexeme && lexeme->lexeme ) {
+		newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme );
+		if ( !newwrds[ *nnw ].lexeme )
+			elog(ERROR,"Out of memory");
+
+		newwrds[ *nnw ].entries->tnvariant = tnvariant;
+	} else {
+		newwrds[ *nnw ].lexeme = NULL;
+		newwrds[ *nnw ].entries->tnvariant = 1;
+	}
+
+	newwrds[ *nnw ].entries->idsubst = src->idsubst;
+	newwrds[ *nnw ].entries->posinsubst = src->posinsubst;
+
+	newwrds[ *nnw ].entries->nextentry = NULL;
+
+	(*nnw)++;
+	return newwrds;
+}
+
+static int
+cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) {
+	if ( a==NULL || b==NULL )
+		return 0;
+
+	if ( a->idsubst == b->idsubst ) {
+		if ( a->posinsubst == b->posinsubst ) {
+			if ( a->tnvariant == b->tnvariant ) 
+					return 0;
+
+			return ( a->tnvariant > b->tnvariant ) ? 1 : -1;
+		}
+
+		return ( a->posinsubst > b->posinsubst ) ? 1 : -1;
+	}
+
+	return ( a->idsubst > b->idsubst ) ? 1 : -1;
+}
+
+static int
+cmpLexeme(TheLexeme *a, TheLexeme* b) {
+	if ( a->lexeme == NULL ) {
+		if ( b->lexeme == NULL )
+			return 0;
+		else
+			return 1;
+	} else if ( b->lexeme == NULL )
+		return -1;
+
+	return strcmp( a->lexeme, b->lexeme );
+}
+
+static int
+cmpLexemeQ(const void *a, const void *b) {
+	return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b ); 
+}
+
+static int cmpTheLexeme(const void *a, const void *b) {
+	TheLexeme *la  = (TheLexeme*)a;
+	TheLexeme *lb  = (TheLexeme*)b;
+	int res;
+
+	if ( (res=cmpLexeme(la, lb)) != 0 )
+		return res;
+
+	return -cmpLexemeInfo(la->entries, lb->entries);
+}
+
+static void
+compileTheLexeme(DictThesaurus *d) {
+	int			i,nnw=0, tnm=16;
+	TheLexeme	*newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds;
+
+	if (!newwrds) 
+		elog(ERROR,"Out of memory");
+
+	for(i=0;i<d->nwrds;i++) {
+		TSLexeme *ptr = (TSLexeme*) DatumGetPointer( 
+				FunctionCall4(
+					&(d->subdict.lexize_info),
+					PointerGetDatum(d->subdict.dictionary),
+					PointerGetDatum(d->wrds[i].lexeme),
+					Int32GetDatum(strlen(d->wrds[i].lexeme)),
+					PointerGetDatum(NULL)
+				)
+			);
+
+		if ( !(ptr && ptr->lexeme) ) {
+			newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
+			elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, assign any non-recognized word", d->wrds[i].lexeme);
+		} else {
+			while( ptr->lexeme ) {
+				TSLexeme	*remptr = ptr+1;
+				int tnvar = 1;
+				int	curvar = ptr->nvariant;
+
+				/* compute n words in one variant */
+				while( remptr->lexeme ) {
+					if ( remptr->nvariant != (remptr-1)->nvariant )
+						break;
+					tnvar++;
+					remptr++;
+				}
+
+				remptr = ptr;
+				while( remptr->lexeme && remptr->nvariant == curvar ) {
+					newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); 
+					remptr++;
+				}
+
+				ptr = remptr;
+			}
+		}
+
+		free( d->wrds[i].lexeme );
+		free( d->wrds[i].entries );
+	}
+
+	free( d->wrds );
+	d->wrds = newwrds;
+	d->nwrds = nnw;
+	d->ntwrds = tnm;
+
+	if ( d->nwrds > 1 ) {
+		qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme ); 
+
+		/* uniq */
+		newwrds = d->wrds;
+		ptrwrds = d->wrds + 1;
+		while( ptrwrds - d->wrds < d->nwrds ) {
+			if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) {
+				if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) {
+					ptrwrds->entries->nextentry = newwrds->entries;
+					newwrds->entries = ptrwrds->entries;
+				} else
+					free( ptrwrds->entries );
+
+				if ( ptrwrds->lexeme )
+					free( ptrwrds->lexeme );
+			} else {
+				newwrds++;
+				*newwrds = *ptrwrds;
+			}
+
+			ptrwrds++;
+		}
+
+		d->nwrds = newwrds - d->wrds + 1;
+		d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds );
+	}
+}
+
+static void
+compileTheSubstitute(DictThesaurus *d) {
+	int i;
+
+	for(i=0;i<d->nsubst;i++) {
+		TSLexeme	*rem = d->subst[i].res, *outptr, *inptr;
+		int			n=2;
+
+		outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n );
+		if ( d->subst[i].res == NULL )
+			elog(ERROR,"Out of Memory");
+		outptr->lexeme = NULL;
+		inptr = rem;
+
+		while( inptr && inptr->lexeme ) { 
+			TSLexeme	*reml, *lexized = (TSLexeme*) DatumGetPointer( 
+				FunctionCall4(
+					&(d->subdict.lexize_info),
+					PointerGetDatum(d->subdict.dictionary),
+					PointerGetDatum(inptr->lexeme),
+					Int32GetDatum(strlen(inptr->lexeme)),
+					PointerGetDatum(NULL)
+				)
+			);
+
+			reml = lexized;
+			if ( lexized ) {
+				int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res)  : -1;
+
+				while( lexized->lexeme ) {
+					if ( outptr - d->subst[i].res + 1 >= n ) {
+						int diff = outptr - d->subst[i].res;
+						n *= 2;
+						d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n );
+						if ( d->subst[i].res == NULL )
+							elog(ERROR,"Out of Memory");
+						outptr = d->subst[i].res + diff;
+					}
+
+					*outptr = *lexized;
+					if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL )
+						elog(ERROR,"Out of Memory");
+
+					outptr++;
+					lexized++;
+				}
+
+				if ( toset > 0)
+					d->subst[i].res[toset].flags |= TSL_ADDPOS;
+			}
+
+			if ( inptr->lexeme )
+				free( inptr->lexeme );
+			inptr++;
+		}
+
+		d->subst[i].reslen = outptr - d->subst[i].res;
+
+		free(rem);
+	}
+}
+
+Datum
+thesaurus_init(PG_FUNCTION_ARGS)
+{
+	DictThesaurus *d;
+	Map		   *cfg,
+			   *pcfg;
+	text	   *in, *subdictname=NULL;
+	bool 		fileloaded = false;
+
+	if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("Thesaurus confguration error")));
+
+	d = (DictThesaurus *) malloc(sizeof(DictThesaurus));
+	if (!d)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	memset(d, 0, sizeof(DictThesaurus));
+
+	in = PG_GETARG_TEXT_P(0);
+	parse_cfgdict(in, &cfg);
+	PG_FREE_IF_COPY(in, 0);
+	pcfg = cfg;
+	while (pcfg->key)
+	{
+		if (pg_strcasecmp("DictFile", pcfg->key) == 0)
+		{
+			if (fileloaded)
+			{
+				freeDictThesaurus(d);
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("Thesaurus file is already loaded")));
+			}
+			fileloaded = true;
+			thesaurusRead( pcfg->value, d );
+		}
+		else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
+		{
+			if (subdictname)
+			{
+				freeDictThesaurus(d);
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("Thesaurus: SubDictionary is already defined")));
+			}
+			subdictname = char2text( pcfg->value );
+		}
+		else
+		{
+			freeDictThesaurus(d);
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("unrecognized option: %s => %s",
+							pcfg->key, pcfg->value)));
+		}
+		pfree(pcfg->key);
+		pfree(pcfg->value);
+		pcfg++;
+	}
+	pfree(cfg);
+
+	if (!fileloaded)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("Thesaurus file  isn't defined")));
+
+	if ( subdictname ) {
+		DictInfo	*subdictptr;
+		/* 
+		 * we already in SPI, but name2id_dict()/finddict()
+		 * invoke SPI_connect()
+		 */
+		SPI_push(); 
+
+		subdictptr = finddict( name2id_dict( subdictname ) );
+
+		SPI_pop();
+
+		d->subdict = *subdictptr;
+	} else 
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("Thesaurus: SubDictionary isn't defined")));
+
+	compileTheLexeme( d );
+	compileTheSubstitute(d);
+
+	PG_RETURN_POINTER(d);
+}
+
+static LexemeInfo*
+findTheLexeme(DictThesaurus *d, char * lexeme) {
+	TheLexeme key = { lexeme, NULL }, *res;
+
+	if ( d->nwrds == 0 )
+		return NULL;
+
+	res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
+
+	if ( res == NULL )
+		return NULL;
+	return res->entries;
+}
+
+static bool
+matchIdSubst(LexemeInfo *stored, uint16 idsubst) {
+	bool res = true;
+
+	if (stored) {
+		res = false;
+
+		for(; stored; stored=stored->nextvariant) 
+			if ( stored->idsubst == idsubst ) {
+				res = true;
+				break;
+			}
+	}
+
+	return res;
+}
+
+static LexemeInfo*
+findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) {
+	for(;;) {
+		int i;
+		LexemeInfo *ptr = newin[0];
+
+		for(i=0; i<newn; i++) {
+			while(newin[i] && newin[i]->idsubst < ptr->idsubst) 
+				newin[i] = newin[i]->nextentry;
+
+			if ( newin[i] == NULL )
+				return in;
+
+			if ( newin[i]->idsubst > ptr->idsubst ) {
+				ptr = newin[i];
+				i=-1;
+				continue;
+			}
+
+			while(newin[i]->idsubst == ptr->idsubst) {
+				if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) {
+					ptr = newin[i];
+					break;
+				}
+
+				newin[i] = newin[i]->nextentry;
+				if ( newin[i] == NULL )
+					return in;
+			}
+
+			if ( newin[i]->idsubst != ptr->idsubst ) {
+				ptr = newin[i];
+				i=-1;
+				continue;
+			}
+		}
+
+		if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */
+
+			ptr->nextvariant = in;
+			in = ptr;
+		}
+
+		/* step forward */
+		for(i=0; i<newn; i++)
+			newin[i] = newin[i]->nextentry;
+	}
+
+	return NULL;
+}
+
+static TSLexeme*
+copyTSLexeme( TheSubstitute *ts ) {
+	TSLexeme	*res;
+	uint16 i;
+
+	res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) );
+	for(i=0;i<ts->reslen;i++) {	
+		res[i] = ts->res[i];
+		res[i].lexeme = pstrdup( ts->res[i].lexeme );
+	}
+
+	res[ts->reslen].lexeme = NULL;
+
+	return res;
+}
+
+static TSLexeme*
+checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) {
+	*moreres = false;
+	while(info) {
+		Assert( info->idsubst < d->nsubst );
+		if ( info->nextvariant )
+			*moreres = true;
+		if ( d->subst[ info->idsubst ].lastlexeme == curpos ) 
+			return copyTSLexeme( d->subst + info->idsubst );
+		info = info->nextvariant;
+	}
+
+	return NULL;
+}
+
+Datum
+thesaurus_lexize(PG_FUNCTION_ARGS)
+{
+	DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
+	DictSubState	*dstate = (DictSubState*)PG_GETARG_POINTER(3);
+	TSLexeme	*res=NULL;
+	LexemeInfo *stored, *info = NULL;
+	uint16	curpos = 0;
+	bool	moreres = false;
+
+	if ( dstate == NULL || PG_NARGS() < 4 )
+		elog(ERROR,"Forbidden call of thesaurus or nested call");
+
+	if ( dstate->isend ) 
+		PG_RETURN_POINTER(NULL);
+	stored = (LexemeInfo*) dstate->private;
+
+	if (stored) 
+		curpos = stored->posinsubst+1;
+
+	res =(TSLexeme*) DatumGetPointer (
+		FunctionCall4(
+			&(d->subdict.lexize_info),
+			PointerGetDatum(d->subdict.dictionary),
+			PG_GETARG_DATUM(1),
+			PG_GETARG_INT32(2),
+			PointerGetDatum(NULL)
+		)
+	);
+
+	if ( res && res->lexeme ) {
+		TSLexeme	*ptr = res , *basevar;
+
+		while( ptr->lexeme ) {
+			uint16		nv = ptr->nvariant;
+			uint16		i,nlex = 0;
+			LexemeInfo	**infos;
+
+			basevar = ptr;
+			while( ptr->lexeme && nv == ptr->nvariant ) {
+				nlex++;
+				ptr++;
+			}
+
+			infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
+			for(i=0;i<nlex;i++) 
+				if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
+					break;
+
+			if ( i<nlex ) { 
+				/* no chance to find */
+				pfree( infos );
+				continue;
+			}
+
+			info = findVariant( info, stored, curpos, infos, nlex);
+		}
+
+	} else {
+		LexemeInfo	*infos = findTheLexeme(d, NULL);
+		info = findVariant( NULL, stored, curpos, &infos, 1);
+	}
+
+	dstate->private = (void*)info;
+
+	if ( !info ) {
+		dstate->getnext = false;
+		PG_RETURN_POINTER(NULL);
+	}
+			
+	if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) {
+		dstate->getnext = moreres;
+		PG_RETURN_POINTER(res);
+	}
+
+	dstate->getnext = true;
+
+	PG_RETURN_POINTER(NULL);	
+}
--- a/contrib/tsearch2/expected/tsearch2.out
+++ b/contrib/tsearch2/expected/tsearch2.out
@ -4,21 +4,21 @@
 --
 \set ECHO none
 psql:tsearch2.sql:13: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
-psql:tsearch2.sql:158: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
-psql:tsearch2.sql:257: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
-psql:tsearch2.sql:264: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
-psql:tsearch2.sql:370: NOTICE:  type "tsvector" is not yet defined
+psql:tsearch2.sql:177: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
+psql:tsearch2.sql:276: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
+psql:tsearch2.sql:283: NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
+psql:tsearch2.sql:389: NOTICE:  type "tsvector" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:375: NOTICE:  argument type tsvector is only a shell
-psql:tsearch2.sql:429: NOTICE:  type "tsquery" is not yet defined
+psql:tsearch2.sql:394: NOTICE:  argument type tsvector is only a shell
+psql:tsearch2.sql:448: NOTICE:  type "tsquery" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:434: NOTICE:  argument type tsquery is only a shell
-psql:tsearch2.sql:592: NOTICE:  type "gtsvector" is not yet defined
+psql:tsearch2.sql:453: NOTICE:  argument type tsquery is only a shell
+psql:tsearch2.sql:611: NOTICE:  type "gtsvector" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:597: NOTICE:  argument type gtsvector is only a shell
-psql:tsearch2.sql:1087: NOTICE:  type "gtsq" is not yet defined
+psql:tsearch2.sql:616: NOTICE:  argument type gtsvector is only a shell
+psql:tsearch2.sql:1106: NOTICE:  type "gtsq" is not yet defined
 DETAIL:  Creating a shell type definition.
-psql:tsearch2.sql:1092: NOTICE:  argument type gtsq is only a shell
+psql:tsearch2.sql:1111: NOTICE:  argument type gtsq is only a shell
 --tsvector
 SELECT '1'::tsvector;
 tsvector 
--- a/contrib/tsearch2/stopword.c
+++ b/contrib/tsearch2/stopword.c
@ -4,8 +4,6 @@
 */
 #include "postgres.h"

-#include "miscadmin.h"
-
 #include "common.h"
 #include "dict.h"
 #include "ts_locale.h"
@ -36,30 +34,11 @@ readstoplist(text *in, StopList * s)
 	s->len = 0;
 	if (in && VARSIZE(in) - VARHDRSZ > 0)
 	{
-		char	   *filename = text2char(in);
+		char	   *filename = to_absfilename(text2char(in));
 		FILE	   *hin;
 		char		buf[STOPBUFLEN];
 		int			reallen = 0;

-		/* if path is relative, take it as relative to share dir */
-		if (!is_absolute_path(filename))
-		{
-			char		sharepath[MAXPGPATH];
-			char	   *absfn;
-#ifdef	WIN32
-			char	delim = '\\';
-#else
-			char 	delim = '/';
-#endif
-
-			get_share_path(my_exec_path, sharepath);
-			absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
-			sprintf(absfn, "%s%c%s", sharepath, delim, filename);
-
-			pfree(filename);
-			filename = absfn;
-		}
-
 		if ((hin = fopen(filename, "r")) == NULL)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
--- a/contrib/tsearch2/thesaurus
+++ b/contrib/tsearch2/thesaurus
@ -0,0 +1,19 @@
+#
+# Theasurus config file. Character ':' splits
+# string to part: 
+#     to be substituted string
+#     substituting string
+#
+
+#one two three : 123
+#one two : 12
+#one : 1
+#two : 2
+
+#foo bar : blah blah
+#f   bar : fbar
+#e   bar : ebar
+#g   bar bar : gbarbar
+#asd:sdffff
+#qwerty:qwer wert erty
+
--- a/contrib/tsearch2/ts_cfg.c
+++ b/contrib/tsearch2/ts_cfg.c
@ -281,15 +281,15 @@ name2id_cfg(text *name)
 	return id;
 }

-
 void
 parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 {
 	int			type,
-				lenlemm,
-				i;
+				lenlemm;
 	char	   *lemm = NULL;
 	WParserInfo *prsobj = findprs(cfg->prs_id);
+	LexizeData	ldata;
+	TSLexeme   *norms;

 	prsobj->prs = (void *) DatumGetPointer(
 										   FunctionCall2(
@ -299,14 +299,16 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 														 )
 		);

-	while ((type = DatumGetInt32(FunctionCall3(
+	LexizeInit(&ldata, cfg);
+
+	do {
+		type = DatumGetInt32(FunctionCall3(
 											   &(prsobj->getlexeme_info),
 											   PointerGetDatum(prsobj->prs),
 											   PointerGetDatum(&lemm),
-										   PointerGetDatum(&lenlemm)))) != 0)
-	{
+										   PointerGetDatum(&lenlemm)));

-		if (lenlemm >= MAXSTRLEN)
+		if (type>0 && lenlemm >= MAXSTRLEN)
 		{
 #ifdef IGNORE_LONGLEXEME
 			ereport(NOTICE,
@ -320,25 +322,11 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 #endif
 		}

-		if (type >= cfg->len)	/* skip this type of lexeme */
-			continue;
+		LexizeAddLemm(&ldata, type, lemm, lenlemm);

-		for (i = 0; i < cfg->map[type].len; i++)
+		while(  (norms = LexizeExec(&ldata, NULL)) != NULL )
 		{
-			DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-			TSLexeme   *norms,
-					   *ptr;
-
-			norms = ptr = (TSLexeme *) DatumGetPointer(
-													   FunctionCall3(
-														&(dict->lexize_info),
-										   PointerGetDatum(dict->dictionary),
-													   PointerGetDatum(lemm),
-													 PointerGetDatum(lenlemm)
-																	 )
-				);
-			if (!norms)			/* dictionary doesn't know this lexeme */
-				continue;
+			TSLexeme *ptr = norms;

 			prs->pos++;			/* set pos */

@ -350,6 +338,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 					prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
 				}

+				if ( ptr->flags & TSL_ADDPOS )
+					prs->pos++;
 				prs->words[prs->curwords].len = strlen(ptr->lexeme);
 				prs->words[prs->curwords].word = ptr->lexeme;
 				prs->words[prs->curwords].nvariant = ptr->nvariant;
@ -359,9 +349,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 				prs->curwords++;
 			}
 			pfree(norms);
-			break;				/* lexeme already normalized or is stop word */
-		}
 	}
+	} while(type>0);

 	FunctionCall1(
 				  &(prsobj->end_info),
@ -417,14 +406,47 @@ hlfinditem(HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int buflen)
 	}
 }

+static void
+addHLParsedLex(HLPRSTEXT *prs, QUERYTYPE * query, ParsedLex *lexs, TSLexeme *norms) {
+	ParsedLex	*tmplexs;
+	TSLexeme *ptr;
+
+	while( lexs ) {
+		
+		if ( lexs->type > 0 ) 
+			hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
+
+		ptr = norms;
+		while( ptr && ptr->lexeme ) {
+			hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+			ptr++;
+		}
+
+		tmplexs = lexs->next;
+		pfree( lexs );
+		lexs = tmplexs;
+	}
+
+	if ( norms ) {
+		ptr = norms;
+		while( ptr->lexeme ) {
+			pfree( ptr->lexeme );
+			ptr++;
+		}
+		pfree(norms);
+	}
+}
+
 void
 hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
 {
 	int			type,
-				lenlemm,
-				i;
+				lenlemm;
 	char	   *lemm = NULL;
 	WParserInfo *prsobj = findprs(cfg->prs_id);
+	LexizeData	ldata;
+	TSLexeme	*norms;
+	ParsedLex	*lexs;

 	prsobj->prs = (void *) DatumGetPointer(
 										   FunctionCall2(
@ -434,14 +456,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
 														 )
 		);

-	while ((type = DatumGetInt32(FunctionCall3(
+	LexizeInit(&ldata, cfg);
+
+	do {
+		type = DatumGetInt32(FunctionCall3(
 											   &(prsobj->getlexeme_info),
 											   PointerGetDatum(prsobj->prs),
 											   PointerGetDatum(&lemm),
-										   PointerGetDatum(&lenlemm)))) != 0)
-	{
+									PointerGetDatum(&lenlemm)));

-		if (lenlemm >= MAXSTRLEN)
+		if (type>0 && lenlemm >= MAXSTRLEN)
 		{
 #ifdef IGNORE_LONGLEXEME
 			ereport(NOTICE,
@ -455,38 +479,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
 #endif
 		}

-		hladdword(prs, lemm, lenlemm, type);
+		LexizeAddLemm(&ldata, type, lemm, lenlemm);

-		if (type >= cfg->len)
-			continue;
+		do {
+			if ( (norms = LexizeExec(&ldata,&lexs)) != NULL ) 
+				addHLParsedLex(prs, query, lexs, norms);
+			else 
+				addHLParsedLex(prs, query, lexs, NULL);
+		} while( norms );

-		for (i = 0; i < cfg->map[type].len; i++)
-		{
-			DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-			TSLexeme   *norms,
-					   *ptr;
-
-			norms = ptr = (TSLexeme *) DatumGetPointer(
-													   FunctionCall3(
-														&(dict->lexize_info),
-										   PointerGetDatum(dict->dictionary),
-													   PointerGetDatum(lemm),
-													 PointerGetDatum(lenlemm)
-																	 )
-				);
-			if (!norms)			/* dictionary doesn't know this lexeme */
-				continue;
-
-			while (ptr->lexeme)
-			{
-				hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
-				pfree(ptr->lexeme);
-				ptr++;
-			}
-			pfree(norms);
-			break;				/* lexeme already normalized or is stop word */
-		}
-	}
+	} while( type>0 );

 	FunctionCall1(
 				  &(prsobj->end_info),
--- a/contrib/tsearch2/ts_lexize.c
+++ b/contrib/tsearch2/ts_lexize.c
@ -0,0 +1,261 @@
+/*
+ * lexize stream of lexemes 
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+#include <locale.h>
+
+#include "ts_cfg.h"
+#include "dict.h"
+
+void
+LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
+	ld->cfg = cfg;
+	ld->curDictId = InvalidOid;
+	ld->posDict = 0;
+	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
+	ld->waste.head = ld->waste.tail = NULL;
+	ld->lastRes=NULL;
+	ld->tmpRes=NULL;
+}
+
+static void
+LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
+	if ( list->tail ) {
+		list->tail->next = newpl;
+		list->tail = newpl;
+	} else
+		list->head = list->tail = newpl;
+	newpl->next = NULL;
+}
+
+static ParsedLex*
+LPLRemoveHead(ListParsedLex *list) {
+	ParsedLex *res = list->head;
+
+	if ( list->head ) 
+		list->head = list->head->next;
+
+	if ( list->head == NULL )
+		list->tail = NULL;
+
+	return res;
+}
+
+
+void
+LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
+	ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
+
+	newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
+	newpl->type = type;
+	newpl->lemm = lemm;
+	newpl->lenlemm = lenlemm;
+	LPLAddTail(&ld->towork, newpl);
+	ld->curSub = ld->towork.tail;
+}
+
+static void
+RemoveHead(LexizeData *ld) {
+	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
+
+	ld->posDict = 0;
+}
+
+static void
+setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
+	if ( correspondLexem ) {
+		*correspondLexem = ld->waste.head;
+	} else {
+		ParsedLex	*tmp, *ptr = ld->waste.head;
+
+		while(ptr) {
+			tmp = ptr->next;
+			pfree(ptr);
+			ptr = tmp;
+		}
+	}
+	ld->waste.head = ld->waste.tail = NULL;
+}
+
+static void
+moveToWaste(LexizeData *ld, ParsedLex *stop) {
+	bool	go = true;
+
+	while( ld->towork.head && go) {
+		if (ld->towork.head == stop) {
+			ld->curSub = stop->next;
+			go = false;
+		}
+		RemoveHead(ld);
+	}
+}
+
+static void
+setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
+	if ( ld->tmpRes ) {
+		TSLexeme	*ptr;
+		for( ptr=ld->tmpRes; ptr->lexeme; ptr++ ) 
+			pfree( ptr->lexeme );
+		pfree( ld->tmpRes );
+	}
+	ld->tmpRes = res;
+	ld->lastRes = lex;
+}
+
+TSLexeme*
+LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
+	int i;
+	ListDictionary	*map;
+	DictInfo *dict;
+	TSLexeme	*res;
+
+	if ( ld->curDictId == InvalidOid ) {
+		/* 
+		 * usial mode: dictionary wants only one word,
+		 * but we should keep in mind that we should go through
+		 * all stack
+		 */
+
+		while( ld->towork.head ) {
+			ParsedLex	*curVal = ld->towork.head;
+
+			map = ld->cfg->map + curVal->type;
+
+			if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {	
+				/* skip this type of lexeme */
+				RemoveHead(ld);
+				continue;
+			}
+
+			for (i = ld->posDict; i < map->len; i++) {
+				dict = finddict(DatumGetObjectId(map->dict_id[i]));
+
+				ld->dictState.isend = ld->dictState.getnext = false;
+				ld->dictState.private = NULL;
+				res = (TSLexeme *) DatumGetPointer( FunctionCall4(
+													&(dict->lexize_info),
+									   				PointerGetDatum(dict->dictionary),
+												   	PointerGetDatum(curVal->lemm),
+												 	Int32GetDatum(curVal->lenlemm),
+													PointerGetDatum(&ld->dictState)
+										 ));
+
+				if ( ld->dictState.getnext ) {
+					/* 
+					 * dictinary wants next word, so setup and store
+					 * current position and go to multiword  mode
+					 */
+					 
+					ld->curDictId = DatumGetObjectId(map->dict_id[i]);
+					ld->posDict = i+1;
+					ld->curSub = curVal->next;
+					if ( res )
+						setNewTmpRes(ld, curVal, res);
+					return LexizeExec(ld, correspondLexem);
+				}
+
+				if (!res)			/* dictionary doesn't know this lexeme */
+					continue;
+				
+				RemoveHead(ld);
+				setCorrLex(ld, correspondLexem);
+				return res;
+			}
+
+			RemoveHead(ld);
+		} 
+	} else { /* curDictId is valid */
+		dict = finddict(ld->curDictId);
+		
+		/*
+		 * Dictionary ld->curDictId asks  us about following words
+		 */
+
+		while( ld->curSub ) {
+			ParsedLex	*curVal = ld->curSub;
+
+			map = ld->cfg->map + curVal->type;
+
+			if (curVal->type != 0) {
+				bool dictExists = false;
+
+				if (curVal->type >= ld->cfg->len || map->len == 0 ) {	
+					/* skip this type of lexeme */
+					ld->curSub = curVal->next;
+					continue;
+				}
+
+				/*
+				 * We should be sure that current type of lexeme is recognized by
+				 * our dictinonary: we just check is it exist in 
+				 * list of dictionaries ?
+				 */
+				for(i=0;i < map->len && !dictExists; i++) 
+					if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
+						dictExists = true;
+
+				if ( !dictExists ) {
+					/*
+					 * Dictionary can't work with current tpe of lexeme,
+					 * return to basic mode and redo all stored lexemes
+					 */
+					ld->curDictId = InvalidOid;
+					return LexizeExec(ld, correspondLexem);
+				}
+			} 
+	
+			ld->dictState.isend = (curVal->type==0) ? true : false;
+			ld->dictState.getnext = false;
+
+			res = (TSLexeme *) DatumGetPointer( FunctionCall4(
+												&(dict->lexize_info),
+								   				PointerGetDatum(dict->dictionary),
+											   	PointerGetDatum(curVal->lemm),
+											 	Int32GetDatum(curVal->lenlemm),
+												PointerGetDatum(&ld->dictState)
+										 ));
+
+			if ( ld->dictState.getnext ) {
+				/* Dictionary wants one more */
+				ld->curSub = curVal->next;
+				if ( res )
+					setNewTmpRes(ld, curVal, res);
+				continue;
+			}
+
+			if ( res || ld->tmpRes ) {
+				/*
+				 * Dictionary normalizes lexemes,
+				 * so we remove from stack all used lexemes ,
+				 * return to basic mode and redo end of stack (if it exists)
+				 */
+				if ( res ) {
+					moveToWaste( ld, ld->curSub );
+				} else {
+					res = ld->tmpRes;
+					moveToWaste( ld, ld->lastRes );
+				}
+
+				/* reset to initial state */
+				ld->curDictId = InvalidOid;
+				ld->posDict = 0;
+				ld->lastRes = NULL;
+				ld->tmpRes = NULL;
+				setCorrLex(ld, correspondLexem);
+				return res;
+			}
+
+			/* Dict don't want next lexem and didn't recognize anything,
+			   redo from ld->towork.head */
+			ld->curDictId = InvalidOid;
+			return LexizeExec(ld, correspondLexem);
+		}	
+	}
+
+	setCorrLex(ld, correspondLexem);
+	return NULL;
+}
+
--- a/contrib/tsearch2/tsearch.sql.in
+++ b/contrib/tsearch2/tsearch.sql.in
@ -146,6 +146,25 @@ insert into pg_ts_dict select
 	'Example of synonym dictionary'
 ;

+CREATE FUNCTION thesaurus_init(internal)
+	RETURNS internal
+	as 'MODULE_PATHNAME' 
+	LANGUAGE C;
+
+CREATE FUNCTION thesaurus_lexize(internal,internal,int4,internal)
+	RETURNS internal
+	as 'MODULE_PATHNAME'
+	LANGUAGE C
+	RETURNS NULL ON NULL INPUT;
+
+insert into pg_ts_dict select 
+	'thesaurus_template', 
+	'thesaurus_init(internal)',
+	null,
+	'thesaurus_lexize(internal,internal,int4,internal)',
+	'Thesaurus template, must be pointed Dictionary and DictFile'
+;
+
 --dict conf
 CREATE TABLE pg_ts_parser (
 	prs_name	text not null primary key,
@ -1193,7 +1212,11 @@ AS

 --example of ISpell dictionary
 --update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
--example of synonym dict
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;

+--example of synonym dict
+--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_name='synonym';
+
+--example of thesaurus dict
+--update pg_ts_dict set dict_initoption='DictFile="contrib/thesaurus", Dictionary="en_stem"' where dict_name='thesaurus_template';
+--update pg_ts_cfgmap set dict_name = '{thesaurus_template,en_stem}' where dict_name = '{en_stem}';
 END;
--- a/contrib/tsearch2/untsearch.sql.in
+++ b/contrib/tsearch2/untsearch.sql.in
@ -41,6 +41,8 @@ DROP FUNCTION snb_lexize(internal,internal,int4);
 DROP FUNCTION snb_ru_init(internal);
 DROP FUNCTION spell_init(internal);
 DROP FUNCTION spell_lexize(internal,internal,int4);
+DROP FUNCTION thesaurus_init(internal);
+DROP FUNCTION thesaurus_lexize(internal,internal,int4);
 DROP FUNCTION syn_init(internal);
 DROP FUNCTION syn_lexize(internal,internal,int4);
 DROP FUNCTION set_curprs(int);