From 22505f4703abafeba216f889b7bb7bc065925096 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 31 May 2006 14:05:31 +0000 Subject: [PATCH] Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes. It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc. --- contrib/tsearch2/Makefile | 8 +- contrib/tsearch2/common.c | 21 + contrib/tsearch2/common.h | 2 + contrib/tsearch2/dict.c | 78 ++- contrib/tsearch2/dict.h | 60 +- contrib/tsearch2/dict_thesaurus.c | 743 +++++++++++++++++++++++++ contrib/tsearch2/expected/tsearch2.out | 22 +- contrib/tsearch2/stopword.c | 23 +- contrib/tsearch2/thesaurus | 19 + contrib/tsearch2/ts_cfg.c | 126 ++--- contrib/tsearch2/ts_lexize.c | 261 +++++++++ contrib/tsearch2/tsearch.sql.in | 27 +- contrib/tsearch2/untsearch.sql.in | 2 + 13 files changed, 1260 insertions(+), 132 deletions(-) create mode 100644 contrib/tsearch2/dict_thesaurus.c create mode 100644 contrib/tsearch2/thesaurus create mode 100644 contrib/tsearch2/ts_lexize.c diff --git a/contrib/tsearch2/Makefile b/contrib/tsearch2/Makefile index 3e322bb85c..393e3fa902 100644 --- a/contrib/tsearch2/Makefile +++ b/contrib/tsearch2/Makefile @@ -1,13 +1,13 @@ -# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $ +# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $ MODULE_big = tsearch2 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ - dict_snowball.o dict_ispell.o dict_syn.o \ + dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \ wparser.o wparser_def.o \ ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \ tsvector_op.o rank.o ts_stat.o \ query_util.o query_support.o query_rewrite.o query_gist.o \ - ts_locale.o ginidx.o + ts_locale.o ts_lexize.o ginidx.o SUBDIRS := snowball ispell wordparser SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o) @@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS) PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser -DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 +DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus DATA_built = tsearch2.sql untsearch2.sql DOCS = README.tsearch2 REGRESS = tsearch2 diff --git a/contrib/tsearch2/common.c b/contrib/tsearch2/common.c index 4984c3d256..c7b9cd3c35 100644 --- a/contrib/tsearch2/common.c +++ b/contrib/tsearch2/common.c @@ -5,6 +5,7 @@ #include "catalog/pg_proc.h" #include "catalog/pg_namespace.h" #include "utils/syscache.h" +#include "miscadmin.h" #include "ts_cfg.h" #include "dict.h" @@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid) return nspoid; } + + /* if path is relative, take it as relative to share dir */ +char * +to_absfilename(char *filename) { + if (!is_absolute_path(filename)) { + char sharepath[MAXPGPATH]; + char *absfn; +#ifdef WIN32 + char delim = '\\'; +#else + char delim = '/'; +#endif + get_share_path(my_exec_path, sharepath); + absfn = palloc(strlen(sharepath) + strlen(filename) + 2); + sprintf(absfn, "%s%c%s", sharepath, delim, filename); + filename = absfn; + } + + return filename; +} diff --git a/contrib/tsearch2/common.h b/contrib/tsearch2/common.h index c84e841e15..d2f4cd66a1 100644 --- a/contrib/tsearch2/common.h +++ b/contrib/tsearch2/common.h @@ -16,6 +16,8 @@ text *mtextdup(text *in); int text_cmp(text *a, text *b); +char * to_absfilename(char *filename); + #define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) ) #define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x)) diff --git a/contrib/tsearch2/dict.c b/contrib/tsearch2/dict.c index 9d912353e9..2c37a2696b 100644 --- a/contrib/tsearch2/dict.c +++ b/contrib/tsearch2/dict.c @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */ /* * interface functions to dictionary @@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict) Datum opt; Oid oid = InvalidOid; + /* setup dictlexize method */ + oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull)); + if (isnull || oid == InvalidOid) + ts_error(ERROR, "Null dict_lexize for dictonary %d", id); + fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext); + + /* setup and call dictinit method, optinally */ oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); if (!(isnull || oid == InvalidOid)) { opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull); dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt)); } - oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull)); - if (isnull || oid == InvalidOid) - ts_error(ERROR, "Null dict_lexize for dictonary %d", id); - fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext); dict->dict_id = id; } else @@ -98,6 +101,29 @@ comparedict(const void *a, const void *b) return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1; } +static void +insertdict(Oid id) { + DictInfo newdict; + + if (DList.len == DList.reallen) + { + DictInfo *tmp; + int reallen = (DList.reallen) ? 2 * DList.reallen : 16; + + tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen); + if (!tmp) + ts_error(ERROR, "No memory"); + DList.reallen = reallen; + DList.list = tmp; + } + init_dict(id, &newdict); + + DList.list[DList.len] = newdict; + DList.len++; + + qsort(DList.list, DList.len, sizeof(DictInfo), comparedict); +} + DictInfo * finddict(Oid id) { @@ -117,23 +143,8 @@ finddict(Oid id) return DList.last_dict; } - /* last chance */ - if (DList.len == DList.reallen) - { - DictInfo *tmp; - int reallen = (DList.reallen) ? 2 * DList.reallen : 16; - - tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen); - if (!tmp) - ts_error(ERROR, "No memory"); - DList.reallen = reallen; - DList.list = tmp; - } - DList.last_dict = &(DList.list[DList.len]); - init_dict(id, DList.last_dict); - - DList.len++; - qsort(DList.list, DList.len, sizeof(DictInfo), comparedict); + /* insert new dictionary */ + insertdict(id); return finddict(id); /* qsort changed order!! */ ; } @@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS) *ptr; Datum *da; ArrayType *a; + DictSubState dstate = { false, false, NULL }; SET_FUNCOID(); dict = finddict(PG_GETARG_OID(0)); ptr = res = (TSLexeme *) DatumGetPointer( - FunctionCall3(&(dict->lexize_info), - PointerGetDatum(dict->dictionary), - PointerGetDatum(VARDATA(in)), - Int32GetDatum(VARSIZE(in) - VARHDRSZ) + FunctionCall4(&(dict->lexize_info), + PointerGetDatum(dict->dictionary), + PointerGetDatum(VARDATA(in)), + Int32GetDatum(VARSIZE(in) - VARHDRSZ), + PointerGetDatum(&dstate) ) ); + + if (dstate.getnext) { + dstate.isend = true; + ptr = res = (TSLexeme *) DatumGetPointer( + FunctionCall4(&(dict->lexize_info), + PointerGetDatum(dict->dictionary), + PointerGetDatum(VARDATA(in)), + Int32GetDatum(VARSIZE(in) - VARHDRSZ), + PointerGetDatum(&dstate) + ) + ); + } + PG_FREE_IF_COPY(in, 1); if (!res) { diff --git a/contrib/tsearch2/dict.h b/contrib/tsearch2/dict.h index 7a6153c453..a0e9fe6fac 100644 --- a/contrib/tsearch2/dict.h +++ b/contrib/tsearch2/dict.h @@ -1,9 +1,10 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */ #ifndef __DICT_H__ #define __DICT_H__ #include "postgres.h" #include "fmgr.h" +#include "ts_cfg.h" typedef struct { @@ -29,6 +30,11 @@ DictInfo *finddict(Oid id); Oid name2id_dict(text *name); void reset_dict(void); +typedef struct { + bool isend; /* in: marks for lexize_info about text end is reached */ + bool getnext; /* out: dict wants next lexeme */ + void *private; /* internal dict state between calls with getnext == true */ +} DictSubState; /* simple parser of cfg string */ typedef struct @@ -45,17 +51,61 @@ typedef struct /* * number of variant of split word , for example Word 'fotballklubber' * (norwegian) has two varian to split: ( fotball, klubb ) and ( fot, - * ball, klubb ). So, dictionary should return: nvariant lexeme 1 - * fotball 1 klubb 2 fot 2 ball 2 klubb - * + * ball, klubb ). So, dictionary should return: + * nvariant lexeme + * 1 fotball + * 1 klubb + * 2 fot + * 2 ball + * 2 klubb */ uint16 nvariant; - /* currently unused */ uint16 flags; /* C-string */ char *lexeme; } TSLexeme; +#define TSL_ADDPOS 0x01 + + +/* + * Lexize subsystem + */ + +typedef struct ParsedLex { + int type; + char *lemm; + int lenlemm; + bool resfollow; + struct ParsedLex *next; +} ParsedLex; + +typedef struct ListParsedLex { + ParsedLex *head; + ParsedLex *tail; +} ListParsedLex; + +typedef struct { + TSCfgInfo *cfg; + Oid curDictId; + int posDict; + DictSubState dictState; + ParsedLex *curSub; + ListParsedLex towork; /* current list to work */ + ListParsedLex waste; /* list of lexemes that already lexized */ + + /* fields to store last variant to lexize (basically, thesaurus + or similar to, which wants several lexemes */ + + ParsedLex *lastRes; + TSLexeme *tmpRes; +} LexizeData; + + +void LexizeInit(LexizeData *ld, TSCfgInfo *cfg); +void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm); +TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem); + #endif diff --git a/contrib/tsearch2/dict_thesaurus.c b/contrib/tsearch2/dict_thesaurus.c new file mode 100644 index 0000000000..8e543a4db7 --- /dev/null +++ b/contrib/tsearch2/dict_thesaurus.c @@ -0,0 +1,743 @@ +/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.1 2006/05/31 14:05:31 teodor Exp $ */ + +/* + * thesaurus + * Teodor Sigaev + */ +#include "postgres.h" +#include "executor/spi.h" + +#include + +#include "dict.h" +#include "common.h" +#include "ts_locale.h" + +typedef struct LexemeInfo { + uint16 idsubst; /* entry's number in DictThesaurus->subst */ + uint16 posinsubst; /* pos info in entry */ + uint16 tnvariant; /* total num lexemes in one variant */ + struct LexemeInfo *nextentry; + struct LexemeInfo *nextvariant; +} LexemeInfo; + +typedef struct { + char *lexeme; + LexemeInfo *entries; +} TheLexeme; + +typedef struct { + uint16 lastlexeme; /* number lexemes to substitute */ + uint16 reslen; + TSLexeme *res; /* prepared substituted result */ +} TheSubstitute; + +typedef struct +{ + /* subdictionary to normalize lexemes */ + DictInfo subdict; + + /* Array to search lexeme by exact match */ + TheLexeme *wrds; + int nwrds; + int ntwrds; + + /* Storage of substituted result, n-th element is for + n-th expression */ + TheSubstitute *subst; + int nsubst; +} DictThesaurus; + +PG_FUNCTION_INFO_V1(thesaurus_init); +Datum thesaurus_init(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(thesaurus_lexize); +Datum thesaurus_lexize(PG_FUNCTION_ARGS); + +static void +freeDictThesaurus(DictThesaurus * d) +{ + free(d); +} + +static void +newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) { + TheLexeme *ptr; + + if ( d->nwrds >= d->ntwrds ) { + if ( d->ntwrds == 0 ) { + d->ntwrds = 16; + d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds); + } else { + d->ntwrds *= 2; + d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds); + } + if (!d->wrds) + elog(ERROR,"Out of memory"); + } + + ptr = d->wrds + d->nwrds; + d->nwrds++; + + if ( (ptr->lexeme = malloc(e-b+1)) == NULL ) + elog(ERROR,"Out of memory"); + + memcpy(ptr->lexeme, b, e-b); + ptr->lexeme[e-b] = '\0'; + + if ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL ) + elog(ERROR,"Out of memory"); + + ptr->entries->nextentry=NULL; + ptr->entries->idsubst = idsubst; + ptr->entries->posinsubst = posinsubst; +} + +static void +addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) { + static int nres=0; + static int ntres = 0; + TheSubstitute *ptr; + + if ( nwrd == 0 ) { + nres = ntres = 0; + + if ( idsubst <= d->nsubst ) { + if ( d->nsubst == 0 ) { + d->nsubst = 16; + d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst); + } else { + d->nsubst *= 2; + d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst); + } + if (!d->subst) + elog(ERROR,"Out of memory"); + } + } + + ptr = d->subst + idsubst; + + ptr->lastlexeme = posinsubst-1; + + if ( nres+1 >= ntres ) { + if ( ntres == 0 ) { + ntres = 2; + ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres ); + } else { + ntres *= 2; + ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres ); + } + + if ( !ptr->res ) + elog(ERROR,"Out of memory"); + } + + if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 ) + elog(ERROR,"Out of memory"); + memcpy(ptr->res[ nres ].lexeme, b, e-b); + ptr->res[ nres ].lexeme[e-b] = '\0'; + + ptr->res[ nres ].nvariant = nwrd; + ptr->res[ nres ].flags = TSL_ADDPOS; + + ptr->res[ ++nres ].lexeme = NULL; +} + +#define TR_WAITLEX 1 +#define TR_INLEX 2 +#define TR_WAITSUBS 3 +#define TR_INSUBS 4 + +static void +thesaurusRead( char *filename, DictThesaurus *d ) { + FILE *fh; + char str[BUFSIZ]; + int lineno=0; + uint16 idsubst = 0; + + fh = fopen(to_absfilename(filename), "r"); + if (!fh) + elog(ERROR,"Thesaurus: can't open '%s' file", filename); + + while( fgets(str, sizeof(str), fh)) { + char *ptr = str; + int state = TR_WAITLEX; + char *beginwrd = NULL; + uint16 posinsubst=0; + uint16 nwrd=0; + + lineno++; + + /* is it comment ? */ + while( t_isspace(ptr) ) + ptr += pg_mblen(ptr); + if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') ) + continue; + + pg_verifymbstr(ptr, strlen(ptr), false); + while(*ptr) { + if ( state == TR_WAITLEX ) { + if ( t_iseq(ptr, ':' ) ) { + if ( posinsubst == 0 ) { + fclose(fh); + elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno); + } + state = TR_WAITSUBS; + } else if ( !t_isspace(ptr) ) { + beginwrd = ptr; + state = TR_INLEX; + } + } else if ( state == TR_INLEX ) { + if ( t_iseq(ptr, ':') ) { + newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ ); + state = TR_WAITSUBS; + } else if ( t_isspace(ptr) ) { + newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ ); + state = TR_WAITLEX; + } + } else if ( state == TR_WAITSUBS ) { + if ( !t_isspace(ptr) ) { + beginwrd = ptr; + state = TR_INSUBS; + } + } else if ( state == TR_INSUBS ) { + if ( t_isspace(ptr) ) { + addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); + state = TR_WAITSUBS; + } + } else + elog(ERROR,"Thesaurus: Unknown state: %d", state); + + ptr += pg_mblen(ptr); + } + + if ( state == TR_INSUBS ) + addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); + + idsubst++; + + if ( !(nwrd && posinsubst) ) { + fclose(fh); + elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno); + } + + } + + d->nsubst = idsubst; + + fclose(fh); +} + +static TheLexeme* +addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) { + + if ( *nnw >= *tnm ) { + *tnm *= 2; + newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm); + if (!newwrds) + elog(ERROR,"Out of memory"); + } + + newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ); + if (!newwrds[ *nnw ].entries) + elog(ERROR,"Out of memory"); + + if ( lexeme && lexeme->lexeme ) { + newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme ); + if ( !newwrds[ *nnw ].lexeme ) + elog(ERROR,"Out of memory"); + + newwrds[ *nnw ].entries->tnvariant = tnvariant; + } else { + newwrds[ *nnw ].lexeme = NULL; + newwrds[ *nnw ].entries->tnvariant = 1; + } + + newwrds[ *nnw ].entries->idsubst = src->idsubst; + newwrds[ *nnw ].entries->posinsubst = src->posinsubst; + + newwrds[ *nnw ].entries->nextentry = NULL; + + (*nnw)++; + return newwrds; +} + +static int +cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) { + if ( a==NULL || b==NULL ) + return 0; + + if ( a->idsubst == b->idsubst ) { + if ( a->posinsubst == b->posinsubst ) { + if ( a->tnvariant == b->tnvariant ) + return 0; + + return ( a->tnvariant > b->tnvariant ) ? 1 : -1; + } + + return ( a->posinsubst > b->posinsubst ) ? 1 : -1; + } + + return ( a->idsubst > b->idsubst ) ? 1 : -1; +} + +static int +cmpLexeme(TheLexeme *a, TheLexeme* b) { + if ( a->lexeme == NULL ) { + if ( b->lexeme == NULL ) + return 0; + else + return 1; + } else if ( b->lexeme == NULL ) + return -1; + + return strcmp( a->lexeme, b->lexeme ); +} + +static int +cmpLexemeQ(const void *a, const void *b) { + return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b ); +} + +static int cmpTheLexeme(const void *a, const void *b) { + TheLexeme *la = (TheLexeme*)a; + TheLexeme *lb = (TheLexeme*)b; + int res; + + if ( (res=cmpLexeme(la, lb)) != 0 ) + return res; + + return -cmpLexemeInfo(la->entries, lb->entries); +} + +static void +compileTheLexeme(DictThesaurus *d) { + int i,nnw=0, tnm=16; + TheLexeme *newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds; + + if (!newwrds) + elog(ERROR,"Out of memory"); + + for(i=0;inwrds;i++) { + TSLexeme *ptr = (TSLexeme*) DatumGetPointer( + FunctionCall4( + &(d->subdict.lexize_info), + PointerGetDatum(d->subdict.dictionary), + PointerGetDatum(d->wrds[i].lexeme), + Int32GetDatum(strlen(d->wrds[i].lexeme)), + PointerGetDatum(NULL) + ) + ); + + if ( !(ptr && ptr->lexeme) ) { + newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); + elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, assign any non-recognized word", d->wrds[i].lexeme); + } else { + while( ptr->lexeme ) { + TSLexeme *remptr = ptr+1; + int tnvar = 1; + int curvar = ptr->nvariant; + + /* compute n words in one variant */ + while( remptr->lexeme ) { + if ( remptr->nvariant != (remptr-1)->nvariant ) + break; + tnvar++; + remptr++; + } + + remptr = ptr; + while( remptr->lexeme && remptr->nvariant == curvar ) { + newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); + remptr++; + } + + ptr = remptr; + } + } + + free( d->wrds[i].lexeme ); + free( d->wrds[i].entries ); + } + + free( d->wrds ); + d->wrds = newwrds; + d->nwrds = nnw; + d->ntwrds = tnm; + + if ( d->nwrds > 1 ) { + qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme ); + + /* uniq */ + newwrds = d->wrds; + ptrwrds = d->wrds + 1; + while( ptrwrds - d->wrds < d->nwrds ) { + if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) { + if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) { + ptrwrds->entries->nextentry = newwrds->entries; + newwrds->entries = ptrwrds->entries; + } else + free( ptrwrds->entries ); + + if ( ptrwrds->lexeme ) + free( ptrwrds->lexeme ); + } else { + newwrds++; + *newwrds = *ptrwrds; + } + + ptrwrds++; + } + + d->nwrds = newwrds - d->wrds + 1; + d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds ); + } +} + +static void +compileTheSubstitute(DictThesaurus *d) { + int i; + + for(i=0;insubst;i++) { + TSLexeme *rem = d->subst[i].res, *outptr, *inptr; + int n=2; + + outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n ); + if ( d->subst[i].res == NULL ) + elog(ERROR,"Out of Memory"); + outptr->lexeme = NULL; + inptr = rem; + + while( inptr && inptr->lexeme ) { + TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer( + FunctionCall4( + &(d->subdict.lexize_info), + PointerGetDatum(d->subdict.dictionary), + PointerGetDatum(inptr->lexeme), + Int32GetDatum(strlen(inptr->lexeme)), + PointerGetDatum(NULL) + ) + ); + + reml = lexized; + if ( lexized ) { + int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1; + + while( lexized->lexeme ) { + if ( outptr - d->subst[i].res + 1 >= n ) { + int diff = outptr - d->subst[i].res; + n *= 2; + d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n ); + if ( d->subst[i].res == NULL ) + elog(ERROR,"Out of Memory"); + outptr = d->subst[i].res + diff; + } + + *outptr = *lexized; + if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL ) + elog(ERROR,"Out of Memory"); + + outptr++; + lexized++; + } + + if ( toset > 0) + d->subst[i].res[toset].flags |= TSL_ADDPOS; + } + + if ( inptr->lexeme ) + free( inptr->lexeme ); + inptr++; + } + + d->subst[i].reslen = outptr - d->subst[i].res; + + free(rem); + } +} + +Datum +thesaurus_init(PG_FUNCTION_ARGS) +{ + DictThesaurus *d; + Map *cfg, + *pcfg; + text *in, *subdictname=NULL; + bool fileloaded = false; + + if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("Thesaurus confguration error"))); + + d = (DictThesaurus *) malloc(sizeof(DictThesaurus)); + if (!d) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + memset(d, 0, sizeof(DictThesaurus)); + + in = PG_GETARG_TEXT_P(0); + parse_cfgdict(in, &cfg); + PG_FREE_IF_COPY(in, 0); + pcfg = cfg; + while (pcfg->key) + { + if (pg_strcasecmp("DictFile", pcfg->key) == 0) + { + if (fileloaded) + { + freeDictThesaurus(d); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("Thesaurus file is already loaded"))); + } + fileloaded = true; + thesaurusRead( pcfg->value, d ); + } + else if (pg_strcasecmp("Dictionary", pcfg->key) == 0) + { + if (subdictname) + { + freeDictThesaurus(d); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("Thesaurus: SubDictionary is already defined"))); + } + subdictname = char2text( pcfg->value ); + } + else + { + freeDictThesaurus(d); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized option: %s => %s", + pcfg->key, pcfg->value))); + } + pfree(pcfg->key); + pfree(pcfg->value); + pcfg++; + } + pfree(cfg); + + if (!fileloaded) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("Thesaurus file isn't defined"))); + + if ( subdictname ) { + DictInfo *subdictptr; + /* + * we already in SPI, but name2id_dict()/finddict() + * invoke SPI_connect() + */ + SPI_push(); + + subdictptr = finddict( name2id_dict( subdictname ) ); + + SPI_pop(); + + d->subdict = *subdictptr; + } else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("Thesaurus: SubDictionary isn't defined"))); + + compileTheLexeme( d ); + compileTheSubstitute(d); + + PG_RETURN_POINTER(d); +} + +static LexemeInfo* +findTheLexeme(DictThesaurus *d, char * lexeme) { + TheLexeme key = { lexeme, NULL }, *res; + + if ( d->nwrds == 0 ) + return NULL; + + res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ); + + if ( res == NULL ) + return NULL; + return res->entries; +} + +static bool +matchIdSubst(LexemeInfo *stored, uint16 idsubst) { + bool res = true; + + if (stored) { + res = false; + + for(; stored; stored=stored->nextvariant) + if ( stored->idsubst == idsubst ) { + res = true; + break; + } + } + + return res; +} + +static LexemeInfo* +findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) { + for(;;) { + int i; + LexemeInfo *ptr = newin[0]; + + for(i=0; iidsubst < ptr->idsubst) + newin[i] = newin[i]->nextentry; + + if ( newin[i] == NULL ) + return in; + + if ( newin[i]->idsubst > ptr->idsubst ) { + ptr = newin[i]; + i=-1; + continue; + } + + while(newin[i]->idsubst == ptr->idsubst) { + if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) { + ptr = newin[i]; + break; + } + + newin[i] = newin[i]->nextentry; + if ( newin[i] == NULL ) + return in; + } + + if ( newin[i]->idsubst != ptr->idsubst ) { + ptr = newin[i]; + i=-1; + continue; + } + } + + if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */ + + ptr->nextvariant = in; + in = ptr; + } + + /* step forward */ + for(i=0; inextentry; + } + + return NULL; +} + +static TSLexeme* +copyTSLexeme( TheSubstitute *ts ) { + TSLexeme *res; + uint16 i; + + res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) ); + for(i=0;ireslen;i++) { + res[i] = ts->res[i]; + res[i].lexeme = pstrdup( ts->res[i].lexeme ); + } + + res[ts->reslen].lexeme = NULL; + + return res; +} + +static TSLexeme* +checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) { + *moreres = false; + while(info) { + Assert( info->idsubst < d->nsubst ); + if ( info->nextvariant ) + *moreres = true; + if ( d->subst[ info->idsubst ].lastlexeme == curpos ) + return copyTSLexeme( d->subst + info->idsubst ); + info = info->nextvariant; + } + + return NULL; +} + +Datum +thesaurus_lexize(PG_FUNCTION_ARGS) +{ + DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0); + DictSubState *dstate = (DictSubState*)PG_GETARG_POINTER(3); + TSLexeme *res=NULL; + LexemeInfo *stored, *info = NULL; + uint16 curpos = 0; + bool moreres = false; + + if ( dstate == NULL || PG_NARGS() < 4 ) + elog(ERROR,"Forbidden call of thesaurus or nested call"); + + if ( dstate->isend ) + PG_RETURN_POINTER(NULL); + stored = (LexemeInfo*) dstate->private; + + if (stored) + curpos = stored->posinsubst+1; + + res =(TSLexeme*) DatumGetPointer ( + FunctionCall4( + &(d->subdict.lexize_info), + PointerGetDatum(d->subdict.dictionary), + PG_GETARG_DATUM(1), + PG_GETARG_INT32(2), + PointerGetDatum(NULL) + ) + ); + + if ( res && res->lexeme ) { + TSLexeme *ptr = res , *basevar; + + while( ptr->lexeme ) { + uint16 nv = ptr->nvariant; + uint16 i,nlex = 0; + LexemeInfo **infos; + + basevar = ptr; + while( ptr->lexeme && nv == ptr->nvariant ) { + nlex++; + ptr++; + } + + infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex); + for(i=0;iprivate = (void*)info; + + if ( !info ) { + dstate->getnext = false; + PG_RETURN_POINTER(NULL); + } + + if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) { + dstate->getnext = moreres; + PG_RETURN_POINTER(res); + } + + dstate->getnext = true; + + PG_RETURN_POINTER(NULL); +} diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out index 39a95b2f70..35c97c9b01 100644 --- a/contrib/tsearch2/expected/tsearch2.out +++ b/contrib/tsearch2/expected/tsearch2.out @@ -4,21 +4,21 @@ -- \set ECHO none psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict" -psql:tsearch2.sql:158: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser" -psql:tsearch2.sql:257: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg" -psql:tsearch2.sql:264: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap" -psql:tsearch2.sql:370: NOTICE: type "tsvector" is not yet defined +psql:tsearch2.sql:177: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser" +psql:tsearch2.sql:276: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg" +psql:tsearch2.sql:283: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap" +psql:tsearch2.sql:389: NOTICE: type "tsvector" is not yet defined DETAIL: Creating a shell type definition. -psql:tsearch2.sql:375: NOTICE: argument type tsvector is only a shell -psql:tsearch2.sql:429: NOTICE: type "tsquery" is not yet defined +psql:tsearch2.sql:394: NOTICE: argument type tsvector is only a shell +psql:tsearch2.sql:448: NOTICE: type "tsquery" is not yet defined DETAIL: Creating a shell type definition. -psql:tsearch2.sql:434: NOTICE: argument type tsquery is only a shell -psql:tsearch2.sql:592: NOTICE: type "gtsvector" is not yet defined +psql:tsearch2.sql:453: NOTICE: argument type tsquery is only a shell +psql:tsearch2.sql:611: NOTICE: type "gtsvector" is not yet defined DETAIL: Creating a shell type definition. -psql:tsearch2.sql:597: NOTICE: argument type gtsvector is only a shell -psql:tsearch2.sql:1087: NOTICE: type "gtsq" is not yet defined +psql:tsearch2.sql:616: NOTICE: argument type gtsvector is only a shell +psql:tsearch2.sql:1106: NOTICE: type "gtsq" is not yet defined DETAIL: Creating a shell type definition. -psql:tsearch2.sql:1092: NOTICE: argument type gtsq is only a shell +psql:tsearch2.sql:1111: NOTICE: argument type gtsq is only a shell --tsvector SELECT '1'::tsvector; tsvector diff --git a/contrib/tsearch2/stopword.c b/contrib/tsearch2/stopword.c index f3894714d2..e6141f8390 100644 --- a/contrib/tsearch2/stopword.c +++ b/contrib/tsearch2/stopword.c @@ -4,8 +4,6 @@ */ #include "postgres.h" -#include "miscadmin.h" - #include "common.h" #include "dict.h" #include "ts_locale.h" @@ -36,30 +34,11 @@ readstoplist(text *in, StopList * s) s->len = 0; if (in && VARSIZE(in) - VARHDRSZ > 0) { - char *filename = text2char(in); + char *filename = to_absfilename(text2char(in)); FILE *hin; char buf[STOPBUFLEN]; int reallen = 0; - /* if path is relative, take it as relative to share dir */ - if (!is_absolute_path(filename)) - { - char sharepath[MAXPGPATH]; - char *absfn; -#ifdef WIN32 - char delim = '\\'; -#else - char delim = '/'; -#endif - - get_share_path(my_exec_path, sharepath); - absfn = palloc(strlen(sharepath) + strlen(filename) + 2); - sprintf(absfn, "%s%c%s", sharepath, delim, filename); - - pfree(filename); - filename = absfn; - } - if ((hin = fopen(filename, "r")) == NULL) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), diff --git a/contrib/tsearch2/thesaurus b/contrib/tsearch2/thesaurus new file mode 100644 index 0000000000..559164604f --- /dev/null +++ b/contrib/tsearch2/thesaurus @@ -0,0 +1,19 @@ +# +# Theasurus config file. Character ':' splits +# string to part: +# to be substituted string +# substituting string +# + +#one two three : 123 +#one two : 12 +#one : 1 +#two : 2 + +#foo bar : blah blah +#f bar : fbar +#e bar : ebar +#g bar bar : gbarbar +#asd:sdffff +#qwerty:qwer wert erty + diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c index a71cf97e59..5a662b7dbb 100644 --- a/contrib/tsearch2/ts_cfg.c +++ b/contrib/tsearch2/ts_cfg.c @@ -281,15 +281,15 @@ name2id_cfg(text *name) return id; } - void parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) { int type, - lenlemm, - i; + lenlemm; char *lemm = NULL; WParserInfo *prsobj = findprs(cfg->prs_id); + LexizeData ldata; + TSLexeme *norms; prsobj->prs = (void *) DatumGetPointer( FunctionCall2( @@ -299,14 +299,16 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) ) ); - while ((type = DatumGetInt32(FunctionCall3( + LexizeInit(&ldata, cfg); + + do { + type = DatumGetInt32(FunctionCall3( &(prsobj->getlexeme_info), PointerGetDatum(prsobj->prs), PointerGetDatum(&lemm), - PointerGetDatum(&lenlemm)))) != 0) - { + PointerGetDatum(&lenlemm))); - if (lenlemm >= MAXSTRLEN) + if (type>0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, @@ -320,25 +322,11 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) #endif } - if (type >= cfg->len) /* skip this type of lexeme */ - continue; + LexizeAddLemm(&ldata, type, lemm, lenlemm); - for (i = 0; i < cfg->map[type].len; i++) + while( (norms = LexizeExec(&ldata, NULL)) != NULL ) { - DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i])); - TSLexeme *norms, - *ptr; - - norms = ptr = (TSLexeme *) DatumGetPointer( - FunctionCall3( - &(dict->lexize_info), - PointerGetDatum(dict->dictionary), - PointerGetDatum(lemm), - PointerGetDatum(lenlemm) - ) - ); - if (!norms) /* dictionary doesn't know this lexeme */ - continue; + TSLexeme *ptr = norms; prs->pos++; /* set pos */ @@ -350,6 +338,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD)); } + if ( ptr->flags & TSL_ADDPOS ) + prs->pos++; prs->words[prs->curwords].len = strlen(ptr->lexeme); prs->words[prs->curwords].word = ptr->lexeme; prs->words[prs->curwords].nvariant = ptr->nvariant; @@ -359,9 +349,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) prs->curwords++; } pfree(norms); - break; /* lexeme already normalized or is stop word */ - } } + } while(type>0); FunctionCall1( &(prsobj->end_info), @@ -417,14 +406,47 @@ hlfinditem(HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int buflen) } } +static void +addHLParsedLex(HLPRSTEXT *prs, QUERYTYPE * query, ParsedLex *lexs, TSLexeme *norms) { + ParsedLex *tmplexs; + TSLexeme *ptr; + + while( lexs ) { + + if ( lexs->type > 0 ) + hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type); + + ptr = norms; + while( ptr && ptr->lexeme ) { + hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme)); + ptr++; + } + + tmplexs = lexs->next; + pfree( lexs ); + lexs = tmplexs; + } + + if ( norms ) { + ptr = norms; + while( ptr->lexeme ) { + pfree( ptr->lexeme ); + ptr++; + } + pfree(norms); + } +} + void hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen) { int type, - lenlemm, - i; + lenlemm; char *lemm = NULL; WParserInfo *prsobj = findprs(cfg->prs_id); + LexizeData ldata; + TSLexeme *norms; + ParsedLex *lexs; prsobj->prs = (void *) DatumGetPointer( FunctionCall2( @@ -434,14 +456,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 ) ); - while ((type = DatumGetInt32(FunctionCall3( + LexizeInit(&ldata, cfg); + + do { + type = DatumGetInt32(FunctionCall3( &(prsobj->getlexeme_info), PointerGetDatum(prsobj->prs), PointerGetDatum(&lemm), - PointerGetDatum(&lenlemm)))) != 0) - { + PointerGetDatum(&lenlemm))); - if (lenlemm >= MAXSTRLEN) + if (type>0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, @@ -455,38 +479,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 #endif } - hladdword(prs, lemm, lenlemm, type); + LexizeAddLemm(&ldata, type, lemm, lenlemm); - if (type >= cfg->len) - continue; + do { + if ( (norms = LexizeExec(&ldata,&lexs)) != NULL ) + addHLParsedLex(prs, query, lexs, norms); + else + addHLParsedLex(prs, query, lexs, NULL); + } while( norms ); - for (i = 0; i < cfg->map[type].len; i++) - { - DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i])); - TSLexeme *norms, - *ptr; - - norms = ptr = (TSLexeme *) DatumGetPointer( - FunctionCall3( - &(dict->lexize_info), - PointerGetDatum(dict->dictionary), - PointerGetDatum(lemm), - PointerGetDatum(lenlemm) - ) - ); - if (!norms) /* dictionary doesn't know this lexeme */ - continue; - - while (ptr->lexeme) - { - hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme)); - pfree(ptr->lexeme); - ptr++; - } - pfree(norms); - break; /* lexeme already normalized or is stop word */ - } - } + } while( type>0 ); FunctionCall1( &(prsobj->end_info), diff --git a/contrib/tsearch2/ts_lexize.c b/contrib/tsearch2/ts_lexize.c new file mode 100644 index 0000000000..c90848c6fd --- /dev/null +++ b/contrib/tsearch2/ts_lexize.c @@ -0,0 +1,261 @@ +/* + * lexize stream of lexemes + * Teodor Sigaev + */ +#include "postgres.h" + +#include +#include + +#include "ts_cfg.h" +#include "dict.h" + +void +LexizeInit(LexizeData *ld, TSCfgInfo *cfg) { + ld->cfg = cfg; + ld->curDictId = InvalidOid; + ld->posDict = 0; + ld->towork.head = ld->towork.tail = ld->curSub = NULL; + ld->waste.head = ld->waste.tail = NULL; + ld->lastRes=NULL; + ld->tmpRes=NULL; +} + +static void +LPLAddTail(ListParsedLex *list, ParsedLex *newpl) { + if ( list->tail ) { + list->tail->next = newpl; + list->tail = newpl; + } else + list->head = list->tail = newpl; + newpl->next = NULL; +} + +static ParsedLex* +LPLRemoveHead(ListParsedLex *list) { + ParsedLex *res = list->head; + + if ( list->head ) + list->head = list->head->next; + + if ( list->head == NULL ) + list->tail = NULL; + + return res; +} + + +void +LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) { + ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) ); + + newpl = (ParsedLex*)palloc( sizeof(ParsedLex) ); + newpl->type = type; + newpl->lemm = lemm; + newpl->lenlemm = lenlemm; + LPLAddTail(&ld->towork, newpl); + ld->curSub = ld->towork.tail; +} + +static void +RemoveHead(LexizeData *ld) { + LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); + + ld->posDict = 0; +} + +static void +setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) { + if ( correspondLexem ) { + *correspondLexem = ld->waste.head; + } else { + ParsedLex *tmp, *ptr = ld->waste.head; + + while(ptr) { + tmp = ptr->next; + pfree(ptr); + ptr = tmp; + } + } + ld->waste.head = ld->waste.tail = NULL; +} + +static void +moveToWaste(LexizeData *ld, ParsedLex *stop) { + bool go = true; + + while( ld->towork.head && go) { + if (ld->towork.head == stop) { + ld->curSub = stop->next; + go = false; + } + RemoveHead(ld); + } +} + +static void +setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) { + if ( ld->tmpRes ) { + TSLexeme *ptr; + for( ptr=ld->tmpRes; ptr->lexeme; ptr++ ) + pfree( ptr->lexeme ); + pfree( ld->tmpRes ); + } + ld->tmpRes = res; + ld->lastRes = lex; +} + +TSLexeme* +LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) { + int i; + ListDictionary *map; + DictInfo *dict; + TSLexeme *res; + + if ( ld->curDictId == InvalidOid ) { + /* + * usial mode: dictionary wants only one word, + * but we should keep in mind that we should go through + * all stack + */ + + while( ld->towork.head ) { + ParsedLex *curVal = ld->towork.head; + + map = ld->cfg->map + curVal->type; + + if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) { + /* skip this type of lexeme */ + RemoveHead(ld); + continue; + } + + for (i = ld->posDict; i < map->len; i++) { + dict = finddict(DatumGetObjectId(map->dict_id[i])); + + ld->dictState.isend = ld->dictState.getnext = false; + ld->dictState.private = NULL; + res = (TSLexeme *) DatumGetPointer( FunctionCall4( + &(dict->lexize_info), + PointerGetDatum(dict->dictionary), + PointerGetDatum(curVal->lemm), + Int32GetDatum(curVal->lenlemm), + PointerGetDatum(&ld->dictState) + )); + + if ( ld->dictState.getnext ) { + /* + * dictinary wants next word, so setup and store + * current position and go to multiword mode + */ + + ld->curDictId = DatumGetObjectId(map->dict_id[i]); + ld->posDict = i+1; + ld->curSub = curVal->next; + if ( res ) + setNewTmpRes(ld, curVal, res); + return LexizeExec(ld, correspondLexem); + } + + if (!res) /* dictionary doesn't know this lexeme */ + continue; + + RemoveHead(ld); + setCorrLex(ld, correspondLexem); + return res; + } + + RemoveHead(ld); + } + } else { /* curDictId is valid */ + dict = finddict(ld->curDictId); + + /* + * Dictionary ld->curDictId asks us about following words + */ + + while( ld->curSub ) { + ParsedLex *curVal = ld->curSub; + + map = ld->cfg->map + curVal->type; + + if (curVal->type != 0) { + bool dictExists = false; + + if (curVal->type >= ld->cfg->len || map->len == 0 ) { + /* skip this type of lexeme */ + ld->curSub = curVal->next; + continue; + } + + /* + * We should be sure that current type of lexeme is recognized by + * our dictinonary: we just check is it exist in + * list of dictionaries ? + */ + for(i=0;i < map->len && !dictExists; i++) + if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) ) + dictExists = true; + + if ( !dictExists ) { + /* + * Dictionary can't work with current tpe of lexeme, + * return to basic mode and redo all stored lexemes + */ + ld->curDictId = InvalidOid; + return LexizeExec(ld, correspondLexem); + } + } + + ld->dictState.isend = (curVal->type==0) ? true : false; + ld->dictState.getnext = false; + + res = (TSLexeme *) DatumGetPointer( FunctionCall4( + &(dict->lexize_info), + PointerGetDatum(dict->dictionary), + PointerGetDatum(curVal->lemm), + Int32GetDatum(curVal->lenlemm), + PointerGetDatum(&ld->dictState) + )); + + if ( ld->dictState.getnext ) { + /* Dictionary wants one more */ + ld->curSub = curVal->next; + if ( res ) + setNewTmpRes(ld, curVal, res); + continue; + } + + if ( res || ld->tmpRes ) { + /* + * Dictionary normalizes lexemes, + * so we remove from stack all used lexemes , + * return to basic mode and redo end of stack (if it exists) + */ + if ( res ) { + moveToWaste( ld, ld->curSub ); + } else { + res = ld->tmpRes; + moveToWaste( ld, ld->lastRes ); + } + + /* reset to initial state */ + ld->curDictId = InvalidOid; + ld->posDict = 0; + ld->lastRes = NULL; + ld->tmpRes = NULL; + setCorrLex(ld, correspondLexem); + return res; + } + + /* Dict don't want next lexem and didn't recognize anything, + redo from ld->towork.head */ + ld->curDictId = InvalidOid; + return LexizeExec(ld, correspondLexem); + } + } + + setCorrLex(ld, correspondLexem); + return NULL; +} + diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in index 76b4c5bb9e..39c6bf9b42 100644 --- a/contrib/tsearch2/tsearch.sql.in +++ b/contrib/tsearch2/tsearch.sql.in @@ -146,6 +146,25 @@ insert into pg_ts_dict select 'Example of synonym dictionary' ; +CREATE FUNCTION thesaurus_init(internal) + RETURNS internal + as 'MODULE_PATHNAME' + LANGUAGE C; + +CREATE FUNCTION thesaurus_lexize(internal,internal,int4,internal) + RETURNS internal + as 'MODULE_PATHNAME' + LANGUAGE C + RETURNS NULL ON NULL INPUT; + +insert into pg_ts_dict select + 'thesaurus_template', + 'thesaurus_init(internal)', + null, + 'thesaurus_lexize(internal,internal,int4,internal)', + 'Thesaurus template, must be pointed Dictionary and DictFile' +; + --dict conf CREATE TABLE pg_ts_parser ( prs_name text not null primary key, @@ -1193,7 +1212,11 @@ AS --example of ISpell dictionary --update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template'; ---example of synonym dict ---update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5; +--example of synonym dict +--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_name='synonym'; + +--example of thesaurus dict +--update pg_ts_dict set dict_initoption='DictFile="contrib/thesaurus", Dictionary="en_stem"' where dict_name='thesaurus_template'; +--update pg_ts_cfgmap set dict_name = '{thesaurus_template,en_stem}' where dict_name = '{en_stem}'; END; diff --git a/contrib/tsearch2/untsearch.sql.in b/contrib/tsearch2/untsearch.sql.in index 2a658dfd93..f344f86b36 100644 --- a/contrib/tsearch2/untsearch.sql.in +++ b/contrib/tsearch2/untsearch.sql.in @@ -41,6 +41,8 @@ DROP FUNCTION snb_lexize(internal,internal,int4); DROP FUNCTION snb_ru_init(internal); DROP FUNCTION spell_init(internal); DROP FUNCTION spell_lexize(internal,internal,int4); +DROP FUNCTION thesaurus_init(internal); +DROP FUNCTION thesaurus_lexize(internal,internal,int4); DROP FUNCTION syn_init(internal); DROP FUNCTION syn_lexize(internal,internal,int4); DROP FUNCTION set_curprs(int);