Allow do not lexize words in substitution.

Docs will be submitted some later, now it's at
 http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
This commit is contained in:
Teodor Sigaev 2006-06-06 16:25:55 +00:00
parent 63e464a5e6
commit 92bcb5abe0
2 changed files with 69 additions and 30 deletions

View File

@ -1,4 +1,4 @@
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
/* /*
* thesaurus * thesaurus
@ -13,6 +13,11 @@
#include "common.h" #include "common.h"
#include "ts_locale.h" #include "ts_locale.h"
/*
* Temporay we use TSLexeme.flags for inner use...
*/
#define DT_USEASIS 0x1000
typedef struct LexemeInfo { typedef struct LexemeInfo {
uint16 idsubst; /* entry's number in DictThesaurus->subst */ uint16 idsubst; /* entry's number in DictThesaurus->subst */
uint16 posinsubst; /* pos info in entry */ uint16 posinsubst; /* pos info in entry */
@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
} }
static void static void
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) { addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis ) {
static int nres=0; static int nres=0;
static int ntres = 0; static int ntres = 0;
TheSubstitute *ptr; TheSubstitute *ptr;
@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
ptr->res[ nres ].lexeme[e-b] = '\0'; ptr->res[ nres ].lexeme[e-b] = '\0';
ptr->res[ nres ].nvariant = nwrd; ptr->res[ nres ].nvariant = nwrd;
ptr->res[ nres ].flags = TSL_ADDPOS; if ( useasis )
ptr->res[ nres ].flags = DT_USEASIS;
else
ptr->res[ nres ].flags = 0;
ptr->res[ ++nres ].lexeme = NULL; ptr->res[ ++nres ].lexeme = NULL;
} }
@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
char str[BUFSIZ]; char str[BUFSIZ];
int lineno=0; int lineno=0;
uint16 idsubst = 0; uint16 idsubst = 0;
bool useasis=false;
fh = fopen(to_absfilename(filename), "r"); fh = fopen(to_absfilename(filename), "r");
if (!fh) if (!fh)
@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
state = TR_WAITLEX; state = TR_WAITLEX;
} }
} else if ( state == TR_WAITSUBS ) { } else if ( state == TR_WAITSUBS ) {
if ( !t_isspace(ptr) ) { if ( t_iseq(ptr, '*') ) {
useasis = true;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
} else if ( t_iseq(ptr, '\\') ) {
useasis = false;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
} else if ( !t_isspace(ptr) ) {
useasis = false;
beginwrd = ptr; beginwrd = ptr;
state = TR_INSUBS; state = TR_INSUBS;
} }
} else if ( state == TR_INSUBS ) { } else if ( state == TR_INSUBS ) {
if ( t_isspace(ptr) ) { if ( t_isspace(ptr) ) {
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); if ( ptr == beginwrd )
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
state = TR_WAITSUBS; state = TR_WAITSUBS;
} }
} else } else
@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
ptr += pg_mblen(ptr); ptr += pg_mblen(ptr);
} }
if ( state == TR_INSUBS ) if ( state == TR_INSUBS ) {
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst ); if ( ptr == beginwrd )
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
}
idsubst++; idsubst++;
@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
elog(ERROR,"Out of memory"); elog(ERROR,"Out of memory");
for(i=0;i<d->nwrds;i++) { for(i=0;i<d->nwrds;i++) {
TSLexeme *ptr = (TSLexeme*) DatumGetPointer( TSLexeme *ptr;
ptr = (TSLexeme*) DatumGetPointer(
FunctionCall4( FunctionCall4(
&(d->subdict.lexize_info), &(d->subdict.lexize_info),
PointerGetDatum(d->subdict.dictionary), PointerGetDatum(d->subdict.dictionary),
@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
if ( !(ptr && ptr->lexeme) ) { if ( !(ptr && ptr->lexeme) ) {
if ( !ptr ) if ( !ptr )
elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary", d->wrds[i].lexeme); elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1 );
else else
elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word", d->wrds[i].lexeme); elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1);
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
} else { } else {
@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
inptr = rem; inptr = rem;
while( inptr && inptr->lexeme ) { while( inptr && inptr->lexeme ) {
TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer( TSLexeme *lexized, tmplex[2];
FunctionCall4(
&(d->subdict.lexize_info), if ( inptr->flags & DT_USEASIS ) { /* do not lexize */
PointerGetDatum(d->subdict.dictionary), tmplex[0] = *inptr;
PointerGetDatum(inptr->lexeme), tmplex[0].flags = 0;
Int32GetDatum(strlen(inptr->lexeme)), tmplex[1].lexeme = NULL;
PointerGetDatum(NULL) lexized = tmplex;
) } else {
); lexized = (TSLexeme*) DatumGetPointer(
FunctionCall4(
&(d->subdict.lexize_info),
PointerGetDatum(d->subdict.dictionary),
PointerGetDatum(inptr->lexeme),
Int32GetDatum(strlen(inptr->lexeme)),
PointerGetDatum(NULL)
)
);
}
reml = lexized;
if ( lexized && lexized->lexeme ) { if ( lexized && lexized->lexeme ) {
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1; int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
if ( toset > 0) if ( toset > 0)
d->subst[i].res[toset].flags |= TSL_ADDPOS; d->subst[i].res[toset].flags |= TSL_ADDPOS;
} else if ( lexized ) {
elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i+1);
} else { } else {
elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, ignored", inptr->lexeme); elog(ERROR,"Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i+1);
} }
if ( inptr->lexeme ) if ( inptr->lexeme )
@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
} }
if ( outptr == d->subst[i].res ) if ( outptr == d->subst[i].res )
elog(ERROR,"Thesaurus: all words in subsitution aren't recognized by subdictionary"); elog(ERROR,"Thesaurus: all words in subsitution are stop word (rule %d)", i+1);
d->subst[i].reslen = outptr - d->subst[i].res; d->subst[i].reslen = outptr - d->subst[i].res;

View File

@ -1,14 +1,16 @@
# #
# Theasurus config file. Character ':' splits # Theasurus config file. Character ':' splits
# string to part: # string to part, example:
# to be substituted string # sample-words : substitute-words
# substituting string
# #
# Any substitute-word can be marked by preceding '*' character,
# which means do not lexize this word
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
#one two three : 123 #one two three : *123
#one two : 12 #one two : *12
#one : 1 #one : *1
#two : 2 #two : *2
#foo bar : blah blah #foo bar : blah blah
#f bar : fbar #f bar : fbar