1 Fix affixes with void replacement (AFAIK, it's only russian)

2 Optimize regex execution
This commit is contained in:
Teodor Sigaev 2004-06-23 11:06:11 +00:00
parent 153d5d31eb
commit de55c0cef6
6 changed files with 338 additions and 75 deletions

View File

@ -1,4 +1,4 @@
# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.5 2003/11/29 19:51:36 pgsql Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.6 2004/06/23 11:06:11 teodor Exp $
subdir = contrib/tsearch2/ispell
top_builddir = ../../..
@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global
PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS)
override CFLAGS += $(CFLAGS_SL)
SUBOBJS = spell.o
SUBOBJS = spell.o regis.o
all: SUBSYS.o

View File

@ -0,0 +1,151 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "regis.h"
#include "common.h"
int
RS_isRegis(const char *str) {
unsigned char *ptr=(unsigned char *)str;
while(ptr && *ptr)
if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^')
ptr++;
else
return 0;
return 1;
}
#define RS_IN_ONEOF 1
#define RS_IN_ONEOF_IN 2
#define RS_IN_NONEOF 3
#define RS_IN_WAIT 4
static RegisNode*
newRegisNode(RegisNode *prev, int len) {
RegisNode *ptr;
ptr = (RegisNode*)malloc(RNHDRSZ+len+1);
if (!ptr)
ts_error(ERROR, "No memory");
memset(ptr,0,RNHDRSZ+len+1);
if (prev)
prev->next=ptr;
return ptr;
}
int
RS_compile(Regis *r, int issuffix, const char *str) {
int i,len = strlen(str);
int state = RS_IN_WAIT;
RegisNode *ptr=NULL;
memset(r,0,sizeof(Regis));
r->issuffix = (issuffix) ? 1 : 0;
for(i=0;i<len;i++) {
unsigned char c = *( ( (unsigned char*)str ) + i );
if ( state == RS_IN_WAIT ) {
if ( isalpha(c) ) {
if ( ptr )
ptr = newRegisNode(ptr,len);
else
ptr = r->node = newRegisNode(NULL,len);
ptr->data[ 0 ] = c;
ptr->type = RSF_ONEOF;
ptr->len=1;
} else if ( c=='[' ) {
if ( ptr )
ptr = newRegisNode(ptr,len);
else
ptr = r->node = newRegisNode(NULL,len);
ptr->type = RSF_ONEOF;
state=RS_IN_ONEOF;
} else
ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
} else if ( state == RS_IN_ONEOF ) {
if ( c=='^' ) {
ptr->type = RSF_NONEOF;
state=RS_IN_NONEOF;
} else if ( isalpha(c) ) {
ptr->data[ 0 ] = c;
ptr->len=1;
state=RS_IN_ONEOF_IN;
} else
ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
} else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) {
if ( isalpha(c) ) {
ptr->data[ ptr->len ] = c;
ptr->len++;
} else if ( c==']' ) {
state=RS_IN_WAIT;
} else
ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
} else
ts_error(ERROR,"Internal error in RS_compile: %d\n", state);
}
ptr = r->node;
while(ptr) {
r->nchar++;
ptr=ptr->next;
}
return 0;
}
void
RS_free(Regis *r) {
RegisNode *ptr=r->node,*tmp;
while(ptr) {
tmp=ptr->next;
free(ptr);
ptr = tmp;
}
r->node = NULL;
}
int
RS_execute(Regis *r, const char *str, int len) {
RegisNode *ptr=r->node;
unsigned char *c;
if (len<0)
len=strlen(str);
if (len<r->nchar)
return 0;
if ( r->issuffix )
c = ((unsigned char*)str) + len - r->nchar;
else
c = (unsigned char*)str;
while(ptr) {
switch(ptr->type) {
case RSF_ONEOF:
if ( ptr->len==0 ) {
if ( *c != *(ptr->data) )
return 0;
} else if ( strchr((char*)ptr->data, *c) == NULL )
return 0;
break;
case RSF_NONEOF:
if ( ptr->len==0 ) {
if ( *c == *(ptr->data) )
return 0;
} else if ( strchr((char*)ptr->data, *c) != NULL )
return 0;
break;
default:
ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type);
}
ptr=ptr->next;
c++;
}
return 1;
}

View File

@ -0,0 +1,34 @@
#ifndef __REGIS_H__
#define __REGIS_H__
#include "postgres.h"
typedef struct RegisNode {
uint32
type:2,
len:16,
unused:14;
struct RegisNode *next;
unsigned char data[1];
} RegisNode;
#define RNHDRSZ (sizeof(uint32)+sizeof(void*))
#define RSF_ONEOF 1
#define RSF_NONEOF 2
typedef struct Regis {
RegisNode *node;
uint32
issuffix:1,
nchar:16,
unused:15;
} Regis;
int RS_isRegis(const char *str);
int RS_compile(Regis *r, int issuffix, const char *str);
void RS_free(Regis *r);
/*×ÏÚ×ÒÁÝÁÅÔ 1 ÅÓÌÉ ÍÁÔÞÉÔÓÑ */
int RS_execute(Regis *r, const char *str, int len);
#endif

View File

@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
{
SPNode *node = Conf->Dictionary;
SPNodeData *StopLow, *StopHigh, *StopMiddle;
int level=0, wrdlen=strlen(word);
uint8 *ptr =(uint8*)word;
while( node && level<wrdlen) {
while( node && *ptr) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
if ( StopMiddle->val == ((uint8*)(word))[level] ) {
if ( wrdlen==level+1 && StopMiddle->isword ) {
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
if ( StopMiddle->val == *ptr ) {
if ( *(ptr+1)=='\0' && StopMiddle->isword ) {
if ( compoundonly && !StopMiddle->compoundallow )
return 0;
if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
return 1;
}
node=StopMiddle->node;
level++;
ptr++;
break;
} else if ( StopMiddle->val < ((uint8*)(word))[level] ) {
} else if ( StopMiddle->val < *ptr ) {
StopLow = StopMiddle + 1;
} else {
StopHigh = StopMiddle;
@ -236,19 +236,32 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
}
MEMOUT(Conf->Affix);
}
if (type == 's')
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
Conf->Affix[Conf->naffixes].compile = 1;
Conf->Affix[Conf->naffixes].flagflags = flagflags;
Conf->Affix[Conf->naffixes].flag = flag;
Conf->Affix[Conf->naffixes].type = type;
strcpy(Conf->Affix[Conf->naffixes].find, find);
strcpy(Conf->Affix[Conf->naffixes].repl, repl);
Conf->Affix[Conf->naffixes].replen = strlen(repl);
Conf->naffixes++;
if ( strcmp(mask,".")==0 ) {
Conf->Affix[Conf->naffixes].issimple=1;
Conf->Affix[Conf->naffixes].isregis=0;
*( Conf->Affix[Conf->naffixes].mask )='\0';
} else if ( RS_isRegis(mask) ) {
Conf->Affix[Conf->naffixes].issimple=0;
Conf->Affix[Conf->naffixes].isregis=1;
strcpy(Conf->Affix[Conf->naffixes].mask, mask);
} else {
Conf->Affix[Conf->naffixes].issimple=0;
Conf->Affix[Conf->naffixes].isregis=0;
if (type == FF_SUFFIX)
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
}
Conf->Affix[Conf->naffixes].compile = 1;
Conf->Affix[Conf->naffixes].flagflags = flagflags;
Conf->Affix[Conf->naffixes].flag = flag;
Conf->Affix[Conf->naffixes].type = type;
strcpy(Conf->Affix[Conf->naffixes].find, find);
strcpy(Conf->Affix[Conf->naffixes].repl, repl);
Conf->Affix[Conf->naffixes].replen = strlen(repl);
Conf->naffixes++;
return (0);
}
@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
continue;
}
NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
}
fclose(affix);
@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) {
return rs;
}
static void
mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) {
int i,cnt=0;
int start = (issuffix) ? startsuffix : 0;
int end = (issuffix) ? Conf->naffixes : startsuffix;
AffixNode *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData));
MEMOUT(Affix);
memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) );
Affix->length=1;
Affix->isvoid=1;
if (issuffix) {
Affix->data->node=Conf->Suffix;
Conf->Suffix = Affix;
} else {
Affix->data->node=Conf->Prefix;
Conf->Prefix = Affix;
}
for(i=start;i<end;i++)
if (Conf->Affix[i].replen==0)
cnt++;
if ( cnt==0 )
return;
Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt );
MEMOUT(Affix->data->aff);
Affix->data->naff = (uint32)cnt;
cnt=0;
for(i=start;i<end;i++)
if (Conf->Affix[i].replen==0) {
Affix->data->aff[cnt] = Conf->Affix + i;
cnt++;
}
}
void
NISortAffixes(IspellDict * Conf)
{
@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf)
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p');
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
mkVoidAffix(Conf, 1, firstsuffix);
mkVoidAffix(Conf, 0, firstsuffix);
}
static AffixNodeData*
@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
AffixNodeData *StopLow, *StopHigh, *StopMiddle;
uint8 symbol;
if ( node->isvoid ) { /* search void affixes */
if (node->data->naff)
return node->data;
node = node->data->node;
}
while( node && *level<wrdlen) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
symbol = GETWCHAR(word,wrdlen,*level,type);
if ( StopMiddle->val == symbol ) {
(*level)++;
if ( StopMiddle->naff )
return StopMiddle;
node=StopMiddle->node;
(*level)++;
break;
} else if ( StopMiddle->val < symbol ) {
StopLow = StopMiddle + 1;
@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
static char *
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
int dat_len;
if ( flagflags & FF_COMPOUNDONLYAFX ) {
if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
return NULL;
}
if ( Affix->type=='s' ) {
if ( Affix->type==FF_SUFFIX ) {
strcpy(newword, word);
strcpy(newword + len - Affix->replen, Affix->find);
} else {
@ -639,34 +695,50 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
strcat(newword, word + Affix->replen);
}
if (Affix->compile)
{
int wmasklen,masklen = strlen(Affix->mask);
pg_wchar *mask;
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
pfree(mask);
if (err)
if ( Affix->issimple ) {
return newword;
} else if ( Affix->isregis ) {
if (Affix->compile) {
RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask);
Affix->compile = 0;
}
if ( RS_execute(&(Affix->reg.regis), newword, -1) )
return newword;
} else {
regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
int dat_len;
if (Affix->compile)
{
/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
pg_regfree(&(Affix->reg));
return (NULL);
int wmasklen,masklen = strlen(Affix->mask);
pg_wchar *mask;
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
pfree(mask);
if (err)
{
/* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */
pg_regfree(&(Affix->reg.regex));
return (NULL);
}
Affix->compile = 0;
}
Affix->compile = 0;
}
/* Convert data string to wide characters */
dat_len = strlen(newword);
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, dat_len);
/* Convert data string to wide characters */
dat_len = strlen(newword);
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, dat_len);
if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) {
pfree(data);
return newword;
if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) {
pfree(data);
return newword;
}
pfree(data);
}
pfree(data);
return NULL;
}
@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
}
}
pnode = prefix->node;
plevel++;
}
/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
}
}
pnode = prefix->node;
plevel++;
}
}
}
snode=suffix->node;
slevel++;
}
if (cur == forms) {
@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf)
for (i = 0; i < Conf->naffixes; i++)
{
if (Affix[i].compile == 0)
pg_regfree(&(Affix[i].reg));
if (Affix[i].compile == 0) {
if ( Affix[i].isregis )
RS_free(&(Affix[i].reg.regis));
else
pg_regfree(&(Affix[i].reg.regex));
}
}
if (Conf->Spell) {
for (i = 0; i < Conf->nspell; i++)

View File

@ -3,6 +3,7 @@
#include <sys/types.h>
#include "regex/regex.h"
#include "regis.h"
#include "c.h"
@ -40,20 +41,29 @@ typedef struct spell_struct
typedef struct aff_struct
{
char flag;
char flagflags;
char type;
char mask[33];
char find[16];
char repl[16];
regex_t reg;
size_t replen;
char compile;
uint32
flag:8,
type:2,
compile:1,
flagflags:3,
issimple:1,
isregis:1,
unused:1,
replen:16;
char mask[32];
char find[16];
char repl[16];
union {
regex_t regex;
Regis regis;
} reg;
} AFFIX;
#define FF_CROSSPRODUCT 0x01
#define FF_COMPOUNDWORD 0x02
#define FF_COMPOUNDONLYAFX 0x04
#define FF_SUFFIX 2
#define FF_PREFIX 1
struct AffixNode;
@ -66,18 +76,13 @@ typedef struct {
} AffixNodeData;
typedef struct AffixNode {
uint32 length;
uint32 isvoid:1,
length:31;
AffixNodeData data[1];
} AffixNode;
#define ANHRDSZ (sizeof(uint32))
typedef struct Tree_struct
{
int Left[256],
Right[256];
} Tree_struct;
typedef struct {
char *affix;
int len;

View File

@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops
FUNCTION 1 tsvector_cmp(tsvector, tsvector);
--example of ISpell dictionary
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_id=4;
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
--example of synonym dict
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
END;