postgresql/contrib/tsearch2/ispell/spell.c

527 lines
12 KiB
C
Raw Normal View History

2003-07-21 12:27:44 +02:00
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "postgres.h"
#include "spell.h"
#define MAXNORMLEN 56
#define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y)))
static int cmpspell(const void *s1,const void *s2){
return(strcmp(((const SPELL*)s1)->word,((const SPELL*)s2)->word));
}
static void
strlower( char * str ) {
unsigned char *ptr = (unsigned char *)str;
while ( *ptr ) {
*ptr = tolower( *ptr );
ptr++;
}
}
/* backward string compaire for suffix tree operations */
static int
strbcmp(const char *s1, const char *s2) {
int l1 = strlen(s1)-1, l2 = strlen(s2)-1;
while (l1 >= 0 && l2 >= 0) {
if (s1[l1] < s2[l2]) return -1;
if (s1[l1] > s2[l2]) return 1;
l1--; l2--;
}
if (l1 < l2) return -1;
if (l1 > l2) return 1;
return 0;
}
static int
strbncmp(const char *s1, const char *s2, size_t count) {
int l1 = strlen(s1) - 1, l2 = strlen(s2) - 1, l = count;
while (l1 >= 0 && l2 >= 0 && l > 0) {
if (s1[l1] < s2[l2]) return -1;
if (s1[l1] > s2[l2]) return 1;
l1--;
l2--;
l--;
}
if (l == 0) return 0;
if (l1 < l2) return -1;
if (l1 > l2) return 1;
return 0;
}
static int
cmpaffix(const void *s1,const void *s2){
if (((const AFFIX*)s1)->type < ((const AFFIX*)s2)->type) return -1;
if (((const AFFIX*)s1)->type > ((const AFFIX*)s2)->type) return 1;
if (((const AFFIX*)s1)->type == 'p')
return(strcmp(((const AFFIX*)s1)->repl,((const AFFIX*)s2)->repl));
else
return(strbcmp(((const AFFIX*)s1)->repl,((const AFFIX*)s2)->repl));
}
int
AddSpell(IspellDict * Conf,const char * word,const char *flag){
if(Conf->nspell>=Conf->mspell){
if(Conf->mspell){
Conf->mspell+=1024*20;
Conf->Spell=(SPELL *)realloc(Conf->Spell,Conf->mspell*sizeof(SPELL));
}else{
Conf->mspell=1024*20;
Conf->Spell=(SPELL *)malloc(Conf->mspell*sizeof(SPELL));
}
if ( Conf->Spell == NULL )
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
2003-07-21 12:27:44 +02:00
}
Conf->Spell[Conf->nspell].word=strdup(word);
if ( !Conf->Spell[Conf->nspell].word )
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
2003-07-21 12:27:44 +02:00
strncpy(Conf->Spell[Conf->nspell].flag,flag,10);
Conf->nspell++;
return(0);
}
int
ImportDictionary(IspellDict * Conf,const char *filename){
unsigned char str[BUFSIZ];
FILE *dict;
if(!(dict=fopen(filename,"r")))return(1);
while(fgets(str,sizeof(str),dict)){
unsigned char *s;
const unsigned char *flag;
flag = NULL;
if((s=strchr(str,'/'))){
*s=0;
s++;flag=s;
while(*s){
if (((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z')))
s++;
else {
*s=0;
break;
}
}
}else{
flag="";
}
strlower(str);
/* Dont load words if first letter is not required */
/* It allows to optimize loading at search time */
s=str;
while(*s){
if(*s=='\r')*s=0;
if(*s=='\n')*s=0;
s++;
}
AddSpell(Conf,str,flag);
}
fclose(dict);
return(0);
}
static SPELL *
FindWord(IspellDict * Conf, const char *word, int affixflag) {
int l,c,r,resc,resl,resr, i;
i = (int)(*word) & 255;
l = Conf->SpellTree.Left[i];
r = Conf->SpellTree.Right[i];
if (l == -1) return (NULL);
while(l<=r){
c = (l + r) >> 1;
resc = strcmp(Conf->Spell[c].word, word);
if( (resc == 0) &&
((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL)) ) {
return(&Conf->Spell[c]);
}
resl = strcmp(Conf->Spell[l].word, word);
if( (resl == 0) &&
((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL)) ) {
return(&Conf->Spell[l]);
}
resr = strcmp(Conf->Spell[r].word, word);
if( (resr == 0) &&
((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL)) ) {
return(&Conf->Spell[r]);
}
if(resc < 0){
l = c + 1;
r--;
} else if(resc > 0){
r = c - 1;
l++;
} else {
l++;
r--;
}
}
return(NULL);
}
int
AddAffix(IspellDict * Conf,int flag,const char *mask,const char *find,const char *repl,int type) {
if(Conf->naffixes>=Conf->maffixes){
if(Conf->maffixes){
Conf->maffixes+=16;
Conf->Affix = (AFFIX*)realloc((void*)Conf->Affix,Conf->maffixes*sizeof(AFFIX));
}else{
Conf->maffixes=16;
Conf->Affix = (AFFIX*)malloc(Conf->maffixes * sizeof(AFFIX));
}
if ( Conf->Affix == NULL )
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
2003-07-21 12:27:44 +02:00
}
if (type=='s') {
sprintf(Conf->Affix[Conf->naffixes].mask,"%s$",mask);
} else {
sprintf(Conf->Affix[Conf->naffixes].mask,"^%s",mask);
}
Conf->Affix[Conf->naffixes].compile = 1;
Conf->Affix[Conf->naffixes].flag=flag;
Conf->Affix[Conf->naffixes].type=type;
strcpy(Conf->Affix[Conf->naffixes].find,find);
strcpy(Conf->Affix[Conf->naffixes].repl,repl);
Conf->Affix[Conf->naffixes].replen=strlen(repl);
Conf->naffixes++;
return(0);
}
static char *
remove_spaces(char *dist,char *src){
char *d,*s;
d=dist;
s=src;
while(*s){
if(*s!=' '&&*s!='-'&&*s!='\t'){
*d=*s;
d++;
}
s++;
}
*d=0;
return(dist);
}
int
ImportAffixes(IspellDict * Conf,const char *filename){
unsigned char str[BUFSIZ];
unsigned char flag=0;
unsigned char mask[BUFSIZ]="";
unsigned char find[BUFSIZ]="";
unsigned char repl[BUFSIZ]="";
unsigned char *s;
int i;
int suffixes=0;
int prefixes=0;
FILE *affix;
if(!(affix=fopen(filename,"r")))
return(1);
while(fgets(str,sizeof(str),affix)){
if(!STRNCASECMP(str,"suffixes")){
suffixes=1;
prefixes=0;
continue;
}
if(!STRNCASECMP(str,"prefixes")){
suffixes=0;
prefixes=1;
continue;
}
if(!STRNCASECMP(str,"flag ")){
s=str+5;
while(strchr("* ",*s))
s++;
flag=*s;
continue;
}
if((!suffixes)&&(!prefixes))continue;
if((s=strchr(str,'#')))*s=0;
if(!*str)continue;
strlower(str);
strcpy(mask,"");
strcpy(find,"");
strcpy(repl,"");
i=sscanf(str,"%[^>\n]>%[^,\n],%[^\n]",mask,find,repl);
remove_spaces(str,repl);strcpy(repl,str);
remove_spaces(str,find);strcpy(find,str);
remove_spaces(str,mask);strcpy(mask,str);
switch(i){
case 3:
break;
case 2:
if(*find != '\0'){
strcpy(repl,find);
strcpy(find,"");
}
break;
default:
continue;
}
AddAffix(Conf,(int)flag,mask,find,repl,suffixes?'s':'p');
}
fclose(affix);
return(0);
}
void
SortDictionary(IspellDict * Conf){
int CurLet = -1, Let;size_t i;
qsort((void*)Conf->Spell,Conf->nspell,sizeof(SPELL),cmpspell);
for(i = 0; i < 256 ; i++ )
Conf->SpellTree.Left[i] = -1;
for(i = 0; i < Conf->nspell; i++) {
Let = (int)(*(Conf->Spell[i].word)) & 255;
if (CurLet != Let) {
Conf->SpellTree.Left[Let] = i;
CurLet = Let;
}
Conf->SpellTree.Right[Let] = i;
}
}
void
SortAffixes(IspellDict * Conf) {
int CurLetP = -1, CurLetS = -1, Let;
AFFIX *Affix; size_t i;
if (Conf->naffixes > 1)
qsort((void*)Conf->Affix,Conf->naffixes,sizeof(AFFIX),cmpaffix);
for(i = 0; i < 256; i++) {
Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1;
Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1;
}
for(i = 0; i < Conf->naffixes; i++) {
Affix = &(((AFFIX*)Conf->Affix)[i]);
if(Affix->type == 'p') {
Let = (int)(*(Affix->repl)) & 255;
if (CurLetP != Let) {
Conf->PrefixTree.Left[Let] = i;
CurLetP = Let;
}
Conf->PrefixTree.Right[Let] = i;
} else {
Let = (Affix->replen) ? (int)(Affix->repl[Affix->replen-1]) & 255 : 0;
if (CurLetS != Let) {
Conf->SuffixTree.Left[Let] = i;
CurLetS = Let;
}
Conf->SuffixTree.Right[Let] = i;
}
}
}
static char *
CheckSuffix(const char *word, size_t len, AFFIX *Affix, int *res, IspellDict *Conf) {
regmatch_t subs[2]; /* workaround for apache&linux */
char newword[2*MAXNORMLEN] = "";
int err;
*res = strbncmp(word, Affix->repl, Affix->replen);
if (*res < 0) {
return NULL;
}
if (*res > 0) {
return NULL;
}
strcpy(newword, word);
strcpy(newword+len-Affix->replen, Affix->find);
if (Affix->compile) {
err = regcomp(&(Affix->reg),Affix->mask,REG_EXTENDED|REG_ICASE|REG_NOSUB);
if(err){
/*regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE);*/
regfree(&(Affix->reg));
return(NULL);
}
Affix->compile = 0;
}
if(!(err=regexec(&(Affix->reg),newword,1,subs,0))){
if(FindWord(Conf, newword, Affix->flag))
return pstrdup(newword);
}
return NULL;
}
#define NS 1
#define MAX_NORM 512
static int
CheckPrefix(const char *word, size_t len, AFFIX *Affix, IspellDict *Conf, int pi,
char **forms, char ***cur ) {
regmatch_t subs[NS*2];
char newword[2*MAXNORMLEN] = "";
int err, ls, res, lres;
size_t newlen;
AFFIX *CAffix = Conf->Affix;
res = strncmp(word, Affix->repl, Affix->replen);
if (res != 0) {
return res;
}
strcpy(newword, Affix->find);
strcat(newword, word+Affix->replen);
if (Affix->compile) {
err = regcomp(&(Affix->reg),Affix->mask,REG_EXTENDED|REG_ICASE|REG_NOSUB);
if(err){
/*regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE);*/
regfree(&(Affix->reg));
return (0);
}
Affix->compile = 0;
}
if(!(err=regexec(&(Affix->reg),newword,1,subs,0))){
SPELL * curspell;
if((curspell=FindWord(Conf, newword, Affix->flag))){
if ((*cur - forms) < (MAX_NORM-1)) {
**cur = pstrdup(newword);
(*cur)++; **cur = NULL;
}
}
newlen = strlen(newword);
ls = Conf->SuffixTree.Left[pi];
if ( ls>=0 && ((*cur - forms) < (MAX_NORM-1)) ) {
**cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf);
if (**cur) {
(*cur)++; **cur = NULL;
}
}
}
return 0;
}
char **
NormalizeWord(IspellDict * Conf,char *word){
/*regmatch_t subs[NS];*/
size_t len;
char ** forms;
char **cur;
AFFIX * Affix;
int ri, pi, ipi, lp, rp, cp, ls, rs;
int lres, rres, cres = 0;
SPELL *spell;
len=strlen(word);
if (len > MAXNORMLEN)
return(NULL);
strlower(word);
forms=(char **) palloc(MAX_NORM*sizeof(char **));
cur=forms;*cur=NULL;
ri = (int)(*word) & 255;
pi = (int)(word[strlen(word)-1]) & 255;
Affix=(AFFIX*)Conf->Affix;
/* Check that the word itself is normal form */
if((spell = FindWord(Conf, word, 0))){
*cur=pstrdup(word);
cur++;*cur=NULL;
}
/* Find all other NORMAL forms of the 'word' */
for (ipi = 0; ipi <= pi; ipi += pi) {
/* check prefix */
lp = Conf->PrefixTree.Left[ri];
rp = Conf->PrefixTree.Right[ri];
while (lp >= 0 && lp <= rp) {
cp = (lp + rp) >> 1;
cres = 0;
if ((cur - forms) < (MAX_NORM-1)) {
cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur);
}
if ((lp < cp) && ((cur - forms) < (MAX_NORM-1)) ) {
lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur);
}
if ( (rp > cp) && ((cur - forms) < (MAX_NORM-1)) ) {
rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur);
}
if (cres < 0) {
rp = cp - 1;
lp++;
} else if (cres > 0) {
lp = cp + 1;
rp--;
} else {
lp++;
rp--;
}
}
/* check suffix */
ls = Conf->SuffixTree.Left[ipi];
rs = Conf->SuffixTree.Right[ipi];
while (ls >= 0 && ls <= rs) {
if ( ((cur - forms) < (MAX_NORM-1)) ) {
*cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf);
if (*cur) {
cur++; *cur = NULL;
}
}
if ( (rs > ls) && ((cur - forms) < (MAX_NORM-1)) ) {
*cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf);
if (*cur) {
cur++; *cur = NULL;
}
}
ls++;
rs--;
} /* end while */
} /* for ipi */
if(cur==forms){
pfree(forms);
return(NULL);
}
return(forms);
}
void
FreeIspell (IspellDict *Conf) {
int i;
AFFIX *Affix = (AFFIX *)Conf->Affix;
for (i = 0; i < Conf->naffixes; i++) {
if (Affix[i].compile == 0) {
regfree(&(Affix[i].reg));
}
}
for (i = 0; i < Conf->naffixes; i++) {
free( Conf->Spell[i].word );
}
free(Conf->Affix);
free(Conf->Spell);
memset( (void*)Conf, 0, sizeof(IspellDict) );
return;
}