#include #include #include #include #include "postgres.h" #include "spell.h" #define MAXNORMLEN 56 #define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y))) static int cmpspell(const void *s1,const void *s2){ return(strcmp(((const SPELL*)s1)->word,((const SPELL*)s2)->word)); } static void strlower( char * str ) { unsigned char *ptr = (unsigned char *)str; while ( *ptr ) { *ptr = tolower( *ptr ); ptr++; } } /* backward string compaire for suffix tree operations */ static int strbcmp(const char *s1, const char *s2) { int l1 = strlen(s1)-1, l2 = strlen(s2)-1; while (l1 >= 0 && l2 >= 0) { if (s1[l1] < s2[l2]) return -1; if (s1[l1] > s2[l2]) return 1; l1--; l2--; } if (l1 < l2) return -1; if (l1 > l2) return 1; return 0; } static int strbncmp(const char *s1, const char *s2, size_t count) { int l1 = strlen(s1) - 1, l2 = strlen(s2) - 1, l = count; while (l1 >= 0 && l2 >= 0 && l > 0) { if (s1[l1] < s2[l2]) return -1; if (s1[l1] > s2[l2]) return 1; l1--; l2--; l--; } if (l == 0) return 0; if (l1 < l2) return -1; if (l1 > l2) return 1; return 0; } static int cmpaffix(const void *s1,const void *s2){ if (((const AFFIX*)s1)->type < ((const AFFIX*)s2)->type) return -1; if (((const AFFIX*)s1)->type > ((const AFFIX*)s2)->type) return 1; if (((const AFFIX*)s1)->type == 'p') return(strcmp(((const AFFIX*)s1)->repl,((const AFFIX*)s2)->repl)); else return(strbcmp(((const AFFIX*)s1)->repl,((const AFFIX*)s2)->repl)); } int AddSpell(IspellDict * Conf,const char * word,const char *flag){ if(Conf->nspell>=Conf->mspell){ if(Conf->mspell){ Conf->mspell+=1024*20; Conf->Spell=(SPELL *)realloc(Conf->Spell,Conf->mspell*sizeof(SPELL)); }else{ Conf->mspell=1024*20; Conf->Spell=(SPELL *)malloc(Conf->mspell*sizeof(SPELL)); } if ( Conf->Spell == NULL ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } Conf->Spell[Conf->nspell].word=strdup(word); if ( !Conf->Spell[Conf->nspell].word ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); strncpy(Conf->Spell[Conf->nspell].flag,flag,10); Conf->nspell++; return(0); } int ImportDictionary(IspellDict * Conf,const char *filename){ unsigned char str[BUFSIZ]; FILE *dict; if(!(dict=fopen(filename,"r")))return(1); while(fgets(str,sizeof(str),dict)){ unsigned char *s; const unsigned char *flag; flag = NULL; if((s=strchr(str,'/'))){ *s=0; s++;flag=s; while(*s){ if (((*s>='A')&&(*s<='Z'))||((*s>='a')&&(*s<='z'))) s++; else { *s=0; break; } } }else{ flag=""; } strlower(str); /* Dont load words if first letter is not required */ /* It allows to optimize loading at search time */ s=str; while(*s){ if(*s=='\r')*s=0; if(*s=='\n')*s=0; s++; } AddSpell(Conf,str,flag); } fclose(dict); return(0); } static SPELL * FindWord(IspellDict * Conf, const char *word, int affixflag) { int l,c,r,resc,resl,resr, i; i = (int)(*word) & 255; l = Conf->SpellTree.Left[i]; r = Conf->SpellTree.Right[i]; if (l == -1) return (NULL); while(l<=r){ c = (l + r) >> 1; resc = strcmp(Conf->Spell[c].word, word); if( (resc == 0) && ((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL)) ) { return(&Conf->Spell[c]); } resl = strcmp(Conf->Spell[l].word, word); if( (resl == 0) && ((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL)) ) { return(&Conf->Spell[l]); } resr = strcmp(Conf->Spell[r].word, word); if( (resr == 0) && ((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL)) ) { return(&Conf->Spell[r]); } if(resc < 0){ l = c + 1; r--; } else if(resc > 0){ r = c - 1; l++; } else { l++; r--; } } return(NULL); } int AddAffix(IspellDict * Conf,int flag,const char *mask,const char *find,const char *repl,int type) { if(Conf->naffixes>=Conf->maffixes){ if(Conf->maffixes){ Conf->maffixes+=16; Conf->Affix = (AFFIX*)realloc((void*)Conf->Affix,Conf->maffixes*sizeof(AFFIX)); }else{ Conf->maffixes=16; Conf->Affix = (AFFIX*)malloc(Conf->maffixes * sizeof(AFFIX)); } if ( Conf->Affix == NULL ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } if (type=='s') { sprintf(Conf->Affix[Conf->naffixes].mask,"%s$",mask); } else { sprintf(Conf->Affix[Conf->naffixes].mask,"^%s",mask); } Conf->Affix[Conf->naffixes].compile = 1; Conf->Affix[Conf->naffixes].flag=flag; Conf->Affix[Conf->naffixes].type=type; strcpy(Conf->Affix[Conf->naffixes].find,find); strcpy(Conf->Affix[Conf->naffixes].repl,repl); Conf->Affix[Conf->naffixes].replen=strlen(repl); Conf->naffixes++; return(0); } static char * remove_spaces(char *dist,char *src){ char *d,*s; d=dist; s=src; while(*s){ if(*s!=' '&&*s!='-'&&*s!='\t'){ *d=*s; d++; } s++; } *d=0; return(dist); } int ImportAffixes(IspellDict * Conf,const char *filename){ unsigned char str[BUFSIZ]; unsigned char flag=0; unsigned char mask[BUFSIZ]=""; unsigned char find[BUFSIZ]=""; unsigned char repl[BUFSIZ]=""; unsigned char *s; int i; int suffixes=0; int prefixes=0; FILE *affix; if(!(affix=fopen(filename,"r"))) return(1); while(fgets(str,sizeof(str),affix)){ if(!STRNCASECMP(str,"suffixes")){ suffixes=1; prefixes=0; continue; } if(!STRNCASECMP(str,"prefixes")){ suffixes=0; prefixes=1; continue; } if(!STRNCASECMP(str,"flag ")){ s=str+5; while(strchr("* ",*s)) s++; flag=*s; continue; } if((!suffixes)&&(!prefixes))continue; if((s=strchr(str,'#')))*s=0; if(!*str)continue; strlower(str); strcpy(mask,""); strcpy(find,""); strcpy(repl,""); i=sscanf(str,"%[^>\n]>%[^,\n],%[^\n]",mask,find,repl); remove_spaces(str,repl);strcpy(repl,str); remove_spaces(str,find);strcpy(find,str); remove_spaces(str,mask);strcpy(mask,str); switch(i){ case 3: break; case 2: if(*find != '\0'){ strcpy(repl,find); strcpy(find,""); } break; default: continue; } AddAffix(Conf,(int)flag,mask,find,repl,suffixes?'s':'p'); } fclose(affix); return(0); } void SortDictionary(IspellDict * Conf){ int CurLet = -1, Let;size_t i; qsort((void*)Conf->Spell,Conf->nspell,sizeof(SPELL),cmpspell); for(i = 0; i < 256 ; i++ ) Conf->SpellTree.Left[i] = -1; for(i = 0; i < Conf->nspell; i++) { Let = (int)(*(Conf->Spell[i].word)) & 255; if (CurLet != Let) { Conf->SpellTree.Left[Let] = i; CurLet = Let; } Conf->SpellTree.Right[Let] = i; } } void SortAffixes(IspellDict * Conf) { int CurLetP = -1, CurLetS = -1, Let; AFFIX *Affix; size_t i; if (Conf->naffixes > 1) qsort((void*)Conf->Affix,Conf->naffixes,sizeof(AFFIX),cmpaffix); for(i = 0; i < 256; i++) { Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1; Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1; } for(i = 0; i < Conf->naffixes; i++) { Affix = &(((AFFIX*)Conf->Affix)[i]); if(Affix->type == 'p') { Let = (int)(*(Affix->repl)) & 255; if (CurLetP != Let) { Conf->PrefixTree.Left[Let] = i; CurLetP = Let; } Conf->PrefixTree.Right[Let] = i; } else { Let = (Affix->replen) ? (int)(Affix->repl[Affix->replen-1]) & 255 : 0; if (CurLetS != Let) { Conf->SuffixTree.Left[Let] = i; CurLetS = Let; } Conf->SuffixTree.Right[Let] = i; } } } static char * CheckSuffix(const char *word, size_t len, AFFIX *Affix, int *res, IspellDict *Conf) { regmatch_t subs[2]; /* workaround for apache&linux */ char newword[2*MAXNORMLEN] = ""; int err; *res = strbncmp(word, Affix->repl, Affix->replen); if (*res < 0) { return NULL; } if (*res > 0) { return NULL; } strcpy(newword, word); strcpy(newword+len-Affix->replen, Affix->find); if (Affix->compile) { err = regcomp(&(Affix->reg),Affix->mask,REG_EXTENDED|REG_ICASE|REG_NOSUB); if(err){ /*regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE);*/ regfree(&(Affix->reg)); return(NULL); } Affix->compile = 0; } if(!(err=regexec(&(Affix->reg),newword,1,subs,0))){ if(FindWord(Conf, newword, Affix->flag)) return pstrdup(newword); } return NULL; } #define NS 1 #define MAX_NORM 512 static int CheckPrefix(const char *word, size_t len, AFFIX *Affix, IspellDict *Conf, int pi, char **forms, char ***cur ) { regmatch_t subs[NS*2]; char newword[2*MAXNORMLEN] = ""; int err, ls, res, lres; size_t newlen; AFFIX *CAffix = Conf->Affix; res = strncmp(word, Affix->repl, Affix->replen); if (res != 0) { return res; } strcpy(newword, Affix->find); strcat(newword, word+Affix->replen); if (Affix->compile) { err = regcomp(&(Affix->reg),Affix->mask,REG_EXTENDED|REG_ICASE|REG_NOSUB); if(err){ /*regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE);*/ regfree(&(Affix->reg)); return (0); } Affix->compile = 0; } if(!(err=regexec(&(Affix->reg),newword,1,subs,0))){ SPELL * curspell; if((curspell=FindWord(Conf, newword, Affix->flag))){ if ((*cur - forms) < (MAX_NORM-1)) { **cur = pstrdup(newword); (*cur)++; **cur = NULL; } } newlen = strlen(newword); ls = Conf->SuffixTree.Left[pi]; if ( ls>=0 && ((*cur - forms) < (MAX_NORM-1)) ) { **cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf); if (**cur) { (*cur)++; **cur = NULL; } } } return 0; } char ** NormalizeWord(IspellDict * Conf,char *word){ /*regmatch_t subs[NS];*/ size_t len; char ** forms; char **cur; AFFIX * Affix; int ri, pi, ipi, lp, rp, cp, ls, rs; int lres, rres, cres = 0; SPELL *spell; len=strlen(word); if (len > MAXNORMLEN) return(NULL); strlower(word); forms=(char **) palloc(MAX_NORM*sizeof(char **)); cur=forms;*cur=NULL; ri = (int)(*word) & 255; pi = (int)(word[strlen(word)-1]) & 255; Affix=(AFFIX*)Conf->Affix; /* Check that the word itself is normal form */ if((spell = FindWord(Conf, word, 0))){ *cur=pstrdup(word); cur++;*cur=NULL; } /* Find all other NORMAL forms of the 'word' */ for (ipi = 0; ipi <= pi; ipi += pi) { /* check prefix */ lp = Conf->PrefixTree.Left[ri]; rp = Conf->PrefixTree.Right[ri]; while (lp >= 0 && lp <= rp) { cp = (lp + rp) >> 1; cres = 0; if ((cur - forms) < (MAX_NORM-1)) { cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur); } if ((lp < cp) && ((cur - forms) < (MAX_NORM-1)) ) { lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur); } if ( (rp > cp) && ((cur - forms) < (MAX_NORM-1)) ) { rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur); } if (cres < 0) { rp = cp - 1; lp++; } else if (cres > 0) { lp = cp + 1; rp--; } else { lp++; rp--; } } /* check suffix */ ls = Conf->SuffixTree.Left[ipi]; rs = Conf->SuffixTree.Right[ipi]; while (ls >= 0 && ls <= rs) { if ( ((cur - forms) < (MAX_NORM-1)) ) { *cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf); if (*cur) { cur++; *cur = NULL; } } if ( (rs > ls) && ((cur - forms) < (MAX_NORM-1)) ) { *cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf); if (*cur) { cur++; *cur = NULL; } } ls++; rs--; } /* end while */ } /* for ipi */ if(cur==forms){ pfree(forms); return(NULL); } return(forms); } void FreeIspell (IspellDict *Conf) { int i; AFFIX *Affix = (AFFIX *)Conf->Affix; for (i = 0; i < Conf->naffixes; i++) { if (Affix[i].compile == 0) { regfree(&(Affix[i].reg)); } } for (i = 0; i < Conf->naffixes; i++) { free( Conf->Spell[i].word ); } free(Conf->Affix); free(Conf->Spell); memset( (void*)Conf, 0, sizeof(IspellDict) ); return; }