#include #include #include #include #include "postgres.h" #include "spell.h" #define MAXNORMLEN 56 #define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y))) static int cmpspell(const void *s1, const void *s2) { return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word)); } static void strlower(char *str) { unsigned char *ptr = (unsigned char *) str; while (*ptr) { *ptr = tolower(*ptr); ptr++; } } /* backward string compaire for suffix tree operations */ static int strbcmp(const char *s1, const char *s2) { int l1 = strlen(s1) - 1, l2 = strlen(s2) - 1; while (l1 >= 0 && l2 >= 0) { if (s1[l1] < s2[l2]) return -1; if (s1[l1] > s2[l2]) return 1; l1--; l2--; } if (l1 < l2) return -1; if (l1 > l2) return 1; return 0; } static int strbncmp(const char *s1, const char *s2, size_t count) { int l1 = strlen(s1) - 1, l2 = strlen(s2) - 1, l = count; while (l1 >= 0 && l2 >= 0 && l > 0) { if (s1[l1] < s2[l2]) return -1; if (s1[l1] > s2[l2]) return 1; l1--; l2--; l--; } if (l == 0) return 0; if (l1 < l2) return -1; if (l1 > l2) return 1; return 0; } static int cmpaffix(const void *s1, const void *s2) { if (((const AFFIX *) s1)->type < ((const AFFIX *) s2)->type) return -1; if (((const AFFIX *) s1)->type > ((const AFFIX *) s2)->type) return 1; if (((const AFFIX *) s1)->type == 'p') return (strcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl)); else return (strbcmp(((const AFFIX *) s1)->repl, ((const AFFIX *) s2)->repl)); } int AddSpell(IspellDict * Conf, const char *word, const char *flag) { if (Conf->nspell >= Conf->mspell) { if (Conf->mspell) { Conf->mspell += 1024 * 20; Conf->Spell = (SPELL *) realloc(Conf->Spell, Conf->mspell * sizeof(SPELL)); } else { Conf->mspell = 1024 * 20; Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL)); } if (Conf->Spell == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } Conf->Spell[Conf->nspell].word = strdup(word); if (!Conf->Spell[Conf->nspell].word) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); strncpy(Conf->Spell[Conf->nspell].flag, flag, 10); Conf->nspell++; return (0); } int ImportDictionary(IspellDict * Conf, const char *filename) { unsigned char str[BUFSIZ]; FILE *dict; if (!(dict = fopen(filename, "r"))) return (1); while (fgets(str, sizeof(str), dict)) { unsigned char *s; const unsigned char *flag; flag = NULL; if ((s = strchr(str, '/'))) { *s = 0; s++; flag = s; while (*s) { if (((*s >= 'A') && (*s <= 'Z')) || ((*s >= 'a') && (*s <= 'z'))) s++; else { *s = 0; break; } } } else flag = ""; strlower(str); /* Dont load words if first letter is not required */ /* It allows to optimize loading at search time */ s = str; while (*s) { if (*s == '\r') *s = 0; if (*s == '\n') *s = 0; s++; } AddSpell(Conf, str, flag); } fclose(dict); return (0); } static SPELL * FindWord(IspellDict * Conf, const char *word, int affixflag) { int l, c, r, resc, resl, resr, i; i = (int) (*word) & 255; l = Conf->SpellTree.Left[i]; r = Conf->SpellTree.Right[i]; if (l == -1) return (NULL); while (l <= r) { c = (l + r) >> 1; resc = strcmp(Conf->Spell[c].word, word); if ((resc == 0) && ((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL))) return (&Conf->Spell[c]); resl = strcmp(Conf->Spell[l].word, word); if ((resl == 0) && ((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL))) return (&Conf->Spell[l]); resr = strcmp(Conf->Spell[r].word, word); if ((resr == 0) && ((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL))) return (&Conf->Spell[r]); if (resc < 0) { l = c + 1; r--; } else if (resc > 0) { r = c - 1; l++; } else { l++; r--; } } return (NULL); } int AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type) { if (Conf->naffixes >= Conf->maffixes) { if (Conf->maffixes) { Conf->maffixes += 16; Conf->Affix = (AFFIX *) realloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX)); } else { Conf->maffixes = 16; Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX)); } if (Conf->Affix == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } if (type == 's') sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); else sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); Conf->Affix[Conf->naffixes].compile = 1; Conf->Affix[Conf->naffixes].flag = flag; Conf->Affix[Conf->naffixes].type = type; strcpy(Conf->Affix[Conf->naffixes].find, find); strcpy(Conf->Affix[Conf->naffixes].repl, repl); Conf->Affix[Conf->naffixes].replen = strlen(repl); Conf->naffixes++; return (0); } static char * remove_spaces(char *dist, char *src) { char *d, *s; d = dist; s = src; while (*s) { if (*s != ' ' && *s != '-' && *s != '\t') { *d = *s; d++; } s++; } *d = 0; return (dist); } int ImportAffixes(IspellDict * Conf, const char *filename) { unsigned char str[BUFSIZ]; unsigned char flag = 0; unsigned char mask[BUFSIZ] = ""; unsigned char find[BUFSIZ] = ""; unsigned char repl[BUFSIZ] = ""; unsigned char *s; int i; int suffixes = 0; int prefixes = 0; FILE *affix; if (!(affix = fopen(filename, "r"))) return (1); while (fgets(str, sizeof(str), affix)) { if (!STRNCASECMP(str, "suffixes")) { suffixes = 1; prefixes = 0; continue; } if (!STRNCASECMP(str, "prefixes")) { suffixes = 0; prefixes = 1; continue; } if (!STRNCASECMP(str, "flag ")) { s = str + 5; while (strchr("* ", *s)) s++; flag = *s; continue; } if ((!suffixes) && (!prefixes)) continue; if ((s = strchr(str, '#'))) *s = 0; if (!*str) continue; strlower(str); strcpy(mask, ""); strcpy(find, ""); strcpy(repl, ""); i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl); remove_spaces(str, repl); strcpy(repl, str); remove_spaces(str, find); strcpy(find, str); remove_spaces(str, mask); strcpy(mask, str); switch (i) { case 3: break; case 2: if (*find != '\0') { strcpy(repl, find); strcpy(find, ""); } break; default: continue; } AddAffix(Conf, (int) flag, mask, find, repl, suffixes ? 's' : 'p'); } fclose(affix); return (0); } void SortDictionary(IspellDict * Conf) { int CurLet = -1, Let; size_t i; qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell); for (i = 0; i < 256; i++) Conf->SpellTree.Left[i] = -1; for (i = 0; i < Conf->nspell; i++) { Let = (int) (*(Conf->Spell[i].word)) & 255; if (CurLet != Let) { Conf->SpellTree.Left[Let] = i; CurLet = Let; } Conf->SpellTree.Right[Let] = i; } } void SortAffixes(IspellDict * Conf) { int CurLetP = -1, CurLetS = -1, Let; AFFIX *Affix; size_t i; if (Conf->naffixes > 1) qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); for (i = 0; i < 256; i++) { Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1; Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1; } for (i = 0; i < Conf->naffixes; i++) { Affix = &(((AFFIX *) Conf->Affix)[i]); if (Affix->type == 'p') { Let = (int) (*(Affix->repl)) & 255; if (CurLetP != Let) { Conf->PrefixTree.Left[Let] = i; CurLetP = Let; } Conf->PrefixTree.Right[Let] = i; } else { Let = (Affix->replen) ? (int) (Affix->repl[Affix->replen - 1]) & 255 : 0; if (CurLetS != Let) { Conf->SuffixTree.Left[Let] = i; CurLetS = Let; } Conf->SuffixTree.Right[Let] = i; } } } static char * CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict * Conf) { regmatch_t subs[2]; /* workaround for apache&linux */ char newword[2 * MAXNORMLEN] = ""; int err; *res = strbncmp(word, Affix->repl, Affix->replen); if (*res < 0) return NULL; if (*res > 0) return NULL; strcpy(newword, word); strcpy(newword + len - Affix->replen, Affix->find); if (Affix->compile) { err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB); if (err) { /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */ regfree(&(Affix->reg)); return (NULL); } Affix->compile = 0; } if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) { if (FindWord(Conf, newword, Affix->flag)) return pstrdup(newword); } return NULL; } #define NS 1 #define MAX_NORM 512 static int CheckPrefix(const char *word, size_t len, AFFIX * Affix, IspellDict * Conf, int pi, char **forms, char ***cur) { regmatch_t subs[NS * 2]; char newword[2 * MAXNORMLEN] = ""; int err, ls, res, lres; size_t newlen; AFFIX *CAffix = Conf->Affix; res = strncmp(word, Affix->repl, Affix->replen); if (res != 0) return res; strcpy(newword, Affix->find); strcat(newword, word + Affix->replen); if (Affix->compile) { err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB); if (err) { /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */ regfree(&(Affix->reg)); return (0); } Affix->compile = 0; } if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) { SPELL *curspell; if ((curspell = FindWord(Conf, newword, Affix->flag))) { if ((*cur - forms) < (MAX_NORM - 1)) { **cur = pstrdup(newword); (*cur)++; **cur = NULL; } } newlen = strlen(newword); ls = Conf->SuffixTree.Left[pi]; if (ls >= 0 && ((*cur - forms) < (MAX_NORM - 1))) { **cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf); if (**cur) { (*cur)++; **cur = NULL; } } } return 0; } char ** NormalizeWord(IspellDict * Conf, char *word) { /*regmatch_t subs[NS];*/ size_t len; char **forms; char **cur; AFFIX *Affix; int ri, pi, ipi, lp, rp, cp, ls, rs; int lres, rres, cres = 0; SPELL *spell; len = strlen(word); if (len > MAXNORMLEN) return (NULL); strlower(word); forms = (char **) palloc(MAX_NORM * sizeof(char **)); cur = forms; *cur = NULL; ri = (int) (*word) & 255; pi = (int) (word[strlen(word) - 1]) & 255; Affix = (AFFIX *) Conf->Affix; /* Check that the word itself is normal form */ if ((spell = FindWord(Conf, word, 0))) { *cur = pstrdup(word); cur++; *cur = NULL; } /* Find all other NORMAL forms of the 'word' */ for (ipi = 0; ipi <= pi; ipi += pi) { /* check prefix */ lp = Conf->PrefixTree.Left[ri]; rp = Conf->PrefixTree.Right[ri]; while (lp >= 0 && lp <= rp) { cp = (lp + rp) >> 1; cres = 0; if ((cur - forms) < (MAX_NORM - 1)) cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur); if ((lp < cp) && ((cur - forms) < (MAX_NORM - 1))) lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur); if ((rp > cp) && ((cur - forms) < (MAX_NORM - 1))) rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur); if (cres < 0) { rp = cp - 1; lp++; } else if (cres > 0) { lp = cp + 1; rp--; } else { lp++; rp--; } } /* check suffix */ ls = Conf->SuffixTree.Left[ipi]; rs = Conf->SuffixTree.Right[ipi]; while (ls >= 0 && ls <= rs) { if (((cur - forms) < (MAX_NORM - 1))) { *cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf); if (*cur) { cur++; *cur = NULL; } } if ((rs > ls) && ((cur - forms) < (MAX_NORM - 1))) { *cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf); if (*cur) { cur++; *cur = NULL; } } ls++; rs--; } /* end while */ } /* for ipi */ if (cur == forms) { pfree(forms); return (NULL); } return (forms); } void FreeIspell(IspellDict * Conf) { int i; AFFIX *Affix = (AFFIX *) Conf->Affix; for (i = 0; i < Conf->naffixes; i++) { if (Affix[i].compile == 0) regfree(&(Affix[i].reg)); } for (i = 0; i < Conf->naffixes; i++) free(Conf->Spell[i].word); free(Conf->Affix); free(Conf->Spell); memset((void *) Conf, 0, sizeof(IspellDict)); return; }