postgresql/contrib/tsearch/morph.c

189 lines
4.4 KiB
C
Raw Normal View History

/*
* morphology module
* New dictionary is include in dict.h. For languages which
* use latin charset it may be need to modify mapdict table.
* Teodor Sigaev <teodor@stack.net>
*/
#include "postgres.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/builtins.h"
#include "catalog/pg_control.h"
#include "utils/pg_locale.h"
#include "morph.h"
#include "deflex.h"
/*
* Struct for calling dictionaries
* All of this methods are optional, but
* if all methods are NULL, then dictionary does nothing :)
* Return value of lemmatize must be palloced or the same.
* Return value of init must be malloced in other case
* it will be free in end of transaction!
*/
typedef struct {
char localename[LOCALE_NAME_BUFLEN];
/* init dictionary */
void* (*init)(void);
/* close dictionary */
void (*close)(void*);
/* find in dictionary */
char* (*lemmatize)(void*,char*,int*);
int (*is_stoplemm)(void*,char*,int);
int (*is_stemstoplemm)(void*,char*,int);
} DICT;
/* insert all dictionaries */
#define DICT_BODY
#include "dict.h"
#undef DICT_BODY
/* fill dictionary's structure */
#define DICT_TABLE
DICT dicts[] = {
{
"C",NULL,NULL,NULL,NULL,NULL /* fake dictionary */
}
#include "dict.h"
};
#undef DICT_TABLE
/* array for storing dictinary's objects (if needed) */
void* dictobjs[ lengthof(dicts) ];
#define STOPLEXEM -2
#define BYLOCALE -1
#define NODICT 0
#define DEFAULTDICT 1
#define MAXNDICT 2
typedef int2 MAPDICT[MAXNDICT];
#define GETDICT(x,i) *( ((int2*)(x)) + (i) )
/* map dictionaries for lexem type */
static MAPDICT mapdict[] = {
{NODICT, NODICT}, /* not used */
{DEFAULTDICT, NODICT}, /* LATWORD */
{BYLOCALE, NODICT}, /* NONLATINWORD */
{BYLOCALE, DEFAULTDICT}, /* UWORD */
{NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */
{NODICT, NODICT}, /* FLOAT */
{NODICT, NODICT}, /* FINT */
{BYLOCALE, DEFAULTDICT}, /* PARTWORD */
{BYLOCALE, NODICT}, /* NONLATINPARTWORD */
{DEFAULTDICT, NODICT}, /* LATPARTWORD */
{STOPLEXEM, NODICT}, /* SPACE */
{STOPLEXEM, NODICT}, /* SYMTAG */
{STOPLEXEM, NODICT}, /* HTTP */
{BYLOCALE, DEFAULTDICT}, /* DEFISWORD */
{DEFAULTDICT, NODICT}, /* DEFISLATWORD */
{BYLOCALE, NODICT}, /* DEFISNONLATINWORD */
{NODICT, NODICT}, /* URI */
{NODICT, NODICT} /* FILEPATH */
};
static bool inited=false;
void initmorph(void) {
int i,j,k;
MAPDICT *md;
bool needinit[ lengthof(dicts) ];
#ifdef USE_LOCALE
PG_LocaleCategories lc;
int bylocaledict = NODICT;
#endif
if ( inited ) return;
for(i=1; i<lengthof(dicts);i++)
needinit[i] = false;
#ifdef USE_LOCALE
PGLC_current(&lc);
for(i=1;i<lengthof(dicts);i++)
if (strcmp( dicts[i].localename, lc.lang ) == 0) {
bylocaledict = i;
break;
}
PGLC_free_categories(&lc);
#endif
for(i=1; i<lengthof(mapdict);i++) {
k=0;
md = &mapdict[i];
for(j=0;j<MAXNDICT;j++) {
GETDICT(md,k) = GETDICT(md,j);
if ( GETDICT(md,k) == NODICT ) {
break;
} else if ( GETDICT(md,k) == BYLOCALE ) {
#ifdef USE_LOCALE
if ( bylocaledict == NODICT )
continue;
GETDICT(md,k) = bylocaledict;
#else
continue;
#endif
}
if ( GETDICT(md,k) >= (int2)lengthof(dicts) )
continue;
needinit[ GETDICT(md,k) ] = true;
k++;
}
for(;k<MAXNDICT;k++)
if ( GETDICT(md,k) != STOPLEXEM )
GETDICT(md,k) = NODICT;
}
for(i=1; i<lengthof(dicts);i++)
if ( needinit[i] && dicts[i].init )
dictobjs[i] = (*(dicts[i].init))();
inited = true;
return;
}
char* lemmatize( char* word, int *len, int type ) {
int2 nd;
int i;
DICT *dict;
for(i=0;i<MAXNDICT;i++) {
nd = GETDICT( &mapdict[type], i );
if ( nd == NODICT ) {
/* there is no dictionary */
return word;
} else if ( nd == STOPLEXEM ) {
/* word is stopword */
return NULL;
} else {
dict = &dicts[ nd ];
if ( dict->is_stoplemm && (*(dict->is_stoplemm))(dictobjs[nd], word, *len) )
return NULL;
if ( dict->lemmatize ) {
int oldlen = *len;
char *newword = (*(dict->lemmatize))(dictobjs[nd], word, len);
/* word is recognized by distionary */
if ( newword != word || *len != oldlen ) {
if ( dict->is_stemstoplemm &&
(*(dict->is_stemstoplemm))(dictobjs[nd], word, *len) ) {
if ( newword != word && newword)
pfree(newword);
return NULL;
}
return newword;
}
}
}
}
return word;
}
bool is_stoptype(int type) {
return ( GETDICT( &mapdict[type], 0 ) == STOPLEXEM ) ? true : false;
}