/* * lexize stream of lexemes * Teodor Sigaev */ #include "postgres.h" #include #include #include "ts_cfg.h" #include "dict.h" void LexizeInit(LexizeData *ld, TSCfgInfo *cfg) { ld->cfg = cfg; ld->curDictId = InvalidOid; ld->posDict = 0; ld->towork.head = ld->towork.tail = ld->curSub = NULL; ld->waste.head = ld->waste.tail = NULL; ld->lastRes=NULL; ld->tmpRes=NULL; } static void LPLAddTail(ListParsedLex *list, ParsedLex *newpl) { if ( list->tail ) { list->tail->next = newpl; list->tail = newpl; } else list->head = list->tail = newpl; newpl->next = NULL; } static ParsedLex* LPLRemoveHead(ListParsedLex *list) { ParsedLex *res = list->head; if ( list->head ) list->head = list->head->next; if ( list->head == NULL ) list->tail = NULL; return res; } void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) { ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) ); newpl = (ParsedLex*)palloc( sizeof(ParsedLex) ); newpl->type = type; newpl->lemm = lemm; newpl->lenlemm = lenlemm; LPLAddTail(&ld->towork, newpl); ld->curSub = ld->towork.tail; } static void RemoveHead(LexizeData *ld) { LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); ld->posDict = 0; } static void setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) { if ( correspondLexem ) { *correspondLexem = ld->waste.head; } else { ParsedLex *tmp, *ptr = ld->waste.head; while(ptr) { tmp = ptr->next; pfree(ptr); ptr = tmp; } } ld->waste.head = ld->waste.tail = NULL; } static void moveToWaste(LexizeData *ld, ParsedLex *stop) { bool go = true; while( ld->towork.head && go) { if (ld->towork.head == stop) { ld->curSub = stop->next; go = false; } RemoveHead(ld); } } static void setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) { if ( ld->tmpRes ) { TSLexeme *ptr; for( ptr=ld->tmpRes; ptr->lexeme; ptr++ ) pfree( ptr->lexeme ); pfree( ld->tmpRes ); } ld->tmpRes = res; ld->lastRes = lex; } TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) { int i; ListDictionary *map; DictInfo *dict; TSLexeme *res; if ( ld->curDictId == InvalidOid ) { /* * usial mode: dictionary wants only one word, * but we should keep in mind that we should go through * all stack */ while( ld->towork.head ) { ParsedLex *curVal = ld->towork.head; map = ld->cfg->map + curVal->type; if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) { /* skip this type of lexeme */ RemoveHead(ld); continue; } for (i = ld->posDict; i < map->len; i++) { dict = finddict(DatumGetObjectId(map->dict_id[i])); ld->dictState.isend = ld->dictState.getnext = false; ld->dictState.private = NULL; res = (TSLexeme *) DatumGetPointer( FunctionCall4( &(dict->lexize_info), PointerGetDatum(dict->dictionary), PointerGetDatum(curVal->lemm), Int32GetDatum(curVal->lenlemm), PointerGetDatum(&ld->dictState) )); if ( ld->dictState.getnext ) { /* * dictinary wants next word, so setup and store * current position and go to multiword mode */ ld->curDictId = DatumGetObjectId(map->dict_id[i]); ld->posDict = i+1; ld->curSub = curVal->next; if ( res ) setNewTmpRes(ld, curVal, res); return LexizeExec(ld, correspondLexem); } if (!res) /* dictionary doesn't know this lexeme */ continue; RemoveHead(ld); setCorrLex(ld, correspondLexem); return res; } RemoveHead(ld); } } else { /* curDictId is valid */ dict = finddict(ld->curDictId); /* * Dictionary ld->curDictId asks us about following words */ while( ld->curSub ) { ParsedLex *curVal = ld->curSub; map = ld->cfg->map + curVal->type; if (curVal->type != 0) { bool dictExists = false; if (curVal->type >= ld->cfg->len || map->len == 0 ) { /* skip this type of lexeme */ ld->curSub = curVal->next; continue; } /* * We should be sure that current type of lexeme is recognized by * our dictinonary: we just check is it exist in * list of dictionaries ? */ for(i=0;i < map->len && !dictExists; i++) if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) ) dictExists = true; if ( !dictExists ) { /* * Dictionary can't work with current tpe of lexeme, * return to basic mode and redo all stored lexemes */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } ld->dictState.isend = (curVal->type==0) ? true : false; ld->dictState.getnext = false; res = (TSLexeme *) DatumGetPointer( FunctionCall4( &(dict->lexize_info), PointerGetDatum(dict->dictionary), PointerGetDatum(curVal->lemm), Int32GetDatum(curVal->lenlemm), PointerGetDatum(&ld->dictState) )); if ( ld->dictState.getnext ) { /* Dictionary wants one more */ ld->curSub = curVal->next; if ( res ) setNewTmpRes(ld, curVal, res); continue; } if ( res || ld->tmpRes ) { /* * Dictionary normalizes lexemes, * so we remove from stack all used lexemes , * return to basic mode and redo end of stack (if it exists) */ if ( res ) { moveToWaste( ld, ld->curSub ); } else { res = ld->tmpRes; moveToWaste( ld, ld->lastRes ); } /* reset to initial state */ ld->curDictId = InvalidOid; ld->posDict = 0; ld->lastRes = NULL; ld->tmpRes = NULL; setCorrLex(ld, correspondLexem); return res; } /* Dict don't want next lexem and didn't recognize anything, redo from ld->towork.head */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } setCorrLex(ld, correspondLexem); return NULL; }