/* * lexize stream of lexemes * Teodor Sigaev */ #include "postgres.h" #include #include #include "ts_cfg.h" #include "dict.h" void LexizeInit(LexizeData * ld, TSCfgInfo * cfg) { ld->cfg = cfg; ld->curDictId = InvalidOid; ld->posDict = 0; ld->towork.head = ld->towork.tail = ld->curSub = NULL; ld->waste.head = ld->waste.tail = NULL; ld->lastRes = NULL; ld->tmpRes = NULL; } static void LPLAddTail(ListParsedLex * list, ParsedLex * newpl) { if (list->tail) { list->tail->next = newpl; list->tail = newpl; } else list->head = list->tail = newpl; newpl->next = NULL; } static ParsedLex * LPLRemoveHead(ListParsedLex * list) { ParsedLex *res = list->head; if (list->head) list->head = list->head->next; if (list->head == NULL) list->tail = NULL; return res; } void LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm) { ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); newpl->type = type; newpl->lemm = lemm; newpl->lenlemm = lenlemm; LPLAddTail(&ld->towork, newpl); ld->curSub = ld->towork.tail; } static void RemoveHead(LexizeData * ld) { LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); ld->posDict = 0; } static void setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem) { if (correspondLexem) { *correspondLexem = ld->waste.head; } else { ParsedLex *tmp, *ptr = ld->waste.head; while (ptr) { tmp = ptr->next; pfree(ptr); ptr = tmp; } } ld->waste.head = ld->waste.tail = NULL; } static void moveToWaste(LexizeData * ld, ParsedLex * stop) { bool go = true; while (ld->towork.head && go) { if (ld->towork.head == stop) { ld->curSub = stop->next; go = false; } RemoveHead(ld); } } static void setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res) { if (ld->tmpRes) { TSLexeme *ptr; for (ptr = ld->tmpRes; ptr->lexeme; ptr++) pfree(ptr->lexeme); pfree(ld->tmpRes); } ld->tmpRes = res; ld->lastRes = lex; } TSLexeme * LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem) { int i; ListDictionary *map; DictInfo *dict; TSLexeme *res; if (ld->curDictId == InvalidOid) { /* * usial mode: dictionary wants only one word, but we should keep in * mind that we should go through all stack */ while (ld->towork.head) { ParsedLex *curVal = ld->towork.head; map = ld->cfg->map + curVal->type; if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0) { /* skip this type of lexeme */ RemoveHead(ld); continue; } for (i = ld->posDict; i < map->len; i++) { dict = finddict(DatumGetObjectId(map->dict_id[i])); ld->dictState.isend = ld->dictState.getnext = false; ld->dictState.private = NULL; res = (TSLexeme *) DatumGetPointer(FunctionCall4( &(dict->lexize_info), PointerGetDatum(dict->dictionary), PointerGetDatum(curVal->lemm), Int32GetDatum(curVal->lenlemm), PointerGetDatum(&ld->dictState) )); if (ld->dictState.getnext) { /* * dictinary wants next word, so setup and store current * position and go to multiword mode */ ld->curDictId = DatumGetObjectId(map->dict_id[i]); ld->posDict = i + 1; ld->curSub = curVal->next; if (res) setNewTmpRes(ld, curVal, res); return LexizeExec(ld, correspondLexem); } if (!res) /* dictionary doesn't know this lexeme */ continue; RemoveHead(ld); setCorrLex(ld, correspondLexem); return res; } RemoveHead(ld); } } else { /* curDictId is valid */ dict = finddict(ld->curDictId); /* * Dictionary ld->curDictId asks us about following words */ while (ld->curSub) { ParsedLex *curVal = ld->curSub; map = ld->cfg->map + curVal->type; if (curVal->type != 0) { bool dictExists = false; if (curVal->type >= ld->cfg->len || map->len == 0) { /* skip this type of lexeme */ ld->curSub = curVal->next; continue; } /* * We should be sure that current type of lexeme is recognized * by our dictinonary: we just check is it exist in list of * dictionaries ? */ for (i = 0; i < map->len && !dictExists; i++) if (ld->curDictId == DatumGetObjectId(map->dict_id[i])) dictExists = true; if (!dictExists) { /* * Dictionary can't work with current tpe of lexeme, * return to basic mode and redo all stored lexemes */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } ld->dictState.isend = (curVal->type == 0) ? true : false; ld->dictState.getnext = false; res = (TSLexeme *) DatumGetPointer(FunctionCall4( &(dict->lexize_info), PointerGetDatum(dict->dictionary), PointerGetDatum(curVal->lemm), Int32GetDatum(curVal->lenlemm), PointerGetDatum(&ld->dictState) )); if (ld->dictState.getnext) { /* Dictionary wants one more */ ld->curSub = curVal->next; if (res) setNewTmpRes(ld, curVal, res); continue; } if (res || ld->tmpRes) { /* * Dictionary normalizes lexemes, so we remove from stack all * used lexemes , return to basic mode and redo end of stack * (if it exists) */ if (res) { moveToWaste(ld, ld->curSub); } else { res = ld->tmpRes; moveToWaste(ld, ld->lastRes); } /* reset to initial state */ ld->curDictId = InvalidOid; ld->posDict = 0; ld->lastRes = NULL; ld->tmpRes = NULL; setCorrLex(ld, correspondLexem); return res; } /* * Dict don't want next lexem and didn't recognize anything, redo * from ld->towork.head */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } setCorrLex(ld, correspondLexem); return NULL; }