/* * In/Out definitions for txtidx type * Internal structure: * string of values, array of position lexem in string and it's length * Teodor Sigaev */ #include "postgres.h" #include "access/gist.h" #include "access/itup.h" #include "utils/elog.h" #include "utils/palloc.h" #include "utils/builtins.h" #include "storage/bufpage.h" #include "executor/spi.h" #include "commands/trigger.h" #include "utils/pg_locale.h" #include /* tolower */ #include "txtidx.h" #include "query.h" #include "deflex.h" #include "parser.h" #include "morph.h" PG_FUNCTION_INFO_V1(txtidx_in); Datum txtidx_in(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(txtidx_out); Datum txtidx_out(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(txt2txtidx); Datum txt2txtidx(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(tsearch); Datum tsearch(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(txtidxsize); Datum txtidxsize(PG_FUNCTION_ARGS); /* * in/out text index type */ static char *BufferStr; static int compareentry( const void * a, const void * b ) { if ( ((WordEntry*)a)->len == ((WordEntry*)b)->len ) { return strncmp( &BufferStr[((WordEntry*)a)->pos], &BufferStr[((WordEntry*)b)->pos], ((WordEntry*)b)->len ); } return ( ((WordEntry*)a)->len > ((WordEntry*)b)->len ) ? 1 : -1; } static int uniqueentry( WordEntry* a, int4 l, char *buf, int4 *outbuflen ) { WordEntry *ptr, *res; res = a; *outbuflen = res->len; if ( l == 1 ) return l; ptr = a+1; BufferStr = buf; qsort((void*)a, l, sizeof(int4), compareentry ); *outbuflen = res->len; while (ptr - a < l) { if ( ! (ptr->len == res->len && strncmp(&buf[ ptr->pos ], &buf[ res->pos ],res->len) == 0 ) ) { res++; res->len = ptr->len; res->pos = ptr->pos; *outbuflen += res->len; } ptr++; } return res + 1 - a; } #define WAITWORD 1 #define WAITENDWORD 2 #define WAITNEXTCHAR 3 #define WAITENDCMPLX 4 #define RESIZEPRSBUF \ do { \ if ( state->curpos - state->word == state->len ) \ { \ int4 clen = state->curpos - state->word; \ state->len *= 2; \ state->word = (char*)repalloc( (void*)state->word, state->len ); \ state->curpos = state->word + clen; \ } \ } while (0) int4 gettoken_txtidx( TI_IN_STATE *state ) { int4 oldstate = 0; state->curpos = state->word; state->state = WAITWORD; while( 1 ) { if ( state->state == WAITWORD ) { if ( *(state->prsbuf) == '\0' ) { return 0; } else if ( *(state->prsbuf) == '\'' ) { state->state = WAITENDCMPLX; } else if ( *(state->prsbuf) == '\\' ) { state->state = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if ( state->oprisdelim && ISOPERATOR( *(state->prsbuf) ) ) { elog(ERROR, "Syntax error"); } else if ( *(state->prsbuf) != ' ' ) { *(state->curpos) = *(state->prsbuf); state->curpos++; state->state = WAITENDWORD; } } else if ( state->state == WAITNEXTCHAR ) { if ( *(state->prsbuf) == '\0' ) { elog(ERROR,"There is no escaped character"); } else { RESIZEPRSBUF; *(state->curpos) = *(state->prsbuf); state->curpos++; state->state = oldstate; } } else if ( state->state == WAITENDWORD ) { if ( *(state->prsbuf) == '\\' ) { state->state = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if ( *(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' || ( state->oprisdelim && ISOPERATOR( *(state->prsbuf) ) ) ) { RESIZEPRSBUF; if ( state->curpos == state->word ) elog(ERROR, "Syntax error"); *(state->curpos) = '\0'; return 1; } else { RESIZEPRSBUF; *(state->curpos) = *(state->prsbuf); state->curpos++; } } else if ( state->state == WAITENDCMPLX ) { if ( *(state->prsbuf) == '\'' ) { RESIZEPRSBUF; *(state->curpos) = '\0'; if ( state->curpos == state->word ) elog(ERROR, "Syntax error"); state->prsbuf++; return 1; } else if ( *(state->prsbuf) == '\\' ) { state->state = WAITNEXTCHAR; oldstate = WAITENDCMPLX; } else if ( *(state->prsbuf) == '\0' ) { elog(ERROR,"Syntax error"); } else { RESIZEPRSBUF; *(state->curpos) = *(state->prsbuf); state->curpos++; } } else { elog(ERROR, "Inner bug :("); } state->prsbuf++; } return 0; } Datum txtidx_in(PG_FUNCTION_ARGS) { char *buf = (char*)PG_GETARG_POINTER(0); TI_IN_STATE state; WordEntry *arr; int4 len=0, totallen = 64; txtidx *in; char *tmpbuf, *cur; int4 i,buflen = 256; state.prsbuf = buf; state.len=32; state.word = (char*)palloc( state.len ); state.oprisdelim = false; arr = (WordEntry*)palloc( sizeof(WordEntry) * totallen ); cur = tmpbuf = (char*)palloc( buflen ); while( gettoken_txtidx( &state ) ) { if ( len == totallen ) { totallen *= 2; arr = (WordEntry*)repalloc( (void*)arr, sizeof(int4)*totallen ); } while ( cur-tmpbuf + state.curpos - state.word >= buflen ) { int4 dist = cur-tmpbuf; buflen *= 2; tmpbuf = (char*)repalloc( (void*)tmpbuf, buflen ); cur = tmpbuf+dist; } if ( state.curpos - state.word > 0xffff ) elog(ERROR,"Word is too long"); arr[len].len = state.curpos - state.word; if ( cur - tmpbuf > 0xffff ) elog(ERROR,"Too long value"); arr[len].pos = cur - tmpbuf; memcpy( (void*)cur, (void*)state.word, arr[len].len ); cur += arr[len].len; len++; } pfree(state.word); if ( !len ) elog(ERROR,"Void value"); len = uniqueentry( arr, len, tmpbuf, &buflen ); totallen = CALCDATASIZE( len, buflen ); in = (txtidx*)palloc( totallen ); in->len = totallen; in->size = len; cur = STRPTR(in); for(i=0;isize; PG_FREE_IF_COPY(in,0); PG_RETURN_INT32( ret ); } Datum txtidx_out(PG_FUNCTION_ARGS) { txtidx *out=(txtidx*)DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(0))); char *outbuf; int4 i,j,lenbuf = STRSIZE(out) + 1 /* \0 */ + out->size*2 /* '' */ + out->size - 1 /* space */; WordEntry *ptr = ARRPTR(out); char *curin, *curout; curout = outbuf = (char*) palloc( lenbuf ); for(i=0;isize;i++) { curin = STRPTR(out) + ptr->pos; if ( i!= 0 ) *curout++ = ' '; *curout++ = '\''; j = ptr->len; while( j-- ) { if ( *curin == '\'' ) { int4 pos = curout - outbuf; outbuf = (char*)repalloc((void*)outbuf, ++lenbuf ); curout = outbuf + pos; *curout++ = '\\'; } *curout++ = *curin++; } *curout++ = '\''; ptr++; } outbuf[ lenbuf-1 ] = '\0'; PG_FREE_IF_COPY(out,0); PG_RETURN_POINTER( outbuf ); } typedef struct { uint16 len; char* word; } WORD; typedef struct { WORD *words; int4 lenwords; int4 curwords; } PRSTEXT; /* * Parse text to lexems */ static void parsetext( PRSTEXT *prs, char *buf, int4 buflen ) { int type,lenlemm; char *ptr,*ptrw; char *lemm; start_parse_str( buf, buflen ); while( (type=tsearch_yylex()) != 0 ) { if ( prs->curwords == prs->lenwords ) { prs->lenwords *= 2; prs->words = (WORD*)repalloc( (void*)prs->words, prs->lenwords * sizeof(WORD) ); } if ( tokenlen>0xffff ) { end_parse(); elog(ERROR, "Word is too long"); } lenlemm = tokenlen; lemm = lemmatize( token, &lenlemm, type ); if ( ! lemm ) continue; if ( lemm != token ) { prs->words[ prs->curwords ].len = lenlemm; prs->words[ prs->curwords ].word = lemm; } else { prs->words[ prs->curwords ].len = lenlemm; ptrw = prs->words[ prs->curwords ].word = (char*)palloc( lenlemm ); ptr = token; while( ptr-token < lenlemm ) { *ptrw = tolower( (unsigned char) *ptr ); ptr++; ptrw++; } } prs->curwords++; } end_parse(); } static int compareWORD( const void * a, const void * b ) { if ( ((WORD*)a)->len == ((WORD*)b)->len ) return strncmp( ((WORD*)a)->word, ((WORD*)b)->word, ((WORD*)b)->len ); return ( ((WORD*)a)->len > ((WORD*)b)->len ) ? 1 : -1; } static int uniqueWORD( WORD* a, int4 l ) { WORD *ptr, *res; if ( l == 1 ) return l; res = a; ptr = a + 1; qsort((void*)a, l, sizeof(WORD), compareWORD ); while (ptr - a < l) { if ( ! (ptr->len == res->len && strncmp(ptr->word, res->word ,res->len) == 0 ) ) { res++; res->len = ptr->len; res->word = ptr->word; } else { pfree(ptr->word); } ptr++; } return res + 1 - a; } /* * make value of txtidx */ static txtidx * makevalue( PRSTEXT *prs ) { int4 i, lenstr=0, totallen; txtidx *in; WordEntry *ptr; char *str,*cur; prs->curwords = uniqueWORD( prs->words, prs->curwords ); for(i=0;icurwords;i++) lenstr += prs->words[i].len; totallen = CALCDATASIZE( prs->curwords, lenstr ); in = (txtidx*)palloc( totallen ); in->len = totallen; in->size = prs->curwords; ptr = ARRPTR(in); cur = str = STRPTR(in); for(i=0;icurwords;i++) { ptr->len = prs->words[i].len; if ( cur-str > 0xffff ) elog(ERROR,"Value is too big"); ptr->pos = cur-str; ptr++; memcpy( (void*)cur, (void*)prs->words[i].word, prs->words[i].len ); pfree(prs->words[i].word); cur += prs->words[i].len; } pfree(prs->words); return in; } Datum txt2txtidx(PG_FUNCTION_ARGS) { text *in = (text*)DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(0))); PRSTEXT prs; txtidx *out = NULL; prs.lenwords = 32; prs.curwords = 0; prs.words = (WORD*)palloc(sizeof(WORD)*prs.lenwords); initmorph(); parsetext( &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ ); PG_FREE_IF_COPY(in,0); if ( prs.curwords ) { out = makevalue( &prs ); PG_RETURN_POINTER( out ); } pfree(prs.words); PG_RETURN_NULL(); } /* * Trigger */ Datum tsearch(PG_FUNCTION_ARGS) { TriggerData *trigdata; Trigger *trigger; Relation rel; HeapTuple rettuple = NULL; int numidxattr,i; PRSTEXT prs; Datum datum = (Datum)0; if (!CALLED_AS_TRIGGER(fcinfo)) elog(ERROR, "TSearch: Not fired by trigger manager"); trigdata = (TriggerData *) fcinfo->context; if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) elog(ERROR, "TSearch: Can't process STATEMENT events"); if (TRIGGER_FIRED_AFTER(trigdata->tg_event)) elog(ERROR, "TSearch: Must be fired BEFORE event"); if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) rettuple = trigdata->tg_trigtuple; else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) rettuple = trigdata->tg_newtuple; else elog(ERROR, "TSearch: Unknown event"); trigger = trigdata->tg_trigger; rel = trigdata->tg_relation; if ( trigger->tgnargs < 2 ) elog(ERROR,"TSearch: format tsearch(txtidx_field, text_field1,...)"); numidxattr = SPI_fnumber(rel->rd_att, trigger->tgargs[0]); if ( numidxattr < 0 ) elog(ERROR,"TSearch: Can not find txtidx_field"); prs.lenwords = 32; prs.curwords = 0; prs.words = (WORD*)palloc(sizeof(WORD)*prs.lenwords); initmorph(); /* find all words in indexable column */ for(i=1; itgnargs; i++) { int4 numattr; text *txt_toasted, *txt; bool isnull; Oid oidtype; numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]); oidtype = SPI_gettypeid(rel->rd_att, numattr); if ( numattr<0 || ( ! ( oidtype==TEXTOID || oidtype==VARCHAROID ) ) ) { elog(NOTICE, "TSearch: can not find field '%s'", trigger->tgargs[i]); continue; } txt_toasted = (text*)DatumGetPointer( SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull ) ); if ( isnull ) continue; txt = (text*)DatumGetPointer( PG_DETOAST_DATUM( PointerGetDatum ( txt_toasted ) ) ); parsetext( &prs, VARDATA(txt), VARSIZE(txt) - VARHDRSZ ); if ( txt != txt_toasted ) pfree(txt); } /* make txtidx value */ if (prs.curwords) { datum = PointerGetDatum( makevalue( &prs ) ); rettuple = SPI_modifytuple( rel, rettuple, 1, &numidxattr, &datum, NULL ); pfree(DatumGetPointer(datum)); } else { char nulls = 'n'; pfree( prs.words ); rettuple = SPI_modifytuple( rel, rettuple, 1, &numidxattr, &datum, &nulls ); } if (rettuple == NULL) elog(ERROR, "TSearch: %d returned by SPI_modifytuple", SPI_result); return PointerGetDatum( rettuple ); }