From 6943a946c7e5eb72d53c0ce71f08a81a133503bd Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Fri, 11 Mar 2016 19:22:36 +0300 Subject: [PATCH] Tsvector editing functions Adds several tsvector editting function: convert tsvector to/from text array, set weight for given lexemes, delete lexeme(s), unnest, filter lexemes with given weights Author: Stas Kelvich with some editorization by me Reviewers: Tomas Vondram, Teodor Sigaev --- doc/src/sgml/func.sgml | 91 ++++- doc/src/sgml/textsearch.sgml | 4 + src/backend/utils/adt/tsvector_op.c | 563 ++++++++++++++++++++++++++ src/include/catalog/pg_proc.h | 18 +- src/include/tsearch/ts_type.h | 7 + src/test/regress/expected/tstypes.out | 221 +++++++++- src/test/regress/sql/tstypes.sql | 47 ++- 7 files changed, 933 insertions(+), 18 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 4b5ee8135f..000489d961 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9211,13 +9211,26 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple setweight - setweight(tsvector, "char") + setweight(vector tsvector, weight "char") tsvector - assign weight to each element of tsvector + assign weight to each element of vector setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A') 'cat':3A 'fat':2A,4A 'rat':5A + + + + setweight + setweight by filter + + setweight(vector tsvector, weight "char", lexemes "text"[]) + + tsvector + assign weight to elements of vector that are listed in lexemes array + setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}') + 'cat':3A 'fat':2,4 'rat':5A + @@ -9230,6 +9243,80 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple strip('fat:2,4 cat:3 rat:5A'::tsvector) 'cat' 'fat' 'rat' + + + + delete + delete lemexeme + + delete(vector tsvector, lexeme text) + + tsvector + remove given lexeme from vector + delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat') + 'cat':3 'rat':5A + + + + + delete + delete lemexemes array + + delete(vector tsvector, lexemes text[]) + + tsvector + remove any occurrence of lexemes in lexemes array from vector + delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat']) + 'cat':3 + + + + + unnest + + unnest(tsvector, OUT lexeme text, OUT positions smallint[], OUT weights text) + + setof record + expand a tsvector to a set of rows + unnest('fat:2,4 cat:3 rat:5A'::tsvector) + (cat,{3},{D}) ... + + + + + tsvector_to_array + + tsvector_to_array(tsvector) + + text[] + convert tsvector to array of lexemes + tsvector_to_array('fat:2,4 cat:3 rat:5A'::tsvector) + {cat,fat,rat} + + + + + array_to_tsvector + + array_to_tsvector(text[]) + + tsvector + convert array of lexemes to tsvector + array_to_tsvector('{fat,cat,rat}'::text[]) + 'fat' 'cat' 'rat' + + + + + filter + + filter(vector tsvector, weights "char"[]) + + tsvector + Select only elements with given weights from vector + filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}') + 'cat':3B 'rat':5A + diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index ff99976068..ea3abc9e15 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank + + Full list of tsvector-related functions available in . + + diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index a3f1c36187..6a01276ca2 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" #include "commands/trigger.h" @@ -65,6 +66,7 @@ typedef struct #define STATHDRSIZE (offsetof(TSVectorStat, data)) static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); +static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len); /* * Order: haspos, len, word, for all positions (pos, weight) @@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS) PG_RETURN_POINTER(out); } +/* + * setweight(tsin tsvector, char_weight "char", lexemes "text"[]) + * + * Assign weight w to elements of tsin that are listed in lexemes. + */ +Datum +tsvector_setweight_by_filter(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0); + char char_weight = PG_GETARG_CHAR(1); + ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2); + + TSVector tsout; + int i, + j, + nlexemes, + weight; + WordEntry *entry; + Datum *dlexemes; + bool *nulls; + + switch (char_weight) + { + case 'A': case 'a': + weight = 3; + break; + case 'B': case 'b': + weight = 2; + break; + case 'C': case 'c': + weight = 1; + break; + case 'D': case 'd': + weight = 0; + break; + default: + /* internal error */ + elog(ERROR, "unrecognized weight: %c", char_weight); + } + + tsout = (TSVector) palloc(VARSIZE(tsin)); + memcpy(tsout, tsin, VARSIZE(tsin)); + entry = ARRPTR(tsout); + + deconstruct_array(lexemes, TEXTOID, -1, false, 'i', + &dlexemes, &nulls, &nlexemes); + + /* + * Assuming that lexemes array is significantly shorter than tsvector + * we can iterate through lexemes performing binary search + * of each lexeme from lexemes in tsvector. + */ + for (i = 0; i < nlexemes; i++) + { + char *lex; + int lex_len, + lex_pos; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lexeme array may not contain nulls"))); + + lex = VARDATA(dlexemes[i]); + lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); + lex_pos = tsvector_bsearch(tsout, lex, lex_len); + + if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) + { + WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos); + while (j--) + { + WEP_SETWEIGHT(*p, weight); + p++; + } + } + } + + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(lexemes, 2); + + PG_RETURN_POINTER(tsout); +} + #define compareEntry(pa, a, pb, b) \ tsCompareString((pa) + (a)->pos, (a)->len, \ (pb) + (b)->pos, (b)->len, \ @@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr, return *clen - startlen; } +/* + * Perform binary search of given lexeme in TSVector. + * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't + * found. + */ +static int +tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) +{ + WordEntry *arrin = ARRPTR(tsv); + int StopLow = 0, + StopHigh = tsv->size, + StopMiddle, + cmp; + + while (StopLow < StopHigh) + { + StopMiddle = (StopLow + StopHigh)/2; + + cmp = tsCompareString(lexeme, lexeme_len, + STRPTR(tsv) + arrin[StopMiddle].pos, + arrin[StopMiddle].len, + false); + + if (cmp < 0) + StopHigh = StopMiddle; + else if (cmp > 0) + StopLow = StopMiddle + 1; + else /* found it */ + return StopMiddle; + } + + return -1; +} + +static int +compareint(const void *va, const void *vb) +{ + int32 a = *((const int32 *) va); + int32 b = *((const int32 *) vb); + + if (a == b) + return 0; + return (a > b) ? 1 : -1; +} + +/* + * Internal routine to delete lexemes from TSVector by array of offsets. + * + * int *indices_to_delete -- array of lexeme offsets to delete + * int indices_count -- size of that array + * + * Returns new TSVector without given lexemes along with their positions + * and weights. + */ +static TSVector +tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, + int indices_count) +{ + TSVector tsout; + WordEntry *arrin = ARRPTR(tsv), + *arrout; + char *data = STRPTR(tsv), + *dataout; + int i, j, k, + curoff; + + /* + * Here we overestimates tsout size, since we don't know exact size + * occupied by positions and weights. We will set exact size later + * after a pass through TSVector. + */ + tsout = (TSVector) palloc0(VARSIZE(tsv)); + arrout = ARRPTR(tsout); + tsout->size = tsv->size - indices_count; + + /* Sort our filter array to simplify membership check later. */ + if (indices_count > 1) + qsort(indices_to_delete, indices_count, sizeof(int), compareint); + + /* + * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete. + */ + curoff = 0; + dataout = STRPTR(tsout); + for (i = j = k = 0; i < tsv->size; i++) + { + /* + * Here we should check whether current i is present in + * indices_to_delete or not. Since indices_to_delete is already + * sorted we can advance it index only when we have match. + */ + if (k < indices_count && i == indices_to_delete[k]){ + k++; + continue; + } + + /* Copy lexeme, it's positions and weights */ + memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); + arrout[j].haspos = arrin[i].haspos; + arrout[j].len = arrin[i].len; + arrout[j].pos = curoff; + curoff += arrin[i].len; + if (arrin[i].haspos) + { + int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) + + sizeof(uint16); + curoff = SHORTALIGN(curoff); + memcpy(dataout + curoff, + STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len), + len); + curoff += len; + } + + j++; + } + + /* + * After the pass through TSVector k should equals exactly to indices_count. + * If it isn't then the caller provided us with indices outside of + * [0, tsv->size) range and estimation of tsout's size is wrong. + */ + Assert(k == indices_count); + + SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff)); + return tsout; +} + +/* + * Delete given lexeme from tsvector. + * Implementation of user-level delete(tsvector, text). + */ +Datum +tsvector_delete_str(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + text *tlexeme = PG_GETARG_TEXT_P(1); + char *lexeme = VARDATA(tlexeme); + int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme), + skip_index; + + if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1) + PG_RETURN_POINTER(tsin); + + tsout = tsvector_delete_by_indices(tsin, &skip_index, 1); + + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(tlexeme, 1); + PG_RETURN_POINTER(tsout); +} + +/* + * Delete given array of lexemes from tsvector. + * Implementation of user-level delete(tsvector, text[]). + */ +Datum +tsvector_delete_arr(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1); + int i, nlex, + skip_count, + *skip_indices; + Datum *dlexemes; + bool *nulls; + + deconstruct_array(lexemes, TEXTOID, -1, false, 'i', + &dlexemes, &nulls, &nlex); + + /* + * In typical use case array of lexemes to delete is relatively small. + * So here we optimizing things for that scenario: iterate through lexarr + * performing binary search of each lexeme from lexarr in tsvector. + */ + skip_indices = palloc0(nlex * sizeof(int)); + for (i = skip_count = 0; i < nlex; i++) + { + char *lex; + int lex_len, + lex_pos; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lexeme array may not contain nulls"))); + + lex = VARDATA(dlexemes[i]); + lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); + lex_pos = tsvector_bsearch(tsin, lex, lex_len); + + if (lex_pos >= 0) + skip_indices[skip_count++] = lex_pos; + } + + tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count); + + pfree(skip_indices); + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(lexemes, 1); + + PG_RETURN_POINTER(tsout); +} + +/* + * Expand tsvector as table with following columns: + * lexeme: lexeme text + * positions: integer array of lexeme positions + * weights: char array of weights corresponding to positions + */ +Datum +tsvector_unnest(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + TSVector tsin; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(3, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions", + INT2ARRAYOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights", + TEXTARRAYOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0); + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + tsin = (TSVector) funcctx->user_fctx; + + if (funcctx->call_cntr < tsin->size) + { + WordEntry *arrin = ARRPTR(tsin); + char *data = STRPTR(tsin); + HeapTuple tuple; + int j, + i = funcctx->call_cntr; + bool nulls[] = {false, false, false}; + Datum values[3]; + + values[0] = PointerGetDatum( + cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len) + ); + + if (arrin[i].haspos) + { + WordEntryPosVector *posv; + Datum *positions; + Datum *weights; + char weight; + + /* + * Internally tsvector stores position and weight in the same + * uint16 (2 bits for weight, 14 for position). Here we extract that + * in two separate arrays. + */ + posv = _POSVECPTR(tsin, arrin + i); + positions = palloc(posv->npos * sizeof(Datum)); + weights = palloc(posv->npos * sizeof(Datum)); + for (j = 0; j < posv->npos; j++) + { + positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j])); + weight = 'D' - WEP_GETWEIGHT(posv->pos[j]); + weights[j] = PointerGetDatum( + cstring_to_text_with_len(&weight, 1) + ); + } + + values[1] = PointerGetDatum( + construct_array(positions, posv->npos, INT2OID, 2, true, 's')); + values[2] = PointerGetDatum( + construct_array(weights, posv->npos, TEXTOID, -1, false, 'i')); + } + else + { + nulls[1] = nulls[2] = true; + } + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + pfree(tsin); + SRF_RETURN_DONE(funcctx); + } +} + +/* + * Convert tsvector to array of lexemes. + */ +Datum +tsvector_to_array(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0); + WordEntry *arrin = ARRPTR(tsin); + Datum elements[tsin->size]; + int i; + ArrayType *array; + + for (i = 0; i < tsin->size; i++) + { + elements[i] = PointerGetDatum( + cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len) + ); + } + + array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i'); + PG_FREE_IF_COPY(tsin, 0); + PG_RETURN_POINTER(array); +} + +/* + * Build tsvector from array of lexemes. + */ +Datum +array_to_tsvector(PG_FUNCTION_ARGS) +{ + ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); + TSVector tsout; + Datum *dlexemes; + WordEntry *arrout; + bool *nulls; + int nitems, + i, + tslen, + datalen = 0; + char *cur; + + deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems); + + for (i = 0; i < nitems; i++) + { + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lexeme array may not contain nulls"))); + + datalen += VARSIZE_ANY_EXHDR(dlexemes[i]); + } + + tslen = CALCDATASIZE(nitems, datalen); + tsout = (TSVector) palloc0(tslen); + SET_VARSIZE(tsout, tslen); + tsout->size = nitems; + arrout = ARRPTR(tsout); + cur = STRPTR(tsout); + + for (i = 0; i < nitems; i++) + { + char *lex = VARDATA(dlexemes[i]); + int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); + + memcpy(cur, lex, lex_len); + arrout[i].haspos = 0; + arrout[i].len = lex_len; + arrout[i].pos = cur - STRPTR(tsout); + cur += lex_len; + } + + PG_FREE_IF_COPY(v, 0); + PG_RETURN_POINTER(tsout); +} + +/* + * Leave only elements with given weights from tsvector. + */ +Datum +tsvector_filter(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1); + WordEntry *arrin = ARRPTR(tsin), + *arrout; + char *datain = STRPTR(tsin), + *dataout; + Datum *dweights; + bool *nulls; + int nweigths; + int i, j; + char mask = 0, + cur_pos = 0; + + deconstruct_array(weights, CHAROID, 1, true, 'c', + &dweights, &nulls, &nweigths); + + for (i = 0; i < nweigths; i++) + { + char char_weight; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight array may not contain nulls"))); + + char_weight = DatumGetChar(dweights[i]); + switch (char_weight) + { + case 'A': case 'a': + mask = mask | 8; + break; + case 'B': case 'b': + mask = mask | 4; + break; + case 'C': case 'c': + mask = mask | 2; + break; + case 'D': case 'd': + mask = mask | 1; + break; + default: + /* internal error */ + elog(ERROR, "unrecognized weight: %c", char_weight); + } + } + + tsout = (TSVector) palloc0(VARSIZE(tsin)); + tsout->size = tsin->size; + arrout = ARRPTR(tsout); + dataout = STRPTR(tsout); + + for (i = j = 0; i < tsin->size; i++) + { + WordEntryPosVector *posvin, + *posvout; + int npos = 0; + int k; + + if (!arrin[i].haspos) + continue; + + posvin = _POSVECPTR(tsin, arrin + i); + posvout = (WordEntryPosVector *) + (dataout + SHORTALIGN(cur_pos + arrin[i].len)); + + for (k = 0; k < posvin->npos; k++) + { + if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k]))) + posvout->pos[npos++] = posvin->pos[k]; + } + + if (!npos) /* no satisfactory positions found, so skip that lexeme */ + continue; + + arrout[j].haspos = true; + arrout[j].len = arrin[i].len; + arrout[j].pos = cur_pos; + + memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len); + posvout->npos = npos; + cur_pos += SHORTALIGN(arrin[i].len); + cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) + + sizeof(uint16); + j++; + } + + tsout->size = j; + if (dataout != STRPTR(tsout)) + memmove(STRPTR(tsout), dataout, cur_pos); + + SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos)); + + PG_FREE_IF_COPY(tsin, 0); + PG_RETURN_POINTER(tsout); +} Datum tsvector_concat(PG_FUNCTION_ARGS) diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 451bad7b4e..5c71bce07a 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4498,8 +4498,22 @@ DESCR("number of lexemes"); DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ )); DESCR("strip position information"); DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ )); -DESCR("set weight of lexeme's entries"); -DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ )); +DESCR("set given weight for whole tsvector"); +DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ )); +DESCR("set given weight for given lexemes"); +DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ )); +DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ )); +DESCR("delete lexeme"); +DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ )); +DESCR("delete given lexemes"); +DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ )); +DESCR("expand tsvector to set of rows"); +DATA(insert OID = 3326 ( tsvector_to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ )); +DESCR("convert to lexeme's array"); +DATA(insert OID = 3327 ( array_to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ )); +DESCR("build tsvector from lexeme's array"); +DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ )); +DESCR("returns tsvector that contain only postings with given weights"); DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ )); DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ )); diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index dc6067a932..bc99524dc0 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -141,7 +141,14 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS); extern Datum tsvector_length(PG_FUNCTION_ARGS); extern Datum tsvector_strip(PG_FUNCTION_ARGS); extern Datum tsvector_setweight(PG_FUNCTION_ARGS); +extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS); extern Datum tsvector_concat(PG_FUNCTION_ARGS); +extern Datum tsvector_delete_str(PG_FUNCTION_ARGS); +extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS); +extern Datum tsvector_unnest(PG_FUNCTION_ARGS); +extern Datum tsvector_to_array(PG_FUNCTION_ARGS); +extern Datum array_to_tsvector(PG_FUNCTION_ARGS); +extern Datum tsvector_filter(PG_FUNCTION_ARGS); extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS); extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index 6284fb6181..a386a46361 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B'; 'a':3A,4B 'b':2A 'ba':1237 (1 row) -SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c'); - setweight ----------------------------------------------------------- - 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C -(1 row) - -SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector); - strip ---------------- - 'a' 'asd' 'w' -(1 row) - --Base tsquery test SELECT '1'::tsquery; tsquery @@ -625,3 +613,212 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s'); 0.1 (1 row) +-- tsvector editing operations +SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector); + strip +--------------- + 'a' 'asd' 'w' +(1 row) + +SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); + strip +---------------------------------------------- + 'base' 'hidden' 'rebel' 'spaceship' 'strike' +(1 row) + +SELECT strip('base hidden rebel spaceship strike'::tsvector); + strip +---------------------------------------------- + 'base' 'hidden' 'rebel' 'spaceship' 'strike' +(1 row) + +SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship'); + delete +------------------------------------------ + 'base':7 'hidden':6 'rebel':1 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base'); + delete +-------------------------------------------------------------- + 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas'); + delete +----------------------------------------------------------------------- + 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases'); + delete +----------------------------------------------------------------------- + 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship'); + delete +------------------------------------------ + 'base':7 'hidden':6 'rebel':1 'strike':3 +(1 row) + +SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship'); + delete +---------------------------------- + 'base' 'hidden' 'rebel' 'strike' +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']); + delete +-------------------------------- + 'base':7 'hidden':6 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']); + delete +------------------------------------------------------------- + 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']); + delete +------------------------------------------------------------- + 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3 +(1 row) + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']); + delete +-------------------------------- + 'base':7 'hidden':6 'strike':3 +(1 row) + +SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']); + delete +-------------------------- + 'base' 'hidden' 'strike' +(1 row) + +SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); +ERROR: lexeme array may not contain nulls +SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); + unnest +--------------------------------------------- + (base,{7},{D}) + (hidden,{6},{D}) + (rebel,{1},{D}) + (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}") + (strike,{3},{D}) +(5 rows) + +SELECT unnest('base hidden rebel spaceship strike'::tsvector); + unnest +--------------- + (base,,) + (hidden,,) + (rebel,,) + (spaceship,,) + (strike,,) +(5 rows) + +SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); + lexeme | positions | weights +-----------+-----------------+------------- + base | {7} | {D} + hidden | {6} | {D} + rebel | {1} | {D} + spaceship | {2,33,34,35,36} | {D,A,B,C,D} + strike | {3} | {D} +(5 rows) + +SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector); + lexeme | positions | weights +-----------+-----------+--------- + base | | + hidden | | + rebel | | + spaceship | | + strike | | +(5 rows) + +SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); + lexeme | positions +-----------+----------- + base | 7 + hidden | 6 + rebel | 1 + spaceship | 2 + strike | 3 +(5 rows) + +SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); + tsvector_to_array +-------------------------------------- + {base,hidden,rebel,spaceship,strike} +(1 row) + +SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector); + tsvector_to_array +-------------------------------------- + {base,hidden,rebel,spaceship,strike} +(1 row) + +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); + array_to_tsvector +---------------------------------------------- + 'base' 'hidden' 'rebel' 'spaceship' 'strike' +(1 row) + +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); +ERROR: lexeme array may not contain nulls +SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c'); + setweight +---------------------------------------------------------- + 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C +(1 row) + +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c'); + setweight +---------------------------------------------------------- + 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C +(1 row) + +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); + setweight +------------------------------------------------------ + 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567 +(1 row) + +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); + setweight +------------------------------------------------------ + 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567 +(1 row) + +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}'); + setweight +-------------------------------------------------------- + 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C +(1 row) + +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}'); + setweight +--------------------------------- + 'a' 'asd' 'w':5,6,12B,13A 'zxc' +(1 row) + +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]); +ERROR: lexeme array may not contain nulls +SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}'); + filter +------------------------------------------------------------- + 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A +(1 row) + +SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}'); + filter +-------- + +(1 row) + +SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}'); +ERROR: weight array may not contain nulls diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index fd7c7024f5..db62c5460d 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector; SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector)); SELECT '''w'':4A,3B,2C,1D,5 a:8'; SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B'; -SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c'); -SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector); --Base tsquery test SELECT '1'::tsquery; @@ -115,3 +113,48 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s'); SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s'); SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s'); SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s'); + +-- tsvector editing operations + +SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector); +SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); +SELECT strip('base hidden rebel spaceship strike'::tsvector); + +SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship'); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base'); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas'); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases'); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship'); +SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship'); + +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']); +SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']); +SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']); +SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); + +SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); +SELECT unnest('base hidden rebel spaceship strike'::tsvector); +SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); +SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector); +SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); + +SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); +SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector); + +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); + +SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c'); +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c'); +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); +SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}'); +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}'); +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]); + +SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}'); +SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}'); +SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}'); +