Fix ts_delete(tsvector, text[]) to cope with duplicate array entries.

Such cases either failed an Assert, or produced a corrupt tsvector in
non-Assert builds, as reported by Andreas Seltenreich.  The reason is
that tsvector_delete_by_indices() just assumed that its input array had
no duplicates.  Fix by explicitly de-duping.

In passing, improve some comments, and fix a number of tests for null
values to use ERRCODE_NULL_VALUE_NOT_ALLOWED not
ERRCODE_INVALID_PARAMETER_VALUE.

Discussion: <87invhoj6e.fsf@credativ.de>
This commit is contained in:
Tom Lane 2016-08-05 15:14:08 -04:00
parent 33fe7360af
commit c50d192ce3
3 changed files with 53 additions and 31 deletions

View File

@ -317,7 +317,7 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
if (nulls[i]) if (nulls[i])
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls"))); errmsg("lexeme array may not contain nulls")));
lex = VARDATA(dlexemes[i]); lex = VARDATA(dlexemes[i]);
@ -430,7 +430,7 @@ compareint(const void *va, const void *vb)
/* /*
* Internal routine to delete lexemes from TSVector by array of offsets. * Internal routine to delete lexemes from TSVector by array of offsets.
* *
* int *indices_to_delete -- array of lexeme offsets to delete * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
* int indices_count -- size of that array * int indices_count -- size of that array
* *
* Returns new TSVector without given lexemes along with their positions * Returns new TSVector without given lexemes along with their positions
@ -445,35 +445,51 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
*arrout; *arrout;
char *data = STRPTR(tsv), char *data = STRPTR(tsv),
*dataout; *dataout;
int i, int i, /* index in arrin */
j, j, /* index in arrout */
k, k, /* index in indices_to_delete */
curoff; curoff; /* index in dataout area */
/* /*
* Here we overestimates tsout size, since we don't know exact size * Sort the filter array to simplify membership checks below. Also, get
* occupied by positions and weights. We will set exact size later after a * rid of any duplicate entries, so that we can assume that indices_count
* pass through TSVector. * is exactly equal to the number of lexemes that will be removed.
*/
if (indices_count > 1)
{
int kp;
qsort(indices_to_delete, indices_count, sizeof(int), compareint);
kp = 0;
for (k = 1; k < indices_count; k++)
{
if (indices_to_delete[k] != indices_to_delete[kp])
indices_to_delete[++kp] = indices_to_delete[k];
}
indices_count = ++kp;
}
/*
* Here we overestimate tsout size, since we don't know how much space is
* used by the deleted lexeme(s). We will set exact size below.
*/ */
tsout = (TSVector) palloc0(VARSIZE(tsv)); tsout = (TSVector) palloc0(VARSIZE(tsv));
arrout = ARRPTR(tsout);
/* This count must be correct because STRPTR(tsout) relies on it. */
tsout->size = tsv->size - indices_count; tsout->size = tsv->size - indices_count;
/* Sort our filter array to simplify membership check later. */
if (indices_count > 1)
qsort(indices_to_delete, indices_count, sizeof(int), compareint);
/* /*
* Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete. * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
*/ */
curoff = 0; arrout = ARRPTR(tsout);
dataout = STRPTR(tsout); dataout = STRPTR(tsout);
curoff = 0;
for (i = j = k = 0; i < tsv->size; i++) for (i = j = k = 0; i < tsv->size; i++)
{ {
/* /*
* Here we should check whether current i is present in * If current i is present in indices_to_delete, skip this lexeme.
* indices_to_delete or not. Since indices_to_delete is already sorted * Since indices_to_delete is already sorted, we only need to check
* we can advance it index only when we have match. * the current (k'th) entry.
*/ */
if (k < indices_count && i == indices_to_delete[k]) if (k < indices_count && i == indices_to_delete[k])
{ {
@ -481,7 +497,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
continue; continue;
} }
/* Copy lexeme, it's positions and weights */ /* Copy lexeme and its positions and weights */
memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
arrout[j].haspos = arrin[i].haspos; arrout[j].haspos = arrin[i].haspos;
arrout[j].len = arrin[i].len; arrout[j].len = arrin[i].len;
@ -489,8 +505,8 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
curoff += arrin[i].len; curoff += arrin[i].len;
if (arrin[i].haspos) if (arrin[i].haspos)
{ {
int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) + int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
sizeof(uint16); + sizeof(uint16);
curoff = SHORTALIGN(curoff); curoff = SHORTALIGN(curoff);
memcpy(dataout + curoff, memcpy(dataout + curoff,
@ -503,10 +519,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
} }
/* /*
* After the pass through TSVector k should equals exactly to * k should now be exactly equal to indices_count. If it isn't then the
* indices_count. If it isn't then the caller provided us with indices * caller provided us with indices outside of [0, tsv->size) range and
* outside of [0, tsv->size) range and estimation of tsout's size is * estimation of tsout's size is wrong.
* wrong.
*/ */
Assert(k == indices_count); Assert(k == indices_count);
@ -560,7 +575,7 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
/* /*
* In typical use case array of lexemes to delete is relatively small. So * In typical use case array of lexemes to delete is relatively small. So
* here we optimizing things for that scenario: iterate through lexarr * here we optimize things for that scenario: iterate through lexarr
* performing binary search of each lexeme from lexarr in tsvector. * performing binary search of each lexeme from lexarr in tsvector.
*/ */
skip_indices = palloc0(nlex * sizeof(int)); skip_indices = palloc0(nlex * sizeof(int));
@ -572,10 +587,10 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
if (nulls[i]) if (nulls[i])
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls"))); errmsg("lexeme array may not contain nulls")));
lex = VARDATA(dlexemes[i]); lex = VARDATA_ANY(dlexemes[i]);
lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
lex_pos = tsvector_bsearch(tsin, lex, lex_len); lex_pos = tsvector_bsearch(tsin, lex, lex_len);
@ -738,7 +753,7 @@ array_to_tsvector(PG_FUNCTION_ARGS)
{ {
if (nulls[i]) if (nulls[i])
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls"))); errmsg("lexeme array may not contain nulls")));
datalen += VARSIZE_ANY_EXHDR(dlexemes[i]); datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
@ -797,7 +812,7 @@ tsvector_filter(PG_FUNCTION_ARGS)
if (nulls[i]) if (nulls[i])
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("weight array may not contain nulls"))); errmsg("weight array may not contain nulls")));
char_weight = DatumGetChar(dweights[i]); char_weight = DatumGetChar(dweights[i]);

View File

@ -1087,6 +1087,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi
'base' 'hidden' 'strike' 'base' 'hidden' 'strike'
(1 row) (1 row)
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
ts_delete
--------------------------
'base' 'hidden' 'strike'
(1 row)
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
ERROR: lexeme array may not contain nulls ERROR: lexeme array may not contain nulls
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);

View File

@ -212,6 +212,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3':
SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']); SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);