Stat function now can show statistics per weight of lexemes

This commit is contained in:
Teodor Sigaev 2004-05-28 15:36:49 +00:00
parent 1b9ef0025d
commit a6ea6457fa
6 changed files with 160 additions and 37 deletions

View File

@ -782,6 +782,7 @@ select rank(' a:1 s:2 d g'::tsvector, 'a & s');
(1 row)
insert into test_tsvector (t) values ('foo bar foo the over foo qq bar');
drop trigger tsvectorupdate on test_tsvector;
select * from stat('select a from test_tsvector') order by ndoc desc, nentry desc, word;
word | ndoc | nentry
-----------+------+--------
@ -1933,6 +1934,55 @@ select * from stat('select a from test_tsvector') order by ndoc desc, nentry des
qwerti | 1 | 1
(1146 rows)
insert into test_tsvector values ('1', 'a:1a,2,3b b:5a,6a,7c,8');
insert into test_tsvector values ('1', 'a:1a,2,3c b:5a,6b,7c,8b');
select * from stat('select a from test_tsvector','a') order by ndoc desc, nentry desc, word;
word | ndoc | nentry
------+------+--------
b | 2 | 3
a | 2 | 2
(2 rows)
select * from stat('select a from test_tsvector','b') order by ndoc desc, nentry desc, word;
word | ndoc | nentry
------+------+--------
b | 1 | 2
a | 1 | 1
(2 rows)
select * from stat('select a from test_tsvector','c') order by ndoc desc, nentry desc, word;
word | ndoc | nentry
------+------+--------
b | 2 | 2
a | 1 | 1
(2 rows)
select * from stat('select a from test_tsvector','d') order by ndoc desc, nentry desc, word;
word | ndoc | nentry
-----------+------+--------
a | 2 | 2
copyright | 2 | 2
foo | 1 | 3
bar | 1 | 2
345 | 1 | 1
b | 1 | 1
qq | 1 | 1
qwerti | 1 | 1
(8 rows)
select * from stat('select a from test_tsvector','ad') order by ndoc desc, nentry desc, word;
word | ndoc | nentry
-----------+------+--------
a | 2 | 4
b | 2 | 4
copyright | 2 | 2
foo | 1 | 3
bar | 1 | 2
345 | 1 | 1
qq | 1 | 1
qwerti | 1 | 1
(8 rows)
select reset_tsearch();
NOTICE: TSearch cache cleaned
reset_tsearch
@ -2092,7 +2142,6 @@ select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
(5 rows)
--check ordering
drop trigger tsvectorupdate on test_tsvector;
insert into test_tsvector values (null, null);
select a is null, a from test_tsvector order by a;
?column? | a
@ -2108,6 +2157,8 @@ select a is null, a from test_tsvector order by a;
f |
f | '345':1 'qwerti':2 'copyright':3
f | 'qq':7 'bar':2,8 'foo':1,3,6 'copyright':9
f | 'a':1A,2,3C 'b':5A,6B,7C,8B
f | 'a':1A,2,3B 'b':5A,6A,7C,8
f | '7w' 'ch' 'd7' 'eo' 'gw' 'i4' 'lq' 'o6' 'qt' 'y0'
f | 'ar' 'ei' 'kq' 'ma' 'qa' 'qh' 'qq' 'qz' 'rx' 'st'
f | 'gs' 'i6' 'i9' 'j2' 'l0' 'oq' 'qx' 'sc' 'xe' 'yu'
@ -2609,5 +2660,5 @@ select a is null, a from test_tsvector order by a;
f | '1b' '42' 'a7' 'ab' 'ak' 'ap' 'at' 'av' 'ay' 'b0' 'b9' 'bb' 'bp' 'bu' 'bz' 'cq' 'da' 'de' 'dn' 'e0' 'eb' 'ef' 'eg' 'ek' 'eq' 'er' 'eu' 'ey' 'fn' 'ft' 'gg' 'h4' 'hk' 'hl' 'i7' 'ig' 'ik' 'ip' 'ir' 'iu' 'iw' 'jr' 'jw' 'jx' 'kg' 'lc' 'lg' 'm0' 'na' 'np' 'om' 'on' 'oz' 'pg' 'pn' 'ps' 'pt' 'pz' 'q3' 'q6' 'qa' 'qb' 'ql' 'qq' 'qt' 'qv' 'qw' 'qy' 'r8' 'rf' 'ri' 'rk' 'rl' 'rw' 'sg' 'si' 'sp' 'sw' 'ta' 'th' 'ua' 'uj' 'uu' 'uv' 'uz' 'vj' 'vk' 'vm' 'wc' 'wf' 'wh' 'wn' 'wo' 'ww' 'xb' 'xk' 'xt' 'xw' 'y7' 'ye' 'yl' 'yt' 'yw' 'z4' 'z7' 'zc' 'zw'
f | '1h' '3s' 'ab' 'ae' 'ax' 'b1' 'bz' 'cy' 'dk' 'dq' 'ds' 'du' 'e8' 'ef' 'ej' 'ek' 'ex' 'f1' 'fe' 'ff' 'fn' 'fo' 'ft' 'fx' 'ge' 'go' 'gz' 'h6' 'hz' 'i2' 'iv' 'iy' 'j5' 'j6' 'ke' 'kf' 'lh' 'lr' 'mc' 'mj' 'na' 'ng' 'oh' 'om' 'oy' 'p2' 'pi' 'pk' 'py' 'q3' 'qb' 'qc' 'qg' 'qn' 'qo' 'qq' 'qu' 'qw' 'qx' 'qy' 'qz' 'r1' 'rk' 'rl' 'rq' 'rs' 'rt' 'ry' 'rz' 'sk' 'sl' 'so' 't9' 'td' 'te' 'tn' 'tw' 'tz' 'ud' 'uk' 'uo' 'uq' 'uw' 'ux' 'uy' 'v1' 'vg' 'vq' 'w4' 'w9' 'wa' 'wg' 'wj' 'wm' 'wo' 'wr' 'ww' 'wy' 'xf' 'xg' 'y9' 'yh' 'yi' 'yk' 'ym' 'yq' 'yv' 'zm'
t |
(512 rows)
(514 rows)

View File

@ -150,7 +150,15 @@ select rank(' a:1 s:2B d g'::tsvector, 'a & s');
select rank(' a:1 s:2 d g'::tsvector, 'a & s');
insert into test_tsvector (t) values ('foo bar foo the over foo qq bar');
drop trigger tsvectorupdate on test_tsvector;
select * from stat('select a from test_tsvector') order by ndoc desc, nentry desc, word;
insert into test_tsvector values ('1', 'a:1a,2,3b b:5a,6a,7c,8');
insert into test_tsvector values ('1', 'a:1a,2,3c b:5a,6b,7c,8b');
select * from stat('select a from test_tsvector','a') order by ndoc desc, nentry desc, word;
select * from stat('select a from test_tsvector','b') order by ndoc desc, nentry desc, word;
select * from stat('select a from test_tsvector','c') order by ndoc desc, nentry desc, word;
select * from stat('select a from test_tsvector','d') order by ndoc desc, nentry desc, word;
select * from stat('select a from test_tsvector','ad') order by ndoc desc, nentry desc, word;
select reset_tsearch();
select to_tsquery('default', 'skies & books');
@ -249,7 +257,6 @@ Upon a woman s face. E. J. Pratt (1882 1964)
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
--check ordering
drop trigger tsvectorupdate on test_tsvector;
insert into test_tsvector values (null, null);
select a is null, a from test_tsvector order by a;

View File

@ -15,9 +15,10 @@ Datum
tsstat_in(PG_FUNCTION_ARGS)
{
tsstat *stat = palloc(STATHDRSIZE);
stat->len = STATHDRSIZE;
stat->size = 0;
stat->weight = 0;
PG_RETURN_POINTER(stat);
}
@ -32,6 +33,20 @@ tsstat_out(PG_FUNCTION_ARGS)
PG_RETURN_NULL();
}
static int
check_weight(tsvector *txt, WordEntry *wptr, int8 weight) {
int len = POSDATALEN(txt, wptr);
int num=0;
WordEntryPos *ptr = POSDATAPTR(txt, wptr);
while (len--) {
if (weight & (1 << ptr->weight))
num++;
ptr++;
}
return num;
}
static WordEntry **
SEI_realloc(WordEntry ** in, uint32 *len)
{
@ -83,6 +98,7 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
totallen = CALCSTATSIZE(nentry, slen);
newstat = palloc(totallen);
newstat->len = totallen;
newstat->weight = stat->weight;
newstat->size = nentry;
memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat));
@ -107,8 +123,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
}
nptr = STATPTR(newstat) + (StopLow - STATPTR(stat));
memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat)));
nptr->nentry = POSDATALEN(txt, *ptr);
if (nptr->nentry == 0)
if ( (*ptr)->haspos ) {
nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
} else
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
@ -127,8 +144,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
}
else
{
nptr->nentry = POSDATALEN(txt, *ptr);
if (nptr->nentry == 0)
if ( (*ptr)->haspos ) {
nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
} else
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
@ -144,8 +162,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
while (ptr - entry < len)
{
nptr->nentry = POSDATALEN(txt, *ptr);
if (nptr->nentry == 0)
if ( (*ptr)->haspos ) {
nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
} else
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
@ -173,12 +192,14 @@ ts_accum(PG_FUNCTION_ARGS)
cur = 0;
StatEntry *sptr;
WordEntry *wptr;
int n=0;
if (stat == NULL || PG_ARGISNULL(0))
{ /* Init in first */
stat = palloc(STATHDRSIZE);
stat->len = STATHDRSIZE;
stat->size = 0;
stat->weight = 0;
}
/* simple check of correctness */
@ -201,32 +222,37 @@ ts_accum(PG_FUNCTION_ARGS)
sptr++;
else if (cmp == 0)
{
int n = POSDATALEN(txt, wptr);
if (n == 0)
n = 1;
sptr->ndoc++;
sptr->nentry += n;
if ( stat->weight == 0 ) {
sptr->ndoc++;
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
} else if ( wptr->haspos && (n=check_weight(txt, wptr, stat->weight))!=0 ) {
sptr->ndoc++;
sptr->nentry += n;
}
sptr++;
wptr++;
}
else
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) {
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
wptr++;
cur++;
}
}
while (wptr - ARRPTR(txt) < txt->size)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) {
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
wptr++;
cur++;
}
}
else
@ -243,12 +269,13 @@ ts_accum(PG_FUNCTION_ARGS)
cmp = compareStatWord(sptr, wptr, stat, txt);
if (cmp == 0)
{
int n = POSDATALEN(txt, wptr);
if (n == 0)
n = 1;
sptr->ndoc++;
sptr->nentry += n;
if ( stat->weight == 0 ) {
sptr->ndoc++;
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
} else if ( wptr->haspos && (n=check_weight(txt, wptr, stat->weight))!=0 ) {
sptr->ndoc++;
sptr->nentry += n;
}
break;
}
else if (cmp < 0)
@ -259,10 +286,12 @@ ts_accum(PG_FUNCTION_ARGS)
if (StopLow >= StopHigh)
{ /* not found */
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) {
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
}
wptr++;
}
@ -389,7 +418,7 @@ get_ti_Oid(void)
}
static tsstat *
ts_stat_sql(text *txt)
ts_stat_sql(text *txt, text *ws)
{
char *query = text2char(txt);
int i;
@ -423,6 +452,31 @@ ts_stat_sql(text *txt)
stat = palloc(STATHDRSIZE);
stat->len = STATHDRSIZE;
stat->size = 0;
stat->weight = 0;
if ( ws ) {
char *buf;
buf = VARDATA(ws);
while( buf - VARDATA(ws) < VARSIZE(buf) - VARHDRSZ ) {
switch (tolower(*buf)) {
case 'a':
stat->weight |= 1 << 3;
break;
case 'b':
stat->weight |= 1 << 2;
break;
case 'c':
stat->weight |= 1 << 1;
break;
case 'd':
stat->weight |= 1;
break;
default:
stat->weight |= 0;
}
buf++;
}
}
while (SPI_processed > 0)
{
@ -467,11 +521,13 @@ ts_stat(PG_FUNCTION_ARGS)
{
tsstat *stat;
text *txt = PG_GETARG_TEXT_P(0);
text *ws = (PG_NARGS() > 1) ? PG_GETARG_TEXT_P(1) : NULL;
funcctx = SRF_FIRSTCALL_INIT();
SPI_connect();
stat = ts_stat_sql(txt);
stat = ts_stat_sql(txt,ws);
PG_FREE_IF_COPY(txt, 0);
if (PG_NARGS() > 1 ) PG_FREE_IF_COPY(ws, 1);
ts_setup_firstcall(funcctx, stat);
SPI_finish();
}

View File

@ -20,10 +20,11 @@ typedef struct
{
int4 len;
int4 size;
int4 weight;
char data[1];
} tsstat;
#define STATHDRSIZE (sizeof(int4)*2)
#define STATHDRSIZE (sizeof(int4)*4)
#define CALCSTATSIZE(x, lenstr) ( x * sizeof(StatEntry) + STATHDRSIZE + lenstr )
#define STATPTR(x) ( (StatEntry*) ( (char*)x + STATHDRSIZE ) )
#define STATSTRPTR(x) ( (char*)x + STATHDRSIZE + ( sizeof(StatEntry) * ((tsvector*)x)->size ) )

View File

@ -652,6 +652,12 @@ CREATE FUNCTION stat(text)
language 'C'
with (isstrict);
CREATE FUNCTION stat(text,text)
returns setof statinfo
as 'MODULE_PATHNAME', 'ts_stat'
language 'C'
with (isstrict);
--reset - just for debuging
CREATE FUNCTION reset_tsearch()
returns void

View File

@ -59,6 +59,8 @@ DROP FUNCTION gtsvector_penalty(internal,internal,internal);
DROP FUNCTION gtsvector_picksplit(internal, internal);
DROP FUNCTION gtsvector_union(internal, internal);
DROP FUNCTION reset_tsearch();
DROP FUNCTION stat(text);
DROP FUNCTION stat(text,stat);
DROP FUNCTION tsearch2() CASCADE;
DROP FUNCTION _get_parser_from_curcfg();