postgresql/src/backend/commands/analyze.c

686 lines
20 KiB
C
Raw Normal View History

2000-05-29 19:44:17 +02:00
/*-------------------------------------------------------------------------
*
* analyze.c
* the postgres optimizer analyzer
*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
2000-05-29 19:44:17 +02:00
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.16 2001/03/22 06:16:11 momjian Exp $
2000-05-29 19:44:17 +02:00
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
2000-05-29 19:44:17 +02:00
#include <sys/types.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "access/heapam.h"
#include "catalog/catname.h"
#include "catalog/indexing.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "parser/parse_oper.h"
#include "tcop/tcopprot.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/syscache.h"
#define swapLong(a,b) {long tmp; tmp=a; a=b; b=tmp;}
#define swapInt(a,b) {int tmp; tmp=a; a=b; b=tmp;}
#define swapDatum(a,b) {Datum tmp; tmp=a; a=b; b=tmp;}
#define VacAttrStatsEqValid(stats) ( stats->f_cmpeq.fn_addr != NULL )
#define VacAttrStatsLtGtValid(stats) ( stats->f_cmplt.fn_addr != NULL && \
stats->f_cmpgt.fn_addr != NULL && \
RegProcedureIsValid(stats->outfunc) )
static void attr_stats(Relation onerel, int attr_cnt, VacAttrStats *vacattrstats, HeapTuple tuple);
static void bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int *bucket_len);
static void update_attstats(Oid relid, int natts, VacAttrStats *vacattrstats);
static void del_stats(Oid relid, int attcnt, int *attnums);
/*
* analyze_rel() -- analyze relation
*/
void
analyze_rel(Oid relid, List *anal_cols2, int MESSAGE_LEVEL)
{
HeapTuple tuple;
2000-05-29 19:44:17 +02:00
Relation onerel;
int32 i;
int attr_cnt,
*attnums = NULL;
Form_pg_attribute *attr;
VacAttrStats *vacattrstats;
HeapScanDesc scan;
StartTransactionCommand();
/*
* Check for user-requested abort. Note we want this to be inside a
* transaction, so xact.c doesn't issue useless NOTICE.
*/
CHECK_FOR_INTERRUPTS();
2000-05-29 19:44:17 +02:00
/*
* Race condition -- if the pg_class tuple has gone away since the
* last time we saw it, we don't need to vacuum it.
*/
tuple = SearchSysCache(RELOID,
ObjectIdGetDatum(relid),
0, 0, 0);
if (!HeapTupleIsValid(tuple))
{
CommitTransactionCommand();
return;
}
2001-03-22 05:01:46 +01:00
2000-05-29 19:44:17 +02:00
/*
2001-03-22 05:01:46 +01:00
* We can VACUUM ANALYZE any table except pg_statistic. see
* update_relstats
2000-05-29 19:44:17 +02:00
*/
if (strcmp(NameStr(((Form_pg_class) GETSTRUCT(tuple))->relname),
StatisticRelationName) == 0)
2000-05-29 19:44:17 +02:00
{
ReleaseSysCache(tuple);
2000-05-29 19:44:17 +02:00
CommitTransactionCommand();
return;
}
ReleaseSysCache(tuple);
2000-05-29 19:44:17 +02:00
onerel = heap_open(relid, AccessShareLock);
if (!pg_ownercheck(GetUserId(), RelationGetRelationName(onerel),
2000-05-29 19:44:17 +02:00
RELNAME))
{
2001-03-22 05:01:46 +01:00
/*
* we already did an elog during vacuum elog(NOTICE, "Skipping
* \"%s\" --- only table owner can VACUUM it",
* RelationGetRelationName(onerel));
*/
heap_close(onerel, NoLock);
2000-05-29 19:44:17 +02:00
CommitTransactionCommand();
return;
}
elog(MESSAGE_LEVEL, "Analyzing...");
attr_cnt = onerel->rd_att->natts;
attr = onerel->rd_att->attrs;
if (anal_cols2 != NIL)
{
int tcnt = 0;
List *le;
if (length(anal_cols2) > attr_cnt)
elog(ERROR, "vacuum: too many attributes specified for relation %s",
RelationGetRelationName(onerel));
attnums = (int *) palloc(attr_cnt * sizeof(int));
foreach(le, anal_cols2)
{
char *col = (char *) lfirst(le);
for (i = 0; i < attr_cnt; i++)
{
if (namestrcmp(&(attr[i]->attname), col) == 0)
break;
}
2001-03-22 05:01:46 +01:00
if (i < attr_cnt) /* found */
2000-05-29 19:44:17 +02:00
attnums[tcnt++] = i;
else
{
elog(ERROR, "vacuum: there is no attribute %s in %s",
col, RelationGetRelationName(onerel));
}
}
attr_cnt = tcnt;
}
vacattrstats = (VacAttrStats *) palloc(attr_cnt * sizeof(VacAttrStats));
for (i = 0; i < attr_cnt; i++)
{
Operator func_operator;
VacAttrStats *stats;
stats = &vacattrstats[i];
stats->attr = palloc(ATTRIBUTE_TUPLE_SIZE);
memcpy(stats->attr, attr[((attnums) ? attnums[i] : i)],
ATTRIBUTE_TUPLE_SIZE);
2000-05-29 19:44:17 +02:00
stats->best = stats->guess1 = stats->guess2 = 0;
stats->max = stats->min = 0;
stats->best_len = stats->guess1_len = stats->guess2_len = 0;
stats->max_len = stats->min_len = 0;
stats->initialized = false;
stats->best_cnt = stats->guess1_cnt = stats->guess1_hits = stats->guess2_hits = 0;
stats->max_cnt = stats->min_cnt = stats->null_cnt = stats->nonnull_cnt = 0;
Clean up two rather nasty bugs in operator selection code. 1. If there is exactly one pg_operator entry of the right name and oprkind, oper() and related routines would return that entry whether its input type had anything to do with the request or not. This is just premature optimization: we shouldn't return the single candidate until after we verify that it really is a valid candidate, ie, is at least coercion-compatible with the given types. 2. oper() and related routines only promise a coercion-compatible result. Unfortunately, there were quite a few callers that assumed the returned operator is binary-compatible with the given datatype; they would proceed to call it without making any datatype coercions. These callers include sorting, grouping, aggregation, and VACUUM ANALYZE. In general I think it is appropriate for these callers to require an exact or binary-compatible match, so I've added a new routine compatible_oper() that only succeeds if it can find an operator that doesn't require any run-time conversions. Callers now call oper() or compatible_oper() depending on whether they are prepared to deal with type conversion or not. The upshot of these bugs is revealed by the following silliness in PL/Tcl's selftest: it creates an operator @< on int4, and then tries to use it to sort a char(N) column. The system would let it do that :-( (and evidently has done so since 6.3 :-( :-(). The result in this case was just a silly sort order, but the reverse combination would've provoked coredump from trying to dereference integers. With this fix you get more reasonable behavior: pltcl_test=# select * from T_pkey1 order by key1, key2 using @<; ERROR: Unable to identify an operator '@<' for types 'bpchar' and 'bpchar' You will have to retype this query using an explicit cast
2001-02-16 04:16:58 +01:00
func_operator = compatible_oper("=",
stats->attr->atttypid,
stats->attr->atttypid,
true);
2000-05-29 19:44:17 +02:00
if (func_operator != NULL)
{
Clean up two rather nasty bugs in operator selection code. 1. If there is exactly one pg_operator entry of the right name and oprkind, oper() and related routines would return that entry whether its input type had anything to do with the request or not. This is just premature optimization: we shouldn't return the single candidate until after we verify that it really is a valid candidate, ie, is at least coercion-compatible with the given types. 2. oper() and related routines only promise a coercion-compatible result. Unfortunately, there were quite a few callers that assumed the returned operator is binary-compatible with the given datatype; they would proceed to call it without making any datatype coercions. These callers include sorting, grouping, aggregation, and VACUUM ANALYZE. In general I think it is appropriate for these callers to require an exact or binary-compatible match, so I've added a new routine compatible_oper() that only succeeds if it can find an operator that doesn't require any run-time conversions. Callers now call oper() or compatible_oper() depending on whether they are prepared to deal with type conversion or not. The upshot of these bugs is revealed by the following silliness in PL/Tcl's selftest: it creates an operator @< on int4, and then tries to use it to sort a char(N) column. The system would let it do that :-( (and evidently has done so since 6.3 :-( :-(). The result in this case was just a silly sort order, but the reverse combination would've provoked coredump from trying to dereference integers. With this fix you get more reasonable behavior: pltcl_test=# select * from T_pkey1 order by key1, key2 using @<; ERROR: Unable to identify an operator '@<' for types 'bpchar' and 'bpchar' You will have to retype this query using an explicit cast
2001-02-16 04:16:58 +01:00
fmgr_info(oprfuncid(func_operator), &(stats->f_cmpeq));
ReleaseSysCache(func_operator);
2000-05-29 19:44:17 +02:00
}
else
stats->f_cmpeq.fn_addr = NULL;
Clean up two rather nasty bugs in operator selection code. 1. If there is exactly one pg_operator entry of the right name and oprkind, oper() and related routines would return that entry whether its input type had anything to do with the request or not. This is just premature optimization: we shouldn't return the single candidate until after we verify that it really is a valid candidate, ie, is at least coercion-compatible with the given types. 2. oper() and related routines only promise a coercion-compatible result. Unfortunately, there were quite a few callers that assumed the returned operator is binary-compatible with the given datatype; they would proceed to call it without making any datatype coercions. These callers include sorting, grouping, aggregation, and VACUUM ANALYZE. In general I think it is appropriate for these callers to require an exact or binary-compatible match, so I've added a new routine compatible_oper() that only succeeds if it can find an operator that doesn't require any run-time conversions. Callers now call oper() or compatible_oper() depending on whether they are prepared to deal with type conversion or not. The upshot of these bugs is revealed by the following silliness in PL/Tcl's selftest: it creates an operator @< on int4, and then tries to use it to sort a char(N) column. The system would let it do that :-( (and evidently has done so since 6.3 :-( :-(). The result in this case was just a silly sort order, but the reverse combination would've provoked coredump from trying to dereference integers. With this fix you get more reasonable behavior: pltcl_test=# select * from T_pkey1 order by key1, key2 using @<; ERROR: Unable to identify an operator '@<' for types 'bpchar' and 'bpchar' You will have to retype this query using an explicit cast
2001-02-16 04:16:58 +01:00
func_operator = compatible_oper("<",
stats->attr->atttypid,
stats->attr->atttypid,
true);
2000-05-29 19:44:17 +02:00
if (func_operator != NULL)
{
Clean up two rather nasty bugs in operator selection code. 1. If there is exactly one pg_operator entry of the right name and oprkind, oper() and related routines would return that entry whether its input type had anything to do with the request or not. This is just premature optimization: we shouldn't return the single candidate until after we verify that it really is a valid candidate, ie, is at least coercion-compatible with the given types. 2. oper() and related routines only promise a coercion-compatible result. Unfortunately, there were quite a few callers that assumed the returned operator is binary-compatible with the given datatype; they would proceed to call it without making any datatype coercions. These callers include sorting, grouping, aggregation, and VACUUM ANALYZE. In general I think it is appropriate for these callers to require an exact or binary-compatible match, so I've added a new routine compatible_oper() that only succeeds if it can find an operator that doesn't require any run-time conversions. Callers now call oper() or compatible_oper() depending on whether they are prepared to deal with type conversion or not. The upshot of these bugs is revealed by the following silliness in PL/Tcl's selftest: it creates an operator @< on int4, and then tries to use it to sort a char(N) column. The system would let it do that :-( (and evidently has done so since 6.3 :-( :-(). The result in this case was just a silly sort order, but the reverse combination would've provoked coredump from trying to dereference integers. With this fix you get more reasonable behavior: pltcl_test=# select * from T_pkey1 order by key1, key2 using @<; ERROR: Unable to identify an operator '@<' for types 'bpchar' and 'bpchar' You will have to retype this query using an explicit cast
2001-02-16 04:16:58 +01:00
fmgr_info(oprfuncid(func_operator), &(stats->f_cmplt));
2000-05-29 19:44:17 +02:00
stats->op_cmplt = oprid(func_operator);
ReleaseSysCache(func_operator);
2000-05-29 19:44:17 +02:00
}
else
{
stats->f_cmplt.fn_addr = NULL;
stats->op_cmplt = InvalidOid;
}
Clean up two rather nasty bugs in operator selection code. 1. If there is exactly one pg_operator entry of the right name and oprkind, oper() and related routines would return that entry whether its input type had anything to do with the request or not. This is just premature optimization: we shouldn't return the single candidate until after we verify that it really is a valid candidate, ie, is at least coercion-compatible with the given types. 2. oper() and related routines only promise a coercion-compatible result. Unfortunately, there were quite a few callers that assumed the returned operator is binary-compatible with the given datatype; they would proceed to call it without making any datatype coercions. These callers include sorting, grouping, aggregation, and VACUUM ANALYZE. In general I think it is appropriate for these callers to require an exact or binary-compatible match, so I've added a new routine compatible_oper() that only succeeds if it can find an operator that doesn't require any run-time conversions. Callers now call oper() or compatible_oper() depending on whether they are prepared to deal with type conversion or not. The upshot of these bugs is revealed by the following silliness in PL/Tcl's selftest: it creates an operator @< on int4, and then tries to use it to sort a char(N) column. The system would let it do that :-( (and evidently has done so since 6.3 :-( :-(). The result in this case was just a silly sort order, but the reverse combination would've provoked coredump from trying to dereference integers. With this fix you get more reasonable behavior: pltcl_test=# select * from T_pkey1 order by key1, key2 using @<; ERROR: Unable to identify an operator '@<' for types 'bpchar' and 'bpchar' You will have to retype this query using an explicit cast
2001-02-16 04:16:58 +01:00
func_operator = compatible_oper(">",
stats->attr->atttypid,
stats->attr->atttypid,
true);
2000-05-29 19:44:17 +02:00
if (func_operator != NULL)
{
Clean up two rather nasty bugs in operator selection code. 1. If there is exactly one pg_operator entry of the right name and oprkind, oper() and related routines would return that entry whether its input type had anything to do with the request or not. This is just premature optimization: we shouldn't return the single candidate until after we verify that it really is a valid candidate, ie, is at least coercion-compatible with the given types. 2. oper() and related routines only promise a coercion-compatible result. Unfortunately, there were quite a few callers that assumed the returned operator is binary-compatible with the given datatype; they would proceed to call it without making any datatype coercions. These callers include sorting, grouping, aggregation, and VACUUM ANALYZE. In general I think it is appropriate for these callers to require an exact or binary-compatible match, so I've added a new routine compatible_oper() that only succeeds if it can find an operator that doesn't require any run-time conversions. Callers now call oper() or compatible_oper() depending on whether they are prepared to deal with type conversion or not. The upshot of these bugs is revealed by the following silliness in PL/Tcl's selftest: it creates an operator @< on int4, and then tries to use it to sort a char(N) column. The system would let it do that :-( (and evidently has done so since 6.3 :-( :-(). The result in this case was just a silly sort order, but the reverse combination would've provoked coredump from trying to dereference integers. With this fix you get more reasonable behavior: pltcl_test=# select * from T_pkey1 order by key1, key2 using @<; ERROR: Unable to identify an operator '@<' for types 'bpchar' and 'bpchar' You will have to retype this query using an explicit cast
2001-02-16 04:16:58 +01:00
fmgr_info(oprfuncid(func_operator), &(stats->f_cmpgt));
ReleaseSysCache(func_operator);
2000-05-29 19:44:17 +02:00
}
else
stats->f_cmpgt.fn_addr = NULL;
tuple = SearchSysCache(TYPEOID,
ObjectIdGetDatum(stats->attr->atttypid),
0, 0, 0);
if (HeapTupleIsValid(tuple))
2000-05-29 19:44:17 +02:00
{
stats->outfunc = ((Form_pg_type) GETSTRUCT(tuple))->typoutput;
stats->typelem = ((Form_pg_type) GETSTRUCT(tuple))->typelem;
ReleaseSysCache(tuple);
2000-05-29 19:44:17 +02:00
}
else
{
stats->outfunc = InvalidOid;
stats->typelem = InvalidOid;
}
}
/* delete existing pg_statistic rows for relation */
del_stats(relid, ((attnums) ? attr_cnt : 0), attnums);
/* scan relation to gather statistics */
2000-05-29 19:44:17 +02:00
scan = heap_beginscan(onerel, false, SnapshotNow, 0, NULL);
while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
attr_stats(onerel, attr_cnt, vacattrstats, tuple);
heap_endscan(scan);
/* close rel, but keep lock so it doesn't go away before commit */
heap_close(onerel, NoLock);
2000-05-29 19:44:17 +02:00
/* update statistics in pg_class */
update_attstats(relid, attr_cnt, vacattrstats);
CommitTransactionCommand();
}
/*
* attr_stats() -- compute column statistics used by the planner
2000-05-29 19:44:17 +02:00
*
* We compute the column min, max, null and non-null counts.
* Plus we attempt to find the count of the value that occurs most
* frequently in each column. These figures are used to compute
* the selectivity of the column.
*
* We use a three-bucket cache to get the most frequent item.
2000-05-29 19:44:17 +02:00
* The 'guess' buckets count hits. A cache miss causes guess1
* to get the most hit 'guess' item in the most recent cycle, and
* the new item goes into guess2. Whenever the total count of hits
* of a 'guess' entry is larger than 'best', 'guess' becomes 'best'.
*
* This method works perfectly for columns with unique values, and columns
* with only two unique values, plus nulls.
*
* It becomes less perfect as the number of unique values increases and
* their distribution in the table becomes more random.
*
*/
static void
attr_stats(Relation onerel, int attr_cnt, VacAttrStats *vacattrstats, HeapTuple tuple)
{
int i;
TupleDesc tupDesc = onerel->rd_att;
for (i = 0; i < attr_cnt; i++)
{
VacAttrStats *stats = &vacattrstats[i];
Datum origvalue;
Datum value;
bool isnull;
bool value_hit;
if (!VacAttrStatsEqValid(stats))
continue;
2000-05-29 19:44:17 +02:00
#ifdef _DROP_COLUMN_HACK__
if (COLUMN_IS_DROPPED(stats->attr))
continue;
#endif /* _DROP_COLUMN_HACK__ */
origvalue = heap_getattr(tuple, stats->attr->attnum,
tupDesc, &isnull);
2000-05-29 19:44:17 +02:00
if (isnull)
{
2000-05-29 19:44:17 +02:00
stats->null_cnt++;
continue;
}
stats->nonnull_cnt++;
/*
2001-03-22 05:01:46 +01:00
* If the value is toasted, detoast it to avoid repeated
* detoastings and resultant memory leakage inside the comparison
* routines.
*/
if (!stats->attr->attbyval && stats->attr->attlen == -1)
value = PointerGetDatum(PG_DETOAST_DATUM(origvalue));
else
value = origvalue;
2001-03-22 05:01:46 +01:00
if (!stats->initialized)
2000-05-29 19:44:17 +02:00
{
bucketcpy(stats->attr, value, &stats->best, &stats->best_len);
/* best_cnt gets incremented below */
bucketcpy(stats->attr, value, &stats->guess1, &stats->guess1_len);
stats->guess1_cnt = stats->guess1_hits = 1;
bucketcpy(stats->attr, value, &stats->guess2, &stats->guess2_len);
stats->guess2_hits = 1;
2000-05-29 19:44:17 +02:00
if (VacAttrStatsLtGtValid(stats))
{
bucketcpy(stats->attr, value, &stats->max, &stats->max_len);
bucketcpy(stats->attr, value, &stats->min, &stats->min_len);
/* min_cnt, max_cnt get incremented below */
2000-05-29 19:44:17 +02:00
}
stats->initialized = true;
}
2000-05-29 19:44:17 +02:00
if (VacAttrStatsLtGtValid(stats))
{
if (DatumGetBool(FunctionCall2(&stats->f_cmplt,
value, stats->min)))
2000-05-29 19:44:17 +02:00
{
bucketcpy(stats->attr, value, &stats->min, &stats->min_len);
stats->min_cnt = 1;
2000-05-29 19:44:17 +02:00
}
else if (DatumGetBool(FunctionCall2(&stats->f_cmpeq,
value, stats->min)))
stats->min_cnt++;
if (DatumGetBool(FunctionCall2(&stats->f_cmpgt,
value, stats->max)))
2000-05-29 19:44:17 +02:00
{
bucketcpy(stats->attr, value, &stats->max, &stats->max_len);
stats->max_cnt = 1;
2000-05-29 19:44:17 +02:00
}
else if (DatumGetBool(FunctionCall2(&stats->f_cmpeq,
value, stats->max)))
stats->max_cnt++;
}
value_hit = true;
if (DatumGetBool(FunctionCall2(&stats->f_cmpeq,
value, stats->best)))
stats->best_cnt++;
else if (DatumGetBool(FunctionCall2(&stats->f_cmpeq,
value, stats->guess1)))
{
stats->guess1_cnt++;
stats->guess1_hits++;
}
else if (DatumGetBool(FunctionCall2(&stats->f_cmpeq,
value, stats->guess2)))
stats->guess2_hits++;
else
value_hit = false;
if (stats->guess2_hits > stats->guess1_hits)
{
swapDatum(stats->guess1, stats->guess2);
swapInt(stats->guess1_len, stats->guess2_len);
swapLong(stats->guess1_hits, stats->guess2_hits);
stats->guess1_cnt = stats->guess1_hits;
}
if (stats->guess1_cnt > stats->best_cnt)
{
swapDatum(stats->best, stats->guess1);
swapInt(stats->best_len, stats->guess1_len);
swapLong(stats->best_cnt, stats->guess1_cnt);
stats->guess1_hits = 1;
stats->guess2_hits = 1;
}
if (!value_hit)
{
bucketcpy(stats->attr, value, &stats->guess2, &stats->guess2_len);
stats->guess1_hits = 1;
stats->guess2_hits = 1;
2000-05-29 19:44:17 +02:00
}
/* Clean up detoasted copy, if any */
if (value != origvalue)
pfree(DatumGetPointer(value));
2000-05-29 19:44:17 +02:00
}
}
/*
* bucketcpy() -- copy a new value into one of the statistics buckets
2000-05-29 19:44:17 +02:00
*/
static void
bucketcpy(Form_pg_attribute attr, Datum value, Datum *bucket, int *bucket_len)
{
if (attr->attbyval)
2000-05-29 19:44:17 +02:00
*bucket = value;
else
{
int len = (attr->attlen != -1 ? attr->attlen : VARSIZE(value));
/* Avoid unnecessary palloc() traffic... */
2000-05-29 19:44:17 +02:00
if (len > *bucket_len)
{
if (*bucket_len != 0)
pfree(DatumGetPointer(*bucket));
*bucket = PointerGetDatum(palloc(len));
*bucket_len = len;
}
memcpy(DatumGetPointer(*bucket), DatumGetPointer(value), len);
2000-05-29 19:44:17 +02:00
}
}
/*
* update_attstats() -- update attribute statistics for one relation
*
* Statistics are stored in several places: the pg_class row for the
* relation has stats about the whole relation, the pg_attribute rows
* for each attribute store "dispersion", and there is a pg_statistic
* row for each (non-system) attribute. (Dispersion probably ought to
* be moved to pg_statistic, but it's not worth doing unless there's
* another reason to have to change pg_attribute.) The pg_class values
* are updated by VACUUM, not here.
*
* We violate no-overwrite semantics here by storing new values for
* the dispersion column directly into the pg_attribute tuple that's
* already on the page. The reason for this is that if we updated
* these tuples in the usual way, vacuuming pg_attribute itself
* wouldn't work very well --- by the time we got done with a vacuum
* cycle, most of the tuples in pg_attribute would've been obsoleted.
* Updating pg_attribute's own statistics would be especially tricky.
* Of course, this only works for fixed-size never-null columns, but
* dispersion is.
*
2001-03-22 05:01:46 +01:00
* pg_statistic rows are just added normally. This means that
* pg_statistic will probably contain some deleted rows at the
* completion of a vacuum cycle, unless it happens to get vacuumed last.
2000-05-29 19:44:17 +02:00
*
* To keep things simple, we punt for pg_statistic, and don't try
* to compute or store rows for pg_statistic itself in pg_statistic.
* This could possibly be made to work, but it's not worth the trouble.
*/
static void
update_attstats(Oid relid, int natts, VacAttrStats *vacattrstats)
{
Relation ad,
sd;
HeapScanDesc scan;
HeapTuple atup,
stup;
ScanKeyData askey;
Form_pg_attribute attp;
ad = heap_openr(AttributeRelationName, RowExclusiveLock);
sd = heap_openr(StatisticRelationName, RowExclusiveLock);
/* Find pg_attribute rows for this relation */
ScanKeyEntryInitialize(&askey, 0, Anum_pg_attribute_attrelid,
F_INT4EQ, relid);
scan = heap_beginscan(ad, false, SnapshotNow, 1, &askey);
while (HeapTupleIsValid(atup = heap_getnext(scan, 0)))
{
int i;
VacAttrStats *stats;
attp = (Form_pg_attribute) GETSTRUCT(atup);
2001-03-22 05:01:46 +01:00
if (attp->attnum <= 0) /* skip system attributes for now */
2000-05-29 19:44:17 +02:00
continue;
for (i = 0; i < natts; i++)
{
if (attp->attnum == vacattrstats[i].attr->attnum)
break;
}
if (i >= natts)
2001-03-22 05:01:46 +01:00
continue; /* skip attr if no stats collected */
2000-05-29 19:44:17 +02:00
stats = &(vacattrstats[i]);
if (VacAttrStatsEqValid(stats))
{
2001-03-22 05:01:46 +01:00
float4 selratio; /* average ratio of rows selected
* for a random constant */
2000-05-29 19:44:17 +02:00
/* Compute dispersion */
2000-05-29 19:44:17 +02:00
if (stats->nonnull_cnt == 0 && stats->null_cnt == 0)
{
/*
2001-03-22 05:01:46 +01:00
* empty relation, so put a dummy value in attdispersion
2000-05-29 19:44:17 +02:00
*/
selratio = 0;
}
else if (stats->null_cnt <= 1 && stats->best_cnt == 1)
{
2001-03-22 05:01:46 +01:00
2000-05-29 19:44:17 +02:00
/*
2001-03-22 05:01:46 +01:00
* looks like we have a unique-key attribute --- flag this
* with special -1.0 flag value.
2000-05-29 19:44:17 +02:00
*
2001-03-22 05:01:46 +01:00
* The correct dispersion is 1.0/numberOfRows, but since the
* relation row count can get updated without recomputing
* dispersion, we want to store a "symbolic" value and
* figure 1.0/numberOfRows on the fly.
2000-05-29 19:44:17 +02:00
*/
selratio = -1;
}
else
{
if (VacAttrStatsLtGtValid(stats) &&
2001-03-22 05:01:46 +01:00
stats->min_cnt + stats->max_cnt == stats->nonnull_cnt)
2000-05-29 19:44:17 +02:00
{
/*
2001-03-22 05:01:46 +01:00
* exact result when there are just 1 or 2 values...
2000-05-29 19:44:17 +02:00
*/
double min_cnt_d = stats->min_cnt,
max_cnt_d = stats->max_cnt,
null_cnt_d = stats->null_cnt;
double total = ((double) stats->nonnull_cnt) + null_cnt_d;
selratio = (min_cnt_d * min_cnt_d + max_cnt_d * max_cnt_d + null_cnt_d * null_cnt_d) / (total * total);
}
else
{
double most = (double) (stats->best_cnt > stats->null_cnt ? stats->best_cnt : stats->null_cnt);
double total = ((double) stats->nonnull_cnt) + ((double) stats->null_cnt);
/*
* we assume count of other values are 20% of best
* count in table
*/
selratio = (most * most + 0.20 * most * (total - most)) / (total * total);
}
/* Make sure calculated values are in-range */
if (selratio < 0.0)
selratio = 0.0;
else if (selratio > 1.0)
selratio = 1.0;
}
/* overwrite the existing statistics in the tuple */
attp->attdispersion = selratio;
2000-05-29 19:44:17 +02:00
/* invalidate the tuple in the cache and write the buffer */
RelationInvalidateHeapTuple(ad, atup);
WriteNoReleaseBuffer(scan->rs_cbuf);
/*
* Create pg_statistic tuples for the relation, if we have
2001-03-22 05:01:46 +01:00
* gathered the right data. del_stats() previously deleted
* all the pg_statistic tuples for the rel, so we just have to
* insert new ones here.
2000-05-29 19:44:17 +02:00
*
2001-03-22 05:01:46 +01:00
* Note analyze_rel() has seen to it that we won't come here when
* vacuuming pg_statistic itself.
2000-05-29 19:44:17 +02:00
*/
if (VacAttrStatsLtGtValid(stats) && stats->initialized)
{
float4 nullratio;
float4 bestratio;
2000-05-29 19:44:17 +02:00
FmgrInfo out_function;
char *out_string;
double best_cnt_d = stats->best_cnt,
null_cnt_d = stats->null_cnt,
2001-03-22 05:01:46 +01:00
nonnull_cnt_d = stats->nonnull_cnt; /* prevent overflow */
2000-05-29 19:44:17 +02:00
Datum values[Natts_pg_statistic];
char nulls[Natts_pg_statistic];
Relation irelations[Num_pg_statistic_indices];
2000-05-29 19:44:17 +02:00
nullratio = null_cnt_d / (nonnull_cnt_d + null_cnt_d);
bestratio = best_cnt_d / (nonnull_cnt_d + null_cnt_d);
fmgr_info(stats->outfunc, &out_function);
for (i = 0; i < Natts_pg_statistic; ++i)
nulls[i] = ' ';
/*
* initialize values[]
2000-05-29 19:44:17 +02:00
*/
i = 0;
2001-03-22 05:01:46 +01:00
values[i++] = ObjectIdGetDatum(relid); /* starelid */
values[i++] = Int16GetDatum(attp->attnum); /* staattnum */
values[i++] = ObjectIdGetDatum(stats->op_cmplt); /* staop */
values[i++] = Float4GetDatum(nullratio); /* stanullfrac */
values[i++] = Float4GetDatum(bestratio); /* stacommonfrac */
out_string = DatumGetCString(FunctionCall3(&out_function,
2001-03-22 05:01:46 +01:00
stats->best,
ObjectIdGetDatum(stats->typelem),
Int32GetDatum(stats->attr->atttypmod)));
values[i++] = DirectFunctionCall1(textin, /* stacommonval */
CStringGetDatum(out_string));
2000-05-29 19:44:17 +02:00
pfree(out_string);
out_string = DatumGetCString(FunctionCall3(&out_function,
2001-03-22 05:01:46 +01:00
stats->min,
ObjectIdGetDatum(stats->typelem),
Int32GetDatum(stats->attr->atttypmod)));
values[i++] = DirectFunctionCall1(textin, /* staloval */
CStringGetDatum(out_string));
2000-05-29 19:44:17 +02:00
pfree(out_string);
out_string = DatumGetCString(FunctionCall3(&out_function,
2001-03-22 05:01:46 +01:00
stats->max,
ObjectIdGetDatum(stats->typelem),
Int32GetDatum(stats->attr->atttypmod)));
values[i++] = DirectFunctionCall1(textin, /* stahival */
CStringGetDatum(out_string));
2000-05-29 19:44:17 +02:00
pfree(out_string);
stup = heap_formtuple(sd->rd_att, values, nulls);
/* store tuple and update indexes too */
heap_insert(sd, stup);
2000-05-29 19:44:17 +02:00
CatalogOpenIndices(Num_pg_statistic_indices, Name_pg_statistic_indices, irelations);
CatalogIndexInsert(irelations, Num_pg_statistic_indices, sd, stup);
CatalogCloseIndices(Num_pg_statistic_indices, irelations);
2000-05-29 19:44:17 +02:00
/* release allocated space */
pfree(DatumGetPointer(values[Anum_pg_statistic_stacommonval - 1]));
pfree(DatumGetPointer(values[Anum_pg_statistic_staloval - 1]));
pfree(DatumGetPointer(values[Anum_pg_statistic_stahival - 1]));
heap_freetuple(stup);
}
}
}
heap_endscan(scan);
/* close rels, but hold locks till upcoming commit */
heap_close(ad, NoLock);
heap_close(sd, NoLock);
}
/*
* del_stats() -- delete pg_statistic rows for a relation
*
* If a list of attribute numbers is given, only zap stats for those attrs.
*/
static void
del_stats(Oid relid, int attcnt, int *attnums)
{
Relation pgstatistic;
HeapScanDesc scan;
HeapTuple tuple;
ScanKeyData key;
pgstatistic = heap_openr(StatisticRelationName, RowExclusiveLock);
ScanKeyEntryInitialize(&key, 0x0, Anum_pg_statistic_starelid,
F_OIDEQ, ObjectIdGetDatum(relid));
scan = heap_beginscan(pgstatistic, false, SnapshotNow, 1, &key);
while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
{
if (attcnt > 0)
{
Form_pg_statistic pgs = (Form_pg_statistic) GETSTRUCT(tuple);
int i;
for (i = 0; i < attcnt; i++)
{
if (pgs->staattnum == attnums[i] + 1)
break;
}
if (i >= attcnt)
continue; /* don't delete it */
}
simple_heap_delete(pgstatistic, &tuple->t_self);
2000-05-29 19:44:17 +02:00
}
heap_endscan(scan);
/*
* Close rel, but *keep* lock; we will need to reacquire it later, so
* there's a possibility of deadlock against another VACUUM process if
* we let go now. Keeping the lock shouldn't delay any common
* operation other than an attempted VACUUM of pg_statistic itself.
*/
heap_close(pgstatistic, NoLock);
}