Suggest to the user the column they may have meant to reference.

Error messages informing the user that no such column exists can
sometimes provoke a perplexed response.  This often happens due to
a subtle typo in the column name or, perhaps less likely, in the
alias name.  To speed discovery of what the real issue is in such
cases, we'll now search the range table for approximate matches.
If there are one or two such matches that are good enough to think
that they might be what the user intended to type, and better than
all other approximate matches, we'll issue a hint suggesting that
the user might have intended to reference those columns.

Peter Geoghegan and Robert Haas
This commit is contained in:
Robert Haas 2015-03-11 10:44:04 -04:00
parent bbfd7edae5
commit e529cd4ffa
8 changed files with 334 additions and 38 deletions

View File

@ -556,7 +556,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
colname = strVal(field2);
/* Try to identify as a column of the RTE */
node = scanRTEForColumn(pstate, rte, colname, cref->location);
node = scanRTEForColumn(pstate, rte, colname, cref->location,
0, NULL);
if (node == NULL)
{
/* Try it as a function call on the whole row */
@ -601,7 +602,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
colname = strVal(field3);
/* Try to identify as a column of the RTE */
node = scanRTEForColumn(pstate, rte, colname, cref->location);
node = scanRTEForColumn(pstate, rte, colname, cref->location,
0, NULL);
if (node == NULL)
{
/* Try it as a function call on the whole row */
@ -659,7 +661,8 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
colname = strVal(field4);
/* Try to identify as a column of the RTE */
node = scanRTEForColumn(pstate, rte, colname, cref->location);
node = scanRTEForColumn(pstate, rte, colname, cref->location,
0, NULL);
if (node == NULL)
{
/* Try it as a function call on the whole row */

View File

@ -1779,7 +1779,7 @@ ParseComplexProjection(ParseState *pstate, char *funcname, Node *first_arg,
((Var *) first_arg)->varno,
((Var *) first_arg)->varlevelsup);
/* Return a Var if funcname matches a column, else NULL */
return scanRTEForColumn(pstate, rte, funcname, location);
return scanRTEForColumn(pstate, rte, funcname, location, 0, NULL);
}
/*

View File

@ -33,6 +33,8 @@
#include "utils/syscache.h"
#define MAX_FUZZY_DISTANCE 3
static RangeTblEntry *scanNameSpaceForRefname(ParseState *pstate,
const char *refname, int location);
static RangeTblEntry *scanNameSpaceForRelid(ParseState *pstate, Oid relid,
@ -519,6 +521,101 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup)
return NULL; /* keep compiler quiet */
}
/*
* updateFuzzyAttrMatchState
* Using Levenshtein distance, consider if column is best fuzzy match.
*/
static void
updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
FuzzyAttrMatchState *fuzzystate, RangeTblEntry *rte,
const char *actual, const char *match, int attnum)
{
int columndistance;
int matchlen;
/* Bail before computing the Levenshtein distance if there's no hope. */
if (fuzzy_rte_penalty > fuzzystate->distance)
return;
/*
* Outright reject dropped columns, which can appear here with apparent
* empty actual names, per remarks within scanRTEForColumn().
*/
if (actual[0] == '\0')
return;
/* Use Levenshtein to compute match distance. */
matchlen = strlen(match);
columndistance =
varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen,
1, 1, 1,
fuzzystate->distance + 1
- fuzzy_rte_penalty);
/*
* If more than half the characters are different, don't treat it as a
* match, to avoid making ridiculous suggestions.
*/
if (columndistance > matchlen / 2)
return;
/*
* From this point on, we can ignore the distinction between the
* RTE-name distance and the column-name distance.
*/
columndistance += fuzzy_rte_penalty;
/*
* If the new distance is less than or equal to that of the best match
* found so far, update fuzzystate.
*/
if (columndistance < fuzzystate->distance)
{
/* Store new lowest observed distance for RTE */
fuzzystate->distance = columndistance;
fuzzystate->rfirst = rte;
fuzzystate->first = attnum;
fuzzystate->rsecond = NULL;
fuzzystate->second = InvalidAttrNumber;
}
else if (columndistance == fuzzystate->distance)
{
/*
* This match distance may equal a prior match within this same
* range table. When that happens, the prior match may also be
* given, but only if there is no more than two equally distant
* matches from the RTE (in turn, our caller will only accept
* two equally distant matches overall).
*/
if (AttributeNumberIsValid(fuzzystate->second))
{
/* Too many RTE-level matches */
fuzzystate->rfirst = NULL;
fuzzystate->first = InvalidAttrNumber;
fuzzystate->rsecond = NULL;
fuzzystate->second = InvalidAttrNumber;
/* Clearly, distance is too low a bar (for *any* RTE) */
fuzzystate->distance = columndistance - 1;
}
else if (AttributeNumberIsValid(fuzzystate->first))
{
/* Record as provisional second match for RTE */
fuzzystate->rsecond = rte;
fuzzystate->second = attnum;
}
else if (fuzzystate->distance <= MAX_FUZZY_DISTANCE)
{
/*
* Record as provisional first match (this can occasionally
* occur because previous lowest distance was "too low a
* bar", rather than being associated with a real match)
*/
fuzzystate->rfirst = rte;
fuzzystate->first = attnum;
}
}
}
/*
* scanRTEForColumn
* Search the column names of a single RTE for the given name.
@ -527,10 +624,14 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup)
*
* Side effect: if we find a match, mark the RTE as requiring read access
* for the column.
*
* Additional side effect: if fuzzystate is non-NULL, check non-system columns
* for an approximate match and update fuzzystate accordingly.
*/
Node *
scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname,
int location)
int location, int fuzzy_rte_penalty,
FuzzyAttrMatchState *fuzzystate)
{
Node *result = NULL;
int attnum = 0;
@ -548,12 +649,16 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname,
* Should this somehow go wrong and we try to access a dropped column,
* we'll still catch it by virtue of the checks in
* get_rte_attribute_type(), which is called by make_var(). That routine
* has to do a cache lookup anyway, so the check there is cheap.
* has to do a cache lookup anyway, so the check there is cheap. Callers
* interested in finding match with shortest distance need to defend
* against this directly, though.
*/
foreach(c, rte->eref->colnames)
{
const char *attcolname = strVal(lfirst(c));
attnum++;
if (strcmp(strVal(lfirst(c)), colname) == 0)
if (strcmp(attcolname, colname) == 0)
{
if (result)
ereport(ERROR,
@ -566,6 +671,11 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname,
markVarForSelectPriv(pstate, var, rte);
result = (Node *) var;
}
/* Updating fuzzy match state, if provided. */
if (fuzzystate != NULL)
updateFuzzyAttrMatchState(fuzzy_rte_penalty, fuzzystate,
rte, attcolname, colname, attnum);
}
/*
@ -642,7 +752,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly,
continue;
/* use orig_pstate here to get the right sublevels_up */
newresult = scanRTEForColumn(orig_pstate, rte, colname, location);
newresult = scanRTEForColumn(orig_pstate, rte, colname, location,
0, NULL);
if (newresult)
{
@ -668,8 +779,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly,
/*
* searchRangeTableForCol
* See if any RangeTblEntry could possibly provide the given column name.
* If so, return a pointer to the RangeTblEntry; else return NULL.
* See if any RangeTblEntry could possibly provide the given column name (or
* find the best match available). Returns state with relevant details.
*
* This is different from colNameToVar in that it considers every entry in
* the ParseState's rangetable(s), not only those that are currently visible
@ -677,11 +788,31 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly,
* and it may give ambiguous results (there might be multiple equally valid
* matches, but only one will be returned). This must be used ONLY as a
* heuristic in giving suitable error messages. See errorMissingColumn.
*
* This function is also different in that it will consider approximate
* matches -- if the user entered an alias/column pair that is only slightly
* different from a valid pair, we may be able to infer what they meant to
* type and provide a reasonable hint.
*
* The FuzzyAttrMatchState will have 'rfirst' pointing to the best RTE
* containing the most promising match for the alias and column name. If
* the alias and column names match exactly, 'first' will be InvalidAttrNumber;
* otherwise, it will be the attribute number for the match. In the latter
* case, 'rsecond' may point to a second, equally close approximate match,
* and 'second' will contain the attribute number for the second match.
*/
static RangeTblEntry *
searchRangeTableForCol(ParseState *pstate, char *colname, int location)
static FuzzyAttrMatchState *
searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname,
int location)
{
ParseState *orig_pstate = pstate;
FuzzyAttrMatchState *fuzzystate = palloc(sizeof(FuzzyAttrMatchState));
fuzzystate->distance = MAX_FUZZY_DISTANCE + 1;
fuzzystate->rfirst = NULL;
fuzzystate->rsecond = NULL;
fuzzystate->first = InvalidAttrNumber;
fuzzystate->second = InvalidAttrNumber;
while (pstate != NULL)
{
@ -689,15 +820,51 @@ searchRangeTableForCol(ParseState *pstate, char *colname, int location)
foreach(l, pstate->p_rtable)
{
RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
int fuzzy_rte_penalty = 0;
if (scanRTEForColumn(orig_pstate, rte, colname, location))
return rte;
/*
* Typically, it is not useful to look for matches within join
* RTEs; they effectively duplicate other RTEs for our purposes,
* and if a match is chosen from a join RTE, an unhelpful alias is
* displayed in the final diagnostic message.
*/
if (rte->rtekind == RTE_JOIN)
continue;
/*
* If the user didn't specify an alias, then matches against one
* RTE are as good as another. But if the user did specify an
* alias, then we want at least a fuzzy - and preferably an exact
* - match for the range table entry.
*/
if (alias != NULL)
fuzzy_rte_penalty =
varstr_levenshtein(alias, strlen(alias),
rte->eref->aliasname,
strlen(rte->eref->aliasname),
1, 1, 1);
/*
* Scan for a matching column; if we find an exact match, we're
* done. Otherwise, update fuzzystate.
*/
if (scanRTEForColumn(orig_pstate, rte, colname, location,
fuzzy_rte_penalty, fuzzystate)
&& fuzzy_rte_penalty == 0)
{
fuzzystate->rfirst = rte;
fuzzystate->first = InvalidAttrNumber;
fuzzystate->rsecond = NULL;
fuzzystate->second = InvalidAttrNumber;
return fuzzystate;
}
}
pstate = pstate->parentParseState;
}
return NULL;
return fuzzystate;
}
/*
@ -2860,34 +3027,67 @@ void
errorMissingColumn(ParseState *pstate,
char *relname, char *colname, int location)
{
RangeTblEntry *rte;
FuzzyAttrMatchState *state;
char *closestfirst = NULL;
/*
* If relname was given, just play dumb and report it. (In practice, a
* bad qualification name should end up at errorMissingRTE, not here, so
* no need to work hard on this case.)
*/
if (relname)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column %s.%s does not exist", relname, colname),
parser_errposition(pstate, location)));
/*
* Otherwise, search the entire rtable looking for possible matches. If
* we find one, emit a hint about it.
* Search the entire rtable looking for possible matches. If we find one,
* emit a hint about it.
*
* TODO: improve this code (and also errorMissingRTE) to mention using
* LATERAL if appropriate.
*/
rte = searchRangeTableForCol(pstate, colname, location);
state = searchRangeTableForCol(pstate, relname, colname, location);
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column \"%s\" does not exist", colname),
rte ? errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.",
colname, rte->eref->aliasname) : 0,
parser_errposition(pstate, location)));
/*
* Extract closest col string for best match, if any.
*
* Infer an exact match referenced despite not being visible from the fact
* that an attribute number was not present in state passed back -- this is
* what is reported when !closestfirst. There might also be an exact match
* that was qualified with an incorrect alias, in which case closestfirst
* will be set (so hint is the same as generic fuzzy case).
*/
if (state->rfirst && AttributeNumberIsValid(state->first))
closestfirst = strVal(list_nth(state->rfirst->eref->colnames,
state->first - 1));
if (!state->rsecond)
{
/*
* Handle case where there is zero or one column suggestions to hint,
* including exact matches referenced but not visible.
*/
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
relname ?
errmsg("column %s.%s does not exist", relname, colname):
errmsg("column \"%s\" does not exist", colname),
state->rfirst ? closestfirst ?
errhint("Perhaps you meant to reference the column \"%s\".\"%s\".",
state->rfirst->eref->aliasname, closestfirst):
errhint("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query.",
colname, state->rfirst->eref->aliasname): 0,
parser_errposition(pstate, location)));
}
else
{
/* Handle case where there are two equally useful column hints */
char *closestsecond;
closestsecond = strVal(list_nth(state->rsecond->eref->colnames,
state->second - 1));
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
relname ?
errmsg("column %s.%s does not exist", relname, colname):
errmsg("column \"%s\" does not exist", colname),
errhint("Perhaps you meant to reference the column \"%s\".\"%s\" or the column \"%s\".\"%s\".",
state->rfirst->eref->aliasname, closestfirst,
state->rsecond->eref->aliasname, closestsecond),
parser_errposition(pstate, location)));
}
}

View File

@ -95,6 +95,15 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
#define STOP_COLUMN m
#endif
/*
* A common use for Levenshtein distance is to match attributes when building
* diagnostic, user-visible messages. Restrict the size of
* MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to
* work.
*/
StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN,
"Levenshtein hinting mechanism restricts NAMEDATALEN");
m = pg_mbstrlen_with_len(source, slen);
n = pg_mbstrlen_with_len(target, tlen);

View File

@ -16,6 +16,24 @@
#include "parser/parse_node.h"
/*
* Support for fuzzily matching column.
*
* This is for building diagnostic messages, where non-exact matching
* attributes are suggested to the user. The struct's fields may be facets of
* a particular RTE, or of an entire range table, depending on context.
*/
typedef struct
{
int distance; /* Weighted distance (lowest so far) */
RangeTblEntry *rfirst; /* RTE of first */
AttrNumber first; /* Closest attribute so far */
RangeTblEntry *rsecond; /* RTE of second */
AttrNumber second; /* Second closest attribute so far */
} FuzzyAttrMatchState;
extern RangeTblEntry *refnameRangeTblEntry(ParseState *pstate,
const char *schemaname,
const char *refname,
@ -35,7 +53,8 @@ extern RangeTblEntry *GetRTEByRangeTablePosn(ParseState *pstate,
extern CommonTableExpr *GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte,
int rtelevelsup);
extern Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte,
char *colname, int location);
char *colname, int location,
int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate);
extern Node *colNameToVar(ParseState *pstate, char *colname, bool localonly,
int location);
extern void markVarForSelectPriv(ParseState *pstate, Var *var,

View File

@ -536,6 +536,7 @@ create table atacc1 ( test int );
-- add a check constraint (fails)
alter table atacc1 add constraint atacc_test1 check (test1>3);
ERROR: column "test1" does not exist
HINT: Perhaps you meant to reference the column "atacc1"."test".
drop table atacc1;
-- something a little more complicated
create table atacc1 ( test int, test2 int, test3 int);
@ -1342,6 +1343,7 @@ select f1 from c1;
ERROR: column "f1" does not exist
LINE 1: select f1 from c1;
^
HINT: Perhaps you meant to reference the column "c1"."f2".
drop table p1 cascade;
NOTICE: drop cascades to table c1
create table p1 (f1 int, f2 int);
@ -1355,6 +1357,7 @@ select f1 from c1;
ERROR: column "f1" does not exist
LINE 1: select f1 from c1;
^
HINT: Perhaps you meant to reference the column "c1"."f2".
drop table p1 cascade;
NOTICE: drop cascades to table c1
create table p1 (f1 int, f2 int);

View File

@ -2222,6 +2222,12 @@ select * from t1 left join t2 on (t1.a = t2.a);
200 | 1000 | 200 | 2001
(5 rows)
-- Test matching of column name with wrong alias
select t1.x from t1 join t3 on (t1.a = t3.x);
ERROR: column t1.x does not exist
LINE 1: select t1.x from t1 join t3 on (t1.a = t3.x);
^
HINT: Perhaps you meant to reference the column "t3"."x".
--
-- regression test for 8.1 merge right join bug
--
@ -3433,6 +3439,38 @@ select * from
----+----+----+----
(0 rows)
--
-- Test hints given on incorrect column references are useful
--
select t1.uunique1 from
tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn
ERROR: column t1.uunique1 does not exist
LINE 1: select t1.uunique1 from
^
HINT: Perhaps you meant to reference the column "t1"."unique1".
select t2.uunique1 from
tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion
ERROR: column t2.uunique1 does not exist
LINE 1: select t2.uunique1 from
^
HINT: Perhaps you meant to reference the column "t2"."unique1".
select uunique1 from
tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once
ERROR: column "uunique1" does not exist
LINE 1: select uunique1 from
^
HINT: Perhaps you meant to reference the column "t1"."unique1" or the column "t2"."unique1".
--
-- Take care to reference the correct RTE
--
select atts.relid::regclass, s.* from pg_stats s join
pg_attribute a on s.attname = a.attname and s.tablename =
a.attrelid::regclass::text join (select unnest(indkey) attnum,
indexrelid from pg_index i) atts on atts.attnum = a.attnum where
schemaname != 'pg_catalog';
ERROR: column atts.relid does not exist
LINE 1: select atts.relid::regclass, s.* from pg_stats s join
^
--
-- Test LATERAL
--

View File

@ -397,6 +397,10 @@ insert into t2a values (200, 2001);
select * from t1 left join t2 on (t1.a = t2.a);
-- Test matching of column name with wrong alias
select t1.x from t1 join t3 on (t1.a = t3.x);
--
-- regression test for 8.1 merge right join bug
--
@ -1060,6 +1064,26 @@ select * from
int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok
--
-- Test hints given on incorrect column references are useful
--
select t1.uunique1 from
tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestipn
select t2.uunique1 from
tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion
select uunique1 from
tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once
--
-- Take care to reference the correct RTE
--
select atts.relid::regclass, s.* from pg_stats s join
pg_attribute a on s.attname = a.attname and s.tablename =
a.attrelid::regclass::text join (select unnest(indkey) attnum,
indexrelid from pg_index i) atts on atts.attnum = a.attnum where
schemaname != 'pg_catalog';
--
-- Test LATERAL
--