postgresql/src/backend/regex/regprefix.c

269 lines
7.2 KiB
C

/*-------------------------------------------------------------------------
*
* regprefix.c
* Extract a common prefix, if any, from a compiled regex.
*
*
* Portions Copyright (c) 2012-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1998, 1999 Henry Spencer
*
* IDENTIFICATION
* src/backend/regex/regprefix.c
*
*-------------------------------------------------------------------------
*/
#include "regex/regguts.h"
/*
* forward declarations
*/
static int findprefix(struct cnfa *cnfa, struct colormap *cm,
chr *string, size_t *slength);
/*
* pg_regprefix - get common prefix for regular expression
*
* Returns one of:
* REG_NOMATCH: there is no common prefix of strings matching the regex
* REG_PREFIX: there is a common prefix of strings matching the regex
* REG_EXACT: all strings satisfying the regex must match the same string
* or a REG_XXX error code
*
* In the non-failure cases, *string is set to a palloc'd string containing
* the common prefix or exact value, of length *slength (measured in chrs
* not bytes!).
*
* This function does not analyze all complex cases (such as lookaround
* constraints) exactly. Therefore it is possible that some strings matching
* the reported prefix or exact-match string do not satisfy the regex. But
* it should never be the case that a string satisfying the regex does not
* match the reported prefix or exact-match string.
*/
int
pg_regprefix(regex_t *re,
chr **string,
size_t *slength)
{
struct guts *g;
struct cnfa *cnfa;
int st;
/* sanity checks */
if (string == NULL || slength == NULL)
return REG_INVARG;
*string = NULL; /* initialize for failure cases */
*slength = 0;
if (re == NULL || re->re_magic != REMAGIC)
return REG_INVARG;
if (re->re_csize != sizeof(chr))
return REG_MIXED;
/* Initialize locale-dependent support */
pg_set_regex_collation(re->re_collation);
/* setup */
g = (struct guts *) re->re_guts;
if (g->info & REG_UIMPOSSIBLE)
return REG_NOMATCH;
/*
* This implementation considers only the search NFA for the topmost regex
* tree node. Therefore, constraints such as backrefs are not fully
* applied, which is allowed per the function's API spec.
*/
assert(g->tree != NULL);
cnfa = &g->tree->cnfa;
/* matchall NFAs never have a fixed prefix */
if (cnfa->flags & MATCHALL)
return REG_NOMATCH;
/*
* Since a correct NFA should never contain any exit-free loops, it should
* not be possible for our traversal to return to a previously visited NFA
* state. Hence we need at most nstates chrs in the output string.
*/
*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
if (*string == NULL)
return REG_ESPACE;
/* do it */
st = findprefix(cnfa, &g->cmap, *string, slength);
assert(*slength <= cnfa->nstates);
/* clean up */
if (st != REG_PREFIX && st != REG_EXACT)
{
FREE(*string);
*string = NULL;
*slength = 0;
}
return st;
}
/*
* findprefix - extract common prefix from cNFA
*
* Results are returned into the preallocated chr array string[], with
* *slength (which must be preset to zero) incremented for each chr.
*/
static int /* regprefix return code */
findprefix(struct cnfa *cnfa,
struct colormap *cm,
chr *string,
size_t *slength)
{
int st;
int nextst;
color thiscolor;
chr c;
struct carc *ca;
/*
* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
* anchored left. If we have both BOS and BOL, they must go to the same
* next state.
*/
st = cnfa->pre;
nextst = -1;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
{
if (nextst == -1)
nextst = ca->to;
else if (nextst != ca->to)
return REG_NOMATCH;
}
else
return REG_NOMATCH;
}
if (nextst == -1)
return REG_NOMATCH;
/*
* Scan through successive states, stopping as soon as we find one with
* more than one acceptable transition character (either multiple colors
* on out-arcs, or a color with more than one member chr).
*
* We could find a state with multiple out-arcs that are all labeled with
* the same singleton color; this comes from patterns like "^ab(cde|cxy)".
* In that case we add the chr "c" to the output string but then exit the
* loop with nextst == -1. This leaves a little bit on the table: if the
* pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
* to the prefix. But chasing multiple parallel state chains doesn't seem
* worth the trouble.
*/
do
{
st = nextst;
nextst = -1;
thiscolor = COLORLESS;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
/* We can ignore BOS/BOL arcs */
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
continue;
/*
* ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
* and LACONs
*/
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
ca->co == RAINBOW || ca->co >= cnfa->ncolors)
{
thiscolor = COLORLESS;
break;
}
if (thiscolor == COLORLESS)
{
/* First plain outarc */
thiscolor = ca->co;
nextst = ca->to;
}
else if (thiscolor == ca->co)
{
/* Another plain outarc for same color */
nextst = -1;
}
else
{
/* More than one plain outarc color terminates the search */
thiscolor = COLORLESS;
break;
}
}
/* Done if we didn't find exactly one color on plain outarcs */
if (thiscolor == COLORLESS)
break;
/* The color must be a singleton */
if (cm->cd[thiscolor].nschrs != 1)
break;
/* Must not have any high-color-map entries */
if (cm->cd[thiscolor].nuchrs != 0)
break;
/*
* Identify the color's sole member chr and add it to the prefix
* string. In general the colormap data structure doesn't provide a
* way to find color member chrs, except by trying GETCOLOR() on each
* possible chr value, which won't do at all. However, for the cases
* we care about it should be sufficient to test the "firstchr" value,
* that is the first chr ever added to the color. There are cases
* where this might no longer be a member of the color (so we do need
* to test), but none of them are likely to arise for a character that
* is a member of a common prefix. If we do hit such a corner case,
* we just fall out without adding anything to the prefix string.
*/
c = cm->cd[thiscolor].firstchr;
if (GETCOLOR(cm, c) != thiscolor)
break;
string[(*slength)++] = c;
/* Advance to next state, but only if we have a unique next state */
} while (nextst != -1);
/*
* If we ended at a state that only has EOS/EOL outarcs leading to the
* "post" state, then we have an exact-match string. Note this is true
* even if the string is of zero length.
*/
nextst = -1;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
{
if (nextst == -1)
nextst = ca->to;
else if (nextst != ca->to)
{
nextst = -1;
break;
}
}
else
{
nextst = -1;
break;
}
}
if (nextst == cnfa->post)
return REG_EXACT;
/*
* Otherwise, if we were unable to identify any prefix characters, say
* NOMATCH --- the pattern is anchored left, but doesn't specify any
* particular first character.
*/
if (*slength > 0)
return REG_PREFIX;
return REG_NOMATCH;
}