293 lines
7.6 KiB
C
293 lines
7.6 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* regexport.c
|
|
* Functions for exporting info about a regex's NFA
|
|
*
|
|
* In this implementation, the NFA defines a necessary but not sufficient
|
|
* condition for a string to match the regex: that is, there can be strings
|
|
* that match the NFA but don't match the full regex, but not vice versa.
|
|
* Thus, for example, it is okay for the functions below to treat lookaround
|
|
* constraints as no-ops, since they merely constrain the string some more.
|
|
*
|
|
* Notice that these functions return info into caller-provided arrays
|
|
* rather than doing their own malloc's. This simplifies the APIs by
|
|
* eliminating a class of error conditions, and in the case of colors
|
|
* allows the caller to decide how big is too big to bother with.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 2013-2020, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1998, 1999 Henry Spencer
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/regex/regexport.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "regex/regguts.h"
|
|
|
|
#include "regex/regexport.h"
|
|
|
|
|
|
/*
|
|
* Get total number of NFA states.
|
|
*/
|
|
int
|
|
pg_reg_getnumstates(const regex_t *regex)
|
|
{
|
|
struct cnfa *cnfa;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
return cnfa->nstates;
|
|
}
|
|
|
|
/*
|
|
* Get initial state of NFA.
|
|
*/
|
|
int
|
|
pg_reg_getinitialstate(const regex_t *regex)
|
|
{
|
|
struct cnfa *cnfa;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
return cnfa->pre;
|
|
}
|
|
|
|
/*
|
|
* Get final state of NFA.
|
|
*/
|
|
int
|
|
pg_reg_getfinalstate(const regex_t *regex)
|
|
{
|
|
struct cnfa *cnfa;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
return cnfa->post;
|
|
}
|
|
|
|
/*
|
|
* pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
|
|
* arcs from the caller, treating any LACON as being automatically satisfied.
|
|
* Since the output representation does not support arcs that consume no
|
|
* character when traversed, we have to recursively traverse LACON arcs here,
|
|
* and report whatever normal arcs are reachable by traversing LACON arcs.
|
|
* Note that this wouldn't work if it were possible to reach the final state
|
|
* via LACON traversal, but the regex library never builds NFAs that have
|
|
* LACON arcs leading directly to the final state. (This is because the
|
|
* regex executor is designed to consume one character beyond the nominal
|
|
* match end --- possibly an EOS indicator --- so there is always a set of
|
|
* ordinary arcs leading to the final state.)
|
|
*
|
|
* traverse_lacons is a recursive subroutine used by both exported functions
|
|
* to count and then emit the reachable regular arcs. *arcs_count is
|
|
* incremented by the number of reachable arcs, and as many as will fit in
|
|
* arcs_len (possibly 0) are emitted into arcs[].
|
|
*/
|
|
static void
|
|
traverse_lacons(struct cnfa *cnfa, int st,
|
|
int *arcs_count,
|
|
regex_arc_t *arcs, int arcs_len)
|
|
{
|
|
struct carc *ca;
|
|
|
|
/*
|
|
* Since this function recurses, it could theoretically be driven to stack
|
|
* overflow. In practice, this is mostly useful to backstop against a
|
|
* failure of the regex compiler to remove a loop of LACON arcs.
|
|
*/
|
|
check_stack_depth();
|
|
|
|
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
|
{
|
|
if (ca->co < cnfa->ncolors)
|
|
{
|
|
/* Ordinary arc, so count and possibly emit it */
|
|
int ndx = (*arcs_count)++;
|
|
|
|
if (ndx < arcs_len)
|
|
{
|
|
arcs[ndx].co = ca->co;
|
|
arcs[ndx].to = ca->to;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* LACON arc --- assume it's satisfied and recurse... */
|
|
/* ... but first, assert it doesn't lead directly to post state */
|
|
Assert(ca->to != cnfa->post);
|
|
|
|
traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Get number of outgoing NFA arcs of state number "st".
|
|
*/
|
|
int
|
|
pg_reg_getnumoutarcs(const regex_t *regex, int st)
|
|
{
|
|
struct cnfa *cnfa;
|
|
int arcs_count;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
if (st < 0 || st >= cnfa->nstates)
|
|
return 0;
|
|
arcs_count = 0;
|
|
traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
|
|
return arcs_count;
|
|
}
|
|
|
|
/*
|
|
* Write array of outgoing NFA arcs of state number "st" into arcs[],
|
|
* whose length arcs_len must be at least as long as indicated by
|
|
* pg_reg_getnumoutarcs(), else not all arcs will be returned.
|
|
*/
|
|
void
|
|
pg_reg_getoutarcs(const regex_t *regex, int st,
|
|
regex_arc_t *arcs, int arcs_len)
|
|
{
|
|
struct cnfa *cnfa;
|
|
int arcs_count;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
|
|
return;
|
|
arcs_count = 0;
|
|
traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
|
|
}
|
|
|
|
/*
|
|
* Get total number of colors.
|
|
*/
|
|
int
|
|
pg_reg_getnumcolors(const regex_t *regex)
|
|
{
|
|
struct colormap *cm;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cm = &((struct guts *) regex->re_guts)->cmap;
|
|
|
|
return cm->max + 1;
|
|
}
|
|
|
|
/*
|
|
* Check if color is beginning of line/string.
|
|
*
|
|
* (We might at some point need to offer more refined handling of pseudocolors,
|
|
* but this will do for now.)
|
|
*/
|
|
int
|
|
pg_reg_colorisbegin(const regex_t *regex, int co)
|
|
{
|
|
struct cnfa *cnfa;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
if (co == cnfa->bos[0] || co == cnfa->bos[1])
|
|
return true;
|
|
else
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Check if color is end of line/string.
|
|
*/
|
|
int
|
|
pg_reg_colorisend(const regex_t *regex, int co)
|
|
{
|
|
struct cnfa *cnfa;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
if (co == cnfa->eos[0] || co == cnfa->eos[1])
|
|
return true;
|
|
else
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Get number of member chrs of color number "co".
|
|
*
|
|
* Note: we return -1 if the color number is invalid, or if it is a special
|
|
* color (WHITE or a pseudocolor), or if the number of members is uncertain.
|
|
* Callers should not try to extract the members if -1 is returned.
|
|
*/
|
|
int
|
|
pg_reg_getnumcharacters(const regex_t *regex, int co)
|
|
{
|
|
struct colormap *cm;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cm = &((struct guts *) regex->re_guts)->cmap;
|
|
|
|
if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */
|
|
return -1;
|
|
if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
|
|
return -1;
|
|
|
|
/*
|
|
* If the color appears anywhere in the high colormap, treat its number of
|
|
* members as uncertain. In principle we could determine all the specific
|
|
* chrs corresponding to each such entry, but it would be expensive
|
|
* (particularly if character class tests are required) and it doesn't
|
|
* seem worth it.
|
|
*/
|
|
if (cm->cd[co].nuchrs != 0)
|
|
return -1;
|
|
|
|
/* OK, return the known number of member chrs */
|
|
return cm->cd[co].nschrs;
|
|
}
|
|
|
|
/*
|
|
* Write array of member chrs of color number "co" into chars[],
|
|
* whose length chars_len must be at least as long as indicated by
|
|
* pg_reg_getnumcharacters(), else not all chars will be returned.
|
|
*
|
|
* Fetching the members of WHITE or a pseudocolor is not supported.
|
|
*
|
|
* Caution: this is a relatively expensive operation.
|
|
*/
|
|
void
|
|
pg_reg_getcharacters(const regex_t *regex, int co,
|
|
pg_wchar *chars, int chars_len)
|
|
{
|
|
struct colormap *cm;
|
|
chr c;
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
cm = &((struct guts *) regex->re_guts)->cmap;
|
|
|
|
if (co <= 0 || co > cm->max || chars_len <= 0)
|
|
return;
|
|
if (cm->cd[co].flags & PSEUDO)
|
|
return;
|
|
|
|
/*
|
|
* We need only examine the low character map; there should not be any
|
|
* matching entries in the high map.
|
|
*/
|
|
for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
|
|
{
|
|
if (cm->locolormap[c - CHR_MIN] == co)
|
|
{
|
|
*chars++ = c;
|
|
if (--chars_len == 0)
|
|
break;
|
|
}
|
|
}
|
|
}
|