2013-04-09 07:05:55 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* regexport.c
|
|
|
|
* Functions for exporting info about a regex's NFA
|
|
|
|
*
|
|
|
|
* In this implementation, the NFA defines a necessary but not sufficient
|
|
|
|
* condition for a string to match the regex: that is, there can be strings
|
|
|
|
* that match the NFA but don't match the full regex, but not vice versa.
|
Implement lookbehind constraints in our regular-expression engine.
A lookbehind constraint is like a lookahead constraint in that it consumes
no text; but it checks for existence (or nonexistence) of a match *ending*
at the current point in the string, rather than one *starting* at the
current point. This is a long-requested feature since it exists in many
other regex libraries, but Henry Spencer had never got around to
implementing it in the code we use.
Just making it work is actually pretty trivial; but naive copying of the
logic for lookahead constraints leads to code that often spends O(N^2) time
to scan an N-character string, because we have to run the match engine
from string start to the current probe point each time the constraint is
checked. In typical use-cases a lookbehind constraint will be written at
the start of the regex and hence will need to be checked at every character
--- so O(N^2) work overall. To fix that, I introduced a third copy of the
core DFA matching loop, paralleling the existing longest() and shortest()
loops. This version, matchuntil(), can suspend and resume matching given
a couple of pointers' worth of storage space. So we need only run it
across the string once, stopping at each interesting probe point and then
resuming to advance to the next one.
I also put in an optimization that simplifies one-character lookahead and
lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND
constraints, which already existed in the engine. This avoids the overhead
of the LACON machinery entirely for these rather common cases.
The net result is that lookbehind constraints run a factor of three or so
slower than Perl's for multi-character constraints, but faster than Perl's
for one-character constraints ... and they work fine for variable-length
constraints, which Perl gives up on entirely. So that's not bad from a
competitive perspective, and there's room for further optimization if
anyone cares. (In reality, raw scan rate across a large input string is
probably not that big a deal for Postgres usage anyway; so I'm happy if
it's linear.)
2015-10-31 00:14:19 +01:00
|
|
|
* Thus, for example, it is okay for the functions below to ignore lookaround
|
2013-04-09 07:05:55 +02:00
|
|
|
* constraints, which merely constrain the string some more.
|
|
|
|
*
|
|
|
|
* Notice that these functions return info into caller-provided arrays
|
|
|
|
* rather than doing their own malloc's. This simplifies the APIs by
|
|
|
|
* eliminating a class of error conditions, and in the case of colors
|
|
|
|
* allows the caller to decide how big is too big to bother with.
|
|
|
|
*
|
|
|
|
*
|
2017-01-03 19:48:53 +01:00
|
|
|
* Portions Copyright (c) 2013-2017, PostgreSQL Global Development Group
|
2013-04-09 07:05:55 +02:00
|
|
|
* Portions Copyright (c) 1998, 1999 Henry Spencer
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/regex/regexport.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "regex/regguts.h"
|
|
|
|
|
|
|
|
#include "regex/regexport.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get total number of NFA states.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_getnumstates(const regex_t *regex)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
return cnfa->nstates;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get initial state of NFA.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_getinitialstate(const regex_t *regex)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
return cnfa->pre;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get final state of NFA.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_getfinalstate(const regex_t *regex)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
return cnfa->post;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get number of outgoing NFA arcs of state number "st".
|
|
|
|
*
|
|
|
|
* Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs().
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_getnumoutarcs(const regex_t *regex, int st)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
struct carc *ca;
|
|
|
|
int count;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
if (st < 0 || st >= cnfa->nstates)
|
|
|
|
return 0;
|
|
|
|
count = 0;
|
|
|
|
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
|
|
|
{
|
|
|
|
if (ca->co < cnfa->ncolors)
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write array of outgoing NFA arcs of state number "st" into arcs[],
|
|
|
|
* whose length arcs_len must be at least as long as indicated by
|
|
|
|
* pg_reg_getnumoutarcs(), else not all arcs will be returned.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
pg_reg_getoutarcs(const regex_t *regex, int st,
|
|
|
|
regex_arc_t *arcs, int arcs_len)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
struct carc *ca;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
|
|
|
|
return;
|
|
|
|
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
|
|
|
|
{
|
|
|
|
if (ca->co < cnfa->ncolors)
|
|
|
|
{
|
|
|
|
arcs->co = ca->co;
|
|
|
|
arcs->to = ca->to;
|
|
|
|
arcs++;
|
|
|
|
if (--arcs_len == 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get total number of colors.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_getnumcolors(const regex_t *regex)
|
|
|
|
{
|
|
|
|
struct colormap *cm;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cm = &((struct guts *) regex->re_guts)->cmap;
|
|
|
|
|
|
|
|
return cm->max + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if color is beginning of line/string.
|
|
|
|
*
|
|
|
|
* (We might at some point need to offer more refined handling of pseudocolors,
|
|
|
|
* but this will do for now.)
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_colorisbegin(const regex_t *regex, int co)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
if (co == cnfa->bos[0] || co == cnfa->bos[1])
|
|
|
|
return true;
|
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if color is end of line/string.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_colorisend(const regex_t *regex, int co)
|
|
|
|
{
|
|
|
|
struct cnfa *cnfa;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cnfa = &((struct guts *) regex->re_guts)->search;
|
|
|
|
|
|
|
|
if (co == cnfa->eos[0] || co == cnfa->eos[1])
|
|
|
|
return true;
|
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get number of member chrs of color number "co".
|
|
|
|
*
|
|
|
|
* Note: we return -1 if the color number is invalid, or if it is a special
|
|
|
|
* color (WHITE or a pseudocolor), or if the number of members is uncertain.
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
|
|
|
* Callers should not try to extract the members if -1 is returned.
|
2013-04-09 07:05:55 +02:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
pg_reg_getnumcharacters(const regex_t *regex, int co)
|
|
|
|
{
|
|
|
|
struct colormap *cm;
|
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cm = &((struct guts *) regex->re_guts)->cmap;
|
|
|
|
|
|
|
|
if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */
|
|
|
|
return -1;
|
|
|
|
if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
|
|
|
|
return -1;
|
|
|
|
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
|
|
|
/*
|
|
|
|
* If the color appears anywhere in the high colormap, treat its number of
|
|
|
|
* members as uncertain. In principle we could determine all the specific
|
|
|
|
* chrs corresponding to each such entry, but it would be expensive
|
|
|
|
* (particularly if character class tests are required) and it doesn't
|
|
|
|
* seem worth it.
|
|
|
|
*/
|
|
|
|
if (cm->cd[co].nuchrs != 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* OK, return the known number of member chrs */
|
|
|
|
return cm->cd[co].nschrs;
|
2013-04-09 07:05:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write array of member chrs of color number "co" into chars[],
|
|
|
|
* whose length chars_len must be at least as long as indicated by
|
|
|
|
* pg_reg_getnumcharacters(), else not all chars will be returned.
|
|
|
|
*
|
|
|
|
* Fetching the members of WHITE or a pseudocolor is not supported.
|
|
|
|
*
|
|
|
|
* Caution: this is a relatively expensive operation.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
pg_reg_getcharacters(const regex_t *regex, int co,
|
|
|
|
pg_wchar *chars, int chars_len)
|
|
|
|
{
|
|
|
|
struct colormap *cm;
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
|
|
|
chr c;
|
2013-04-09 07:05:55 +02:00
|
|
|
|
|
|
|
assert(regex != NULL && regex->re_magic == REMAGIC);
|
|
|
|
cm = &((struct guts *) regex->re_guts)->cmap;
|
|
|
|
|
|
|
|
if (co <= 0 || co > cm->max || chars_len <= 0)
|
|
|
|
return;
|
|
|
|
if (cm->cd[co].flags & PSEUDO)
|
|
|
|
return;
|
|
|
|
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
|
|
|
/*
|
|
|
|
* We need only examine the low character map; there should not be any
|
|
|
|
* matching entries in the high map.
|
|
|
|
*/
|
|
|
|
for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
|
2013-04-09 07:05:55 +02:00
|
|
|
{
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
|
|
|
if (cm->locolormap[c - CHR_MIN] == co)
|
2013-04-09 07:05:55 +02:00
|
|
|
{
|
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
|
|
|
*chars++ = c;
|
|
|
|
if (--chars_len == 0)
|
|
|
|
break;
|
2013-04-09 07:05:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|