postgresql/src/backend/regex/regexport.c

253 lines
5.9 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* regexport.c
* Functions for exporting info about a regex's NFA
*
* In this implementation, the NFA defines a necessary but not sufficient
* condition for a string to match the regex: that is, there can be strings
* that match the NFA but don't match the full regex, but not vice versa.
Implement lookbehind constraints in our regular-expression engine. A lookbehind constraint is like a lookahead constraint in that it consumes no text; but it checks for existence (or nonexistence) of a match *ending* at the current point in the string, rather than one *starting* at the current point. This is a long-requested feature since it exists in many other regex libraries, but Henry Spencer had never got around to implementing it in the code we use. Just making it work is actually pretty trivial; but naive copying of the logic for lookahead constraints leads to code that often spends O(N^2) time to scan an N-character string, because we have to run the match engine from string start to the current probe point each time the constraint is checked. In typical use-cases a lookbehind constraint will be written at the start of the regex and hence will need to be checked at every character --- so O(N^2) work overall. To fix that, I introduced a third copy of the core DFA matching loop, paralleling the existing longest() and shortest() loops. This version, matchuntil(), can suspend and resume matching given a couple of pointers' worth of storage space. So we need only run it across the string once, stopping at each interesting probe point and then resuming to advance to the next one. I also put in an optimization that simplifies one-character lookahead and lookbehind constraints, such as "(?=x)" or "(?<!\w)", into AHEAD and BEHIND constraints, which already existed in the engine. This avoids the overhead of the LACON machinery entirely for these rather common cases. The net result is that lookbehind constraints run a factor of three or so slower than Perl's for multi-character constraints, but faster than Perl's for one-character constraints ... and they work fine for variable-length constraints, which Perl gives up on entirely. So that's not bad from a competitive perspective, and there's room for further optimization if anyone cares. (In reality, raw scan rate across a large input string is probably not that big a deal for Postgres usage anyway; so I'm happy if it's linear.)
2015-10-31 00:14:19 +01:00
* Thus, for example, it is okay for the functions below to ignore lookaround
* constraints, which merely constrain the string some more.
*
* Notice that these functions return info into caller-provided arrays
* rather than doing their own malloc's. This simplifies the APIs by
* eliminating a class of error conditions, and in the case of colors
* allows the caller to decide how big is too big to bother with.
*
*
2017-01-03 19:48:53 +01:00
* Portions Copyright (c) 2013-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1998, 1999 Henry Spencer
*
* IDENTIFICATION
* src/backend/regex/regexport.c
*
*-------------------------------------------------------------------------
*/
#include "regex/regguts.h"
#include "regex/regexport.h"
/*
* Get total number of NFA states.
*/
int
pg_reg_getnumstates(const regex_t *regex)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
return cnfa->nstates;
}
/*
* Get initial state of NFA.
*/
int
pg_reg_getinitialstate(const regex_t *regex)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
return cnfa->pre;
}
/*
* Get final state of NFA.
*/
int
pg_reg_getfinalstate(const regex_t *regex)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
return cnfa->post;
}
/*
* Get number of outgoing NFA arcs of state number "st".
*
* Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs().
*/
int
pg_reg_getnumoutarcs(const regex_t *regex, int st)
{
struct cnfa *cnfa;
struct carc *ca;
int count;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (st < 0 || st >= cnfa->nstates)
return 0;
count = 0;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co < cnfa->ncolors)
count++;
}
return count;
}
/*
* Write array of outgoing NFA arcs of state number "st" into arcs[],
* whose length arcs_len must be at least as long as indicated by
* pg_reg_getnumoutarcs(), else not all arcs will be returned.
*/
void
pg_reg_getoutarcs(const regex_t *regex, int st,
regex_arc_t *arcs, int arcs_len)
{
struct cnfa *cnfa;
struct carc *ca;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
return;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co < cnfa->ncolors)
{
arcs->co = ca->co;
arcs->to = ca->to;
arcs++;
if (--arcs_len == 0)
break;
}
}
}
/*
* Get total number of colors.
*/
int
pg_reg_getnumcolors(const regex_t *regex)
{
struct colormap *cm;
assert(regex != NULL && regex->re_magic == REMAGIC);
cm = &((struct guts *) regex->re_guts)->cmap;
return cm->max + 1;
}
/*
* Check if color is beginning of line/string.
*
* (We might at some point need to offer more refined handling of pseudocolors,
* but this will do for now.)
*/
int
pg_reg_colorisbegin(const regex_t *regex, int co)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (co == cnfa->bos[0] || co == cnfa->bos[1])
return true;
else
return false;
}
/*
* Check if color is end of line/string.
*/
int
pg_reg_colorisend(const regex_t *regex, int co)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (co == cnfa->eos[0] || co == cnfa->eos[1])
return true;
else
return false;
}
/*
* Get number of member chrs of color number "co".
*
* Note: we return -1 if the color number is invalid, or if it is a special
* color (WHITE or a pseudocolor), or if the number of members is uncertain.
Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
* Callers should not try to extract the members if -1 is returned.
*/
int
pg_reg_getnumcharacters(const regex_t *regex, int co)
{
struct colormap *cm;
assert(regex != NULL && regex->re_magic == REMAGIC);
cm = &((struct guts *) regex->re_guts)->cmap;
if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */
return -1;
if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
return -1;
Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
/*
* If the color appears anywhere in the high colormap, treat its number of
* members as uncertain. In principle we could determine all the specific
* chrs corresponding to each such entry, but it would be expensive
* (particularly if character class tests are required) and it doesn't
* seem worth it.
*/
if (cm->cd[co].nuchrs != 0)
return -1;
/* OK, return the known number of member chrs */
return cm->cd[co].nschrs;
}
/*
* Write array of member chrs of color number "co" into chars[],
* whose length chars_len must be at least as long as indicated by
* pg_reg_getnumcharacters(), else not all chars will be returned.
*
* Fetching the members of WHITE or a pseudocolor is not supported.
*
* Caution: this is a relatively expensive operation.
*/
void
pg_reg_getcharacters(const regex_t *regex, int co,
pg_wchar *chars, int chars_len)
{
struct colormap *cm;
Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
chr c;
assert(regex != NULL && regex->re_magic == REMAGIC);
cm = &((struct guts *) regex->re_guts)->cmap;
if (co <= 0 || co > cm->max || chars_len <= 0)
return;
if (cm->cd[co].flags & PSEUDO)
return;
Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
/*
* We need only examine the low character map; there should not be any
* matching entries in the high map.
*/
for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
{
Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
if (cm->locolormap[c - CHR_MIN] == co)
{
Make locale-dependent regex character classes work for large char codes. Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
2016-09-05 23:06:29 +02:00
*chars++ = c;
if (--chars_len == 0)
break;
}
}
}