postgresql/contrib/pg_trgm/trgm_regexp.c
Bruce Momjian 27b77ecf9f Update copyright for 2022
Backpatch-through: 10
2022-01-07 19:04:57 -05:00

2365 lines
69 KiB
C

/*-------------------------------------------------------------------------
*
* trgm_regexp.c
* Regular expression matching using trigrams.
*
* The general idea of trigram index support for a regular expression (regex)
* search is to transform the regex into a logical expression on trigrams.
* For example:
*
* (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
*
* If a string matches the regex, then it must match the logical expression on
* trigrams. The opposite is not necessarily true, however: a string that
* matches the logical expression might not match the original regex. Such
* false positives are removed via recheck, by running the regular regex match
* operator on the retrieved heap tuple.
*
* Since the trigram expression involves both AND and OR operators, we can't
* expect the core index machinery to evaluate it completely. Instead, the
* result of regex analysis is a list of trigrams to be sought in the index,
* plus a simplified graph that is used by trigramsMatchGraph() to determine
* whether a particular indexed value matches the expression.
*
* Converting a regex to a trigram expression is based on analysis of an
* automaton corresponding to the regex. The algorithm consists of four
* stages:
*
* 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
* regexp library, which provides accessors for its opaque regex_t struct
* to expose the NFA state graph and the "colors" (sets of equivalent
* characters) used as state transition labels.
*
* 2) Transform the original NFA into an expanded graph, where arcs
* are labeled with trigrams that must be present in order to move from
* one state to another via the arcs. The trigrams used in this stage
* consist of colors, not characters, as in the original NFA.
*
* 3) Expand the color trigrams into regular trigrams consisting of
* characters. If too many distinct trigrams are produced, trigrams are
* eliminated and the graph is simplified until it's simple enough.
*
* 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
* and returned to the caller.
*
* 1) Compile the regexp to NFA form
* ---------------------------------
* The automaton returned by the regexp compiler is a graph where vertices
* are "states" and arcs are labeled with colors. Each color represents
* a set of characters, so that all characters assigned to the same color
* are interchangeable, so far as matching the regexp is concerned. There
* are two special states: "initial" and "final". A state can have multiple
* outgoing arcs labeled with the same color, which makes the automaton
* non-deterministic, because it can be in many states simultaneously.
*
* Note that this NFA is already lossy compared to the original regexp,
* since it ignores some regex features such as lookahead constraints and
* backref matching. This is OK for our purposes since it's still the case
* that only strings matching the NFA can possibly satisfy the regexp.
*
* 2) Transform the original NFA into an expanded graph
* ----------------------------------------------------
* In the 2nd stage, the automaton is transformed into a graph based on the
* original NFA. Each state in the expanded graph represents a state from
* the original NFA, plus a prefix identifying the last two characters
* (colors, to be precise) seen before entering the state. There can be
* multiple states in the expanded graph for each state in the original NFA,
* depending on what characters can precede it. A prefix position can be
* "unknown" if it's uncertain what the preceding character was, or "blank"
* if the character was a non-word character (we don't need to distinguish
* which non-word character it was, so just think of all of them as blanks).
*
* For convenience in description, call an expanded-state identifier
* (two prefix colors plus a state number from the original NFA) an
* "enter key".
*
* Each arc of the expanded graph is labeled with a trigram that must be
* present in the string to match. We can construct this from an out-arc of
* the underlying NFA state by combining the expanded state's prefix with the
* color label of the underlying out-arc, if neither prefix position is
* "unknown". But note that some of the colors in the trigram might be
* "blank". This is OK since we want to generate word-boundary trigrams as
* the regular trigram machinery would, if we know that some word characters
* must be adjacent to a word boundary in all strings matching the NFA.
*
* The expanded graph can also have fewer states than the original NFA,
* because we don't bother to make a separate state entry unless the state
* is reachable by a valid arc. When an enter key is reachable from a state
* of the expanded graph, but we do not know a complete trigram associated
* with that transition, we cannot make a valid arc; instead we insert the
* enter key into the enterKeys list of the source state. This effectively
* means that the two expanded states are not reliably distinguishable based
* on examining trigrams.
*
* So the expanded graph resembles the original NFA, but the arcs are
* labeled with trigrams instead of individual characters, and there may be
* more or fewer states. It is a lossy representation of the original NFA:
* any string that matches the original regexp must match the expanded graph,
* but the reverse is not true.
*
* We build the expanded graph through a breadth-first traversal of states
* reachable from the initial state. At each reachable state, we identify the
* states reachable from it without traversing a predictable trigram, and add
* those states' enter keys to the current state. Then we generate all
* out-arcs leading out of this collection of states that have predictable
* trigrams, adding their target states to the queue of states to examine.
*
* When building the graph, if the number of states or arcs exceed pre-defined
* limits, we give up and simply mark any states not yet processed as final
* states. Roughly speaking, that means that we make use of some portion from
* the beginning of the regexp. Also, any colors that have too many member
* characters are treated as "unknown", so that we can't derive trigrams
* from them.
*
* 3) Expand the color trigrams into regular trigrams
* --------------------------------------------------
* The trigrams in the expanded graph are "color trigrams", consisting
* of three consecutive colors that must be present in the string. But for
* search, we need regular trigrams consisting of characters. In the 3rd
* stage, the color trigrams are expanded into regular trigrams. Since each
* color can represent many characters, the total number of regular trigrams
* after expansion could be very large. Because searching the index for
* thousands of trigrams would be slow, and would likely produce so many
* false positives that we would have to traverse a large fraction of the
* index, the graph is simplified further in a lossy fashion by removing
* color trigrams. When a color trigram is removed, the states connected by
* any arcs labeled with that trigram are merged.
*
* Trigrams do not all have equivalent value for searching: some of them are
* more frequent and some of them are less frequent. Ideally, we would like
* to know the distribution of trigrams, but we don't. But because of padding
* we know for sure that the empty character is more frequent than others,
* so we can penalize trigrams according to presence of whitespace. The
* penalty assigned to each color trigram is the number of simple trigrams
* it would produce, times the penalties[] multiplier associated with its
* whitespace content. (The penalties[] constants were calculated by analysis
* of some real-life text.) We eliminate color trigrams starting with the
* highest-penalty one, until we get to a total penalty of no more than
* WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
* lead to merging the initial and final states, so we may not be able to
* reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
* MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
*
* 4) Pack the graph into a compact representation
* -----------------------------------------------
* The 2nd and 3rd stages might have eliminated or merged many of the states
* and trigrams created earlier, so in this final stage, the graph is
* compacted and packed into a simpler struct that contains only the
* information needed to evaluate it.
*
* ALGORITHM EXAMPLE:
*
* Consider the example regex "ab[cd]". This regex is transformed into the
* following NFA (for simplicity we show colors as their single members):
*
* 4#
* c/
* a b /
* 1* --- 2 ---- 3
* \
* d\
* 5#
*
* We use * to mark initial state and # to mark final state. It's not depicted,
* but states 1, 4, 5 have self-referencing arcs for all possible characters,
* because this pattern can match to any part of a string.
*
* As the result of stage 2 we will have the following graph:
*
* abc abd
* 2# <---- 1* ----> 3#
*
* The process for generating this graph is:
* 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
* 2) Add key (UNKNOWN, "a", 2) to state 1.
* 3) Add key ("a", "b", 3) to state 1.
* 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
* from state 1 to state 2 with label trigram "abc".
* 5) Mark state 2 final because state 4 of source NFA is marked as final.
* 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
* from state 1 to state 3 with label trigram "abd".
* 7) Mark state 3 final because state 5 of source NFA is marked as final.
*
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* contrib/pg_trgm/trgm_regexp.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "regex/regexport.h"
#include "trgm.h"
#include "tsearch/ts_locale.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
/*
* Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
* for exploring and debugging the algorithm implementation.
* This produces three graph files in /tmp, in Graphviz .gv format.
* Some progress information is also printed to postmaster stderr.
*/
/* #define TRGM_REGEXP_DEBUG */
/*
* These parameters are used to limit the amount of work done.
* Otherwise regex processing could be too slow and memory-consuming.
*
* MAX_EXPANDED_STATES - How many states we allow in expanded graph
* MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
* MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
* WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
* COLOR_COUNT_LIMIT - Maximum number of characters per color
*/
#define MAX_EXPANDED_STATES 128
#define MAX_EXPANDED_ARCS 1024
#define MAX_TRGM_COUNT 256
#define WISH_TRGM_PENALTY 16
#define COLOR_COUNT_LIMIT 256
/*
* Penalty multipliers for trigram counts depending on whitespace contents.
* Numbers based on analysis of real-life texts.
*/
static const float4 penalties[8] = {
1.0f, /* "aaa" */
3.5f, /* "aa " */
0.0f, /* "a a" (impossible) */
0.0f, /* "a " (impossible) */
4.2f, /* " aa" */
2.1f, /* " a " */
25.0f, /* " a" */
0.0f /* " " (impossible) */
};
/* Struct representing a single pg_wchar, converted back to multibyte form */
typedef struct
{
char bytes[MAX_MULTIBYTE_CHAR_LEN];
} trgm_mb_char;
/*
* Attributes of NFA colors:
*
* expandable - we know the character expansion of this color
* containsNonWord - color contains non-word characters
* (which will not be extracted into trigrams)
* wordCharsCount - count of word characters in color
* wordChars - array of this color's word characters
* (which can be extracted into trigrams)
*
* When expandable is false, the other attributes don't matter; we just
* assume this color represents unknown character(s).
*/
typedef struct
{
bool expandable;
bool containsNonWord;
int wordCharsCount;
trgm_mb_char *wordChars;
} TrgmColorInfo;
/*
* A "prefix" is information about the colors of the last two characters read
* before reaching a specific NFA state. These colors can have special values
* COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
* information, for example because we read some character of an unexpandable
* color. COLOR_BLANK means that we read a non-word character.
*
* We call a prefix ambiguous if at least one of its colors is unknown. It's
* fully ambiguous if both are unknown, partially ambiguous if only the first
* is unknown. (The case of first color known, second unknown is not valid.)
*
* Wholly- or partly-blank prefixes are mostly handled the same as regular
* color prefixes. This allows us to generate appropriate partly-blank
* trigrams when the NFA requires word character(s) to appear adjacent to
* non-word character(s).
*/
typedef int TrgmColor;
/* We assume that colors returned by the regexp engine cannot be these: */
#define COLOR_UNKNOWN (-3)
#define COLOR_BLANK (-4)
typedef struct
{
TrgmColor colors[2];
} TrgmPrefix;
/*
* Color-trigram data type. Note that some elements of the trigram can be
* COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
*/
typedef struct
{
TrgmColor colors[3];
} ColorTrgm;
/*
* Key identifying a state of our expanded graph: color prefix, and number
* of the corresponding state in the underlying regex NFA. The color prefix
* shows how we reached the regex state (to the extent that we know it).
*/
typedef struct
{
TrgmPrefix prefix;
int nstate;
} TrgmStateKey;
/*
* One state of the expanded graph.
*
* stateKey - ID of this state
* arcs - outgoing arcs of this state (List of TrgmArc)
* enterKeys - enter keys reachable from this state without reading any
* predictable trigram (List of TrgmStateKey)
* flags - flag bits
* snumber - number of this state (initially assigned as -1, -2, etc,
* for debugging purposes only; then at the packaging stage,
* surviving states are renumbered with positive numbers)
* parent - parent state, if this state has been merged into another
* tentFlags - flags this state would acquire via planned merges
* tentParent - planned parent state, if considering a merge
*/
#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
#define TSTATE_FIN 0x02 /* flag indicating this state is final */
typedef struct TrgmState
{
TrgmStateKey stateKey; /* hashtable key: must be first field */
List *arcs;
List *enterKeys;
int flags;
int snumber;
struct TrgmState *parent;
int tentFlags;
struct TrgmState *tentParent;
} TrgmState;
/*
* One arc in the expanded graph.
*/
typedef struct
{
ColorTrgm ctrgm; /* trigram needed to traverse arc */
TrgmState *target; /* next state */
} TrgmArc;
/*
* Information about arc of specific color trigram (used in stage 3)
*
* Contains pointers to the source and target states.
*/
typedef struct
{
TrgmState *source;
TrgmState *target;
} TrgmArcInfo;
/*
* Information about color trigram (used in stage 3)
*
* ctrgm - trigram itself
* cnumber - number of this trigram (used in the packaging stage)
* count - number of simple trigrams created from this color trigram
* expanded - indicates this color trigram is expanded into simple trigrams
* arcs - list of all arcs labeled with this color trigram.
*/
typedef struct
{
ColorTrgm ctrgm;
int cnumber;
int count;
float4 penalty;
bool expanded;
List *arcs;
} ColorTrgmInfo;
/*
* Data structure representing all the data we need during regex processing.
*
* regex - compiled regex
* colorInfo - extracted information about regex's colors
* ncolors - number of colors in colorInfo[]
* states - hashtable of TrgmStates (states of expanded graph)
* initState - pointer to initial state of expanded graph
* queue - queue of to-be-processed TrgmStates
* keysQueue - queue of to-be-processed TrgmStateKeys
* arcsCount - total number of arcs of expanded graph (for resource
* limiting)
* overflowed - we have exceeded resource limit for transformation
* colorTrgms - array of all color trigrams present in graph
* colorTrgmsCount - count of those color trigrams
* totalTrgmCount - total count of extracted simple trigrams
*/
typedef struct
{
/* Source regexp, and color information extracted from it (stage 1) */
regex_t *regex;
TrgmColorInfo *colorInfo;
int ncolors;
/* Expanded graph (stage 2) */
HTAB *states;
TrgmState *initState;
int nstates;
/* Workspace for stage 2 */
List *queue;
List *keysQueue;
int arcsCount;
bool overflowed;
/* Information about distinct color trigrams in the graph (stage 3) */
ColorTrgmInfo *colorTrgms;
int colorTrgmsCount;
int totalTrgmCount;
} TrgmNFA;
/*
* Final, compact representation of expanded graph.
*/
typedef struct
{
int targetState; /* index of target state (zero-based) */
int colorTrgm; /* index of color trigram for transition */
} TrgmPackedArc;
typedef struct
{
int arcsCount; /* number of out-arcs for this state */
TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
} TrgmPackedState;
/* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
struct TrgmPackedGraph
{
/*
* colorTrigramsCount and colorTrigramGroups contain information about how
* trigrams are grouped into color trigrams. "colorTrigramsCount" is the
* count of color trigrams and "colorTrigramGroups" contains number of
* simple trigrams for each color trigram. The array of simple trigrams
* (stored separately from this struct) is ordered so that the simple
* trigrams for each color trigram are consecutive, and they're in order
* by color trigram number.
*/
int colorTrigramsCount;
int *colorTrigramGroups; /* array of size colorTrigramsCount */
/*
* The states of the simplified NFA. State number 0 is always initial
* state and state number 1 is always final state.
*/
int statesCount;
TrgmPackedState *states; /* array of size statesCount */
/* Temporary work space for trigramsMatchGraph() */
bool *colorTrigramsActive; /* array of size colorTrigramsCount */
bool *statesActive; /* array of size statesCount */
int *statesQueue; /* array of size statesCount */
};
/*
* Temporary structure for representing an arc during packaging.
*/
typedef struct
{
int sourceState;
int targetState;
int colorTrgm;
} TrgmPackArcInfo;
/* prototypes for private functions */
static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
MemoryContext rcontext);
static void RE_compile(regex_t *regex, text *text_re,
int cflags, Oid collation);
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
static void transformGraph(TrgmNFA *trgmNFA);
static void processState(TrgmNFA *trgmNFA, TrgmState *state);
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
TrgmColor co, TrgmStateKey *destKey);
static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
static bool selectColorTrigrams(TrgmNFA *trgmNFA);
static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
static void mergeStates(TrgmState *state1, TrgmState *state2);
static int colorTrgmInfoCmp(const void *p1, const void *p2);
static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
static int packArcInfoCmp(const void *a1, const void *a2);
#ifdef TRGM_REGEXP_DEBUG
static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
static void printTrgmNFA(TrgmNFA *trgmNFA);
static void printTrgmColor(StringInfo buf, TrgmColor co);
static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
#endif
/*
* Main entry point to process a regular expression.
*
* Returns an array of trigrams required by the regular expression, or NULL if
* the regular expression was too complex to analyze. In addition, a packed
* graph representation of the regex is returned into *graph. The results
* must be allocated in rcontext (which might or might not be the current
* context).
*/
TRGM *
createTrgmNFA(text *text_re, Oid collation,
TrgmPackedGraph **graph, MemoryContext rcontext)
{
TRGM *trg;
regex_t regex;
MemoryContext tmpcontext;
MemoryContext oldcontext;
/*
* This processing generates a great deal of cruft, which we'd like to
* clean up before returning (since this function may be called in a
* query-lifespan memory context). Make a temp context we can work in so
* that cleanup is easy.
*/
tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
"createTrgmNFA temporary context",
ALLOCSET_DEFAULT_SIZES);
oldcontext = MemoryContextSwitchTo(tmpcontext);
/*
* Stage 1: Compile the regexp into a NFA, using the regexp library.
*/
#ifdef IGNORECASE
RE_compile(&regex, text_re,
REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
#else
RE_compile(&regex, text_re,
REG_ADVANCED | REG_NOSUB, collation);
#endif
/*
* Since the regexp library allocates its internal data structures with
* malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets
* done even if there's an error.
*/
PG_TRY();
{
trg = createTrgmNFAInternal(&regex, graph, rcontext);
}
PG_FINALLY();
{
pg_regfree(&regex);
}
PG_END_TRY();
/* Clean up all the cruft we created */
MemoryContextSwitchTo(oldcontext);
MemoryContextDelete(tmpcontext);
return trg;
}
/*
* Body of createTrgmNFA, exclusive of regex compilation/freeing.
*/
static TRGM *
createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
MemoryContext rcontext)
{
TRGM *trg;
TrgmNFA trgmNFA;
trgmNFA.regex = regex;
/* Collect color information from the regex */
getColorInfo(regex, &trgmNFA);
#ifdef TRGM_REGEXP_DEBUG
printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
#endif
/*
* Stage 2: Create an expanded graph from the source NFA.
*/
transformGraph(&trgmNFA);
#ifdef TRGM_REGEXP_DEBUG
printTrgmNFA(&trgmNFA);
#endif
/*
* Fail if we were unable to make a nontrivial graph, ie it is possible to
* get from the initial state to the final state without reading any
* predictable trigram.
*/
if (trgmNFA.initState->flags & TSTATE_FIN)
return NULL;
/*
* Stage 3: Select color trigrams to expand. Fail if too many trigrams.
*/
if (!selectColorTrigrams(&trgmNFA))
return NULL;
/*
* Stage 4: Expand color trigrams and pack graph into final
* representation.
*/
trg = expandColorTrigrams(&trgmNFA, rcontext);
*graph = packGraph(&trgmNFA, rcontext);
#ifdef TRGM_REGEXP_DEBUG
printTrgmPackedGraph(*graph, trg);
#endif
return trg;
}
/*
* Main entry point for evaluating a graph during index scanning.
*
* The check[] array is indexed by trigram number (in the array of simple
* trigrams returned by createTrgmNFA), and holds true for those trigrams
* that are present in the index entry being checked.
*/
bool
trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
{
int i,
j,
k,
queueIn,
queueOut;
/*
* Reset temporary working areas.
*/
memset(graph->colorTrigramsActive, 0,
sizeof(bool) * graph->colorTrigramsCount);
memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
/*
* Check which color trigrams were matched. A match for any simple
* trigram associated with a color trigram counts as a match of the color
* trigram.
*/
j = 0;
for (i = 0; i < graph->colorTrigramsCount; i++)
{
int cnt = graph->colorTrigramGroups[i];
for (k = j; k < j + cnt; k++)
{
if (check[k])
{
/*
* Found one matched trigram in the group. Can skip the rest
* of them and go to the next group.
*/
graph->colorTrigramsActive[i] = true;
break;
}
}
j = j + cnt;
}
/*
* Initialize the statesQueue to hold just the initial state. Note:
* statesQueue has room for statesCount entries, which is certainly enough
* since no state will be put in the queue more than once. The
* statesActive array marks which states have been queued.
*/
graph->statesActive[0] = true;
graph->statesQueue[0] = 0;
queueIn = 0;
queueOut = 1;
/* Process queued states as long as there are any. */
while (queueIn < queueOut)
{
int stateno = graph->statesQueue[queueIn++];
TrgmPackedState *state = &graph->states[stateno];
int cnt = state->arcsCount;
/* Loop over state's out-arcs */
for (i = 0; i < cnt; i++)
{
TrgmPackedArc *arc = &state->arcs[i];
/*
* If corresponding color trigram is present then activate the
* corresponding state. We're done if that's the final state,
* otherwise queue the state if it's not been queued already.
*/
if (graph->colorTrigramsActive[arc->colorTrgm])
{
int nextstate = arc->targetState;
if (nextstate == 1)
return true; /* success: final state is reachable */
if (!graph->statesActive[nextstate])
{
graph->statesActive[nextstate] = true;
graph->statesQueue[queueOut++] = nextstate;
}
}
}
}
/* Queue is empty, so match fails. */
return false;
}
/*
* Compile regex string into struct at *regex.
* NB: pg_regfree must be applied to regex if this completes successfully.
*/
static void
RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
{
int text_re_len = VARSIZE_ANY_EXHDR(text_re);
char *text_re_val = VARDATA_ANY(text_re);
pg_wchar *pattern;
int pattern_len;
int regcomp_result;
char errMsg[100];
/* Convert pattern string to wide characters */
pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
pattern_len = pg_mb2wchar_with_len(text_re_val,
pattern,
text_re_len);
/* Compile regex */
regcomp_result = pg_regcomp(regex,
pattern,
pattern_len,
cflags,
collation);
pfree(pattern);
if (regcomp_result != REG_OKAY)
{
/* re didn't compile (no need for pg_regfree, if so) */
pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("invalid regular expression: %s", errMsg)));
}
}
/*---------------------
* Subroutines for pre-processing the color map (stage 1).
*---------------------
*/
/*
* Fill TrgmColorInfo structure for each color using regex export functions.
*/
static void
getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
{
int colorsCount = pg_reg_getnumcolors(regex);
int i;
trgmNFA->ncolors = colorsCount;
trgmNFA->colorInfo = (TrgmColorInfo *)
palloc0(colorsCount * sizeof(TrgmColorInfo));
/*
* Loop over colors, filling TrgmColorInfo about each. Note we include
* WHITE (0) even though we know it'll be reported as non-expandable.
*/
for (i = 0; i < colorsCount; i++)
{
TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
int charsCount = pg_reg_getnumcharacters(regex, i);
pg_wchar *chars;
int j;
if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
{
/* Non expandable, or too large to work with */
colorInfo->expandable = false;
continue;
}
colorInfo->expandable = true;
colorInfo->containsNonWord = false;
colorInfo->wordChars = (trgm_mb_char *)
palloc(sizeof(trgm_mb_char) * charsCount);
colorInfo->wordCharsCount = 0;
/* Extract all the chars in this color */
chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount);
pg_reg_getcharacters(regex, i, chars, charsCount);
/*
* Convert characters back to multibyte form, and save only those that
* are word characters. Set "containsNonWord" if any non-word
* character. (Note: it'd probably be nicer to keep the chars in
* pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
*/
for (j = 0; j < charsCount; j++)
{
trgm_mb_char c;
if (!convertPgWchar(chars[j], &c))
continue; /* ok to ignore it altogether */
if (ISWORDCHR(c.bytes))
colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
else
colorInfo->containsNonWord = true;
}
pfree(chars);
}
}
/*
* Convert pg_wchar to multibyte format.
* Returns false if the character should be ignored completely.
*/
static bool
convertPgWchar(pg_wchar c, trgm_mb_char *result)
{
/* "s" has enough space for a multibyte character and a trailing NUL */
char s[MAX_MULTIBYTE_CHAR_LEN + 1];
/*
* We can ignore the NUL character, since it can never appear in a PG text
* string. This avoids the need for various special cases when
* reconstructing trigrams.
*/
if (c == 0)
return false;
/* Do the conversion, making sure the result is NUL-terminated */
memset(s, 0, sizeof(s));
pg_wchar2mb_with_len(&c, s, 1);
/*
* In IGNORECASE mode, we can ignore uppercase characters. We assume that
* the regex engine generated both uppercase and lowercase equivalents
* within each color, since we used the REG_ICASE option; so there's no
* need to process the uppercase version.
*
* XXX this code is dependent on the assumption that lowerstr() works the
* same as the regex engine's internal case folding machinery. Might be
* wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c).
* On the other hand, the trigrams in the index were created using
* lowerstr(), so we're probably screwed if there's any incompatibility
* anyway.
*/
#ifdef IGNORECASE
{
char *lowerCased = lowerstr(s);
if (strcmp(lowerCased, s) != 0)
{
pfree(lowerCased);
return false;
}
pfree(lowerCased);
}
#endif
/* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
return true;
}
/*---------------------
* Subroutines for expanding original NFA graph into a trigram graph (stage 2).
*---------------------
*/
/*
* Transform the graph, given a regex and extracted color information.
*
* We create and process a queue of expanded-graph states until all the states
* are processed.
*
* This algorithm may be stopped due to resource limitation. In this case we
* force every unprocessed branch to immediately finish with matching (this
* can give us false positives but no false negatives) by marking all
* unprocessed states as final.
*/
static void
transformGraph(TrgmNFA *trgmNFA)
{
HASHCTL hashCtl;
TrgmStateKey initkey;
TrgmState *initstate;
ListCell *lc;
/* Initialize this stage's workspace in trgmNFA struct */
trgmNFA->queue = NIL;
trgmNFA->keysQueue = NIL;
trgmNFA->arcsCount = 0;
trgmNFA->overflowed = false;
/* Create hashtable for states */
hashCtl.keysize = sizeof(TrgmStateKey);
hashCtl.entrysize = sizeof(TrgmState);
hashCtl.hcxt = CurrentMemoryContext;
trgmNFA->states = hash_create("Trigram NFA",
1024,
&hashCtl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
trgmNFA->nstates = 0;
/* Create initial state: ambiguous prefix, NFA's initial state */
MemSet(&initkey, 0, sizeof(initkey));
initkey.prefix.colors[0] = COLOR_UNKNOWN;
initkey.prefix.colors[1] = COLOR_UNKNOWN;
initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
initstate = getState(trgmNFA, &initkey);
initstate->flags |= TSTATE_INIT;
trgmNFA->initState = initstate;
/*
* Recursively build the expanded graph by processing queue of states
* (breadth-first search). getState already put initstate in the queue.
* Note that getState will append new states to the queue within the loop,
* too; this works as long as we don't do repeat fetches using the "lc"
* pointer.
*/
foreach(lc, trgmNFA->queue)
{
TrgmState *state = (TrgmState *) lfirst(lc);
/*
* If we overflowed then just mark state as final. Otherwise do
* actual processing.
*/
if (trgmNFA->overflowed)
state->flags |= TSTATE_FIN;
else
processState(trgmNFA, state);
/* Did we overflow? */
if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES)
trgmNFA->overflowed = true;
}
}
/*
* Process one state: add enter keys and then add outgoing arcs.
*/
static void
processState(TrgmNFA *trgmNFA, TrgmState *state)
{
ListCell *lc;
/* keysQueue should be NIL already, but make sure */
trgmNFA->keysQueue = NIL;
/*
* Add state's own key, and then process all keys added to keysQueue until
* queue is finished. But we can quit if the state gets marked final.
*/
addKey(trgmNFA, state, &state->stateKey);
foreach(lc, trgmNFA->keysQueue)
{
TrgmStateKey *key = (TrgmStateKey *) lfirst(lc);
if (state->flags & TSTATE_FIN)
break;
addKey(trgmNFA, state, key);
}
/* Release keysQueue to clean up for next cycle */
list_free(trgmNFA->keysQueue);
trgmNFA->keysQueue = NIL;
/*
* Add outgoing arcs only if state isn't final (we have no interest in
* outgoing arcs if we already match)
*/
if (!(state->flags & TSTATE_FIN))
addArcs(trgmNFA, state);
}
/*
* Add the given enter key into the state's enterKeys list, and determine
* whether this should result in any further enter keys being added.
* If so, add those keys to keysQueue so that processState will handle them.
*
* If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
* This situation means that we can reach the final state from this expanded
* state without reading any predictable trigram, so we must consider this
* state as an accepting one.
*
* The given key could be a duplicate of one already in enterKeys, or be
* redundant with some enterKeys. So we check that before doing anything.
*
* Note that we don't generate any actual arcs here. addArcs will do that
* later, after we have identified all the enter keys for this state.
*/
static void
addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
{
regex_arc_t *arcs;
TrgmStateKey destKey;
ListCell *cell;
int i,
arcsCount;
/*
* Ensure any pad bytes in destKey are zero, since it may get used as a
* hashtable key by getState.
*/
MemSet(&destKey, 0, sizeof(destKey));
/*
* Compare key to each existing enter key of the state to check for
* redundancy. We can drop either old key(s) or the new key if we find
* redundancy.
*/
foreach(cell, state->enterKeys)
{
TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
if (existingKey->nstate == key->nstate)
{
if (prefixContains(&existingKey->prefix, &key->prefix))
{
/* This old key already covers the new key. Nothing to do */
return;
}
if (prefixContains(&key->prefix, &existingKey->prefix))
{
/*
* The new key covers this old key. Remove the old key, it's
* no longer needed once we add this key to the list.
*/
state->enterKeys = foreach_delete_current(state->enterKeys,
cell);
}
}
}
/* No redundancy, so add this key to the state's list */
state->enterKeys = lappend(state->enterKeys, key);
/* If state is now known final, mark it and we're done */
if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
{
state->flags |= TSTATE_FIN;
return;
}
/*
* Loop through all outgoing arcs of the corresponding state in the
* original NFA.
*/
arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
for (i = 0; i < arcsCount; i++)
{
regex_arc_t *arc = &arcs[i];
if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
{
/*
* Start of line/string (^). Trigram extraction treats start of
* line same as start of word: double space prefix is added.
* Hence, make an enter key showing we can reach the arc
* destination with all-blank prefix.
*/
destKey.prefix.colors[0] = COLOR_BLANK;
destKey.prefix.colors[1] = COLOR_BLANK;
destKey.nstate = arc->to;
/* Add enter key to this state */
addKeyToQueue(trgmNFA, &destKey);
}
else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
{
/*
* End of line/string ($). We must consider this arc as a
* transition that doesn't read anything. The reason for adding
* this enter key to the state is that if the arc leads to the
* NFA's final state, we must mark this expanded state as final.
*/
destKey.prefix.colors[0] = COLOR_UNKNOWN;
destKey.prefix.colors[1] = COLOR_UNKNOWN;
destKey.nstate = arc->to;
/* Add enter key to this state */
addKeyToQueue(trgmNFA, &destKey);
}
else if (arc->co >= 0)
{
/* Regular color (including WHITE) */
TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
if (colorInfo->expandable)
{
if (colorInfo->containsNonWord &&
!validArcLabel(key, COLOR_BLANK))
{
/*
* We can reach the arc destination after reading a
* non-word character, but the prefix is not something
* that addArc will accept with COLOR_BLANK, so no trigram
* arc can get made for this transition. We must make an
* enter key to show that the arc destination is
* reachable. Set it up with an all-blank prefix, since
* that corresponds to what the trigram extraction code
* will do at a word starting boundary.
*/
destKey.prefix.colors[0] = COLOR_BLANK;
destKey.prefix.colors[1] = COLOR_BLANK;
destKey.nstate = arc->to;
addKeyToQueue(trgmNFA, &destKey);
}
if (colorInfo->wordCharsCount > 0 &&
!validArcLabel(key, arc->co))
{
/*
* We can reach the arc destination after reading a word
* character, but the prefix is not something that addArc
* will accept, so no trigram arc can get made for this
* transition. We must make an enter key to show that the
* arc destination is reachable. The prefix for the enter
* key should reflect the info we have for this arc.
*/
destKey.prefix.colors[0] = key->prefix.colors[1];
destKey.prefix.colors[1] = arc->co;
destKey.nstate = arc->to;
addKeyToQueue(trgmNFA, &destKey);
}
}
else
{
/*
* Unexpandable color. Add enter key with ambiguous prefix,
* showing we can reach the destination from this state, but
* the preceding colors will be uncertain. (We do not set the
* first prefix color to key->prefix.colors[1], because a
* prefix of known followed by unknown is invalid.)
*/
destKey.prefix.colors[0] = COLOR_UNKNOWN;
destKey.prefix.colors[1] = COLOR_UNKNOWN;
destKey.nstate = arc->to;
addKeyToQueue(trgmNFA, &destKey);
}
}
else
{
/* RAINBOW: treat as unexpandable color */
destKey.prefix.colors[0] = COLOR_UNKNOWN;
destKey.prefix.colors[1] = COLOR_UNKNOWN;
destKey.nstate = arc->to;
addKeyToQueue(trgmNFA, &destKey);
}
}
pfree(arcs);
}
/*
* Add copy of given key to keysQueue for later processing.
*/
static void
addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
{
TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey));
memcpy(keyCopy, key, sizeof(TrgmStateKey));
trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
}
/*
* Add outgoing arcs from given state, whose enter keys are all now known.
*/
static void
addArcs(TrgmNFA *trgmNFA, TrgmState *state)
{
TrgmStateKey destKey;
ListCell *cell;
regex_arc_t *arcs;
int arcsCount,
i;
/*
* Ensure any pad bytes in destKey are zero, since it may get used as a
* hashtable key by getState.
*/
MemSet(&destKey, 0, sizeof(destKey));
/*
* Iterate over enter keys associated with this expanded-graph state. This
* includes both the state's own stateKey, and any enter keys we added to
* it during addKey (which represent expanded-graph states that are not
* distinguishable from this one by means of trigrams). For each such
* enter key, examine all the out-arcs of the key's underlying NFA state,
* and try to make a trigram arc leading to where the out-arc leads.
* (addArc will deal with whether the arc is valid or not.)
*/
foreach(cell, state->enterKeys)
{
TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
for (i = 0; i < arcsCount; i++)
{
regex_arc_t *arc = &arcs[i];
TrgmColorInfo *colorInfo;
/*
* Ignore non-expandable colors; addKey already handled the case.
*
* We need no special check for WHITE or begin/end pseudocolors
* here. We don't need to do any processing for them, and they
* will be marked non-expandable since the regex engine will have
* reported them that way. We do have to watch out for RAINBOW,
* which has a negative color number.
*/
if (arc->co < 0)
continue;
Assert(arc->co < trgmNFA->ncolors);
colorInfo = &trgmNFA->colorInfo[arc->co];
if (!colorInfo->expandable)
continue;
if (colorInfo->containsNonWord)
{
/*
* Color includes non-word character(s).
*
* Generate an arc, treating this transition as occurring on
* BLANK. This allows word-ending trigrams to be manufactured
* if possible.
*/
destKey.prefix.colors[0] = key->prefix.colors[1];
destKey.prefix.colors[1] = COLOR_BLANK;
destKey.nstate = arc->to;
addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
}
if (colorInfo->wordCharsCount > 0)
{
/*
* Color includes word character(s).
*
* Generate an arc. Color is pushed into prefix of target
* state.
*/
destKey.prefix.colors[0] = key->prefix.colors[1];
destKey.prefix.colors[1] = arc->co;
destKey.nstate = arc->to;
addArc(trgmNFA, state, key, arc->co, &destKey);
}
}
pfree(arcs);
}
}
/*
* Generate an out-arc of the expanded graph, if it's valid and not redundant.
*
* state: expanded-graph state we want to add an out-arc to
* key: provides prefix colors (key->nstate is not used)
* co: transition color
* destKey: identifier for destination state of expanded graph
*/
static void
addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
TrgmColor co, TrgmStateKey *destKey)
{
TrgmArc *arc;
ListCell *cell;
/* Do nothing if this wouldn't be a valid arc label trigram */
if (!validArcLabel(key, co))
return;
/*
* Check if we are going to reach key which is covered by a key which is
* already listed in this state. If so arc is useless: the NFA can bypass
* it through a path that doesn't require any predictable trigram, so
* whether the arc's trigram is present or not doesn't really matter.
*/
foreach(cell, state->enterKeys)
{
TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
if (existingKey->nstate == destKey->nstate &&
prefixContains(&existingKey->prefix, &destKey->prefix))
return;
}
/* Checks were successful, add new arc */
arc = (TrgmArc *) palloc(sizeof(TrgmArc));
arc->target = getState(trgmNFA, destKey);
arc->ctrgm.colors[0] = key->prefix.colors[0];
arc->ctrgm.colors[1] = key->prefix.colors[1];
arc->ctrgm.colors[2] = co;
state->arcs = lappend(state->arcs, arc);
trgmNFA->arcsCount++;
}
/*
* Can we make a valid trigram arc label from the given prefix and arc color?
*
* This is split out so that tests in addKey and addArc will stay in sync.
*/
static bool
validArcLabel(TrgmStateKey *key, TrgmColor co)
{
/*
* We have to know full trigram in order to add outgoing arc. So we can't
* do it if prefix is ambiguous.
*/
if (key->prefix.colors[0] == COLOR_UNKNOWN)
return false;
/* If key->prefix.colors[0] isn't unknown, its second color isn't either */
Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
/* And we should not be called with an unknown arc color anytime */
Assert(co != COLOR_UNKNOWN);
/*
* We don't bother with making arcs representing three non-word
* characters, since that's useless for trigram extraction.
*/
if (key->prefix.colors[0] == COLOR_BLANK &&
key->prefix.colors[1] == COLOR_BLANK &&
co == COLOR_BLANK)
return false;
/*
* We also reject nonblank-blank-anything. The nonblank-blank-nonblank
* case doesn't correspond to any trigram the trigram extraction code
* would make. The nonblank-blank-blank case is also not possible with
* RPADDING = 1. (Note that in many cases we'd fail to generate such a
* trigram even if it were valid, for example processing "foo bar" will
* not result in considering the trigram "o ". So if you want to support
* RPADDING = 2, there's more to do than just twiddle this test.)
*/
if (key->prefix.colors[0] != COLOR_BLANK &&
key->prefix.colors[1] == COLOR_BLANK)
return false;
/*
* Other combinations involving blank are valid, in particular we assume
* blank-blank-nonblank is valid, which presumes that LPADDING is 2.
*
* Note: Using again the example "foo bar", we will not consider the
* trigram " b", though this trigram would be found by the trigram
* extraction code. Since we will find " ba", it doesn't seem worth
* trying to hack the algorithm to generate the additional trigram.
*/
/* arc label is valid */
return true;
}
/*
* Get state of expanded graph for given state key,
* and queue the state for processing if it didn't already exist.
*/
static TrgmState *
getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
{
TrgmState *state;
bool found;
state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
&found);
if (!found)
{
/* New state: initialize and queue it */
state->arcs = NIL;
state->enterKeys = NIL;
state->flags = 0;
/* states are initially given negative numbers */
state->snumber = -(++trgmNFA->nstates);
state->parent = NULL;
state->tentFlags = 0;
state->tentParent = NULL;
trgmNFA->queue = lappend(trgmNFA->queue, state);
}
return state;
}
/*
* Check if prefix1 "contains" prefix2.
*
* "contains" means that any exact prefix (with no ambiguity) that satisfies
* prefix2 also satisfies prefix1.
*/
static bool
prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
{
if (prefix1->colors[1] == COLOR_UNKNOWN)
{
/* Fully ambiguous prefix contains everything */
return true;
}
else if (prefix1->colors[0] == COLOR_UNKNOWN)
{
/*
* Prefix with only first unknown color contains every prefix with
* same second color.
*/
if (prefix1->colors[1] == prefix2->colors[1])
return true;
else
return false;
}
else
{
/* Exact prefix contains only the exact same prefix */
if (prefix1->colors[0] == prefix2->colors[0] &&
prefix1->colors[1] == prefix2->colors[1])
return true;
else
return false;
}
}
/*---------------------
* Subroutines for expanding color trigrams into regular trigrams (stage 3).
*---------------------
*/
/*
* Get vector of all color trigrams in graph and select which of them
* to expand into simple trigrams.
*
* Returns true if OK, false if exhausted resource limits.
*/
static bool
selectColorTrigrams(TrgmNFA *trgmNFA)
{
HASH_SEQ_STATUS scan_status;
int arcsCount = trgmNFA->arcsCount,
i;
TrgmState *state;
ColorTrgmInfo *colorTrgms;
int64 totalTrgmCount;
float4 totalTrgmPenalty;
int cnumber;
/* Collect color trigrams from all arcs */
colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
trgmNFA->colorTrgms = colorTrgms;
i = 0;
hash_seq_init(&scan_status, trgmNFA->states);
while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
{
ListCell *cell;
foreach(cell, state->arcs)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
ColorTrgmInfo *trgmInfo = &colorTrgms[i];
arcInfo->source = state;
arcInfo->target = arc->target;
trgmInfo->ctrgm = arc->ctrgm;
trgmInfo->cnumber = -1;
/* count and penalty will be set below */
trgmInfo->expanded = true;
trgmInfo->arcs = list_make1(arcInfo);
i++;
}
}
Assert(i == arcsCount);
/* Remove duplicates, merging their arcs lists */
if (arcsCount >= 2)
{
ColorTrgmInfo *p1,
*p2;
/* Sort trigrams to ease duplicate detection */
qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
/* p1 is probe point, p2 is last known non-duplicate. */
p2 = colorTrgms;
for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
{
if (colorTrgmInfoCmp(p1, p2) > 0)
{
p2++;
*p2 = *p1;
}
else
{
p2->arcs = list_concat(p2->arcs, p1->arcs);
}
}
trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
}
else
{
trgmNFA->colorTrgmsCount = arcsCount;
}
/*
* Count number of simple trigrams generated by each color trigram, and
* also compute a penalty value, which is the number of simple trigrams
* times a multiplier that depends on its whitespace content.
*
* Note: per-color-trigram counts cannot overflow an int so long as
* COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
* 1290. However, the grand total totalTrgmCount might conceivably
* overflow an int, so we use int64 for that within this routine. Also,
* penalties are calculated in float4 arithmetic to avoid any overflow
* worries.
*/
totalTrgmCount = 0;
totalTrgmPenalty = 0.0f;
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
ColorTrgmInfo *trgmInfo = &colorTrgms[i];
int j,
count = 1,
typeIndex = 0;
for (j = 0; j < 3; j++)
{
TrgmColor c = trgmInfo->ctrgm.colors[j];
typeIndex *= 2;
if (c == COLOR_BLANK)
typeIndex++;
else
count *= trgmNFA->colorInfo[c].wordCharsCount;
}
trgmInfo->count = count;
totalTrgmCount += count;
trgmInfo->penalty = penalties[typeIndex] * (float4) count;
totalTrgmPenalty += trgmInfo->penalty;
}
/* Sort color trigrams in descending order of their penalties */
qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
colorTrgmInfoPenaltyCmp);
/*
* Remove color trigrams from the graph so long as total penalty of color
* trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
* WISH_TRGM_PENALTY, it's OK so long as total count is no more than
* MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
* penalty, since those are the most promising for reducing the total
* penalty. When removing a color trigram we have to merge states
* connected by arcs labeled with that trigram. It's necessary to not
* merge initial and final states, because our graph becomes useless if
* that happens; so we cannot always remove the trigram we'd prefer to.
*/
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
ColorTrgmInfo *trgmInfo = &colorTrgms[i];
bool canRemove = true;
ListCell *cell;
/* Done if we've reached the target */
if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
break;
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
trgmInfo->ctrgm.colors[0],
trgmInfo->ctrgm.colors[1],
trgmInfo->ctrgm.colors[2],
trgmInfo->penalty,
list_length(trgmInfo->arcs));
#endif
/*
* Does any arc of this color trigram connect initial and final
* states? If so we can't remove it.
*/
foreach(cell, trgmInfo->arcs)
{
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
TrgmState *source = arcInfo->source,
*target = arcInfo->target;
int source_flags,
target_flags;
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
-target->snumber, target->flags,
-source->snumber, source->flags);
#endif
/* examine parent states, if any merging has already happened */
while (source->parent)
source = source->parent;
while (target->parent)
target = target->parent;
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
-target->snumber, target->flags,
-source->snumber, source->flags);
#endif
/* we must also consider merges we are planning right now */
source_flags = source->flags | source->tentFlags;
while (source->tentParent)
{
source = source->tentParent;
source_flags |= source->flags | source->tentFlags;
}
target_flags = target->flags | target->tentFlags;
while (target->tentParent)
{
target = target->tentParent;
target_flags |= target->flags | target->tentFlags;
}
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
-target->snumber, target_flags,
-source->snumber, source_flags);
#endif
/* would fully-merged state have both INIT and FIN set? */
if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
(TSTATE_INIT | TSTATE_FIN))
{
canRemove = false;
break;
}
/* ok so far, so remember planned merge */
if (source != target)
{
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... tentatively merging s%d into s%d\n",
-target->snumber, -source->snumber);
#endif
target->tentParent = source;
source->tentFlags |= target_flags;
}
}
/*
* We must reset all the tentFlags/tentParent fields before
* continuing. tentFlags could only have become set in states that
* are the source or parent or tentative parent of one of the current
* arcs; likewise tentParent could only have become set in states that
* are the target or parent or tentative parent of one of the current
* arcs. There might be some overlap between those sets, but if we
* clear tentFlags in target states as well as source states, we
* should be okay even if we visit a state as target before visiting
* it as a source.
*/
foreach(cell, trgmInfo->arcs)
{
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
TrgmState *source = arcInfo->source,
*target = arcInfo->target;
TrgmState *ttarget;
/* no need to touch previously-merged states */
while (source->parent)
source = source->parent;
while (target->parent)
target = target->parent;
while (source)
{
source->tentFlags = 0;
source = source->tentParent;
}
while ((ttarget = target->tentParent) != NULL)
{
target->tentParent = NULL;
target->tentFlags = 0; /* in case it was also a source */
target = ttarget;
}
}
/* Now, move on if we can't drop this trigram */
if (!canRemove)
{
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, " ... not ok to merge\n");
#endif
continue;
}
/* OK, merge states linked by each arc labeled by the trigram */
foreach(cell, trgmInfo->arcs)
{
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
TrgmState *source = arcInfo->source,
*target = arcInfo->target;
while (source->parent)
source = source->parent;
while (target->parent)
target = target->parent;
if (source != target)
{
#ifdef TRGM_REGEXP_DEBUG
fprintf(stderr, "merging s%d into s%d\n",
-target->snumber, -source->snumber);
#endif
mergeStates(source, target);
/* Assert we didn't merge initial and final states */
Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
(TSTATE_INIT | TSTATE_FIN));
}
}
/* Mark trigram unexpanded, and update totals */
trgmInfo->expanded = false;
totalTrgmCount -= trgmInfo->count;
totalTrgmPenalty -= trgmInfo->penalty;
}
/* Did we succeed in fitting into MAX_TRGM_COUNT? */
if (totalTrgmCount > MAX_TRGM_COUNT)
return false;
trgmNFA->totalTrgmCount = (int) totalTrgmCount;
/*
* Sort color trigrams by colors (will be useful for bsearch in packGraph)
* and enumerate the color trigrams that are expanded.
*/
cnumber = 0;
qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
colorTrgmInfoCmp);
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
if (colorTrgms[i].expanded)
{
colorTrgms[i].cnumber = cnumber;
cnumber++;
}
}
return true;
}
/*
* Expand selected color trigrams into regular trigrams.
*
* Returns the TRGM array to be passed to the index machinery.
* The array must be allocated in rcontext.
*/
static TRGM *
expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
{
TRGM *trg;
trgm *p;
int i;
TrgmColorInfo blankColor;
trgm_mb_char blankChar;
/* Set up "blank" color structure containing a single zero character */
memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
blankColor.wordCharsCount = 1;
blankColor.wordChars = &blankChar;
/* Construct the trgm array */
trg = (TRGM *)
MemoryContextAllocZero(rcontext,
TRGMHDRSIZE +
trgmNFA->totalTrgmCount * sizeof(trgm));
trg->flag = ARRKEY;
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
p = GETARR(trg);
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
TrgmColorInfo *c[3];
trgm_mb_char s[3];
int j,
i1,
i2,
i3;
/* Ignore any unexpanded trigrams ... */
if (!colorTrgm->expanded)
continue;
/* Get colors, substituting the dummy struct for COLOR_BLANK */
for (j = 0; j < 3; j++)
{
if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
else
c[j] = &blankColor;
}
/* Iterate over all possible combinations of colors' characters */
for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
{
s[0] = c[0]->wordChars[i1];
for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
{
s[1] = c[1]->wordChars[i2];
for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
{
s[2] = c[2]->wordChars[i3];
fillTrgm(p, s);
p++;
}
}
}
}
return trg;
}
/*
* Convert trigram into trgm datatype.
*/
static void
fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
{
char str[3 * MAX_MULTIBYTE_CHAR_LEN],
*p;
int i,
j;
/* Write multibyte string into "str" (we don't need null termination) */
p = str;
for (i = 0; i < 3; i++)
{
if (s[i].bytes[0] != 0)
{
for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
*p++ = s[i].bytes[j];
}
else
{
/* Emit a space in place of COLOR_BLANK */
*p++ = ' ';
}
}
/* Convert "str" to a standard trigram (possibly hashing it) */
compact_trigram(ptrgm, str, p - str);
}
/*
* Merge two states of graph.
*/
static void
mergeStates(TrgmState *state1, TrgmState *state2)
{
Assert(state1 != state2);
Assert(!state1->parent);
Assert(!state2->parent);
/* state1 absorbs state2's flags */
state1->flags |= state2->flags;
/* state2, and indirectly all its children, become children of state1 */
state2->parent = state1;
}
/*
* Compare function for sorting of color trigrams by their colors.
*/
static int
colorTrgmInfoCmp(const void *p1, const void *p2)
{
const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
}
/*
* Compare function for sorting color trigrams in descending order of
* their penalty fields.
*/
static int
colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
{
float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
if (penalty1 < penalty2)
return 1;
else if (penalty1 == penalty2)
return 0;
else
return -1;
}
/*---------------------
* Subroutines for packing the graph into final representation (stage 4).
*---------------------
*/
/*
* Pack expanded graph into final representation.
*
* The result data must be allocated in rcontext.
*/
static TrgmPackedGraph *
packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
{
int snumber = 2,
arcIndex,
arcsCount;
HASH_SEQ_STATUS scan_status;
TrgmState *state;
TrgmPackArcInfo *arcs,
*p1,
*p2;
TrgmPackedArc *packedArcs;
TrgmPackedGraph *result;
int i,
j;
/* Enumerate surviving states, giving init and fin reserved numbers */
hash_seq_init(&scan_status, trgmNFA->states);
while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
{
while (state->parent)
state = state->parent;
if (state->snumber < 0)
{
if (state->flags & TSTATE_INIT)
state->snumber = 0;
else if (state->flags & TSTATE_FIN)
state->snumber = 1;
else
{
state->snumber = snumber;
snumber++;
}
}
}
/* Collect array of all arcs */
arcs = (TrgmPackArcInfo *)
palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount);
arcIndex = 0;
hash_seq_init(&scan_status, trgmNFA->states);
while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
{
TrgmState *source = state;
ListCell *cell;
while (source->parent)
source = source->parent;
foreach(cell, state->arcs)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
TrgmState *target = arc->target;
while (target->parent)
target = target->parent;
if (source->snumber != target->snumber)
{
ColorTrgmInfo *ctrgm;
ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
trgmNFA->colorTrgms,
trgmNFA->colorTrgmsCount,
sizeof(ColorTrgmInfo),
colorTrgmInfoCmp);
Assert(ctrgm != NULL);
Assert(ctrgm->expanded);
arcs[arcIndex].sourceState = source->snumber;
arcs[arcIndex].targetState = target->snumber;
arcs[arcIndex].colorTrgm = ctrgm->cnumber;
arcIndex++;
}
}
}
/* Sort arcs to ease duplicate detection */
qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
/* We could have duplicates because states were merged. Remove them. */
/* p1 is probe point, p2 is last known non-duplicate. */
p2 = arcs;
for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
{
if (packArcInfoCmp(p1, p2) > 0)
{
p2++;
*p2 = *p1;
}
}
arcsCount = (p2 - arcs) + 1;
/* Create packed representation */
result = (TrgmPackedGraph *)
MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
/* Pack color trigrams information */
result->colorTrigramsCount = 0;
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
if (trgmNFA->colorTrgms[i].expanded)
result->colorTrigramsCount++;
}
result->colorTrigramGroups = (int *)
MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
j = 0;
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
if (trgmNFA->colorTrgms[i].expanded)
{
result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
j++;
}
}
/* Pack states and arcs information */
result->statesCount = snumber;
result->states = (TrgmPackedState *)
MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
packedArcs = (TrgmPackedArc *)
MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
j = 0;
for (i = 0; i < snumber; i++)
{
int cnt = 0;
result->states[i].arcs = &packedArcs[j];
while (j < arcsCount && arcs[j].sourceState == i)
{
packedArcs[j].targetState = arcs[j].targetState;
packedArcs[j].colorTrgm = arcs[j].colorTrgm;
cnt++;
j++;
}
result->states[i].arcsCount = cnt;
}
/* Allocate working memory for trigramsMatchGraph() */
result->colorTrigramsActive = (bool *)
MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
result->statesActive = (bool *)
MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
result->statesQueue = (int *)
MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
return result;
}
/*
* Comparison function for sorting TrgmPackArcInfos.
*
* Compares arcs in following order: sourceState, colorTrgm, targetState.
*/
static int
packArcInfoCmp(const void *a1, const void *a2)
{
const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
if (p1->sourceState < p2->sourceState)
return -1;
if (p1->sourceState > p2->sourceState)
return 1;
if (p1->colorTrgm < p2->colorTrgm)
return -1;
if (p1->colorTrgm > p2->colorTrgm)
return 1;
if (p1->targetState < p2->targetState)
return -1;
if (p1->targetState > p2->targetState)
return 1;
return 0;
}
/*---------------------
* Debugging functions
*
* These are designed to emit GraphViz files.
*---------------------
*/
#ifdef TRGM_REGEXP_DEBUG
/*
* Print initial NFA, in regexp library's representation
*/
static void
printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
{
StringInfoData buf;
int nstates = pg_reg_getnumstates(regex);
int state;
int i;
initStringInfo(&buf);
appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
for (state = 0; state < nstates; state++)
{
regex_arc_t *arcs;
int i,
arcsCount;
appendStringInfo(&buf, "s%d", state);
if (pg_reg_getfinalstate(regex) == state)
appendStringInfoString(&buf, " [shape = doublecircle]");
appendStringInfoString(&buf, ";\n");
arcsCount = pg_reg_getnumoutarcs(regex, state);
arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount);
pg_reg_getoutarcs(regex, state, arcs, arcsCount);
for (i = 0; i < arcsCount; i++)
{
appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
state, arcs[i].to, arcs[i].co);
}
pfree(arcs);
}
appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
appendStringInfo(&buf, " initial -> s%d;\n",
pg_reg_getinitialstate(regex));
/* Print colors */
appendStringInfoString(&buf, " { rank = sink;\n");
appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
for (i = 0; i < ncolors; i++)
{
TrgmColorInfo *color = &colors[i];
int j;
appendStringInfo(&buf, "<br/>Color %d: ", i);
if (color->expandable)
{
for (j = 0; j < color->wordCharsCount; j++)
{
char s[MAX_MULTIBYTE_CHAR_LEN + 1];
memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
appendStringInfoString(&buf, s);
}
}
else
appendStringInfoString(&buf, "not expandable");
appendStringInfoChar(&buf, '\n');
}
appendStringInfoString(&buf, " >];\n");
appendStringInfoString(&buf, " }\n");
appendStringInfoString(&buf, "}\n");
{
/* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
FILE *fp = fopen("/tmp/source.gv", "w");
fprintf(fp, "%s", buf.data);
fclose(fp);
}
pfree(buf.data);
}
/*
* Print expanded graph.
*/
static void
printTrgmNFA(TrgmNFA *trgmNFA)
{
StringInfoData buf;
HASH_SEQ_STATUS scan_status;
TrgmState *state;
TrgmState *initstate = NULL;
initStringInfo(&buf);
appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
hash_seq_init(&scan_status, trgmNFA->states);
while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
{
ListCell *cell;
appendStringInfo(&buf, "s%d", -state->snumber);
if (state->flags & TSTATE_FIN)
appendStringInfoString(&buf, " [shape = doublecircle]");
if (state->flags & TSTATE_INIT)
initstate = state;
appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
appendStringInfoString(&buf, ";\n");
foreach(cell, state->arcs)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
appendStringInfo(&buf, " s%d -> s%d [label = \"",
-state->snumber, -arc->target->snumber);
printTrgmColor(&buf, arc->ctrgm.colors[0]);
appendStringInfoChar(&buf, ' ');
printTrgmColor(&buf, arc->ctrgm.colors[1]);
appendStringInfoChar(&buf, ' ');
printTrgmColor(&buf, arc->ctrgm.colors[2]);
appendStringInfoString(&buf, "\"];\n");
}
}
if (initstate)
{
appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
}
appendStringInfoString(&buf, "}\n");
{
/* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
FILE *fp = fopen("/tmp/transformed.gv", "w");
fprintf(fp, "%s", buf.data);
fclose(fp);
}
pfree(buf.data);
}
/*
* Print a TrgmColor readably.
*/
static void
printTrgmColor(StringInfo buf, TrgmColor co)
{
if (co == COLOR_UNKNOWN)
appendStringInfoChar(buf, 'u');
else if (co == COLOR_BLANK)
appendStringInfoChar(buf, 'b');
else
appendStringInfo(buf, "%d", (int) co);
}
/*
* Print final packed representation of trigram-based expanded graph.
*/
static void
printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
{
StringInfoData buf;
trgm *p;
int i;
initStringInfo(&buf);
appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
for (i = 0; i < packedGraph->statesCount; i++)
{
TrgmPackedState *state = &packedGraph->states[i];
int j;
appendStringInfo(&buf, " s%d", i);
if (i == 1)
appendStringInfoString(&buf, " [shape = doublecircle]");
appendStringInfo(&buf, " [label = <s%d>];\n", i);
for (j = 0; j < state->arcsCount; j++)
{
TrgmPackedArc *arc = &state->arcs[j];
appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
i, arc->targetState, arc->colorTrgm);
}
}
appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
appendStringInfo(&buf, " initial -> s%d;\n", 0);
/* Print trigrams */
appendStringInfoString(&buf, " { rank = sink;\n");
appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
p = GETARR(trigrams);
for (i = 0; i < packedGraph->colorTrigramsCount; i++)
{
int count = packedGraph->colorTrigramGroups[i];
int j;
appendStringInfo(&buf, "<br/>Trigram %d: ", i);
for (j = 0; j < count; j++)
{
if (j > 0)
appendStringInfoString(&buf, ", ");
/*
* XXX This representation is nice only for all-ASCII trigrams.
*/
appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
p++;
}
}
appendStringInfoString(&buf, " >];\n");
appendStringInfoString(&buf, " }\n");
appendStringInfoString(&buf, "}\n");
{
/* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
FILE *fp = fopen("/tmp/packed.gv", "w");
fprintf(fp, "%s", buf.data);
fclose(fp);
}
pfree(buf.data);
}
#endif /* TRGM_REGEXP_DEBUG */