Improve performance of pullback/pushfwd in regular-expression compiler.

The previous coding would create a new intermediate state every time it wanted to interchange the ordering of two constraint arcs. Certain regex features such as \Y can generate large numbers of parallel constraint arcs, and if we needed to reorder the results of that, we created unreasonable numbers of intermediate states. To improve matters, keep a list of already-created intermediate states associated with the state currently being considered by the outer loop; we can re-use such states to place all the new arcs leading to the same destination or source. I also took the trouble to redefine push() and pull() to have a less risky API: they no longer delete any state or arc that the caller might possibly have a pointer to, except for the specifically-passed constraint arc. This reduces the risk of re-introducing the same type of error seen in the failed patch for CVE-2007-4772. Back-patch to all supported branches.
2015-10-16 15:11:49 -04:00 · 2015-10-16 15:11:49 -04:00 · 6a7153661d
parent f5b7d103bc
commit 6a7153661d
2 changed files with 119 additions and 35 deletions
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@ -1513,6 +1513,7 @@ pullback(struct nfa * nfa,
 	struct state *nexts;
 	struct arc *a;
 	struct arc *nexta;
+	struct state *intermediates;
 	int			progress;

 	/* find and pull until there are no more */
@ -1522,14 +1523,25 @@ pullback(struct nfa * nfa,
 		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
 		{
 			nexts = s->next;
+			intermediates = NULL;
 			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
 			{
 				nexta = a->outchain;
 				if (a->type == '^' || a->type == BEHIND)
-					if (pull(nfa, a))
+					if (pull(nfa, a, &intermediates))
 						progress = 1;
-				assert(nexta == NULL || s->no != FREESTATE);
 			}
+			/* clear tmp fields of intermediate states created here */
+			while (intermediates != NULL)
+			{
+				struct state *ns = intermediates->tmp;
+
+				intermediates->tmp = NULL;
+				intermediates = ns;
+			}
+			/* if s is now useless, get rid of it */
+			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+				dropstate(nfa, s);
 		}
 		if (progress && f != NULL)
 			dumpnfa(nfa, f);
@ -1557,13 +1569,26 @@ pullback(struct nfa * nfa,

 /*
 * pull - pull a back constraint backward past its source state
- * A significant property of this function is that it deletes at most
- * one state -- the constraint's from state -- and only if the constraint
- * was that state's last outarc.
+ *
+ * Returns 1 if successful (which it always is unless the source is the
+ * start state or we have an internal error), 0 if nothing happened.
+ *
+ * A significant property of this function is that it deletes no pre-existing
+ * states, and no outarcs of the constraint's from state other than the given
+ * constraint arc.  This makes the loops in pullback() safe, at the cost that
+ * we may leave useless states behind.  Therefore, we leave it to pullback()
+ * to delete such states.
+ *
+ * If the from state has multiple back-constraint outarcs, and/or multiple
+ * compatible constraint inarcs, we only need to create one new intermediate
+ * state per combination of predecessor and successor states.  *intermediates
+ * points to a list of such intermediate states for this from state (chained
+ * through their tmp fields).
 */
-static int						/* 0 couldn't, 1 could */
+static int
 pull(struct nfa * nfa,
-	 struct arc * con)
+	 struct arc * con,
+	 struct state ** intermediates)
 {
 	struct state *from = con->from;
 	struct state *to = con->to;
@ -1580,7 +1605,11 @@ pull(struct nfa * nfa,
 		return 1;
 	}

-	/* first, clone from state if necessary to avoid other outarcs */
+	/*
+	 * First, clone from state if necessary to avoid other outarcs.  This may
+	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
+	 * clone state again at the bottom.
+	 */
 	if (from->nouts > 1)
 	{
 		s = newstate(nfa);
@ -1589,13 +1618,15 @@ pull(struct nfa * nfa,
 		copyins(nfa, from, s, 1);		/* duplicate inarcs */
 		cparc(nfa, con, s, to); /* move constraint arc */
 		freearc(nfa, con);
+		if (NISERR())
+			return 0;
 		from = s;
 		con = from->outs;
 	}
 	assert(from->nouts == 1);

 	/* propagate the constraint into the from state's inarcs */
-	for (a = from->ins; a != NULL; a = nexta)
+	for (a = from->ins; a != NULL && !NISERR(); a = nexta)
 	{
 		nexta = a->inchain;
 		switch (combine(con, a))
@ -1606,13 +1637,23 @@ pull(struct nfa * nfa,
 			case SATISFIED:		/* no action needed */
 				break;
 			case COMPATIBLE:	/* swap the two arcs, more or less */
-				s = newstate(nfa);
-				if (NISERR())
-					return 0;
-				cparc(nfa, a, s, to);	/* anticipate move */
+				/* need an intermediate state, but might have one already */
+				for (s = *intermediates; s != NULL; s = s->tmp)
+				{
+					assert(s->nins > 0 && s->nouts > 0);
+					if (s->ins->from == a->from && s->outs->to == to)
+						break;
+				}
+				if (s == NULL)
+				{
+					s = newstate(nfa);
+					if (NISERR())
+						return 0;
+					s->tmp = *intermediates;
+					*intermediates = s;
+				}
 				cparc(nfa, con, a->from, s);
-				if (NISERR())
-					return 0;
+				cparc(nfa, a, s, to);
 				freearc(nfa, a);
 				break;
 			default:
@ -1623,7 +1664,8 @@ pull(struct nfa * nfa,

 	/* remaining inarcs, if any, incorporate the constraint */
 	moveins(nfa, from, to);
-	dropstate(nfa, from);		/* will free the constraint */
+	freearc(nfa, con);
+	/* from state is now useless, but we leave it to pullback() to clean up */
 	return 1;
 }

@ -1638,6 +1680,7 @@ pushfwd(struct nfa * nfa,
 	struct state *nexts;
 	struct arc *a;
 	struct arc *nexta;
+	struct state *intermediates;
 	int			progress;

 	/* find and push until there are no more */
@ -1647,14 +1690,25 @@ pushfwd(struct nfa * nfa,
 		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
 		{
 			nexts = s->next;
+			intermediates = NULL;
 			for (a = s->ins; a != NULL && !NISERR(); a = nexta)
 			{
 				nexta = a->inchain;
 				if (a->type == '$' || a->type == AHEAD)
-					if (push(nfa, a))
+					if (push(nfa, a, &intermediates))
 						progress = 1;
-				assert(nexta == NULL || s->no != FREESTATE);
 			}
+			/* clear tmp fields of intermediate states created here */
+			while (intermediates != NULL)
+			{
+				struct state *ns = intermediates->tmp;
+
+				intermediates->tmp = NULL;
+				intermediates = ns;
+			}
+			/* if s is now useless, get rid of it */
+			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+				dropstate(nfa, s);
 		}
 		if (progress && f != NULL)
 			dumpnfa(nfa, f);
@ -1682,13 +1736,26 @@ pushfwd(struct nfa * nfa,

 /*
 * push - push a forward constraint forward past its destination state
- * A significant property of this function is that it deletes at most
- * one state -- the constraint's to state -- and only if the constraint
- * was that state's last inarc.
+ *
+ * Returns 1 if successful (which it always is unless the destination is the
+ * post state or we have an internal error), 0 if nothing happened.
+ *
+ * A significant property of this function is that it deletes no pre-existing
+ * states, and no inarcs of the constraint's to state other than the given
+ * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
+ * we may leave useless states behind.  Therefore, we leave it to pushfwd()
+ * to delete such states.
+ *
+ * If the to state has multiple forward-constraint inarcs, and/or multiple
+ * compatible constraint outarcs, we only need to create one new intermediate
+ * state per combination of predecessor and successor states.  *intermediates
+ * points to a list of such intermediate states for this to state (chained
+ * through their tmp fields).
 */
-static int						/* 0 couldn't, 1 could */
+static int
 push(struct nfa * nfa,
-	 struct arc * con)
+	 struct arc * con,
+	 struct state ** intermediates)
 {
 	struct state *from = con->from;
 	struct state *to = con->to;
@ -1705,22 +1772,28 @@ push(struct nfa * nfa,
 		return 1;
 	}

-	/* first, clone to state if necessary to avoid other inarcs */
+	/*
+	 * First, clone to state if necessary to avoid other inarcs.  This may
+	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
+	 * clone state again at the bottom.
+	 */
 	if (to->nins > 1)
 	{
 		s = newstate(nfa);
 		if (NISERR())
 			return 0;
 		copyouts(nfa, to, s, 1);	/* duplicate outarcs */
-		cparc(nfa, con, from, s);		/* move constraint */
+		cparc(nfa, con, from, s);		/* move constraint arc */
 		freearc(nfa, con);
+		if (NISERR())
+			return 0;
 		to = s;
 		con = to->ins;
 	}
 	assert(to->nins == 1);

 	/* propagate the constraint into the to state's outarcs */
-	for (a = to->outs; a != NULL; a = nexta)
+	for (a = to->outs; a != NULL && !NISERR(); a = nexta)
 	{
 		nexta = a->outchain;
 		switch (combine(con, a))
@ -1731,13 +1804,23 @@ push(struct nfa * nfa,
 			case SATISFIED:		/* no action needed */
 				break;
 			case COMPATIBLE:	/* swap the two arcs, more or less */
-				s = newstate(nfa);
-				if (NISERR())
-					return 0;
-				cparc(nfa, con, s, a->to);		/* anticipate move */
+				/* need an intermediate state, but might have one already */
+				for (s = *intermediates; s != NULL; s = s->tmp)
+				{
+					assert(s->nins > 0 && s->nouts > 0);
+					if (s->ins->from == from && s->outs->to == a->to)
+						break;
+				}
+				if (s == NULL)
+				{
+					s = newstate(nfa);
+					if (NISERR())
+						return 0;
+					s->tmp = *intermediates;
+					*intermediates = s;
+				}
+				cparc(nfa, con, s, a->to);
 				cparc(nfa, a, from, s);
-				if (NISERR())
-					return 0;
 				freearc(nfa, a);
 				break;
 			default:
@ -1748,7 +1831,8 @@ push(struct nfa * nfa,

 	/* remaining outarcs, if any, incorporate the constraint */
 	moveouts(nfa, to, from);
-	dropstate(nfa, to);			/* will free the constraint */
+	freearc(nfa, con);
+	/* to state is now useless, but we leave it to pushfwd() to clean up */
 	return 1;
 }

--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@ -149,9 +149,9 @@ static void cleartraverse(struct nfa *, struct state *);
 static void specialcolors(struct nfa *);
 static long optimize(struct nfa *, FILE *);
 static void pullback(struct nfa *, FILE *);
-static int	pull(struct nfa *, struct arc *);
+static int	pull(struct nfa *, struct arc *, struct state **);
 static void pushfwd(struct nfa *, FILE *);
-static int	push(struct nfa *, struct arc *);
+static int	push(struct nfa *, struct arc *, struct state **);

 #define INCOMPATIBLE	1		/* destroys arc */
 #define SATISFIED	2			/* constraint satisfied */