Reimplement text_position and related functions to use Boyer-Moore-Horspool

searching instead of naive matching. In the worst case this has the same O(M*N) complexity as the naive method, but the worst case is hard to hit, and the average case is very fast, especially with longer patterns. David Rowley
2008-09-07 04:20:00 +00:00 · 2008-09-07 04:20:00 +00:00 · e6a310b281
parent 2cf3f6694f
commit e6a310b281
1 changed files with 187 additions and 32 deletions
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.167 2008/05/27 00:13:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.168 2008/09/07 04:20:00 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -39,6 +39,9 @@ typedef struct
 	pg_wchar   *wstr2;			/* note: these are palloc'd */
 	int			len1;			/* string lengths in logical characters */
 	int			len2;
+	/* Skip table for Boyer-Moore-Horspool search algorithm: */
+	int			skiptablemask;	/* mask for ANDing with skiptable subscripts */
+	int			skiptable[256];	/* skip distance for given mismatched char */
 } TextPositionState;

 #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
@ -753,7 +756,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 		 * If we're working with an untoasted source, no need to do an extra
 		 * copying step.
 		 */
-		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) || 
+		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
 			slice = DatumGetTextPSlice(str, slice_start, slice_size);
 		else
@ -866,6 +869,7 @@ text_position(text *t1, text *t2)
 	return result;
 }

+
 /*
 * text_position_setup, text_position_next, text_position_cleanup -
 *	Component steps of text_position()
@ -909,64 +913,215 @@ text_position_setup(text *t1, text *t2, TextPositionState *state)
 		state->len1 = len1;
 		state->len2 = len2;
 	}
+
+	/*
+	 * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
+	 * notes we use the terminology that the "haystack" is the string to be
+	 * searched (t1) and the "needle" is the pattern being sought (t2).
+	 *
+	 * If the needle is empty or bigger than the haystack then there is no
+	 * point in wasting cycles initializing the table.  We also choose not
+	 * to use B-M-H for needles of length 1, since the skip table can't
+	 * possibly save anything in that case.
+	 */
+	if (len1 >= len2 && len2 > 1)
+	{
+		int		searchlength = len1 - len2;
+		int     skiptablemask;
+		int     last;
+		int     i;
+
+		/*
+		 * First we must determine how much of the skip table to use.  The
+		 * declaration of TextPositionState allows up to 256 elements, but for
+		 * short search problems we don't really want to have to initialize so
+		 * many elements --- it would take too long in comparison to the
+		 * actual search time.  So we choose a useful skip table size based on
+		 * the haystack length minus the needle length.  The closer the needle
+		 * length is to the haystack length the less useful skipping becomes.
+		 *
+		 * Note: since we use bit-masking to select table elements, the skip
+		 * table size MUST be a power of 2, and so the mask must be 2^N-1.
+		 */
+		if (searchlength < 16)
+			skiptablemask = 3;
+		else if (searchlength < 64)
+			skiptablemask = 7;
+		else if (searchlength < 128)
+			skiptablemask = 15;
+		else if (searchlength < 512)
+			skiptablemask = 31;
+		else if (searchlength < 2048)
+			skiptablemask = 63;
+		else if (searchlength < 4096)
+			skiptablemask = 127;
+		else
+			skiptablemask = 255;
+		state->skiptablemask = skiptablemask;
+
+		/*
+		 * Initialize the skip table.  We set all elements to the needle
+		 * length, since this is the correct skip distance for any character
+		 * not found in the needle.
+		 */
+		for (i = 0; i <= skiptablemask; i++)
+			state->skiptable[i] = len2;
+
+		/*
+		 * Now examine the needle.  For each character except the last one,
+		 * set the corresponding table element to the appropriate skip
+		 * distance.  Note that when two characters share the same skip table
+		 * entry, the one later in the needle must determine the skip distance.
+		 */
+		last = len2 - 1;
+
+		if (!state->use_wchar)
+		{
+			const char *str2 = state->str2;
+
+			for (i = 0; i < last; i++)
+				state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
+		}
+		else
+		{
+			const pg_wchar *wstr2 = state->wstr2;
+
+			for (i = 0; i < last; i++)
+				state->skiptable[wstr2[i] & skiptablemask] = last - i;
+		}
+	}
 }

 static int
 text_position_next(int start_pos, TextPositionState *state)
 {
-	int			pos = 0,
-				p,
-				px;
+	int			haystack_len = state->len1;
+	int			needle_len = state->len2;
+	int			skiptablemask = state->skiptablemask;

 	Assert(start_pos > 0);		/* else caller error */

-	if (state->len2 <= 0)
+	if (needle_len <= 0)
 		return start_pos;		/* result for empty pattern */

+	start_pos--;				/* adjust for zero based arrays */
+
+	/* Done if the needle can't possibly fit */
+	if (haystack_len < start_pos + needle_len)
+		return 0;
+
 	if (!state->use_wchar)
 	{
 		/* simple case - single byte encoding */
-		char	   *p1 = state->str1;
-		char	   *p2 = state->str2;
+		const char *haystack = state->str1;
+		const char *needle = state->str2;
+		const char *haystack_end = &haystack[haystack_len];
+		const char *hptr;

-		/* no use in searching str past point where search_str will fit */
-		px = (state->len1 - state->len2);
-
-		p1 += start_pos - 1;
-
-		for (p = start_pos - 1; p <= px; p++)
+		if (needle_len == 1)
 		{
-			if ((*p1 == *p2) && (strncmp(p1, p2, state->len2) == 0))
+			/* No point in using B-M-H for a one-character needle */
+			char	nchar = *needle;
+
+			hptr = &haystack[start_pos];
+			while (hptr < haystack_end)
 			{
-				pos = p + 1;
-				break;
+				if (*hptr == nchar)
+					return hptr - haystack + 1;
+				hptr++;
+			}
+		}
+		else
+		{
+			const char *needle_last = &needle[needle_len - 1];
+
+			/* Start at startpos plus the length of the needle */
+			hptr = &haystack[start_pos + needle_len - 1];
+			while (hptr < haystack_end)
+			{
+				/* Match the needle scanning *backward* */
+				const char *nptr;
+				const char *p;
+
+				nptr = needle_last;
+				p = hptr;
+				while (*nptr == *p)
+				{
+					/* Matched it all?  If so, return 1-based position */
+					if (nptr == needle)
+						return p - haystack + 1;
+					nptr--, p--;
+				}
+				/*
+				 * No match, so use the haystack char at hptr to decide how
+				 * far to advance.  If the needle had any occurrence of that
+				 * character (or more precisely, one sharing the same
+				 * skiptable entry) before its last character, then we advance
+				 * far enough to align the last such needle character with
+				 * that haystack position.  Otherwise we can advance by the
+				 * whole needle length.
+				 */
+				hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
 			}
-			p1++;
 		}
 	}
 	else
 	{
-		/* not as simple - multibyte encoding */
-		pg_wchar   *p1 = state->wstr1;
-		pg_wchar   *p2 = state->wstr2;
+		/* The multibyte char version. This works exactly the same way. */
+		const pg_wchar *haystack = state->wstr1;
+		const pg_wchar *needle = state->wstr2;
+		const pg_wchar *haystack_end = &haystack[haystack_len];
+		const pg_wchar *hptr;

-		/* no use in searching str past point where search_str will fit */
-		px = (state->len1 - state->len2);
-
-		p1 += start_pos - 1;
-
-		for (p = start_pos - 1; p <= px; p++)
+		if (needle_len == 1)
 		{
-			if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, state->len2) == 0))
+			/* No point in using B-M-H for a one-character needle */
+			pg_wchar	nchar = *needle;
+
+			hptr = &haystack[start_pos];
+			while (hptr < haystack_end)
 			{
-				pos = p + 1;
-				break;
+				if (*hptr == nchar)
+					return hptr - haystack + 1;
+				hptr++;
+			}
+		}
+		else
+		{
+			const pg_wchar *needle_last = &needle[needle_len - 1];
+
+			/* Start at startpos plus the length of the needle */
+			hptr = &haystack[start_pos + needle_len - 1];
+			while (hptr < haystack_end)
+			{
+				/* Match the needle scanning *backward* */
+				const pg_wchar *nptr;
+				const pg_wchar *p;
+
+				nptr = needle_last;
+				p = hptr;
+				while (*nptr == *p)
+				{
+					/* Matched it all?  If so, return 1-based position */
+					if (nptr == needle)
+						return p - haystack + 1;
+					nptr--, p--;
+				}
+				/*
+				 * No match, so use the haystack char at hptr to decide how
+				 * far to advance.  If the needle had any occurrence of that
+				 * character (or more precisely, one sharing the same
+				 * skiptable entry) before its last character, then we advance
+				 * far enough to align the last such needle character with
+				 * that haystack position.  Otherwise we can advance by the
+				 * whole needle length.
+				 */
+				hptr += state->skiptable[*hptr & skiptablemask];
 			}
-			p1++;
 		}
 	}

-	return pos;
+	return 0;					/* not found */
 }

 static void