From 73e6f9d3b61995525785b2f4490b465fe860196b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 27 Oct 2007 19:03:45 +0000 Subject: [PATCH] Change text search parsing rules for hyphenated words so that digit strings containing decimal points aren't considered part of a hyphenated word. Sync the hyphenated-word lookahead states with the subsequent part-by-part reparsing states so that we don't get different answers about how much text is part of the hyphenated word. Per my gripe of a few days ago. --- src/backend/tsearch/wparser_def.c | 83 +++++---------------------- src/test/regress/expected/tsearch.out | 20 +++---- 2 files changed, 21 insertions(+), 82 deletions(-) diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 7fa0f435b2..086ac95155 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.6 2007/10/27 17:53:15 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -181,19 +181,13 @@ typedef enum TPS_InHyphenWord, TPS_InHyphenNumWordFirst, TPS_InHyphenNumWord, - TPS_InHyphenValueFirst, - TPS_InHyphenValue, - TPS_InHyphenValueExact, + TPS_InHyphenDigitLookahead, TPS_InParseHyphen, TPS_InParseHyphenHyphen, TPS_InHyphenWordPart, TPS_InHyphenAsciiWordPart, TPS_InHyphenNumWordPart, TPS_InHyphenUnsignedInt, - TPS_InHDecimalPartFirst, - TPS_InHDecimalPart, - TPS_InHVersionPartFirst, - TPS_InHVersionPart, TPS_Null /* last state (fake value) */ } TParserState; @@ -1147,8 +1141,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -1164,8 +1157,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = { static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -1179,8 +1171,8 @@ static const TParserStateActionItem actionTPS_InHyphenWord[] = { static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -1191,34 +1183,18 @@ static const TParserStateActionItem actionTPS_InHyphenNumWord[] = { {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen} }; -static const TParserStateActionItem actionTPS_InHyphenValueFirst[] = { +static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL}, - {NULL, 0, A_POP, TPS_Null, 0, NULL} -}; - -static const TParserStateActionItem actionTPS_InHyphenValue[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen} -}; - -static const TParserStateActionItem actionTPS_InHyphenValueExact[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen} + {NULL, 0, A_POP, TPS_Null, 0, NULL} }; static const TParserStateActionItem actionTPS_InParseHyphen[] = { {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL}, + {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL}, {NULL, 0, A_RERUN, TPS_Base, 0, NULL} }; @@ -1251,39 +1227,12 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = { }; static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL}, - {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL}, - {NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL} -}; - -static const TParserStateActionItem actionTPS_InHDecimalPartFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL}, + {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHDecimalPart[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL}, - {NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL} -}; - -static const TParserStateActionItem actionTPS_InHVersionPartFirst[] = { - {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL}, - {NULL, 0, A_POP, TPS_Null, 0, NULL} -}; - -static const TParserStateActionItem actionTPS_InHVersionPart[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL}, - {NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL} -}; - /* * main table of per-state parser actions @@ -1378,19 +1327,13 @@ static const TParserStateAction Actions[] = { TPARSERSTATEACTION(TPS_InHyphenWord), TPARSERSTATEACTION(TPS_InHyphenNumWordFirst), TPARSERSTATEACTION(TPS_InHyphenNumWord), - TPARSERSTATEACTION(TPS_InHyphenValueFirst), - TPARSERSTATEACTION(TPS_InHyphenValue), - TPARSERSTATEACTION(TPS_InHyphenValueExact), + TPARSERSTATEACTION(TPS_InHyphenDigitLookahead), TPARSERSTATEACTION(TPS_InParseHyphen), TPARSERSTATEACTION(TPS_InParseHyphenHyphen), TPARSERSTATEACTION(TPS_InHyphenWordPart), TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart), TPARSERSTATEACTION(TPS_InHyphenNumWordPart), - TPARSERSTATEACTION(TPS_InHyphenUnsignedInt), - TPARSERSTATEACTION(TPS_InHDecimalPartFirst), - TPARSERSTATEACTION(TPS_InHDecimalPart), - TPARSERSTATEACTION(TPS_InHVersionPartFirst), - TPARSERSTATEACTION(TPS_InHVersionPart) + TPARSERSTATEACTION(TPS_InHyphenUnsignedInt) }; diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 9de7959134..b6f8f05d22 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -352,15 +352,11 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w 12 | . 20 | 4.2 12 | , - 15 | readline-4.2 - 11 | readline - 12 | - - 20 | 4.2 + 1 | readline + 20 | -4.2 12 | - 15 | readline-4.2 - 11 | readline - 12 | - - 20 | 4.2 + 1 | readline + 20 | -4.2 12 | . 22 | 234 12 | @@ -377,14 +373,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w 12 | 12 | <> 1 | qwerty -(135 rows) +(131 rows) SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf qwer jf sdjk ewr1> ewri2 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 wow < jqw <> qwerty'); - to_tsvector ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - 'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 + to_tsvector +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + 'ad':17 'dw':19 'jf':39 '234':61 '345':1 '4.2':54,55,56 '455':31 'jqw':64 'qwe':2,18,27,28,35 'wer':36 'wow':63 '-4.2':58,60 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':65 '234.435':30 'qwe-wer':34 'readlin':53,57,59 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 (1 row) SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf qwer jf sdjk ewr1> ewri2