Change text search parsing rules for hyphenated words so that digit strings

containing decimal points aren't considered part of a hyphenated word.
Sync the hyphenated-word lookahead states with the subsequent part-by-part
reparsing states so that we don't get different answers about how much text
is part of the hyphenated word.  Per my gripe of a few days ago.
This commit is contained in:
Tom Lane 2007-10-27 19:03:45 +00:00
parent 1aaf39bd20
commit 73e6f9d3b6
2 changed files with 21 additions and 82 deletions

View File

@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.6 2007/10/27 17:53:15 tgl Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -181,19 +181,13 @@ typedef enum
TPS_InHyphenWord,
TPS_InHyphenNumWordFirst,
TPS_InHyphenNumWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InHyphenDigitLookahead,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenWordPart,
TPS_InHyphenAsciiWordPart,
TPS_InHyphenNumWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
TPS_InHVersionPartFirst,
TPS_InHVersionPart,
TPS_Null /* last state (fake value) */
} TParserState;
@ -1147,8 +1141,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -1164,8 +1157,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -1179,8 +1171,8 @@ static const TParserStateActionItem actionTPS_InHyphenWord[] = {
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -1191,34 +1183,18 @@ static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static const TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InHyphenValue[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static const TParserStateActionItem actionTPS_InHyphenValueExact[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
};
@ -1251,39 +1227,12 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
};
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
};
static const TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InHDecimalPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL}
};
static const TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InHVersionPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL}
};
/*
* main table of per-state parser actions
@ -1378,19 +1327,13 @@ static const TParserStateAction Actions[] = {
TPARSERSTATEACTION(TPS_InHyphenWord),
TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
TPARSERSTATEACTION(TPS_InHyphenNumWord),
TPARSERSTATEACTION(TPS_InHyphenValueFirst),
TPARSERSTATEACTION(TPS_InHyphenValue),
TPARSERSTATEACTION(TPS_InHyphenValueExact),
TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
TPARSERSTATEACTION(TPS_InParseHyphen),
TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
TPARSERSTATEACTION(TPS_InHyphenWordPart),
TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt),
TPARSERSTATEACTION(TPS_InHDecimalPartFirst),
TPARSERSTATEACTION(TPS_InHDecimalPart),
TPARSERSTATEACTION(TPS_InHVersionPartFirst),
TPARSERSTATEACTION(TPS_InHVersionPart)
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
};

View File

@ -352,15 +352,11 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
12 | .
20 | 4.2
12 | ,
15 | readline-4.2
11 | readline
12 | -
20 | 4.2
1 | readline
20 | -4.2
12 |
15 | readline-4.2
11 | readline
12 | -
20 | 4.2
1 | readline
20 | -4.2
12 | .
22 | 234
12 |
@ -377,14 +373,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
12 |
12 | <>
1 | qwerty
(135 rows)
(131 rows)
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
to_tsvector
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
to_tsvector
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'ad':17 'dw':19 'jf':39 '234':61 '345':1 '4.2':54,55,56 '455':31 'jqw':64 'qwe':2,18,27,28,35 'wer':36 'wow':63 '-4.2':58,60 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':65 '234.435':30 'qwe-wer':34 'readlin':53,57,59 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
(1 row)
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">