From e8c81e179e752b1f443b863a200c5c07477e09d3 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 5 Dec 2005 18:13:22 +0000 Subject: [PATCH] Improve word parser. - improve file and path recognition - fix misspeling - improve tag recognition --- contrib/tsearch2/wordparser/parser.c | 78 +++++++++++++++++++++------- contrib/tsearch2/wordparser/parser.h | 9 ++-- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c index 282acf6e36..deccdb284a 100644 --- a/contrib/tsearch2/wordparser/parser.c +++ b/contrib/tsearch2/wordparser/parser.c @@ -327,6 +327,7 @@ static TParserStateActionItem actionTPS_Base[] = { {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InPathFirst, 0, NULL}, {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL} }; @@ -336,6 +337,7 @@ static TParserStateActionItem actionTPS_InUWord[] = { {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, {NULL, 0, A_BINGO, TPS_Base, UWORD, NULL} }; @@ -343,8 +345,8 @@ static TParserStateActionItem actionTPS_InUWord[] = { static TParserStateActionItem actionTPS_InLatWord[] = { {p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL}, {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InFileFirst, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, @@ -366,7 +368,7 @@ static TParserStateActionItem actionTPS_InCyrWord[] = { static TParserStateActionItem actionTPS_InUnsignedInt[] = { {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}, {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL}, {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL}, {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL}, @@ -500,10 +502,19 @@ static TParserStateActionItem actionTPS_InTagFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL}, {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL}, + {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL}, {p_islatin, 0, A_PUSH, TPS_InTag, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; +static TParserStateActionItem actionTPS_InXMLBegin[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + /* */ + {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL}, + {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -583,30 +602,30 @@ static TParserStateActionItem actionTPS_InCommentEnd[] = { {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL} }; -static TParserStateActionItem actionTPS_InHostFirstDomen[] = { +static TParserStateActionItem actionTPS_InHostFirstDomain[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHostDomenSecond, 0, NULL}, + {p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static TParserStateActionItem actionTPS_InHostDomenSecond[] = { +static TParserStateActionItem actionTPS_InHostDomainSecond[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL}, + {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static TParserStateActionItem actionTPS_InHostDomen[] = { +static TParserStateActionItem actionTPS_InHostDomain[] = { {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL}, + {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_isdigit, 0, A_POP, TPS_Null, 0, NULL}, {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURIStart, HOST, NULL}, @@ -640,7 +659,7 @@ static TParserStateActionItem actionTPS_InHost[] = { {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, - {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL}, + {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -652,14 +671,32 @@ static TParserStateActionItem actionTPS_InEmail[] = { static TParserStateActionItem actionTPS_InFileFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL}, - {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL}, - {p_iseqC, '.', A_CLEAR, TPS_InFile, 0, NULL}, - {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL}, + {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; +static TParserStateActionItem actionTPS_InPathFirst[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, + {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL}, + {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static TParserStateActionItem actionTPS_InPathSecond[] = { + {p_isEOF, 0, A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL}, + {p_iseqC, '/', A_NEXT|A_PUSH, TPS_InFileFirst, 0, NULL}, + {p_iseqC, '/', A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL}, + {p_isspace, 0, A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + static TParserStateActionItem actionTPS_InFile[] = { {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL}, {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, @@ -894,6 +931,7 @@ static const TParserStateAction Actions[] = { {TPS_InHTMLEntityNum, actionTPS_InHTMLEntityNum}, {TPS_InHTMLEntityEnd, actionTPS_InHTMLEntityEnd}, {TPS_InTagFirst, actionTPS_InTagFirst}, + {TPS_InXMLBegin, actionTPS_InXMLBegin}, {TPS_InTagCloseFirst, actionTPS_InTagCloseFirst}, {TPS_InTag, actionTPS_InTag}, {TPS_InTagEscapeK, actionTPS_InTagEscapeK}, @@ -906,15 +944,17 @@ static const TParserStateAction Actions[] = { {TPS_InCloseCommentFirst, actionTPS_InCloseCommentFirst}, {TPS_InCloseCommentLast, actionTPS_InCloseCommentLast}, {TPS_InCommentEnd, actionTPS_InCommentEnd}, - {TPS_InHostFirstDomen, actionTPS_InHostFirstDomen}, - {TPS_InHostDomenSecond, actionTPS_InHostDomenSecond}, - {TPS_InHostDomen, actionTPS_InHostDomen}, + {TPS_InHostFirstDomain, actionTPS_InHostFirstDomain}, + {TPS_InHostDomainSecond, actionTPS_InHostDomainSecond}, + {TPS_InHostDomain, actionTPS_InHostDomain}, {TPS_InPortFirst, actionTPS_InPortFirst}, {TPS_InPort, actionTPS_InPort}, {TPS_InHostFirstAN, actionTPS_InHostFirstAN}, {TPS_InHost, actionTPS_InHost}, {TPS_InEmail, actionTPS_InEmail}, {TPS_InFileFirst, actionTPS_InFileFirst}, + {TPS_InPathFirst, actionTPS_InPathFirst}, + {TPS_InPathSecond, actionTPS_InPathSecond}, {TPS_InFile, actionTPS_InFile}, {TPS_InFileNext, actionTPS_InFileNext}, {TPS_InURIFirst, actionTPS_InURIFirst}, diff --git a/contrib/tsearch2/wordparser/parser.h b/contrib/tsearch2/wordparser/parser.h index 83468d657f..9cdd141efd 100644 --- a/contrib/tsearch2/wordparser/parser.h +++ b/contrib/tsearch2/wordparser/parser.h @@ -30,6 +30,7 @@ typedef enum TPS_InHTMLEntityNum, TPS_InHTMLEntityEnd, TPS_InTagFirst, + TPS_InXMLBegin, TPS_InTagCloseFirst, TPS_InTag, TPS_InTagEscapeK, @@ -42,15 +43,17 @@ typedef enum TPS_InCloseCommentFirst, TPS_InCloseCommentLast, TPS_InCommentEnd, - TPS_InHostFirstDomen, - TPS_InHostDomenSecond, - TPS_InHostDomen, + TPS_InHostFirstDomain, + TPS_InHostDomainSecond, + TPS_InHostDomain, TPS_InPortFirst, TPS_InPort, TPS_InHostFirstAN, TPS_InHost, TPS_InEmail, TPS_InFileFirst, + TPS_InPathFirst, + TPS_InPathSecond, TPS_InFile, TPS_InFileNext, TPS_InURIFirst,