diff --git a/contrib/tsearch/README.tsearch b/contrib/tsearch/README.tsearch index c63ae91edd..a57df55eea 100644 --- a/contrib/tsearch/README.tsearch +++ b/contrib/tsearch/README.tsearch @@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access. All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov (oleg@sai.msu.su). +CHANGES: + +August 13, 2002 + Use parser of OpenFTS v0.33. + IMPORTANT NOTICE: This is a first step of our work on integration of OpenFTS diff --git a/contrib/tsearch/deflex.h b/contrib/tsearch/deflex.h index f9d6847167..17c4fdf1ec 100644 --- a/contrib/tsearch/deflex.h +++ b/contrib/tsearch/deflex.h @@ -2,28 +2,33 @@ #define __DEFLEX_H__ /* rememder !!!! */ -#define LASTNUM 19 +#define LASTNUM 23 #define LATWORD 1 -#define NONLATINWORD 2 +#define CYRWORD 2 #define UWORD 3 #define EMAIL 4 #define FURL 5 #define HOST 6 -#define FLOAT 7 -#define FINT 8 -#define PARTWORD 9 -#define NONLATINPARTWORD 10 -#define LATPARTWORD 11 -#define SPACE 12 -#define SYMTAG 13 -#define HTTP 14 -#define DEFISWORD 15 -#define DEFISLATWORD 16 -#define DEFISNONLATINWORD 17 +#define SCIENTIFIC 7 +#define VERSIONNUMBER 8 +#define PARTHYPHENWORD 9 +#define CYRPARTHYPHENWORD 10 +#define LATPARTHYPHENWORD 11 +#define SPACE 12 +#define TAG 13 +#define HTTP 14 +#define HYPHENWORD 15 +#define LATHYPHENWORD 16 +#define CYRHYPHENWORD 17 #define URI 18 #define FILEPATH 19 +#define DECIMAL 20 +#define SIGNEDINT 21 +#define UNSIGNEDINT 22 +#define HTMLENTITY 23 extern const char *descr[]; #endif + diff --git a/contrib/tsearch/expected/tsearch.out b/contrib/tsearch/expected/tsearch.out index f75b429bcb..0b12765d8f 100644 --- a/contrib/tsearch/expected/tsearch.out +++ b/contrib/tsearch/expected/tsearch.out @@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)'; select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf qwer jf sdjk ewr1> ewri2 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 wow < jqw <> qwerty'); - txt2txtidx ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32' + txt2txtidx +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32' (1 row) select txtidxsize(txt2txtidx('345 qw')); @@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e wow < jqw <> qwerty')); txtidxsize ------------ - 52 + 53 (1 row) insert into test_txtidx (a) values ('345 qwerty'); diff --git a/contrib/tsearch/morph.c b/contrib/tsearch/morph.c index 60797b07e9..b29a3f6779 100644 --- a/contrib/tsearch/morph.c +++ b/contrib/tsearch/morph.c @@ -75,19 +75,23 @@ static MAPDICT mapdict[] = { {NODICT, NODICT}, /* EMAIL */ {NODICT, NODICT}, /* FURL */ {NODICT, NODICT}, /* HOST */ - {NODICT, NODICT}, /* FLOAT */ - {NODICT, NODICT}, /* FINT */ - {BYLOCALE, DEFAULTDICT}, /* PARTWORD */ - {BYLOCALE, NODICT}, /* NONLATINPARTWORD */ - {DEFAULTDICT, NODICT}, /* LATPARTWORD */ + {NODICT, NODICT}, /* SCIENTIFIC */ + {NODICT, NODICT}, /* VERSIONNUMBER */ + {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */ + {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */ + {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */ {STOPLEXEM, NODICT}, /* SPACE */ - {STOPLEXEM, NODICT}, /* SYMTAG */ + {STOPLEXEM, NODICT}, /* TAG */ {STOPLEXEM, NODICT}, /* HTTP */ - {BYLOCALE, DEFAULTDICT}, /* DEFISWORD */ - {DEFAULTDICT, NODICT}, /* DEFISLATWORD */ - {BYLOCALE, NODICT}, /* DEFISNONLATINWORD */ + {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */ + {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */ + {BYLOCALE, NODICT}, /* CYRHYPHENWORD */ {NODICT, NODICT}, /* URI */ - {NODICT, NODICT} /* FILEPATH */ + {NODICT, NODICT}, /* FILEPATH */ + {NODICT, NODICT}, /* DECIMAL */ + {NODICT, NODICT}, /* SIGNEDINT */ + {NODICT, NODICT}, /* UNSIGNEDINT */ + {STOPLEXEM, NODICT} /* HTMLENTITY */ }; static bool inited = false; diff --git a/contrib/tsearch/parser.l b/contrib/tsearch/parser.l index 6081fd4c7b..f30fbcd4f4 100644 --- a/contrib/tsearch/parser.l +++ b/contrib/tsearch/parser.l @@ -5,18 +5,17 @@ /* postgres allocation function */ #include "postgres.h" -#define free pfree -#define malloc palloc +#define free pfree +#define malloc palloc #define realloc repalloc #ifdef strdup #undef strdup #endif -#define strdup pstrdup - +#define strdup pstrdup char *token = NULL; /* pointer to token */ -char *s = NULL; /* for returning full defis-word */ +char *s = NULL; /* to return WHOLE hyphenated-word */ YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ @@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */ %option nounput %option noyywrap - -/* parser's state for parsing defis-word */ +/* parser's state for parsing hyphenated-word */ %x DELIM /* parser's state for parsing URL*/ %x URL %x SERVER -/* parser's state for parsing filepath */ - +/* parser's state for parsing TAGS */ %x INTAG %x QINTAG +%x INCOMMENT +%x INSCRIPT -/* NONLATIN char */ -NONLATINALNUM [0-9\200-\377] -NONLATINALPHA [\200-\377] +/* cyrillic koi8 char */ +CYRALNUM [0-9\200-\377] +CYRALPHA [\200-\377] ALPHA [a-zA-Z\200-\377] ALNUM [0-9a-zA-Z\200-\377] @@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+ %% -"<"[[:alpha:]] { BEGIN INTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } +"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; } -"" { - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; -} - -"<"[^>[:alpha:]] { +"" { + BEGIN INITIAL; + *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; token = tsearch_yytext; tokenlen = tsearch_yyleng; return SPACE; } -"\"" { BEGIN QINTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } +"" { + BEGIN INITIAL; + *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; token = tsearch_yytext; tokenlen = tsearch_yyleng; - return SYMTAG; + return SPACE; } -"\"" { BEGIN INTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } -.|\n { +"<"[\![:alpha:]] { BEGIN INTAG; } + +""\"" { BEGIN QINTAG; } + +"\\\"" ; + +"\"" { BEGIN INTAG; } + +">" { + BEGIN INITIAL; token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; + *tsearch_yytext=' '; + token = tsearch_yytext; + tokenlen = 1; + return TAG; } -">" { BEGIN INITIAL; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } +.|\n ; -.|\n { +\&(quot|amp|nbsp|lt|gt)\; { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return SYMTAG; + return HTMLENTITY; } +\&\#[0-9][0-9]?[0-9]?\; { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return HTMLENTITY; +} [-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ { token = tsearch_yytext; @@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+ return EMAIL; } -[0-9] /* digit's and point (might be a version) */ { +[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return FINT; + return SCIENTIFIC; } -[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ { - token = tsearch_yytext; +[0-9]+\.[0-9]+\.[0-9\.]*[0-9] { + token = tsearch_yytext; tokenlen = tsearch_yyleng; - return FINT; + return VERSIONNUMBER; } -[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ { +[+-]?[0-9]+\.[0-9]+ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return DECIMAL; +} + +[+-][0-9]+ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return FLOAT; + return SIGNEDINT; +} + +[0-9]+ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return UNSIGNEDINT; } http"://" { @@ -208,52 +212,58 @@ ftp"://" { return FILEPATH; } -({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ { +({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch_yytext ); tokenlen = tsearch_yyleng; yyless( 0 ); token = s; - return DEFISNONLATINWORD; + return CYRHYPHENWORD; } -([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ { +([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } - tokenlen = tsearch_yyleng; s = strdup( tsearch_yytext ); + tokenlen = tsearch_yyleng; yyless( 0 ); token = s; - return DEFISLATWORD; + return LATHYPHENWORD; } -({ALNUM}+-)+{ALPHA}+ /* composite-word */ { +({ALNUM}+-)+{ALNUM}+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch_yytext ); tokenlen = tsearch_yyleng; yyless( 0 ); token = s; - return DEFISWORD; + return HYPHENWORD; } -{NONLATINALNUM}+ /* one word in composite-word */ { - token = tsearch_yytext; +\+?[0-9]+\.[0-9]+ { + token = tsearch_yytext; tokenlen = tsearch_yyleng; - return NONLATINPARTWORD; + return DECIMAL; } -[[:alnum:]]+ /* one word in composite-word */ { +{CYRALPHA}+ /* one word in composite-word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return LATPARTWORD; + return CYRPARTHYPHENWORD; +} + +[[:alpha:]]+ /* one word in composite-word */ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return LATPARTHYPHENWORD; } {ALNUM}+ /* one word in composite-word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return PARTWORD; + return PARTHYPHENWORD; } - { @@ -264,17 +274,16 @@ ftp"://" { .|\n /* return in basic state */ { BEGIN INITIAL; - tokenlen = tsearch_yyleng; yyless( 0 ); } -{NONLATINALNUM}+ /* normal word */ { +{CYRALPHA}+ /* normal word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return NONLATINWORD; + return CYRWORD; } -[[:alnum:]]+ /* normal word */ { +[[:alpha:]]+ /* normal word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; return LATWORD; @@ -286,7 +295,13 @@ ftp"://" { return UWORD; } -.|\n { +[ \r\n\t]+ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return SPACE; +} + +. { token = tsearch_yytext; tokenlen = tsearch_yyleng; return SPACE;