August 13, 2002

Use parser of OpenFTS v0.33.

--
Teodor Sigaev
This commit is contained in:
Bruce Momjian 2002-08-15 03:02:08 +00:00
parent 1276356268
commit 2860041bf0
5 changed files with 133 additions and 104 deletions

View File

@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access.
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
(oleg@sai.msu.su).
CHANGES:
August 13, 2002
Use parser of OpenFTS v0.33.
IMPORTANT NOTICE:
This is a first step of our work on integration of OpenFTS

View File

@ -2,28 +2,33 @@
#define __DEFLEX_H__
/* rememder !!!! */
#define LASTNUM 19
#define LASTNUM 23
#define LATWORD 1
#define NONLATINWORD 2
#define CYRWORD 2
#define UWORD 3
#define EMAIL 4
#define FURL 5
#define HOST 6
#define FLOAT 7
#define FINT 8
#define PARTWORD 9
#define NONLATINPARTWORD 10
#define LATPARTWORD 11
#define SPACE 12
#define SYMTAG 13
#define HTTP 14
#define DEFISWORD 15
#define DEFISLATWORD 16
#define DEFISNONLATINWORD 17
#define SCIENTIFIC 7
#define VERSIONNUMBER 8
#define PARTHYPHENWORD 9
#define CYRPARTHYPHENWORD 10
#define LATPARTHYPHENWORD 11
#define SPACE 12
#define TAG 13
#define HTTP 14
#define HYPHENWORD 15
#define LATHYPHENWORD 16
#define CYRHYPHENWORD 17
#define URI 18
#define FILEPATH 19
#define DECIMAL 20
#define SIGNEDINT 21
#define UNSIGNEDINT 22
#define HTMLENTITY 23
extern const char *descr[];
#endif

View File

@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)';
select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
txt2txtidx

'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
txt2txtidx

'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
(1 row)
select txtidxsize(txt2txtidx('345 qw'));
@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e
<i <b> wow < jqw <> qwerty'));
txtidxsize
------------
52
53
(1 row)
insert into test_txtidx (a) values ('345 qwerty');

View File

@ -75,19 +75,23 @@ static MAPDICT mapdict[] = {
{NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */
{NODICT, NODICT}, /* FLOAT */
{NODICT, NODICT}, /* FINT */
{BYLOCALE, DEFAULTDICT}, /* PARTWORD */
{BYLOCALE, NODICT}, /* NONLATINPARTWORD */
{DEFAULTDICT, NODICT}, /* LATPARTWORD */
{NODICT, NODICT}, /* SCIENTIFIC */
{NODICT, NODICT}, /* VERSIONNUMBER */
{BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
{BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
{DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
{STOPLEXEM, NODICT}, /* SPACE */
{STOPLEXEM, NODICT}, /* SYMTAG */
{STOPLEXEM, NODICT}, /* TAG */
{STOPLEXEM, NODICT}, /* HTTP */
{BYLOCALE, DEFAULTDICT}, /* DEFISWORD */
{DEFAULTDICT, NODICT}, /* DEFISLATWORD */
{BYLOCALE, NODICT}, /* DEFISNONLATINWORD */
{BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
{DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
{BYLOCALE, NODICT}, /* CYRHYPHENWORD */
{NODICT, NODICT}, /* URI */
{NODICT, NODICT} /* FILEPATH */
{NODICT, NODICT}, /* FILEPATH */
{NODICT, NODICT}, /* DECIMAL */
{NODICT, NODICT}, /* SIGNEDINT */
{NODICT, NODICT}, /* UNSIGNEDINT */
{STOPLEXEM, NODICT} /* HTMLENTITY */
};
static bool inited = false;

View File

@ -5,18 +5,17 @@
/* postgres allocation function */
#include "postgres.h"
#define free pfree
#define malloc palloc
#define free pfree
#define malloc palloc
#define realloc repalloc
#ifdef strdup
#undef strdup
#endif
#define strdup pstrdup
#define strdup pstrdup
char *token = NULL; /* pointer to token */
char *s = NULL; /* for returning full defis-word */
char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */
%option nounput
%option noyywrap
/* parser's state for parsing defis-word */
/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
/* parser's state for parsing filepath */
/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
%x INCOMMENT
%x INSCRIPT
/* NONLATIN char */
NONLATINALNUM [0-9\200-\377]
NONLATINALPHA [\200-\377]
/* cyrillic koi8 char */
CYRALNUM [0-9\200-\377]
CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[[:alpha:]] { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
"</"[[:alpha:]] { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<>" {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<"[^>[:alpha:]] {
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
<INTAG>"\"" { BEGIN QINTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<!--" { BEGIN INCOMMENT; }
<QINTAG>"\\\"" {
<INCOMMENT>"-->" {
BEGIN INITIAL;
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
return SPACE;
}
<QINTAG>"\"" { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>.|\n {
"<"[\![:alpha:]] { BEGIN INTAG; }
"</"[[:alpha:]] { BEGIN INTAG; }
<INTAG>"\"" { BEGIN QINTAG; }
<QINTAG>"\\\"" ;
<QINTAG>"\"" { BEGIN INTAG; }
<INTAG>">" {
BEGIN INITIAL;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
*tsearch_yytext=' ';
token = tsearch_yytext;
tokenlen = 1;
return TAG;
}
<INTAG>">" { BEGIN INITIAL;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
<INTAG>.|\n {
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
return HTMLENTITY;
}
\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return HTMLENTITY;
}
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch_yytext;
@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+
return EMAIL;
}
<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ {
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FINT;
return SCIENTIFIC;
}
<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ {
token = tsearch_yytext;
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FINT;
return VERSIONNUMBER;
}
[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ {
[+-]?[0-9]+\.[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return DECIMAL;
}
[+-][0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FLOAT;
return SIGNEDINT;
}
<DELIM,INITIAL>[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return UNSIGNEDINT;
}
http"://" {
@ -208,52 +212,58 @@ ftp"://" {
return FILEPATH;
}
({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ {
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
return DEFISNONLATINWORD;
return CYRHYPHENWORD;
}
([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ {
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
tokenlen = tsearch_yyleng;
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
return DEFISLATWORD;
return LATHYPHENWORD;
}
({ALNUM}+-)+{ALPHA}+ /* composite-word */ {
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
return DEFISWORD;
return HYPHENWORD;
}
<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext;
<DELIM>\+?[0-9]+\.[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return NONLATINPARTWORD;
return DECIMAL;
}
<DELIM>[[:alnum:]]+ /* one word in composite-word */ {
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATPARTWORD;
return CYRPARTHYPHENWORD;
}
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return PARTWORD;
return PARTHYPHENWORD;
}
<DELIM>- {
@ -264,17 +274,16 @@ ftp"://" {
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
tokenlen = tsearch_yyleng;
yyless( 0 );
}
{NONLATINALNUM}+ /* normal word */ {
{CYRALPHA}+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return NONLATINWORD;
return CYRWORD;
}
[[:alnum:]]+ /* normal word */ {
[[:alpha:]]+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATWORD;
@ -286,7 +295,13 @@ ftp"://" {
return UWORD;
}
.|\n {
[ \r\n\t]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
. {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;