%{ #include "postgres.h" #include "deflex.h" #include "parser.h" #include "common.h" /* Avoid exit() on fatal scanner errors */ #undef fprintf #define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg) char *token = NULL; /* pointer to token */ int tokenlen; static char *s = NULL; /* to return WHOLE hyphenated-word */ YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ typedef struct { int tlen; int clen; char *str; } TagStorage; static TagStorage ts={0,0,NULL}; static void addTag(void) { while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) { ts.tlen*=2; ts.str=realloc(ts.str,ts.tlen); if (!ts.str) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng); ts.clen+=tsearch2_yyleng; ts.str[ts.clen]='\0'; } static void startTag(void) { if ( ts.str==NULL ) { ts.tlen=tsearch2_yyleng+1; ts.str=malloc(ts.tlen); if (!ts.str) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } ts.clen=0; ts.str[0]='\0'; addTag(); } %} %option 8bit %option never-interactive %option nodefault %option nounput %option noyywrap /* parser's state for parsing hyphenated-word */ %x DELIM /* parser's state for parsing URL*/ %x URL %x SERVER /* parser's state for parsing TAGS */ %x INTAG %x QINTAG %x INCOMMENT %x INSCRIPT /* cyrillic koi8 char */ CYRALNUM [0-9\200-\377] CYRALPHA [\200-\377] ALPHA [a-zA-Z\200-\377] ALNUM [0-9a-zA-Z\200-\377] HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+ URI [-_[:alnum:]/%,\.;=&?#]+ %% "<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); } "" { BEGIN INITIAL; addTag(); token = ts.str; tokenlen = ts.clen; return TAG; } "" { BEGIN INITIAL; addTag(); token = ts.str; tokenlen = ts.clen; return TAG; } "<"[\![:alpha:]] { BEGIN INTAG; startTag(); } ""\"" { BEGIN QINTAG; addTag(); } "\\\"" { addTag(); } "\"" { BEGIN INTAG; addTag(); } ">" { BEGIN INITIAL; addTag(); token = ts.str; tokenlen = ts.clen; return TAG; } .|\n { addTag(); } \&(quot|amp|nbsp|lt|gt)\; { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return HTMLENTITY; } \&\#[0-9][0-9]?[0-9]?\; { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return HTMLENTITY; } [-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return EMAIL; } [+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return SCIENTIFIC; } [0-9]+\.[0-9]+\.[0-9\.]*[0-9] { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return VERSIONNUMBER; } [+-]?[0-9]+\.[0-9]+ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return DECIMAL; } [+-][0-9]+ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return SIGNEDINT; } [0-9]+ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return UNSIGNEDINT; } http"://" { BEGIN URL; token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return HTTP; } ftp"://" { BEGIN URL; token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return HTTP; } {HOSTNAME}[/:]{URI} { BEGIN SERVER; if (s) { free(s); s=NULL; } s = strdup( tsearch2_yytext ); tokenlen = tsearch2_yyleng; yyless( 0 ); token = s; return FURL; } {HOSTNAME} { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return HOST; } [/:]{URI} { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return URI; } [[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return FILEPATH; } ({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch2_yytext ); tokenlen = tsearch2_yyleng; yyless( 0 ); token = s; return CYRHYPHENWORD; } ([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch2_yytext ); tokenlen = tsearch2_yyleng; yyless( 0 ); token = s; return LATHYPHENWORD; } ({ALNUM}+-)+{ALNUM}+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch2_yytext ); tokenlen = tsearch2_yyleng; yyless( 0 ); token = s; return HYPHENWORD; } [0-9]+\.[0-9]+\.[0-9\.]*[0-9] { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return VERSIONNUMBER; } \+?[0-9]+\.[0-9]+ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return DECIMAL; } {CYRALPHA}+ /* one word in composite-word */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return CYRPARTHYPHENWORD; } [[:alpha:]]+ /* one word in composite-word */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return LATPARTHYPHENWORD; } {ALNUM}+ /* one word in composite-word */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return PARTHYPHENWORD; } - { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return SPACE; } .|\n /* return in basic state */ { BEGIN INITIAL; yyless( 0 ); } {CYRALPHA}+ /* normal word */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return CYRWORD; } [[:alpha:]]+ /* normal word */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return LATWORD; } {ALNUM}+ /* normal word */ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return UWORD; } [ \r\n\t]+ { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return SPACE; } . { token = tsearch2_yytext; tokenlen = tsearch2_yyleng; return SPACE; } %% /* clearing after parsing from string */ void tsearch2_end_parse(void) { if (s) { free(s); s = NULL; } tsearch2_yy_delete_buffer( buf ); buf = NULL; } /* start parse from string */ void tsearch2_start_parse_str(char* str, int limit) { if (buf) tsearch2_end_parse(); buf = tsearch2_yy_scan_bytes( str, limit ); tsearch2_yy_switch_to_buffer( buf ); BEGIN INITIAL; }