/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.13 2007/03/22 15:58:24 teodor Exp $ */ #include "postgres.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "mb/pg_wchar.h" #include "deflex.h" #include "parser.h" #include "ts_locale.h" static TParserPosition * newTParserPosition(TParserPosition * prev) { TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); if (prev) memcpy(res, prev, sizeof(TParserPosition)); else memset(res, 0, sizeof(TParserPosition)); res->prev = prev; res->pushedAtAction = NULL; return res; } TParser * TParserInit(char *str, int len) { TParser *prs = (TParser *) palloc0(sizeof(TParser)); prs->charmaxlen = pg_database_encoding_max_length(); prs->str = str; prs->lenstr = len; #ifdef TS_USE_WIDE /* * Use wide char code only when max encoding length > 1. */ if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else #endif prs->usewide = false; prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; return prs; } void TParserClose(TParser * prs) { while (prs->state) { TParserPosition *ptr = prs->state->prev; pfree(prs->state); prs->state = ptr; } #ifdef TS_USE_WIDE if (prs->wstr) pfree(prs->wstr); #endif pfree(prs); } /* * defining support function, equvalent is* macroses, but * working with any possible encodings and locales. Note, * that with multibyte encoding and C-locale isw* function may fail * or give wrong result. Note 2: multibyte encoding and C-locale * often are used for Asian languages. */ #ifdef TS_USE_WIDE #define p_iswhat(type) \ static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ if ( prs->usewide ) \ { \ if ( lc_ctype_is_c() ) \ return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ \ return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ } \ \ return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ } \ \ static int \ p_isnot##type(TParser *prs) { \ return !p_is##type(prs); \ } static int p_isalnum(TParser *prs) { Assert( prs->state ); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding * with C-locale is an alpha character */ if ( c > 0x7f ) return 1; return isalnum(0xff & c); } return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); } return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte )); } static int p_isnotalnum(TParser *prs) { return !p_isalnum(prs); } static int p_isalpha(TParser *prs) { Assert( prs->state ); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding * with C-locale is an alpha character */ if ( c > 0x7f ) return 1; return isalpha(0xff & c); } return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); } return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte )); } static int p_isnotalpha(TParser *prs) { return !p_isalpha(prs); } /* p_iseq should be used only for ascii symbols */ static int p_iseq(TParser * prs, char c) { Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } #else /* TS_USE_WIDE */ #define p_iswhat(type) \ static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ } \ \ static int \ p_isnot##type(TParser *prs) { \ return !p_is##type(prs); \ } static int p_iseq(TParser * prs, char c) { Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } p_iswhat(alnum) p_iswhat(alpha) #endif /* TS_USE_WIDE */ p_iswhat(digit) p_iswhat(lower) p_iswhat(print) p_iswhat(punct) p_iswhat(space) p_iswhat(upper) p_iswhat(xdigit) static int p_isEOF(TParser * prs) { Assert(prs->state); return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0; } static int p_iseqC(TParser * prs) { return p_iseq(prs, prs->c); } static int p_isneC(TParser * prs) { return !p_iseq(prs, prs->c); } static int p_isascii(TParser * prs) { return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0; } static int p_islatin(TParser * prs) { return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0; } static int p_isnonlatin(TParser * prs) { return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0; } void _make_compiler_happy(void); void _make_compiler_happy(void) { p_isalnum(NULL); p_isnotalnum(NULL); p_isalpha(NULL); p_isnotalpha(NULL); p_isdigit(NULL); p_isnotdigit(NULL); p_islower(NULL); p_isnotlower(NULL); p_isprint(NULL); p_isnotprint(NULL); p_ispunct(NULL); p_isnotpunct(NULL); p_isspace(NULL); p_isnotspace(NULL); p_isupper(NULL); p_isnotupper(NULL); p_isxdigit(NULL); p_isnotxdigit(NULL); p_isEOF(NULL); p_iseqC(NULL); p_isneC(NULL); } static void SpecialTags(TParser * prs) { switch (prs->state->lencharlexeme) { case 8: /* lexeme, "ignore = false; break; case 7: /*