postgresql/contrib/tsearch2/wparser_def.c

/*
 * default word parser
 * Teodor Sigaev <teodor@sigaev.ru>
 */
#include "postgres.h"

#include "utils/builtins.h"

#include "dict.h"
#include "wparser.h"
#include "common.h"
#include "ts_cfg.h"
#include "wordparser/parser.h"
#include "wordparser/deflex.h"

PG_FUNCTION_INFO_V1(prsd_lextype);
Datum		prsd_lextype(PG_FUNCTION_ARGS);

Datum
prsd_lextype(PG_FUNCTION_ARGS)
{
	LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
	int			i;

	for (i = 1; i <= LASTNUM; i++)
	{
		descr[i - 1].lexid = i;
		descr[i - 1].alias = pstrdup(tok_alias[i]);
		descr[i - 1].descr = pstrdup(lex_descr[i]);
	}

	descr[LASTNUM].lexid = 0;

	PG_RETURN_POINTER(descr);
}

PG_FUNCTION_INFO_V1(prsd_start);
Datum		prsd_start(PG_FUNCTION_ARGS);
Datum
prsd_start(PG_FUNCTION_ARGS)
{
	PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
}

PG_FUNCTION_INFO_V1(prsd_getlexeme);
Datum		prsd_getlexeme(PG_FUNCTION_ARGS);
Datum
prsd_getlexeme(PG_FUNCTION_ARGS)
{
	TParser    *p = (TParser *) PG_GETARG_POINTER(0);
	char	  **t = (char **) PG_GETARG_POINTER(1);
	int		   *tlen = (int *) PG_GETARG_POINTER(2);

	if (!TParserGet(p))
		PG_RETURN_INT32(0);

	*t = p->lexeme;
	*tlen = p->lenbytelexeme;

	PG_RETURN_INT32(p->type);
}

PG_FUNCTION_INFO_V1(prsd_end);
Datum		prsd_end(PG_FUNCTION_ARGS);
Datum
prsd_end(PG_FUNCTION_ARGS)
{
	TParser    *p = (TParser *) PG_GETARG_POINTER(0);

	TParserClose(p);
	PG_RETURN_VOID();
}

#define LEAVETOKEN(x)	( (x)==12 )
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define ENDPUNCTOKEN(x) ( (x)==12 )


#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
#define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )

typedef struct
{
	HLWORD	   *words;
	int			len;
}	hlCheck;

static bool
checkcondition_HL(void *checkval, ITEM * val)
{
	int			i;

	for (i = 0; i < ((hlCheck *) checkval)->len; i++)
	{
		if (((hlCheck *) checkval)->words[i].item == val)
			return true;
	}
	return false;
}


static bool
hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
{
	int			i,
				j;
	ITEM	   *item = GETQUERY(query);
	int			pos = *p;

	*q = 0;
	*p = 0x7fffffff;

	for (j = 0; j < query->size; j++)
	{
		if (item->type != VAL)
		{
			item++;
			continue;
		}
		for (i = pos; i < prs->curwords; i++)
		{
			if (prs->words[i].item == item)
			{
				if (i > *q)
					*q = i;
				break;
			}
		}
		item++;
	}

	if (*q == 0)
		return false;

	item = GETQUERY(query);
	for (j = 0; j < query->size; j++)
	{
		if (item->type != VAL)
		{
			item++;
			continue;
		}
		for (i = *q; i >= pos; i--)
		{
			if (prs->words[i].item == item)
			{
				if (i < *p)
					*p = i;
				break;
			}
		}
		item++;
	}

	if (*p <= *q)
	{
		hlCheck		ch;

		ch.words = &(prs->words[*p]);
		ch.len = *q - *p + 1;
		if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
			return true;
		else
		{
			(*p)++;
			return hlCover(prs, query, p, q);
		}
	}

	return false;
}

PG_FUNCTION_INFO_V1(prsd_headline);
Datum		prsd_headline(PG_FUNCTION_ARGS);
Datum
prsd_headline(PG_FUNCTION_ARGS)
{
	HLPRSTEXT  *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
	text	   *opt = (text *) PG_GETARG_POINTER(1);	/* can't be toasted */
	QUERYTYPE  *query = (QUERYTYPE *) PG_GETARG_POINTER(2);		/* can't be toasted */

	/* from opt + start and and tag */
	int			min_words = 15;
	int			max_words = 35;
	int			shortword = 3;

	int			p = 0,
				q = 0;
	int			bestb = -1,
				beste = -1;
	int			bestlen = -1;
	int			pose = 0,
				posb,
				poslen,
				curlen;

	int			i;
	int			highlight = 0;

	/* config */
	prs->startsel = NULL;
	prs->stopsel = NULL;
	if (opt)
	{
		Map		   *map,
				   *mptr;

		parse_cfgdict(opt, &map);
		mptr = map;

		while (mptr && mptr->key)
		{
			if (pg_strcasecmp(mptr->key, "MaxWords") == 0)
				max_words = pg_atoi(mptr->value, 4, 1);
			else if (pg_strcasecmp(mptr->key, "MinWords") == 0)
				min_words = pg_atoi(mptr->value, 4, 1);
			else if (pg_strcasecmp(mptr->key, "ShortWord") == 0)
				shortword = pg_atoi(mptr->value, 4, 1);
			else if (pg_strcasecmp(mptr->key, "StartSel") == 0)
				prs->startsel = pstrdup(mptr->value);
			else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
				prs->stopsel = pstrdup(mptr->value);
			else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
				highlight = (
							 pg_strcasecmp(mptr->value, "1") == 0 ||
							 pg_strcasecmp(mptr->value, "on") == 0 ||
							 pg_strcasecmp(mptr->value, "true") == 0 ||
							 pg_strcasecmp(mptr->value, "t") == 0 ||
							 pg_strcasecmp(mptr->value, "y") == 0 ||
							 pg_strcasecmp(mptr->value, "yes") == 0) ?
					1 : 0;

			pfree(mptr->key);
			pfree(mptr->value);

			mptr++;
		}
		pfree(map);

		if (highlight == 0)
		{
			if (min_words >= max_words)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("MinWords should be less than MaxWords")));
			if (min_words <= 0)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("MinWords should be positive")));
			if (shortword < 0)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("ShortWord should be >= 0")));
		}
	}

	if (highlight == 0)
	{
		while (hlCover(prs, query, &p, &q))
		{
			/* find cover len in words */
			curlen = 0;
			poslen = 0;
			for (i = p; i <= q && curlen < max_words; i++)
			{
				if (!NONWORDTOKEN(prs->words[i].type))
					curlen++;
				if (prs->words[i].item && !prs->words[i].repeated)
					poslen++;
				pose = i;
			}

			if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
			{
				/* best already finded, so try one more cover */
				p++;
				continue;
			}

			posb = p;
			if (curlen < max_words)
			{					/* find good end */
				for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
				{
					if (i != q)
					{
						if (!NONWORDTOKEN(prs->words[i].type))
							curlen++;
						if (prs->words[i].item && !prs->words[i].repeated)
							poslen++;
					}
					pose = i;
					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
						continue;
					if (curlen >= min_words)
						break;
				}
				if (curlen < min_words && i >= prs->curwords)
				{				/* got end of text and our cover is shoter
								 * than min_words */
					for (i = p; i >= 0; i--)
					{
						if (!NONWORDTOKEN(prs->words[i].type))
							curlen++;
						if (prs->words[i].item && !prs->words[i].repeated)
							poslen++;
						if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
							continue;
						if (curlen >= min_words)
							break;
					}
					posb = (i >= 0) ? i : 0;
				}
			}
			else
			{					/* shorter cover :((( */
				for (; curlen > min_words; i--)
				{
					if (!NONWORDTOKEN(prs->words[i].type))
						curlen--;
					if (prs->words[i].item && !prs->words[i].repeated)
						poslen--;
					pose = i;
					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
						continue;
					break;
				}
			}

			if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
				(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
				 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
			{
				bestb = posb;
				beste = pose;
				bestlen = poslen;
			}

			p++;
		}

		if (bestlen < 0)
		{
			curlen = 0;
			for (i = 0; i < prs->curwords && curlen < min_words; i++)
			{
				if (!NONWORDTOKEN(prs->words[i].type))
					curlen++;
				pose = i;
			}
			bestb = 0;
			beste = pose;
		}
	}
	else
	{
		bestb = 0;
		beste = prs->curwords - 1;
	}

	for (i = bestb; i <= beste; i++)
	{
		if (prs->words[i].item)
			prs->words[i].selected = 1;
		if (highlight == 0)
		{
			if (HLIDIGNORE(prs->words[i].type))
				prs->words[i].replace = 1;
		}
		else
		{
			if (HTMLHLIDIGNORE(prs->words[i].type))
				prs->words[i].replace = 1;
		}

		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
	}

	if (!prs->startsel)
		prs->startsel = pstrdup("<b>");
	if (!prs->stopsel)
		prs->stopsel = pstrdup("</b>");
	prs->startsellen = strlen(prs->startsel);
	prs->stopsellen = strlen(prs->stopsel);

	PG_RETURN_POINTER(prs);
}