postgresql/contrib/tsearch2/wparser_def.c

/*
 * default word parser
 * Teodor Sigaev <teodor@sigaev.ru>
 */
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "postgres.h"
#include "utils/builtins.h"

#include "dict.h"
#include "wparser.h"
#include "common.h"
#include "ts_cfg.h"
#include "wordparser/parser.h"
#include "wordparser/deflex.h"

PG_FUNCTION_INFO_V1(prsd_lextype);
Datum		prsd_lextype(PG_FUNCTION_ARGS);

Datum
prsd_lextype(PG_FUNCTION_ARGS)
{
	LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
	int			i;

	for (i = 1; i <= LASTNUM; i++)
	{
		descr[i - 1].lexid = i;
		descr[i - 1].alias = pstrdup(tok_alias[i]);
		descr[i - 1].descr = pstrdup(lex_descr[i]);
	}

	descr[LASTNUM].lexid = 0;

	PG_RETURN_POINTER(descr);
}

PG_FUNCTION_INFO_V1(prsd_start);
Datum		prsd_start(PG_FUNCTION_ARGS);
Datum
prsd_start(PG_FUNCTION_ARGS)
{
	tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
	PG_RETURN_POINTER(NULL);
}

PG_FUNCTION_INFO_V1(prsd_getlexeme);
Datum		prsd_getlexeme(PG_FUNCTION_ARGS);
Datum
prsd_getlexeme(PG_FUNCTION_ARGS)
{
	/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
	char	  **t = (char **) PG_GETARG_POINTER(1);
	int		   *tlen = (int *) PG_GETARG_POINTER(2);
	int			type = tsearch2_yylex();

	*t = token;
	*tlen = tokenlen;
	PG_RETURN_INT32(type);
}

PG_FUNCTION_INFO_V1(prsd_end);
Datum		prsd_end(PG_FUNCTION_ARGS);
Datum
prsd_end(PG_FUNCTION_ARGS)
{
	/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
	tsearch2_end_parse();
	PG_RETURN_VOID();
}

#define LEAVETOKEN(x)	( (x)==12 )
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define ENDPUNCTOKEN(x) ( (x)==12 )


#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
#define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )

typedef struct
{
	HLWORD	   *words;
	int			len;
}	hlCheck;

static bool
checkcondition_HL(void *checkval, ITEM * val)
{
	int			i;

	for (i = 0; i < ((hlCheck *) checkval)->len; i++)
	{
		if (((hlCheck *) checkval)->words[i].item == val)
			return true;
	}
	return false;
}


static bool
hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
{
	int			i,
				j;
	ITEM	   *item = GETQUERY(query);
	int			pos = *p;

	*q = 0;
	*p = 0x7fffffff;

	for (j = 0; j < query->size; j++)
	{
		if (item->type != VAL)
		{
			item++;
			continue;
		}
		for (i = pos; i < prs->curwords; i++)
		{
			if (prs->words[i].item == item)
			{
				if (i > *q)
					*q = i;
				break;
			}
		}
		item++;
	}

	if (*q == 0)
		return false;

	item = GETQUERY(query);
	for (j = 0; j < query->size; j++)
	{
		if (item->type != VAL)
		{
			item++;
			continue;
		}
		for (i = *q; i >= pos; i--)
		{
			if (prs->words[i].item == item)
			{
				if (i < *p)
					*p = i;
				break;
			}
		}
		item++;
	}

	if (*p <= *q)
	{
		hlCheck		ch;

		ch.words = &(prs->words[*p]);
		ch.len = *q - *p + 1;
		if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
			return true;
		else
		{
			(*p)++;
			return hlCover(prs, query, p, q);
		}
	}

	return false;
}

PG_FUNCTION_INFO_V1(prsd_headline);
Datum		prsd_headline(PG_FUNCTION_ARGS);
Datum
prsd_headline(PG_FUNCTION_ARGS)
{
	HLPRSTEXT  *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
	text	   *opt = (text *) PG_GETARG_POINTER(1);	/* can't be toasted */
	QUERYTYPE  *query = (QUERYTYPE *) PG_GETARG_POINTER(2);		/* can't be toasted */

	/* from opt + start and and tag */
	int			min_words = 15;
	int			max_words = 35;
	int			shortword = 3;

	int			p = 0,
				q = 0;
	int			bestb = -1,
				beste = -1;
	int			bestlen = -1;
	int			pose = 0, posb,
				poslen,
				curlen;

	int			i;

	/* config */
	prs->startsel = NULL;
	prs->stopsel = NULL;
	if (opt)
	{
		Map		   *map,
				   *mptr;

		parse_cfgdict(opt, &map);
		mptr = map;

		while (mptr && mptr->key)
		{
			if (pg_strcasecmp(mptr->key, "MaxWords") == 0)
				max_words = pg_atoi(mptr->value, 4, 1);
			else if (pg_strcasecmp(mptr->key, "MinWords") == 0)
				min_words = pg_atoi(mptr->value, 4, 1);
			else if (pg_strcasecmp(mptr->key, "ShortWord") == 0)
				shortword = pg_atoi(mptr->value, 4, 1);
			else if (pg_strcasecmp(mptr->key, "StartSel") == 0)
				prs->startsel = pstrdup(mptr->value);
			else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
				prs->stopsel = pstrdup(mptr->value);

			pfree(mptr->key);
			pfree(mptr->value);

			mptr++;
		}
		pfree(map);

		if (min_words >= max_words)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("MinWords should be less than MaxWords")));
		if (min_words <= 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("MinWords should be positive")));
		if (shortword < 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("ShortWord should be >= 0")));
	}

	while (hlCover(prs, query, &p, &q))
	{
		/* find cover len in words */
		curlen = 0;
		poslen = 0;
		for (i = p; i <= q && curlen < max_words; i++)
		{
			if (!NONWORDTOKEN(prs->words[i].type))
				curlen++;
			if (prs->words[i].item && !prs->words[i].repeated)
				poslen++;
			pose = i;
		}

		if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
		{
			/* best already finded, so try one more cover */
			p++;
			continue;
		}

		posb=p;
		if (curlen < max_words)
		{						/* find good end */
			for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
			{
				if (i != q)
				{
					if (!NONWORDTOKEN(prs->words[i].type))
						curlen++;
					if (prs->words[i].item && !prs->words[i].repeated)
						poslen++;
				}
				pose = i;
				if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
					continue;
				if (curlen >= min_words)
					break;
			}
			if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
				for(i=p; i>= 0; i--) {
					if (!NONWORDTOKEN(prs->words[i].type))
						curlen++;
					if (prs->words[i].item && !prs->words[i].repeated)
						poslen++;
					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
						continue;
					if (curlen >= min_words)
						break;
				}
				posb=(i>=0) ? i : 0;
			}
		}
		else
		{						/* shorter cover :((( */
			for (; curlen > min_words; i--)
			{
				if (!NONWORDTOKEN(prs->words[i].type))
					curlen--;
				if (prs->words[i].item && !prs->words[i].repeated)
					poslen--;
				pose = i;
				if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
					continue;
				break;
			}
		}

		if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
			(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
			 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
		{
			bestb = posb;
			beste = pose;
			bestlen = poslen;
		}

		p++;
	}

	if (bestlen < 0)
	{
		curlen = 0;
		poslen = 0;
		for (i = 0; i < prs->curwords && curlen < min_words; i++)
		{
			if (!NONWORDTOKEN(prs->words[i].type))
				curlen++;
			pose = i;
		}
		bestb = 0;
		beste = pose;
	}

	for (i = bestb; i <= beste; i++)
	{
		if (prs->words[i].item)
			prs->words[i].selected = 1;
		if (prs->words[i].repeated)
			prs->words[i].skip = 1;
		if (HLIDIGNORE(prs->words[i].type))
			prs->words[i].replace = 1;

		prs->words[i].in = 1;
	}

	if (!prs->startsel)
		prs->startsel = pstrdup("<b>");
	if (!prs->stopsel)
		prs->stopsel = pstrdup("</b>");
	prs->startsellen = strlen(prs->startsel);
	prs->stopsellen = strlen(prs->stopsel);

	PG_RETURN_POINTER(prs);
}