postgresql/src/include/tsearch/ts_public.h

/*-------------------------------------------------------------------------
 *
 * ts_public.h
 *	  Public interface to various tsearch modules, such as
 *	  parsers and dictionaries.
 *
 * Copyright (c) 1998-2023, PostgreSQL Global Development Group
 *
 * src/include/tsearch/ts_public.h
 *
 *-------------------------------------------------------------------------
 */
#ifndef _PG_TS_PUBLIC_H_
#define _PG_TS_PUBLIC_H_

#include "tsearch/ts_type.h"

/*
 * Parser's framework
 */

/*
 * returning type for prslextype method of parser
 */
typedef struct
{
	int			lexid;
	char	   *alias;
	char	   *descr;
} LexDescr;

/*
 * Interface to headline generator (tsparser's prsheadline function)
 *
 * HeadlineParsedText describes the text that is to be highlighted.
 * Some fields are passed from the core code to the prsheadline function,
 * while others are output from the prsheadline function.
 *
 * The principal data is words[], an array of HeadlineWordEntry,
 * one entry per token, of length curwords.
 * The fields of HeadlineWordEntry are:
 *
 * in, selected, replace, skip: these flags are initially zero
 * and may be set by the prsheadline function.  A consecutive group
 * of tokens marked "in" form a "fragment" to be output.
 * Such tokens may additionally be marked selected, replace, or skip
 * to modify how they are shown.  (If you set more than one of those
 * bits, you get an unspecified one of those behaviors.)
 *
 * type, len, pos, word: filled by core code to describe the token.
 *
 * item: if the token matches any operand of the tsquery of interest,
 * a pointer to such an operand.  (If there are multiple matching
 * operands, we generate extra copies of the HeadlineWordEntry to hold
 * all the pointers.  The extras are marked with repeated = 1 and should
 * be ignored except for checking the item pointer.)
 */
typedef struct
{
	uint32		selected:1,		/* token is to be highlighted */
				in:1,			/* token is part of headline */
				replace:1,		/* token is to be replaced with a space */
				repeated:1,		/* duplicate entry to hold item pointer */
				skip:1,			/* token is to be skipped (not output) */
				unused:3,		/* available bits */
				type:8,			/* parser's token category */
				len:16;			/* length of token */
	WordEntryPos pos;			/* position of token */
	char	   *word;			/* text of token (not null-terminated) */
	QueryOperand *item;			/* a matching query operand, or NULL if none */
} HeadlineWordEntry;

typedef struct
{
	/* Fields filled by core code before calling prsheadline function: */
	HeadlineWordEntry *words;
	int32		lenwords;		/* allocated length of words[] */
	int32		curwords;		/* current number of valid entries */
	int32		vectorpos;		/* used by ts_parse.c in filling pos fields */

	/* The prsheadline function must fill these fields: */
	/* Strings for marking selected tokens and separating fragments: */
	char	   *startsel;		/* palloc'd strings */
	char	   *stopsel;
	char	   *fragdelim;
	int16		startsellen;	/* lengths of strings */
	int16		stopsellen;
	int16		fragdelimlen;
} HeadlineParsedText;

/*
 * Common useful things for tsearch subsystem
 */
extern char *get_tsearch_config_filename(const char *basename,
										 const char *extension);

/*
 * Often useful stopword list management
 */
typedef struct
{
	int			len;
	char	  **stop;
} StopList;

extern void readstoplist(const char *fname, StopList *s,
						 char *(*wordop) (const char *));
extern bool searchstoplist(StopList *s, char *key);

/*
 * Interface with dictionaries
 */

/* return struct for any lexize function */
typedef struct
{
	/*----------
	 * Number of current variant of split word.  For example the Norwegian
	 * word 'fotballklubber' has two variants to split: ( fotball, klubb )
	 * and ( fot, ball, klubb ). So, dictionary should return:
	 *
	 * nvariant    lexeme
	 *	   1	   fotball
	 *	   1	   klubb
	 *	   2	   fot
	 *	   2	   ball
	 *	   2	   klubb
	 *
	 * In general, a TSLexeme will be considered to belong to the same split
	 * variant as the previous one if they have the same nvariant value.
	 * The exact values don't matter, only changes from one lexeme to next.
	 *----------
	 */
	uint16		nvariant;

	uint16		flags;			/* See flag bits below */

	char	   *lexeme;			/* C string */
} TSLexeme;

/* Flag bits that can appear in TSLexeme.flags */
#define TSL_ADDPOS		0x01
#define TSL_PREFIX		0x02
#define TSL_FILTER		0x04

/*
 * Struct for supporting complex dictionaries like thesaurus.
 * 4th argument for dictlexize method is a pointer to this
 */
typedef struct
{
	bool		isend;			/* in: marks for lexize_info about text end is
								 * reached */
	bool		getnext;		/* out: dict wants next lexeme */
	void	   *private_state;	/* internal dict state between calls with
								 * getnext == true */
} DictSubState;

#endif							/* _PG_TS_PUBLIC_H_ */
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* ts_public.h`
			`* Public interface to various tsearch modules, such as`
			`* parsers and dictionaries.`
			`*`
Update copyright for 2023 Backpatch-through: 11 2023-01-02 21:00:37 +01:00			`* Copyright (c) 1998-2023, PostgreSQL Global Development Group`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`*`
Remove cvs keywords from all files. 2010-09-20 22:08:53 +02:00			`* src/include/tsearch/ts_public.h`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`*`
			`*-------------------------------------------------------------------------`
			`*/`
			`#ifndef _PG_TS_PUBLIC_H_`
			`#define _PG_TS_PUBLIC_H_`

			`#include "tsearch/ts_type.h"`

			`/*`
			`* Parser's framework`
			`*/`

			`/*`
			`* returning type for prslextype method of parser`
			`*/`
			`typedef struct`
			`{`
			`int lexid;`
			`char *alias;`
			`char *descr;`
			`} LexDescr;`

			`/*`
Add comments and a missing CHECK_FOR_INTERRUPTS in ts_headline. I just spent an annoying amount of time reverse-engineering the 100%-undocumented API between ts_headline and the text search parser's prsheadline function. Add some commentary about that while it's fresh in mind. Also remove some unused macros in wparser_def.c. While at it, I noticed that when commit 78e73e875 added a CHECK_FOR_INTERRUPTS call in TS_execute_recurse, it missed doing so in the parallel function TS_phrase_execute, which surely needs one just as much. Back-patch because of the missing CHECK_FOR_INTERRUPTS. Might as well back-patch the rest of this too. 2022-11-21 23:07:07 +01:00			`* Interface to headline generator (tsparser's prsheadline function)`
			`*`
			`* HeadlineParsedText describes the text that is to be highlighted.`
			`* Some fields are passed from the core code to the prsheadline function,`
			`* while others are output from the prsheadline function.`
			`*`
			`* The principal data is words[], an array of HeadlineWordEntry,`
			`* one entry per token, of length curwords.`
			`* The fields of HeadlineWordEntry are:`
			`*`
			`* in, selected, replace, skip: these flags are initially zero`
			`* and may be set by the prsheadline function. A consecutive group`
			`* of tokens marked "in" form a "fragment" to be output.`
			`* Such tokens may additionally be marked selected, replace, or skip`
			`* to modify how they are shown. (If you set more than one of those`
			`* bits, you get an unspecified one of those behaviors.)`
			`*`
			`* type, len, pos, word: filled by core code to describe the token.`
			`*`
			`* item: if the token matches any operand of the tsquery of interest,`
			`* a pointer to such an operand. (If there are multiple matching`
			`* operands, we generate extra copies of the HeadlineWordEntry to hold`
			`* all the pointers. The extras are marked with repeated = 1 and should`
			`* be ignored except for checking the item pointer.)`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`*/`
			`typedef struct`
			`{`
Add comments and a missing CHECK_FOR_INTERRUPTS in ts_headline. I just spent an annoying amount of time reverse-engineering the 100%-undocumented API between ts_headline and the text search parser's prsheadline function. Add some commentary about that while it's fresh in mind. Also remove some unused macros in wparser_def.c. While at it, I noticed that when commit 78e73e875 added a CHECK_FOR_INTERRUPTS call in TS_execute_recurse, it missed doing so in the parallel function TS_phrase_execute, which surely needs one just as much. Back-patch because of the missing CHECK_FOR_INTERRUPTS. Might as well back-patch the rest of this too. 2022-11-21 23:07:07 +01:00			`uint32 selected:1, /* token is to be highlighted */`
			`in:1, /* token is part of headline */`
			`replace:1, /* token is to be replaced with a space */`
			`repeated:1, /* duplicate entry to hold item pointer */`
			`skip:1, /* token is to be skipped (not output) */`
			`unused:3, /* available bits */`
			`type:8, /* parser's token category */`
			`len:16; /* length of token */`
			`WordEntryPos pos; /* position of token */`
			`char word; / text of token (not null-terminated) */`
			`QueryOperand item; / a matching query operand, or NULL if none */`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`} HeadlineWordEntry;`

			`typedef struct`
			`{`
Add comments and a missing CHECK_FOR_INTERRUPTS in ts_headline. I just spent an annoying amount of time reverse-engineering the 100%-undocumented API between ts_headline and the text search parser's prsheadline function. Add some commentary about that while it's fresh in mind. Also remove some unused macros in wparser_def.c. While at it, I noticed that when commit 78e73e875 added a CHECK_FOR_INTERRUPTS call in TS_execute_recurse, it missed doing so in the parallel function TS_phrase_execute, which surely needs one just as much. Back-patch because of the missing CHECK_FOR_INTERRUPTS. Might as well back-patch the rest of this too. 2022-11-21 23:07:07 +01:00			`/* Fields filled by core code before calling prsheadline function: */`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`HeadlineWordEntry *words;`
Add comments and a missing CHECK_FOR_INTERRUPTS in ts_headline. I just spent an annoying amount of time reverse-engineering the 100%-undocumented API between ts_headline and the text search parser's prsheadline function. Add some commentary about that while it's fresh in mind. Also remove some unused macros in wparser_def.c. While at it, I noticed that when commit 78e73e875 added a CHECK_FOR_INTERRUPTS call in TS_execute_recurse, it missed doing so in the parallel function TS_phrase_execute, which surely needs one just as much. Back-patch because of the missing CHECK_FOR_INTERRUPTS. Might as well back-patch the rest of this too. 2022-11-21 23:07:07 +01:00			`int32 lenwords; /* allocated length of words[] */`
			`int32 curwords; /* current number of valid entries */`
			`int32 vectorpos; /* used by ts_parse.c in filling pos fields */`

			`/* The prsheadline function must fill these fields: */`
			`/* Strings for marking selected tokens and separating fragments: */`
			`char startsel; / palloc'd strings */`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`char *stopsel;`
Improve headeline generation. Now headline can contain several fragments a-la Google. Sushant Sinha <sushant354@gmail.com> 2008-10-17 20:05:19 +02:00			`char *fragdelim;`
Add comments and a missing CHECK_FOR_INTERRUPTS in ts_headline. I just spent an annoying amount of time reverse-engineering the 100%-undocumented API between ts_headline and the text search parser's prsheadline function. Add some commentary about that while it's fresh in mind. Also remove some unused macros in wparser_def.c. While at it, I noticed that when commit 78e73e875 added a CHECK_FOR_INTERRUPTS call in TS_execute_recurse, it missed doing so in the parallel function TS_phrase_execute, which surely needs one just as much. Back-patch because of the missing CHECK_FOR_INTERRUPTS. Might as well back-patch the rest of this too. 2022-11-21 23:07:07 +01:00			`int16 startsellen; /* lengths of strings */`
Replace int2/int4 in C code with int16/int32 The latter was already the dominant use, and it's preferable because in C the convention is that intXX means XX bits. Therefore, allowing mixed use of int2, int4, int8, int16, int32 is obviously confusing. Remove the typedefs for int2 and int4 for now. They don't seem to be widely used outside of the PostgreSQL source tree, and the few uses can probably be cleaned up by the time this ships. 2012-06-25 00:51:46 +02:00			`int16 stopsellen;`
			`int16 fragdelimlen;`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`} HeadlineParsedText;`

			`/*`
			`* Common useful things for tsearch subsystem`
			`*/`
			`extern char get_tsearch_config_filename(const char basename,`
			`const char *extension);`

			`/*`
			`* Often useful stopword list management`
			`*/`
			`typedef struct`
			`{`
			`int len;`
			`char **stop;`
			`} StopList;`

Cleanup for some problems in tsearch patch: - ispell initialization crashed on empty dictionary file - ispell initialization crashed on affix file with prefixes but no suffixes - stop words file was run through pg_verify_mbstr, with database encoding, but it's supposed to be UTF-8; similar bug for synonym files - bunch of comments added, typos fixed, and other cleanup Introduced consistent encoding checking/conversion of data read from tsearch configuration files, by doing this in a single t_readline() subroutine (replacing direct usages of fgets). Cleaned up API for readstopwords too. Heikki Linnakangas 2007-08-25 02:03:59 +02:00			`extern void readstoplist(const char fname, StopList s,`
Clean up ts_locale.h/.c. Fix broken and not-consistent-across-platforms behavior of wchar2char/char2wchar; this should resolve bug #3730. Avoid excess computations of pg_mblen in t_isalpha and friends. Const-ify APIs where possible. 2007-11-09 23:37:35 +01:00			`char (wordop) (const char *));`
Cleanup for some problems in tsearch patch: - ispell initialization crashed on empty dictionary file - ispell initialization crashed on affix file with prefixes but no suffixes - stop words file was run through pg_verify_mbstr, with database encoding, but it's supposed to be UTF-8; similar bug for synonym files - bunch of comments added, typos fixed, and other cleanup Introduced consistent encoding checking/conversion of data read from tsearch configuration files, by doing this in a single t_readline() subroutine (replacing direct usages of fgets). Cleaned up API for readstopwords too. Heikki Linnakangas 2007-08-25 02:03:59 +02:00			`extern bool searchstoplist(StopList s, char key);`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00
			`/*`
			`* Interface with dictionaries`
			`*/`

			`/* return struct for any lexize function */`
			`typedef struct`
			`{`
Improve comments for TSLexeme data structure. Mostly, clean up long-ago pgindent damage. 2011-11-03 23:47:28 +01:00			`/*----------`
			`* Number of current variant of split word. For example the Norwegian`
			`* word 'fotballklubber' has two variants to split: ( fotball, klubb )`
			`* and ( fot, ball, klubb ). So, dictionary should return:`
			`*`
			`* nvariant lexeme`
			`* 1 fotball`
			`* 1 klubb`
			`* 2 fot`
			`* 2 ball`
			`* 2 klubb`
			`*`
			`* In general, a TSLexeme will be considered to belong to the same split`
			`* variant as the previous one if they have the same nvariant value.`
			`* The exact values don't matter, only changes from one lexeme to next.`
			`*----------`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`*/`
			`uint16 nvariant;`

Improve comments for TSLexeme data structure. Mostly, clean up long-ago pgindent damage. 2011-11-03 23:47:28 +01:00			`uint16 flags; /* See flag bits below */`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00
Improve comments for TSLexeme data structure. Mostly, clean up long-ago pgindent damage. 2011-11-03 23:47:28 +01:00			`char lexeme; / C string */`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`} TSLexeme;`

Improve comments for TSLexeme data structure. Mostly, clean up long-ago pgindent damage. 2011-11-03 23:47:28 +01:00			`/* Flag bits that can appear in TSLexeme.flags */`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`#define TSL_ADDPOS 0x01`
Extend GIN to support partial-match searches, and extend tsquery to support prefix matching using this facility. Teodor Sigaev and Oleg Bartunov 2008-05-16 18:31:02 +02:00			`#define TSL_PREFIX 0x02`
Introduce filtering dictionary support to tsearch. Propagate --nolocale option to CREATE DATABASE command in pg_regress to allow correct checking of locale-sensitive contrib modules. 2009-08-18 12:30:41 +02:00			`#define TSL_FILTER 0x04`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00
			`/*`
Cleanup for some problems in tsearch patch: - ispell initialization crashed on empty dictionary file - ispell initialization crashed on affix file with prefixes but no suffixes - stop words file was run through pg_verify_mbstr, with database encoding, but it's supposed to be UTF-8; similar bug for synonym files - bunch of comments added, typos fixed, and other cleanup Introduced consistent encoding checking/conversion of data read from tsearch configuration files, by doing this in a single t_readline() subroutine (replacing direct usages of fgets). Cleaned up API for readstopwords too. Heikki Linnakangas 2007-08-25 02:03:59 +02:00			`* Struct for supporting complex dictionaries like thesaurus.`
			`* 4th argument for dictlexize method is a pointer to this`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`*/`
			`typedef struct`
			`{`
			`bool isend; /* in: marks for lexize_info about text end is`
			`* reached */`
			`bool getnext; /* out: dict wants next lexeme */`
Make backend header files C++ safe This alters various incidental uses of C++ key words to use other similar identifiers, so that a C++ compiler won't choke outright. You still (probably) need extern "C" { }; around the inclusion of backend headers. based on a patch by Kurt Harriman <harriman@acm.org> Also add a script cpluspluscheck to check for C++ compatibility in the future. As of right now, this passes without error for me. 2009-07-16 08:33:46 +02:00			`void private_state; / internal dict state between calls with`
Tsearch2 functionality migrates to core. The bulk of this work is by Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done. 2007-08-21 03:11:32 +02:00			`* getnext == true */`
			`} DictSubState;`

			`#endif /* _PG_TS_PUBLIC_H_ */`