Add sample text search dictionary templates and parsers, to replace the

hard-to-maintain textual examples currently in the SGML docs.  From
Sergey Karpov.
This commit is contained in:
Tom Lane 2007-10-15 21:36:50 +00:00
parent fb631dba2a
commit 5fcb079858
24 changed files with 1324 additions and 9 deletions

View File

@ -1,4 +1,4 @@
# $PostgreSQL: pgsql/contrib/Makefile,v 1.80 2007/10/13 22:59:43 tgl Exp $
# $PostgreSQL: pgsql/contrib/Makefile,v 1.81 2007/10/15 21:36:49 tgl Exp $
subdir = contrib
top_builddir = ..
@ -10,6 +10,8 @@ WANTED_DIRS = \
chkpass \
cube \
dblink \
dict_int \
dict_xsyn \
earthdistance \
fuzzystrmatch \
hstore \
@ -31,6 +33,7 @@ WANTED_DIRS = \
seg \
spi \
tablefunc \
test_parser \
vacuumlo
ifeq ($(with_openssl),yes)

View File

@ -1,4 +1,3 @@
The PostgreSQL contrib tree
---------------------------
@ -29,8 +28,8 @@ adminpack -
by Dave Page <dpage@vale-housing.co.uk>
btree_gist -
Support for emulating BTREE indexing in GiST
by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
Support for emulating BTREE indexing in GiST
by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
chkpass -
An auto-encrypted password datatype
@ -44,8 +43,16 @@ dblink -
Allows remote query execution
by Joe Conway <mail@joeconway.com>
dict_int -
Text search dictionary template for integers
by Sergey Karpov <karpov@sao.ru>
dict_xsyn -
Text search dictionary template for extended synonym processing
by Sergey Karpov <karpov@sao.ru>
earthdistance -
Operator for computing earth distance for two points
Operator for computing earth distance between two points
by Hal Snyder <hal@vailsys.com>
fuzzystrmatch -
@ -53,8 +60,8 @@ fuzzystrmatch -
by Joe Conway <mail@joeconway.com>, Joel Burton <jburton@scw.org>
hstore -
Hstore - module for storing (key,value) pairs
by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
Module for storing (key, value) pairs
by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
intagg -
Integer aggregator
@ -92,6 +99,10 @@ pg_freespacemap -
Displays the contents of the free space map (FSM)
by Mark Kirkwood <markir@paradise.net.nz>
pg_standby -
Sample archive_command for warm standby operation
by Simon Riggs <simon@2ndquadrant.com>
pg_trgm -
Functions for determining the similarity of text based on trigram
matching.
@ -110,7 +121,7 @@ pgrowlocks -
by Tatsuo Ishii <ishii@sraoss.co.jp>
pgstattuple -
A function to return statistics about "dead" tuples and free
Functions to return statistics about "dead" tuples and free
space within a table
by Tatsuo Ishii <ishii@sraoss.co.jp>
@ -126,12 +137,16 @@ sslinfo -
by Victor Wagner <vitus@cryptocom.ru>
start-scripts -
Scripts for starting the server at boot time.
Scripts for starting the server at boot time on various platforms.
tablefunc -
Examples of functions returning tables
by Joe Conway <mail@joeconway.com>
test_parser -
Sample text search parser
by Sergey Karpov <karpov@sao.ru>
tsearch2 -
Full-text-index support using GiST
by Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov

19
contrib/dict_int/Makefile Normal file
View File

@ -0,0 +1,19 @@
# $PostgreSQL: pgsql/contrib/dict_int/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
MODULE_big = dict_int
OBJS = dict_int.o
DATA_built = dict_int.sql
DATA = uninstall_dict_int.sql
DOCS = README.dict_int
REGRESS = dict_int
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/dict_int
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

View File

@ -0,0 +1,41 @@
Dictionary for integers
=======================
The motivation for this example dictionary is to control the indexing of
integers (signed and unsigned), and, consequently, to minimize the number of
unique words which greatly affect the performance of searching.
* Configuration
The dictionary accepts two options:
- The MAXLEN parameter specifies the maximum length (number of digits)
allowed in an integer word. The default value is 6.
- The REJECTLONG parameter specifies if an overlength integer should be
truncated or ignored. If REJECTLONG=FALSE (default), the dictionary returns
the first MAXLEN digits of the integer. If REJECTLONG=TRUE, the
dictionary treats an overlength integer as a stop word, so that it will
not be indexed.
* Usage
1. Compile and install
2. Load dictionary
psql mydb < dict_int.sql
3. Test it
mydb# select ts_lexize('intdict', '12345678');
ts_lexize
-----------
{123456}
4. Change its options as you wish
mydb# ALTER TEXT SEARCH DICTIONARY intdict (MAXLEN = 4, REJECTLONG = true);
ALTER TEXT SEARCH DICTIONARY
That's all.

View File

@ -0,0 +1,99 @@
/*-------------------------------------------------------------------------
*
* dict_int.c
* Text search dictionary for integers
*
* Copyright (c) 2007, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/dict_int/dict_int.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "commands/defrem.h"
#include "fmgr.h"
#include "tsearch/ts_public.h"
PG_MODULE_MAGIC;
typedef struct {
int maxlen;
bool rejectlong;
} DictInt;
PG_FUNCTION_INFO_V1(dintdict_init);
Datum dintdict_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(dintdict_lexize);
Datum dintdict_lexize(PG_FUNCTION_ARGS);
Datum
dintdict_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictInt *d;
ListCell *l;
d = (DictInt *) palloc0(sizeof(DictInt));
d->maxlen = 6;
d->rejectlong = false;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp(defel->defname, "MAXLEN") == 0)
{
d->maxlen = atoi(defGetString(defel));
}
else if (pg_strcasecmp(defel->defname, "REJECTLONG") == 0)
{
d->rejectlong = defGetBoolean(defel);
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized intdict parameter: \"%s\"",
defel->defname)));
}
}
PG_RETURN_POINTER(d);
}
Datum
dintdict_lexize(PG_FUNCTION_ARGS)
{
DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
char *in = (char*)PG_GETARG_POINTER(1);
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
TSLexeme *res=palloc(sizeof(TSLexeme)*2);
res[1].lexeme = NULL;
if (PG_GETARG_INT32(2) > d->maxlen)
{
if ( d->rejectlong )
{
/* reject by returning void array */
pfree(txt);
res[0].lexeme = NULL;
}
else
{
/* trim integer */
txt[d->maxlen] = '\0';
res[0].lexeme = txt;
}
}
else
{
res[0].lexeme = txt;
}
PG_RETURN_POINTER(res);
}

View File

@ -0,0 +1,29 @@
-- $PostgreSQL: pgsql/contrib/dict_int/dict_int.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
-- Adjust this setting to control where the objects get created.
SET search_path = public;
BEGIN;
CREATE FUNCTION dintdict_init(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION dintdict_lexize(internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE TEXT SEARCH TEMPLATE intdict_template (
LEXIZE = dintdict_lexize,
INIT = dintdict_init
);
CREATE TEXT SEARCH DICTIONARY intdict (
TEMPLATE = intdict_template
);
COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'dictionary for integers';
END;

View File

@ -0,0 +1,308 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
--lexize
select ts_lexize('intdict', '511673');
ts_lexize
-----------
{511673}
(1 row)
select ts_lexize('intdict', '129');
ts_lexize
-----------
{129}
(1 row)
select ts_lexize('intdict', '40865854');
ts_lexize
-----------
{408658}
(1 row)
select ts_lexize('intdict', '952');
ts_lexize
-----------
{952}
(1 row)
select ts_lexize('intdict', '654980341');
ts_lexize
-----------
{654980}
(1 row)
select ts_lexize('intdict', '09810106');
ts_lexize
-----------
{098101}
(1 row)
select ts_lexize('intdict', '14262713');
ts_lexize
-----------
{142627}
(1 row)
select ts_lexize('intdict', '6532082986');
ts_lexize
-----------
{653208}
(1 row)
select ts_lexize('intdict', '0150061');
ts_lexize
-----------
{015006}
(1 row)
select ts_lexize('intdict', '7778');
ts_lexize
-----------
{7778}
(1 row)
select ts_lexize('intdict', '9547');
ts_lexize
-----------
{9547}
(1 row)
select ts_lexize('intdict', '753395478');
ts_lexize
-----------
{753395}
(1 row)
select ts_lexize('intdict', '647652');
ts_lexize
-----------
{647652}
(1 row)
select ts_lexize('intdict', '6988655574');
ts_lexize
-----------
{698865}
(1 row)
select ts_lexize('intdict', '1279');
ts_lexize
-----------
{1279}
(1 row)
select ts_lexize('intdict', '1266645909');
ts_lexize
-----------
{126664}
(1 row)
select ts_lexize('intdict', '7594193969');
ts_lexize
-----------
{759419}
(1 row)
select ts_lexize('intdict', '16928207');
ts_lexize
-----------
{169282}
(1 row)
select ts_lexize('intdict', '196850350328');
ts_lexize
-----------
{196850}
(1 row)
select ts_lexize('intdict', '22026985592');
ts_lexize
-----------
{220269}
(1 row)
select ts_lexize('intdict', '2063765');
ts_lexize
-----------
{206376}
(1 row)
select ts_lexize('intdict', '242387310');
ts_lexize
-----------
{242387}
(1 row)
select ts_lexize('intdict', '93595');
ts_lexize
-----------
{93595}
(1 row)
select ts_lexize('intdict', '9374');
ts_lexize
-----------
{9374}
(1 row)
select ts_lexize('intdict', '996969');
ts_lexize
-----------
{996969}
(1 row)
select ts_lexize('intdict', '353595982');
ts_lexize
-----------
{353595}
(1 row)
select ts_lexize('intdict', '925860');
ts_lexize
-----------
{925860}
(1 row)
select ts_lexize('intdict', '11848378337');
ts_lexize
-----------
{118483}
(1 row)
select ts_lexize('intdict', '333');
ts_lexize
-----------
{333}
(1 row)
select ts_lexize('intdict', '799287416765');
ts_lexize
-----------
{799287}
(1 row)
select ts_lexize('intdict', '745939');
ts_lexize
-----------
{745939}
(1 row)
select ts_lexize('intdict', '67601305734');
ts_lexize
-----------
{676013}
(1 row)
select ts_lexize('intdict', '3361113');
ts_lexize
-----------
{336111}
(1 row)
select ts_lexize('intdict', '9033778607');
ts_lexize
-----------
{903377}
(1 row)
select ts_lexize('intdict', '7507648');
ts_lexize
-----------
{750764}
(1 row)
select ts_lexize('intdict', '1166');
ts_lexize
-----------
{1166}
(1 row)
select ts_lexize('intdict', '9360498');
ts_lexize
-----------
{936049}
(1 row)
select ts_lexize('intdict', '917795');
ts_lexize
-----------
{917795}
(1 row)
select ts_lexize('intdict', '9387894');
ts_lexize
-----------
{938789}
(1 row)
select ts_lexize('intdict', '42764329');
ts_lexize
-----------
{427643}
(1 row)
select ts_lexize('intdict', '564062');
ts_lexize
-----------
{564062}
(1 row)
select ts_lexize('intdict', '5413377');
ts_lexize
-----------
{541337}
(1 row)
select ts_lexize('intdict', '060965');
ts_lexize
-----------
{060965}
(1 row)
select ts_lexize('intdict', '08273593');
ts_lexize
-----------
{082735}
(1 row)
select ts_lexize('intdict', '593556010144');
ts_lexize
-----------
{593556}
(1 row)
select ts_lexize('intdict', '17988843352');
ts_lexize
-----------
{179888}
(1 row)
select ts_lexize('intdict', '252281774');
ts_lexize
-----------
{252281}
(1 row)
select ts_lexize('intdict', '313425');
ts_lexize
-----------
{313425}
(1 row)
select ts_lexize('intdict', '641439323669');
ts_lexize
-----------
{641439}
(1 row)
select ts_lexize('intdict', '314532610153');
ts_lexize
-----------
{314532}
(1 row)

View File

@ -0,0 +1,61 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
\i dict_int.sql
\set ECHO all
RESET client_min_messages;
--lexize
select ts_lexize('intdict', '511673');
select ts_lexize('intdict', '129');
select ts_lexize('intdict', '40865854');
select ts_lexize('intdict', '952');
select ts_lexize('intdict', '654980341');
select ts_lexize('intdict', '09810106');
select ts_lexize('intdict', '14262713');
select ts_lexize('intdict', '6532082986');
select ts_lexize('intdict', '0150061');
select ts_lexize('intdict', '7778');
select ts_lexize('intdict', '9547');
select ts_lexize('intdict', '753395478');
select ts_lexize('intdict', '647652');
select ts_lexize('intdict', '6988655574');
select ts_lexize('intdict', '1279');
select ts_lexize('intdict', '1266645909');
select ts_lexize('intdict', '7594193969');
select ts_lexize('intdict', '16928207');
select ts_lexize('intdict', '196850350328');
select ts_lexize('intdict', '22026985592');
select ts_lexize('intdict', '2063765');
select ts_lexize('intdict', '242387310');
select ts_lexize('intdict', '93595');
select ts_lexize('intdict', '9374');
select ts_lexize('intdict', '996969');
select ts_lexize('intdict', '353595982');
select ts_lexize('intdict', '925860');
select ts_lexize('intdict', '11848378337');
select ts_lexize('intdict', '333');
select ts_lexize('intdict', '799287416765');
select ts_lexize('intdict', '745939');
select ts_lexize('intdict', '67601305734');
select ts_lexize('intdict', '3361113');
select ts_lexize('intdict', '9033778607');
select ts_lexize('intdict', '7507648');
select ts_lexize('intdict', '1166');
select ts_lexize('intdict', '9360498');
select ts_lexize('intdict', '917795');
select ts_lexize('intdict', '9387894');
select ts_lexize('intdict', '42764329');
select ts_lexize('intdict', '564062');
select ts_lexize('intdict', '5413377');
select ts_lexize('intdict', '060965');
select ts_lexize('intdict', '08273593');
select ts_lexize('intdict', '593556010144');
select ts_lexize('intdict', '17988843352');
select ts_lexize('intdict', '252281774');
select ts_lexize('intdict', '313425');
select ts_lexize('intdict', '641439323669');
select ts_lexize('intdict', '314532610153');

View File

@ -0,0 +1,9 @@
SET search_path = public;
DROP TEXT SEARCH DICTIONARY intdict;
DROP TEXT SEARCH TEMPLATE intdict_template;
DROP FUNCTION dintdict_init(internal);
DROP FUNCTION dintdict_lexize(internal,internal,internal,internal);

View File

@ -0,0 +1,38 @@
# $PostgreSQL: pgsql/contrib/dict_xsyn/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
MODULE_big = dict_xsyn
OBJS = dict_xsyn.o
DATA_built = dict_xsyn.sql
DATA = uninstall_dict_xsyn.sql
DOCS = README.dict_xsyn
REGRESS = dict_xsyn
DICTDIR = tsearch_data
DICTFILES = xsyn_sample.rules
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/dict_xsyn
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif
install: install-data
.PHONY: install-data
install-data: $(DICTFILES)
for i in $(DICTFILES); \
do $(INSTALL_DATA) $(srcdir)/$$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \
done
uninstall: uninstall-data
.PHONY: uninstall-data
uninstall-data:
for i in $(DICTFILES); \
do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \
done

View File

@ -0,0 +1,52 @@
Extended Synonym dictionary
===========================
This is a simple synonym dictionary. It replaces words with groups of their
synonyms, and so makes it possible to search for a word using any of its
synonyms.
* Configuration
It accepts the following options:
- KEEPORIG controls whether the original word is included, or only its
synonyms. Default is 'true'.
- RULES is the base name of the file containing the list of synonyms.
This file must be in $(prefix)/share/tsearch_data/, and its name must
end in ".rules" (which is not included in the RULES parameter).
The rules file has the following format:
- Each line represents a group of synonyms for a single word, which is
given first on the line. Synonyms are separated by whitespace:
word syn1 syn2 syn3
- Sharp ('#') sign is a comment delimiter. It may appear at any position
inside the line. The rest of the line will be skipped.
Look at xsyn_sample.rules, which is installed in $(prefix)/share/tsearch_data/,
for an example.
* Usage
1. Compile and install
2. Load dictionary
psql mydb < dict_xsyn.sql
3. Test it
mydb=# SELECT ts_lexize('xsyn','word');
ts_lexize
----------------
{word,syn1,syn2,syn3)
4. Change the dictionary options as you wish
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (KEEPORIG=false);
ALTER TEXT SEARCH DICTIONARY
That's all.

View File

@ -0,0 +1,235 @@
/*-------------------------------------------------------------------------
*
* dict_xsyn.c
* Extended synonym dictionary
*
* Copyright (c) 2007, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include "commands/defrem.h"
#include "fmgr.h"
#include "storage/fd.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
PG_MODULE_MAGIC;
typedef struct
{
char *key; /* Word */
char *value; /* Unparsed list of synonyms, including the word itself */
} Syn;
typedef struct
{
int len;
Syn *syn;
bool keeporig;
} DictSyn;
PG_FUNCTION_INFO_V1(dxsyn_init);
Datum dxsyn_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(dxsyn_lexize);
Datum dxsyn_lexize(PG_FUNCTION_ARGS);
static char *
find_word(char *in, char **end)
{
char *start;
*end = NULL;
while (*in && t_isspace(in))
in += pg_mblen(in);
if (!*in || *in == '#')
return NULL;
start = in;
while (*in && !t_isspace(in))
in += pg_mblen(in);
*end = in;
return start;
}
static int
compare_syn(const void *a, const void *b)
{
return strcmp(((Syn *) a)->key, ((Syn *) b)->key);
}
static void
read_dictionary(DictSyn *d, char *filename)
{
char *real_filename = get_tsearch_config_filename(filename, "rules");
FILE *fin;
char *line;
int cur = 0;
if ((fin = AllocateFile(real_filename, "r")) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open synonym file \"%s\": %m",
real_filename)));
while ((line = t_readline(fin)) != NULL)
{
char *value;
char *key;
char *end = NULL;
if (*line == '\0')
continue;
value = lowerstr(line);
pfree(line);
key = find_word(value, &end);
if (!key)
{
pfree(value);
continue;
}
if (cur == d->len)
{
d->len = (d->len > 0) ? 2 * d->len : 16;
if (d->syn)
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
else
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
d->syn[cur].key = pnstrdup(key, end - key);
d->syn[cur].value = value;
cur++;
}
FreeFile(fin);
d->len = cur;
if (cur > 1)
qsort(d->syn, d->len, sizeof(Syn), compare_syn);
pfree(real_filename);
}
Datum
dxsyn_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSyn *d;
ListCell *l;
d = (DictSyn *) palloc0(sizeof(DictSyn));
d->len = 0;
d->syn = NULL;
d->keeporig = true;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
{
d->keeporig = defGetBoolean(defel);
}
else if (pg_strcasecmp(defel->defname, "RULES") == 0)
{
read_dictionary(d, defGetString(defel));
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized xsyn parameter: \"%s\"",
defel->defname)));
}
}
PG_RETURN_POINTER(d);
}
Datum
dxsyn_lexize(PG_FUNCTION_ARGS)
{
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int length = PG_GETARG_INT32(2);
Syn word;
Syn *found;
TSLexeme *res = NULL;
if (!length || d->len == 0)
PG_RETURN_POINTER(NULL);
/* Create search pattern */
{
char *temp = pnstrdup(in, length);
word.key = lowerstr(temp);
pfree(temp);
word.value = NULL;
}
/* Look for matching syn */
found = (Syn *)bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
pfree(word.key);
if (!found)
PG_RETURN_POINTER(NULL);
/* Parse string of synonyms and return array of words */
{
char *value = pstrdup(found->value);
int value_length = strlen(value);
char *pos = value;
int nsyns = 0;
bool is_first = true;
res = palloc(0);
while(pos < value + value_length)
{
char *end;
char *syn = find_word(pos, &end);
if (!syn)
break;
*end = '\0';
res = repalloc(res, sizeof(TSLexeme)*(nsyns + 2));
res[nsyns].lexeme = NULL;
/* first word is added to result only if KEEPORIG flag is set */
if(d->keeporig || !is_first)
{
res[nsyns].lexeme = pstrdup(syn);
res[nsyns + 1].lexeme = NULL;
nsyns++;
}
is_first = false;
pos = end + 1;
}
pfree(value);
}
PG_RETURN_POINTER(res);
}

View File

@ -0,0 +1,29 @@
-- $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
-- Adjust this setting to control where the objects get created.
SET search_path = public;
BEGIN;
CREATE FUNCTION dxsyn_init(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE TEXT SEARCH TEMPLATE xsyn_template (
LEXIZE = dxsyn_lexize,
INIT = dxsyn_init
);
CREATE TEXT SEARCH DICTIONARY xsyn (
TEMPLATE = xsyn_template
);
COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary';
END;

View File

@ -0,0 +1,22 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
--configuration
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
--lexize
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
----------------
{sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)

View File

@ -0,0 +1,16 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
\i dict_xsyn.sql
\set ECHO all
RESET client_min_messages;
--configuration
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
--lexize
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'grb');

View File

@ -0,0 +1,9 @@
SET search_path = public;
DROP TEXT SEARCH DICTIONARY xsyn;
DROP TEXT SEARCH TEMPLATE xsyn_template;
DROP FUNCTION dxsyn_init(internal);
DROP FUNCTION dxsyn_lexize(internal,internal,internal,internal);

View File

@ -0,0 +1,6 @@
# Sample rules file for eXtended Synonym (xsyn) dictionary
# format is as follows:
#
# word synonym1 synonym2 ...
#
supernova sn sne 1987a

View File

@ -0,0 +1,19 @@
# $PostgreSQL: pgsql/contrib/test_parser/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
MODULE_big = test_parser
OBJS = test_parser.o
DATA_built = test_parser.sql
DATA = uninstall_test_parser.sql
DOCS = README.test_parser
REGRESS = test_parser
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/test_parser
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

View File

@ -0,0 +1,52 @@
Example parser
==============
This is an example of a custom parser for full text search.
It recognizes space-delimited words and returns only two token types:
- 3, word, Word
- 12, blank, Space symbols
The token numbers have been chosen to keep compatibility with the default
ts_headline() function, since we do not want to implement our own version.
* Configuration
The parser has no user-configurable parameters.
* Usage
1. Compile and install
2. Load dictionary
psql mydb < test_parser.sql
3. Test it
mydb# SELECT * FROM ts_parse('testparser','That''s my first own parser');
tokid | token
-------+--------
3 | That's
12 |
3 | my
12 |
3 | first
12 |
3 | own
12 |
3 | parser
mydb# SELECT to_tsvector('testcfg','That''s my first own parser');
to_tsvector
-------------------------------------------------
'my':2 'own':4 'first':3 'parser':5 'that''s':1
mydb# SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
headline
-----------------------------------------------------------------
Supernovae <b>stars</b> are the brightest phenomena in galaxies
That's all.

View File

@ -0,0 +1,50 @@
--
-- first, define the parser. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
-- make test configuration using parser
CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
-- ts_parse
SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
tokid | token
-------+-----------------------
3 | That's
12 |
3 | simple
12 |
3 | parser
12 |
3 | can't
12 |
3 | parse
12 |
3 | urls
12 |
3 | like
12 |
3 | http://some.url/here/
(15 rows)
SELECT to_tsvector('testcfg','That''s my first own parser');
to_tsvector
-------------------------------------------------
'my':2 'own':4 'first':3 'parser':5 'that''s':1
(1 row)
SELECT to_tsquery('testcfg', 'star');
to_tsquery
------------
'star'
(1 row)
SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies',
to_tsquery('testcfg', 'stars'));
ts_headline
-----------------------------------------------------------------
Supernovae <b>stars</b> are the brightest phenomena in galaxies
(1 row)

View File

@ -0,0 +1,26 @@
--
-- first, define the parser. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
\i test_parser.sql
\set ECHO all
RESET client_min_messages;
-- make test configuration using parser
CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
-- ts_parse
SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
SELECT to_tsvector('testcfg','That''s my first own parser');
SELECT to_tsquery('testcfg', 'star');
SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies',
to_tsquery('testcfg', 'stars'));

View File

@ -0,0 +1,130 @@
/*-------------------------------------------------------------------------
*
* test_parser.c
* Simple example of a text search parser
*
* Copyright (c) 2007, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/test_parser/test_parser.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
PG_MODULE_MAGIC;
/*
* types
*/
/* self-defined type */
typedef struct {
char * buffer; /* text to parse */
int len; /* length of the text in buffer */
int pos; /* position of the parser */
} ParserState;
/* copy-paste from wparser.h of tsearch2 */
typedef struct {
int lexid;
char *alias;
char *descr;
} LexDescr;
/*
* prototypes
*/
PG_FUNCTION_INFO_V1(testprs_start);
Datum testprs_start(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_getlexeme);
Datum testprs_getlexeme(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_end);
Datum testprs_end(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_lextype);
Datum testprs_lextype(PG_FUNCTION_ARGS);
/*
* functions
*/
Datum testprs_start(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));
pst->buffer = (char *) PG_GETARG_POINTER(0);
pst->len = PG_GETARG_INT32(1);
pst->pos = 0;
PG_RETURN_POINTER(pst);
}
Datum testprs_getlexeme(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2);
int type;
*tlen = pst->pos;
*t = pst->buffer + pst->pos;
if ((pst->buffer)[pst->pos] == ' ')
{
/* blank type */
type = 12;
/* go to the next non-white-space character */
while ((pst->buffer)[pst->pos] == ' ' &&
pst->pos < pst->len)
(pst->pos)++;
} else {
/* word type */
type = 3;
/* go to the next white-space character */
while ((pst->buffer)[pst->pos] != ' ' &&
pst->pos < pst->len)
(pst->pos)++;
}
*tlen = pst->pos - *tlen;
/* we are finished if (*tlen == 0) */
if (*tlen == 0)
type=0;
PG_RETURN_INT32(type);
}
Datum testprs_end(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
pfree(pst);
PG_RETURN_VOID();
}
Datum testprs_lextype(PG_FUNCTION_ARGS)
{
/*
* Remarks:
* - we have to return the blanks for headline reason
* - we use the same lexids like Teodor in the default
* word parser; in this way we can reuse the headline
* function of the default word parser.
*/
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
/* there are only two types in this parser */
descr[0].lexid = 3;
descr[0].alias = pstrdup("word");
descr[0].descr = pstrdup("Word");
descr[1].lexid = 12;
descr[1].alias = pstrdup("blank");
descr[1].descr = pstrdup("Space symbols");
descr[2].lexid = 0;
PG_RETURN_POINTER(descr);
}

View File

@ -0,0 +1,36 @@
-- $PostgreSQL: pgsql/contrib/test_parser/test_parser.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
-- Adjust this setting to control where the objects get created.
SET search_path = public;
BEGIN;
CREATE FUNCTION testprs_start(internal, int4)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION testprs_getlexeme(internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION testprs_end(internal)
RETURNS void
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION testprs_lextype(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE TEXT SEARCH PARSER testparser (
START = testprs_start,
GETTOKEN = testprs_getlexeme,
END = testprs_end,
HEADLINE = pg_catalog.prsd_headline,
LEXTYPES = testprs_lextype
);
END;

View File

@ -0,0 +1,11 @@
SET search_path = public;
DROP TEXT SEARCH PARSER testparser;
DROP FUNCTION testprs_start(internal, int4);
DROP FUNCTION testprs_getlexeme(internal, internal, internal);
DROP FUNCTION testprs_end(internal);
DROP FUNCTION testprs_lextype(internal);