Bring SIMILAR TO and SUBSTRING into some semblance of conformance with

the SQL99 standard.  (I'm not sure that the character-class features are
quite right, but that can be fixed later.)  Document SQL99 and POSIX
regexps as being different features; provide variants of SUBSTRING for
each.
This commit is contained in:
Tom Lane 2002-09-22 17:27:25 +00:00
parent ac355d558e
commit 9946b83ded
7 changed files with 683 additions and 381 deletions

View File

@ -1,5 +1,5 @@
<!--
$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.124 2002/09/21 18:32:53 petere Exp $
$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.125 2002/09/22 17:27:23 tgl Exp $
PostgreSQL documentation
-->
@ -921,18 +921,31 @@ PostgreSQL documentation
</row>
<row>
<entry><function>substring</function>(<parameter>string</parameter> <optional>from <replaceable>pattern</replaceable></optional> <optional>for <replaceable>escape</replaceable></optional>)</entry>
<entry><function>substring</function>(<parameter>string</parameter> from <replaceable>pattern</replaceable>)</entry>
<entry><type>text</type></entry>
<entry>
extract regular expression
extract substring matching POSIX regular expression
<indexterm>
<primary>substring</primary>
</indexterm>
</entry>
<entry><literal>substring('Thomas' from 'mas$' for <optional>escape '\\'</optional>)</literal></entry>
<entry><literal>substring('Thomas' from '...$')</literal></entry>
<entry><literal>mas</literal></entry>
</row>
<row>
<entry><function>substring</function>(<parameter>string</parameter> from <replaceable>pattern</replaceable> for <replaceable>escape</replaceable>)</entry>
<entry><type>text</type></entry>
<entry>
extract substring matching SQL99 regular expression
<indexterm>
<primary>substring</primary>
</indexterm>
</entry>
<entry><literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal></entry>
<entry><literal>oma</literal></entry>
</row>
<row>
<entry>
<function>trim</function>(<optional>leading | trailing | both</optional>
@ -960,6 +973,328 @@ PostgreSQL documentation
</tgroup>
</table>
<para>
Additional string manipulation functions are available and are
listed below. Some of them are used internally to implement the
<acronym>SQL</acronym>-standard string functions listed above.
</para>
<table id="functions-string-other">
<title>Other String Functions</title>
<tgroup cols="5">
<thead>
<row>
<entry>Function</entry>
<entry>Return Type</entry>
<entry>Description</entry>
<entry>Example</entry>
<entry>Result</entry>
</row>
</thead>
<tbody>
<row>
<entry><function>ascii</function>(<type>text</type>)</entry>
<entry>integer</entry>
<entry>Returns the <acronym>ASCII</acronym> code of the first character of the argument.</entry>
<entry><literal>ascii('x')</literal></entry>
<entry><literal>120</literal></entry>
</row>
<row>
<entry><function>btrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>trim</parameter> <type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>
Remove (trim) the longest string consisting only of characters
in <parameter>trim</parameter> from the start and end of
<parameter>string</parameter>.
</entry>
<entry><literal>btrim('xyxtrimyyx','xy')</literal></entry>
<entry><literal>trim</literal></entry>
</row>
<row>
<entry><function>chr</function>(<type>integer</type>)</entry>
<entry><type>text</type></entry>
<entry>Returns the character with the given <acronym>ASCII</acronym> code.</entry>
<entry><literal>chr(65)</literal></entry>
<entry><literal>A</literal></entry>
</row>
<row>
<entry>
<function>convert</function>(<parameter>string</parameter>
<type>text</type>,
<optional><parameter>src_encoding</parameter> <type>name</type>,</optional>
<parameter>dest_encoding</parameter> <type>name</type>)
</entry>
<entry><type>text</type></entry>
<entry>
Converts string using <parameter>dest_encoding</parameter>.
The original encoding is specified by
<parameter>src_encoding</parameter>. If
<parameter>src_encoding</parameter> is omitted, database
encoding is assumed.
</entry>
<entry><literal>convert('text_in_unicode', 'UNICODE', 'LATIN1')</literal></entry>
<entry><literal>text_in_unicode</literal> represented in ISO 8859-1</entry>
</row>
<row>
<entry>
<function>decode</function>(<parameter>string</parameter> <type>text</type>,
<parameter>type</parameter> <type>text</type>)
</entry>
<entry><type>bytea</type></entry>
<entry>
Decodes binary data from <parameter>string</parameter> previously
encoded with encode(). Parameter type is same as in encode().
</entry>
<entry><literal>decode('MTIzAAE=', 'base64')</literal></entry>
<entry><literal>123\000\001</literal></entry>
</row>
<row>
<entry>
<function>encode</function>(<parameter>data</parameter> <type>bytea</type>,
<parameter>type</parameter> <type>text</type>)
</entry>
<entry><type>text</type></entry>
<entry>
Encodes binary data to <acronym>ASCII</acronym>-only representation. Supported
types are: 'base64', 'hex', 'escape'.
</entry>
<entry><literal>encode('123\\000\\001', 'base64')</literal></entry>
<entry><literal>MTIzAAE=</literal></entry>
</row>
<row>
<entry><function>initcap</function>(<type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>Converts first letter of each word (whitespace separated) to upper case.</entry>
<entry><literal>initcap('hi thomas')</literal></entry>
<entry><literal>Hi Thomas</literal></entry>
</row>
<row>
<entry><function>length</function>(<parameter>string</parameter>)</entry>
<entry><type>integer</type></entry>
<entry>
length of string
<indexterm>
<primary>character strings</primary>
<secondary>length</secondary>
</indexterm>
<indexterm>
<primary>length</primary>
<secondary>character strings</secondary>
<see>character strings, length</see>
</indexterm>
</entry>
<entry><literal>length('jose')</literal></entry>
<entry><literal>4</literal></entry>
</row>
<row>
<entry>
<function>lpad</function>(<parameter>string</parameter> <type>text</type>,
<parameter>length</parameter> <type>integer</type>
<optional>, <parameter>fill</parameter> <type>text</type></optional>)
</entry>
<entry>text</entry>
<entry>
Fills up the <parameter>string</parameter> to length
<parameter>length</parameter> by prepending the characters
<parameter>fill</parameter> (a space by default). If the
<parameter>string</parameter> is already longer than
<parameter>length</parameter> then it is truncated (on the
right).
</entry>
<entry><literal>lpad('hi', 5, 'xy')</literal></entry>
<entry><literal>xyxhi</literal></entry>
</row>
<row>
<entry><function>ltrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>text</parameter> <type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>
Removes the longest string containing only characters from
<parameter>trim</parameter> from the start of the string.
</entry>
<entry><literal>ltrim('zzzytrim','xyz')</literal></entry>
<entry><literal>trim</literal></entry>
</row>
<row>
<entry><function>pg_client_encoding</function>()</entry>
<entry><type>name</type></entry>
<entry>
Returns current client encoding name.
</entry>
<entry><literal>pg_client_encoding()</literal></entry>
<entry><literal>SQL_ASCII</literal></entry>
</row>
<row>
<entry><function>quote_ident</function>(<parameter>string</parameter> text)</entry>
<entry><type>text</type></entry>
<entry>
Returns the given string suitably quoted to be used as an identifier
in an SQL query string.
Quotes are added only if necessary (i.e., if the string contains
non-identifier characters or would be case-folded).
Embedded quotes are properly doubled.
</entry>
<entry><literal>quote_ident('Foo')</literal></entry>
<entry><literal>"Foo"</literal></entry>
</row>
<row>
<entry><function>quote_literal</function>(<parameter>string</parameter> text)</entry>
<entry><type>text</type></entry>
<entry>
Returns the given string suitably quoted to be used as a literal
in an SQL query string.
Embedded quotes and backslashes are properly doubled.
</entry>
<entry><literal>quote_literal('O\'Reilly')</literal></entry>
<entry><literal>'O''Reilly'</literal></entry>
</row>
<row>
<entry><function>repeat</function>(<type>text</type>, <type>integer</type>)</entry>
<entry><type>text</type></entry>
<entry>Repeat text a number of times.</entry>
<entry><literal>repeat('Pg', 4)</literal></entry>
<entry><literal>PgPgPgPg</literal></entry>
</row>
<row>
<entry><function>replace</function>(<parameter>string</parameter> <type>text</type>,
<parameter>from</parameter> <type>text</type>,
<parameter>to</parameter> <type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>Replace all occurrences in <parameter>string</parameter> of substring
<parameter>from</parameter> with substring <parameter>to</parameter>
</entry>
<entry><literal>replace('abcdefabcdef', 'cd', 'XX')</literal></entry>
<entry><literal>abXXefabXXef</literal></entry>
</row>
<row>
<entry>
<function>rpad</function>(<parameter>string</parameter> <type>text</type>,
<parameter>length</parameter> <type>integer</type>
<optional>, <parameter>fill</parameter> <type>text</type></optional>)
</entry>
<entry><type>text</type></entry>
<entry>
Fills up the <parameter>string</parameter> to length
<parameter>length</parameter> by appending the characters
<parameter>fill</parameter> (a space by default). If the
<parameter>string</parameter> is already longer than
<parameter>length</parameter> then it is truncated.
</entry>
<entry><literal>rpad('hi', 5, 'xy')</literal></entry>
<entry><literal>hixyx</literal></entry>
</row>
<row>
<entry><function>rtrim</function>(<parameter>string</parameter>
text, <parameter>trim</parameter> text)</entry>
<entry><type>text</type></entry>
<entry>
Removes the longest string containing only characters from
<parameter>trim</parameter> from the end of the string.
</entry>
<entry><literal>rtrim('trimxxxx','x')</literal></entry>
<entry><literal>trim</literal></entry>
</row>
<row>
<entry><function>split_part</function>(<parameter>string</parameter> <type>text</type>,
<parameter>delimiter</parameter> <type>text</type>,
<parameter>column</parameter> <type>integer</type>)</entry>
<entry><type>text</type></entry>
<entry>Split <parameter>string</parameter> on <parameter>delimiter</parameter>
returning the resulting (one based) <parameter>column</parameter> number.
</entry>
<entry><literal>split_part('abc~@~def~@~ghi','~@~',2)</literal></entry>
<entry><literal>def</literal></entry>
</row>
<row>
<entry><function>strpos</function>(<parameter>string</parameter>, <parameter>substring</parameter>)</entry>
<entry><type>text</type></entry>
<entry>
Locates specified substring. (same as
<literal>position(<parameter>substring</parameter> in
<parameter>string</parameter>)</literal>, but note the reversed
argument order)
</entry>
<entry><literal>strpos('high','ig')</literal></entry>
<entry><literal>2</literal></entry>
</row>
<row>
<entry><function>substr</function>(<parameter>string</parameter>, <parameter>from</parameter> <optional>, <parameter>count</parameter></optional>)</entry>
<entry><type>text</type></entry>
<entry>
Extracts specified substring. (same as
<literal>substring(<parameter>string</parameter> from <parameter>from</parameter> for <parameter>count</parameter>)</literal>)
</entry>
<entry><literal>substr('alphabet', 3, 2)</literal></entry>
<entry><literal>ph</literal></entry>
</row>
<row>
<entry><function>to_ascii</function>(<type>text</type>
<optional>, <parameter>encoding</parameter></optional>)</entry>
<entry><type>text</type></entry>
<entry>Converts text from multibyte encoding to <acronym>ASCII</acronym>.</entry>
<entry><literal>to_ascii('Karel')</literal></entry>
<entry><literal>Karel</literal></entry>
</row>
<row>
<entry><function>to_hex</function>(<parameter>number</parameter> <type>integer</type>
or <type>bigint</type>)</entry>
<entry><type>text</type></entry>
<entry>Convert <parameter>number</parameter> to its equivalent hexadecimal
representation.
</entry>
<entry><literal>to_hex(9223372036854775807::bigint)</literal></entry>
<entry><literal>7fffffffffffffff</literal></entry>
</row>
<row>
<entry>
<function>translate</function>(<parameter>string</parameter>
<type>text</type>,
<parameter>from</parameter> <type>text</type>,
<parameter>to</parameter> <type>text</type>)
</entry>
<entry><type>text</type></entry>
<entry>
Any character in <parameter>string</parameter> that matches a
character in the <parameter>from</parameter> set is replaced by
the corresponding character in the <parameter>to</parameter>
set.
</entry>
<entry><literal>translate('12345', '14', 'ax')</literal></entry>
<entry><literal>a23x5</literal></entry>
</row>
</tbody>
</tgroup>
</table>
<para>
The <function>to_ascii</function> function supports conversion from
LATIN1, LATIN2, WIN1250 (CP1250) only.
</para>
<table id="conversion-names">
<title>Available conversion names</title>
<tgroup cols="3">
@ -1660,326 +1995,6 @@ PostgreSQL documentation
</tgroup>
</table>
<para>
Additional string manipulation functions are available and are
listed below. Some of them are used internally to implement the
<acronym>SQL</acronym>-standard string functions listed above.
</para>
<table id="functions-string-other">
<title>Other String Functions</title>
<tgroup cols="5">
<thead>
<row>
<entry>Function</entry>
<entry>Return Type</entry>
<entry>Description</entry>
<entry>Example</entry>
<entry>Result</entry>
</row>
</thead>
<tbody>
<row>
<entry><function>ascii</function>(<type>text</type>)</entry>
<entry>integer</entry>
<entry>Returns the <acronym>ASCII</acronym> code of the first character of the argument.</entry>
<entry><literal>ascii('x')</literal></entry>
<entry><literal>120</literal></entry>
</row>
<row>
<entry><function>btrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>trim</parameter> <type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>
Remove (trim) the longest string consisting only of characters
in <parameter>trim</parameter> from the start and end of
<parameter>string</parameter>.
</entry>
<entry><literal>btrim('xyxtrimyyx','xy')</literal></entry>
<entry><literal>trim</literal></entry>
</row>
<row>
<entry><function>chr</function>(<type>integer</type>)</entry>
<entry><type>text</type></entry>
<entry>Returns the character with the given <acronym>ASCII</acronym> code.</entry>
<entry><literal>chr(65)</literal></entry>
<entry><literal>A</literal></entry>
</row>
<row>
<entry>
<function>convert</function>(<parameter>string</parameter>
<type>text</type>,
<optional><parameter>src_encoding</parameter> <type>name</type>,</optional>
<parameter>dest_encoding</parameter> <type>name</type>)
</entry>
<entry><type>text</type></entry>
<entry>
Converts string using <parameter>dest_encoding</parameter>.
The original encoding is specified by
<parameter>src_encoding</parameter>. If
<parameter>src_encoding</parameter> is omitted, database
encoding is assumed.
</entry>
<entry><literal>convert('text_in_unicode', 'UNICODE', 'LATIN1')</literal></entry>
<entry><literal>text_in_unicode</literal> represented in ISO 8859-1</entry>
</row>
<row>
<entry>
<function>decode</function>(<parameter>string</parameter> <type>text</type>,
<parameter>type</parameter> <type>text</type>)
</entry>
<entry><type>bytea</type></entry>
<entry>
Decodes binary data from <parameter>string</parameter> previously
encoded with encode(). Parameter type is same as in encode().
</entry>
<entry><literal>decode('MTIzAAE=', 'base64')</literal></entry>
<entry><literal>123\000\001</literal></entry>
</row>
<row>
<entry>
<function>encode</function>(<parameter>data</parameter> <type>bytea</type>,
<parameter>type</parameter> <type>text</type>)
</entry>
<entry><type>text</type></entry>
<entry>
Encodes binary data to <acronym>ASCII</acronym>-only representation. Supported
types are: 'base64', 'hex', 'escape'.
</entry>
<entry><literal>encode('123\\000\\001', 'base64')</literal></entry>
<entry><literal>MTIzAAE=</literal></entry>
</row>
<row>
<entry><function>initcap</function>(<type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>Converts first letter of each word (whitespace separated) to upper case.</entry>
<entry><literal>initcap('hi thomas')</literal></entry>
<entry><literal>Hi Thomas</literal></entry>
</row>
<row>
<entry><function>length</function>(<parameter>string</parameter>)</entry>
<entry><type>integer</type></entry>
<entry>
length of string
<indexterm>
<primary>character strings</primary>
<secondary>length</secondary>
</indexterm>
<indexterm>
<primary>length</primary>
<secondary>character strings</secondary>
<see>character strings, length</see>
</indexterm>
</entry>
<entry><literal>length('jose')</literal></entry>
<entry><literal>4</literal></entry>
</row>
<row>
<entry>
<function>lpad</function>(<parameter>string</parameter> <type>text</type>,
<parameter>length</parameter> <type>integer</type>
<optional>, <parameter>fill</parameter> <type>text</type></optional>)
</entry>
<entry>text</entry>
<entry>
Fills up the <parameter>string</parameter> to length
<parameter>length</parameter> by prepending the characters
<parameter>fill</parameter> (a space by default). If the
<parameter>string</parameter> is already longer than
<parameter>length</parameter> then it is truncated (on the
right).
</entry>
<entry><literal>lpad('hi', 5, 'xy')</literal></entry>
<entry><literal>xyxhi</literal></entry>
</row>
<row>
<entry><function>ltrim</function>(<parameter>string</parameter> <type>text</type>, <parameter>text</parameter> <type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>
Removes the longest string containing only characters from
<parameter>trim</parameter> from the start of the string.
</entry>
<entry><literal>ltrim('zzzytrim','xyz')</literal></entry>
<entry><literal>trim</literal></entry>
</row>
<row>
<entry><function>pg_client_encoding</function>()</entry>
<entry><type>name</type></entry>
<entry>
Returns current client encoding name.
</entry>
<entry><literal>pg_client_encoding()</literal></entry>
<entry><literal>SQL_ASCII</literal></entry>
</row>
<row>
<entry><function>quote_ident</function>(<parameter>string</parameter> text)</entry>
<entry><type>text</type></entry>
<entry>
Returns the given string suitably quoted to be used as an identifier
in an SQL query string.
Quotes are added only if necessary (i.e., if the string contains
non-identifier characters or would be case-folded).
Embedded quotes are properly doubled.
</entry>
<entry><literal>quote_ident('Foo')</literal></entry>
<entry><literal>"Foo"</literal></entry>
</row>
<row>
<entry><function>quote_literal</function>(<parameter>string</parameter> text)</entry>
<entry><type>text</type></entry>
<entry>
Returns the given string suitably quoted to be used as a literal
in an SQL query string.
Embedded quotes and backslashes are properly doubled.
</entry>
<entry><literal>quote_literal('O\'Reilly')</literal></entry>
<entry><literal>'O''Reilly'</literal></entry>
</row>
<row>
<entry><function>repeat</function>(<type>text</type>, <type>integer</type>)</entry>
<entry><type>text</type></entry>
<entry>Repeat text a number of times.</entry>
<entry><literal>repeat('Pg', 4)</literal></entry>
<entry><literal>PgPgPgPg</literal></entry>
</row>
<row>
<entry><function>replace</function>(<parameter>string</parameter> <type>text</type>,
<parameter>from</parameter> <type>text</type>,
<parameter>to</parameter> <type>text</type>)</entry>
<entry><type>text</type></entry>
<entry>Replace all occurrences in <parameter>string</parameter> of substring
<parameter>from</parameter> with substring <parameter>to</parameter>
</entry>
<entry><literal>replace('abcdefabcdef', 'cd', 'XX')</literal></entry>
<entry><literal>abXXefabXXef</literal></entry>
</row>
<row>
<entry>
<function>rpad</function>(<parameter>string</parameter> <type>text</type>,
<parameter>length</parameter> <type>integer</type>
<optional>, <parameter>fill</parameter> <type>text</type></optional>)
</entry>
<entry><type>text</type></entry>
<entry>
Fills up the <parameter>string</parameter> to length
<parameter>length</parameter> by appending the characters
<parameter>fill</parameter> (a space by default). If the
<parameter>string</parameter> is already longer than
<parameter>length</parameter> then it is truncated.
</entry>
<entry><literal>rpad('hi', 5, 'xy')</literal></entry>
<entry><literal>hixyx</literal></entry>
</row>
<row>
<entry><function>rtrim</function>(<parameter>string</parameter>
text, <parameter>trim</parameter> text)</entry>
<entry><type>text</type></entry>
<entry>
Removes the longest string containing only characters from
<parameter>trim</parameter> from the end of the string.
</entry>
<entry><literal>rtrim('trimxxxx','x')</literal></entry>
<entry><literal>trim</literal></entry>
</row>
<row>
<entry><function>split_part</function>(<parameter>string</parameter> <type>text</type>,
<parameter>delimiter</parameter> <type>text</type>,
<parameter>column</parameter> <type>integer</type>)</entry>
<entry><type>text</type></entry>
<entry>Split <parameter>string</parameter> on <parameter>delimiter</parameter>
returning the resulting (one based) <parameter>column</parameter> number.
</entry>
<entry><literal>split_part('abc~@~def~@~ghi','~@~',2)</literal></entry>
<entry><literal>def</literal></entry>
</row>
<row>
<entry><function>strpos</function>(<parameter>string</parameter>, <parameter>substring</parameter>)</entry>
<entry><type>text</type></entry>
<entry>
Locates specified substring. (same as
<literal>position(<parameter>substring</parameter> in
<parameter>string</parameter>)</literal>, but note the reversed
argument order)
</entry>
<entry><literal>strpos('high','ig')</literal></entry>
<entry><literal>2</literal></entry>
</row>
<row>
<entry><function>substr</function>(<parameter>string</parameter>, <parameter>from</parameter> <optional>, <parameter>count</parameter></optional>)</entry>
<entry><type>text</type></entry>
<entry>
Extracts specified substring. (same as
<literal>substring(<parameter>string</parameter> from <parameter>from</parameter> for <parameter>count</parameter>)</literal>)
</entry>
<entry><literal>substr('alphabet', 3, 2)</literal></entry>
<entry><literal>ph</literal></entry>
</row>
<row>
<entry><function>to_ascii</function>(<type>text</type>
<optional>, <parameter>encoding</parameter></optional>)</entry>
<entry><type>text</type></entry>
<entry>Converts text from multibyte encoding to <acronym>ASCII</acronym>.</entry>
<entry><literal>to_ascii('Karel')</literal></entry>
<entry><literal>Karel</literal></entry>
</row>
<row>
<entry><function>to_hex</function>(<parameter>number</parameter> <type>integer</type>
or <type>bigint</type>)</entry>
<entry><type>text</type></entry>
<entry>Convert <parameter>number</parameter> to its equivalent hexadecimal
representation.
</entry>
<entry><literal>to_hex(9223372036854775807::bigint)</literal></entry>
<entry><literal>7fffffffffffffff</literal></entry>
</row>
<row>
<entry>
<function>translate</function>(<parameter>string</parameter>
<type>text</type>,
<parameter>from</parameter> <type>text</type>,
<parameter>to</parameter> <type>text</type>)
</entry>
<entry><type>text</type></entry>
<entry>
Any character in <parameter>string</parameter> that matches a
character in the <parameter>from</parameter> set is replaced by
the corresponding character in the <parameter>to</parameter>
set.
</entry>
<entry><literal>translate('12345', '14', 'ax')</literal></entry>
<entry><literal>a23x5</literal></entry>
</row>
</tbody>
</tgroup>
</table>
<para>
The <function>to_ascii</function> function supports conversion from
LATIN1, LATIN2, WIN1250 (CP1250) only.
</para>
</sect1>
@ -2171,16 +2186,16 @@ PostgreSQL documentation
<title>Pattern Matching</title>
<para>
There are two separate approaches to pattern matching provided by
There are three separate approaches to pattern matching provided by
<productname>PostgreSQL</productname>: the traditional
<acronym>SQL</acronym>
<function>LIKE</function> operator and the more recent
<function>LIKE</function> operator, the more recent
<acronym>SQL99</acronym>
<function>SIMILAR TO</function> operator implementing
<function>SIMILAR TO</function> operator, and
<acronym>POSIX</acronym>-style regular expressions.
Additionally, a pattern matching function,
<function>SUBSTRING</function>, is available, as defined in
<acronym>SQL99</acronym>.
<function>SUBSTRING</function>, is available, using either
<acronym>SQL99</acronym>-style or POSIX-style regular expressions.
</para>
<tip>
@ -2190,13 +2205,6 @@ PostgreSQL documentation
</para>
</tip>
<para>
Both <function>LIKE</function> and <function>SIMILAR TO</function>
are SQL-standard operators which are also available in alternate
forms as <productname>PostgreSQL</productname> operators; look at
<literal>~</literal> and <literal>~~</literal> for examples.
</para>
<sect2 id="functions-like">
<title><function>LIKE</function></title>
@ -2296,11 +2304,142 @@ PostgreSQL documentation
</sect2>
<sect2 id="functions-regexp">
<title><function>SIMILAR TO</function> and <acronym>POSIX</acronym>
<sect2 id="functions-sql99-regexp">
<title><function>SIMILAR TO</function> and <acronym>SQL99</acronym>
Regular Expressions</title>
<indexterm zone="functions-regexp">
<indexterm zone="functions-sql99-regexp">
<primary>regular expressions</primary>
<seealso>pattern matching</seealso>
</indexterm>
<indexterm>
<primary>similar to</primary>
</indexterm>
<indexterm>
<primary>substring</primary>
</indexterm>
<synopsis>
<replaceable>string</replaceable> SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional>
<replaceable>string</replaceable> NOT SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional>
</synopsis>
<para>
The <function>SIMILAR TO</function> operator returns true or false
depending on whether its pattern matches the given string. It is
much like <function>LIKE</function>, except that it interprets the
pattern using <acronym>SQL99</acronym>'s definition of a regular
expression.
<acronym>SQL99</acronym>'s regular expressions are a curious cross
between <function>LIKE</function> notation and common regular expression
notation.
</para>
<para>
Like <function>LIKE</function>, the <function>SIMILAR TO</function>
operator succeeds only if its pattern matches the entire string;
this is unlike common regular expression practice, wherein the pattern
may match any part of the string.
Also like
<function>LIKE</function>, <function>SIMILAR TO</function> uses
<literal>%</> and <literal>_</> as wildcard characters denoting
any string and any single character, respectively (these are
comparable to <literal>.*</> and <literal>.</> in POSIX regular
expressions).
</para>
<para>
In addition to these facilities borrowed from <function>LIKE</function>,
<function>SIMILAR TO</function> supports these pattern-matching
metacharacters borrowed from POSIX regular expressions:
<itemizedlist>
<listitem>
<para>
<literal>|</literal> denotes alternation (either of two alternatives).
</para>
</listitem>
<listitem>
<para>
<literal>*</literal> denotes repetition of the previous item zero
or more times.
</para>
</listitem>
<listitem>
<para>
<literal>+</literal> denotes repetition of the previous item one
or more times.
</para>
</listitem>
<listitem>
<para>
Parentheses <literal>()</literal> may be used to group items into
a single logical item.
</para>
</listitem>
<listitem>
<para>
A bracket expression <literal>[...]</literal> specifies a character
class, just as in POSIX regular expressions.
</para>
</listitem>
</itemizedlist>
Notice that bounded repetition (<literal>?</> and <literal>{...}</>)
are not provided, though they exist in POSIX. Also, dot (<literal>.</>)
is not a metacharacter.
</para>
<para>
As with <function>LIKE</>, a backslash disables the special meaning
of any of these metacharacters; or a different escape character can
be specified with <literal>ESCAPE</>.
</para>
<informalexample>
<para>
Some examples:
<programlisting>
'abc' SIMILAR TO 'abc' <lineannotation>true</lineannotation>
'abc' SIMILAR TO 'a' <lineannotation>false</lineannotation>
'abc' SIMILAR TO '%(b|d)%' <lineannotation>true</lineannotation>
'abc' SIMILAR TO '(b|c)%' <lineannotation>false</lineannotation>
</programlisting>
</para>
</informalexample>
<para>
The SUBSTRING function with three parameters,
<function>SUBSTRING</function>(<parameter>string</parameter> FROM
<replaceable>pattern</replaceable> FOR
<replaceable>escape</replaceable>), provides extraction of a substring
that matches a SQL99 regular expression pattern. As with SIMILAR TO,
the specified pattern must match to the entire data string, else the
function fails and returns NULL. To indicate the part of the pattern
that should be returned on success, SQL99 specifies that the pattern
must contain two occurrences of the escape character followed by
double quote (<literal>"</>). The text matching the portion of the
pattern between these markers is returned.
</para>
<informalexample>
<para>
Some examples:
<programlisting>
SUBSTRING('foobar' FROM '%#"o_b#"%' FOR '#') <lineannotation>oob</lineannotation>
SUBSTRING('foobar' FROM '#"o_b#"%' FOR '#') <lineannotation>NULL</lineannotation>
</programlisting>
</para>
</informalexample>
</sect2>
<sect2 id="functions-posix-regexp">
<title><acronym>POSIX</acronym> Regular Expressions</title>
<indexterm zone="functions-posix-regexp">
<primary>regular expressions</primary>
<seealso>pattern matching</seealso>
</indexterm>
@ -2341,12 +2480,6 @@ PostgreSQL documentation
<entry>Does not match regular expression, case insensitive</entry>
<entry><literal>'thomas' !~* '.*vadim.*'</literal></entry>
</row>
<row>
<entry> <literal>SIMILAR TO</literal> </entry>
<entry>Matches regular expression, case sensitive</entry>
<entry><literal>'thomas' SIMILAR TO '.*thomas.*'</literal></entry>
</row>
</tbody>
</tgroup>
</table>
@ -2354,7 +2487,8 @@ PostgreSQL documentation
<para>
<acronym>POSIX</acronym> regular expressions provide a more
powerful means for
pattern matching than the <function>LIKE</function> function.
pattern matching than the <function>LIKE</function> and
<function>SIMILAR TO</> operators.
Many Unix tools such as <command>egrep</command>,
<command>sed</command>, or <command>awk</command> use a pattern
matching language that is similar to the one described here.
@ -2379,10 +2513,34 @@ PostgreSQL documentation
<para>
Some examples:
<programlisting>
'abc' SIMILAR TO 'abc' <lineannotation>true</lineannotation>
'abc' SIMILAR TO '^a' <lineannotation>true</lineannotation>
'abc' SIMILAR TO '(b|d)' <lineannotation>true</lineannotation>
'abc' SIMILAR TO '^(b|c)' <lineannotation>false</lineannotation>
'abc' ~ 'abc' <lineannotation>true</lineannotation>
'abc' ~ '^a' <lineannotation>true</lineannotation>
'abc' ~ '(b|d)' <lineannotation>true</lineannotation>
'abc' ~ '^(b|c)' <lineannotation>false</lineannotation>
</programlisting>
</para>
</informalexample>
<para>
The SUBSTRING function with two parameters,
<function>SUBSTRING</function>(<parameter>string</parameter> FROM
<replaceable>pattern</replaceable>), provides extraction of a substring
that matches a POSIX regular expression pattern. It returns NULL if
there is no match, otherwise the portion of the text that matched the
pattern. But if the pattern contains any parentheses, the portion
of the text that matched the first parenthesized subexpression (the
one whose left parenthesis comes first) is
returned. You can always put parentheses around the whole expression
if you want to use parentheses within it without triggering this
exception.
</para>
<informalexample>
<para>
Some examples:
<programlisting>
SUBSTRING('foobar' FROM 'o.b') <lineannotation>oob</lineannotation>
SUBSTRING('foobar' FROM 'o(.)b') <lineannotation>o</lineannotation>
</programlisting>
</para>
</informalexample>

View File

@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.367 2002/09/18 21:35:21 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/parser/gram.y,v 2.368 2002/09/22 17:27:23 tgl Exp $
*
* HISTORY
* AUTHOR DATE MAJOR EVENT
@ -5644,22 +5644,40 @@ a_expr: c_expr { $$ = $1; }
}
| a_expr SIMILAR TO a_expr %prec SIMILAR
{ $$ = (Node *) makeSimpleA_Expr(OP, "~", $1, $4); }
{
A_Const *c = makeNode(A_Const);
FuncCall *n = makeNode(FuncCall);
c->val.type = T_Null;
n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($4, (Node *) c);
n->agg_star = FALSE;
n->agg_distinct = FALSE;
$$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n);
}
| a_expr SIMILAR TO a_expr ESCAPE a_expr
{
FuncCall *n = makeNode(FuncCall);
n->funcname = SystemFuncName("like_escape");
n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($4, $6);
n->agg_star = FALSE;
n->agg_distinct = FALSE;
$$ = (Node *) makeSimpleA_Expr(OP, "~", $1, (Node *) n);
}
| a_expr NOT SIMILAR TO a_expr %prec SIMILAR
{ $$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, $5); }
{
A_Const *c = makeNode(A_Const);
FuncCall *n = makeNode(FuncCall);
c->val.type = T_Null;
n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($5, (Node *) c);
n->agg_star = FALSE;
n->agg_distinct = FALSE;
$$ = (Node *) makeSimpleA_Expr(OP, "!~", $1, (Node *) n);
}
| a_expr NOT SIMILAR TO a_expr ESCAPE a_expr
{
FuncCall *n = makeNode(FuncCall);
n->funcname = SystemFuncName("like_escape");
n->funcname = SystemFuncName("similar_escape");
n->args = makeList2($5, $7);
n->agg_star = FALSE;
n->agg_distinct = FALSE;

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.42 2002/09/04 20:31:28 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $
*
* Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance
@ -317,8 +317,7 @@ textregexsubstr(PG_FUNCTION_ARGS)
char *sterm;
int len;
bool match;
int nmatch = 1;
regmatch_t pmatch;
regmatch_t pmatch[2];
/* be sure sterm is null-terminated */
len = VARSIZE(s) - VARHDRSZ;
@ -327,21 +326,131 @@ textregexsubstr(PG_FUNCTION_ARGS)
sterm[len] = '\0';
/*
* We need the match info back from the pattern match to be able to
* actually extract the substring. It seems to be adequate to pass in
* a structure to return only one result.
* We pass two regmatch_t structs to get info about the overall match
* and the match for the first parenthesized subexpression (if any).
* If there is a parenthesized subexpression, we return what it matched;
* else return what the whole regexp matched.
*/
match = RE_compile_and_execute(p, sterm, REG_EXTENDED, nmatch, &pmatch);
match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch);
pfree(sterm);
/* match? then return the substring matching the pattern */
if (match)
{
int so,
eo;
so = pmatch[1].rm_so;
eo = pmatch[1].rm_eo;
if (so < 0 || eo < 0)
{
/* no parenthesized subexpression */
so = pmatch[0].rm_so;
eo = pmatch[0].rm_eo;
}
return (DirectFunctionCall3(text_substr,
PointerGetDatum(s),
Int32GetDatum(pmatch.rm_so + 1),
Int32GetDatum(pmatch.rm_eo - pmatch.rm_so)));
Int32GetDatum(so + 1),
Int32GetDatum(eo - so)));
}
PG_RETURN_NULL();
}
/* similar_escape()
* Convert a SQL99 regexp pattern to POSIX style, so it can be used by
* our regexp engine.
*/
Datum
similar_escape(PG_FUNCTION_ARGS)
{
text *pat_text;
text *esc_text;
text *result;
unsigned char *p,
*e,
*r;
int plen,
elen;
bool afterescape = false;
int nquotes = 0;
/* This function is not strict, so must test explicitly */
if (PG_ARGISNULL(0))
PG_RETURN_NULL();
pat_text = PG_GETARG_TEXT_P(0);
p = VARDATA(pat_text);
plen = (VARSIZE(pat_text) - VARHDRSZ);
if (PG_ARGISNULL(1))
{
/* No ESCAPE clause provided; default to backslash as escape */
e = "\\";
elen = 1;
}
else
{
esc_text = PG_GETARG_TEXT_P(1);
e = VARDATA(esc_text);
elen = (VARSIZE(esc_text) - VARHDRSZ);
if (elen == 0)
e = NULL; /* no escape character */
else if (elen != 1)
elog(ERROR, "ESCAPE string must be empty or one character");
}
/* We need room for ^, $, and up to 2 output bytes per input byte */
result = (text *) palloc(VARHDRSZ + 2 + 2 * plen);
r = VARDATA(result);
*r++ = '^';
while (plen > 0)
{
unsigned char pchar = *p;
if (afterescape)
{
if (pchar == '"') /* for SUBSTRING patterns */
*r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
else
{
*r++ = '\\';
*r++ = pchar;
}
afterescape = false;
}
else if (e && pchar == *e)
{
/* SQL99 escape character; do not send to output */
afterescape = true;
}
else if (pchar == '%')
{
*r++ = '.';
*r++ = '*';
}
else if (pchar == '_')
{
*r++ = '.';
}
else if (pchar == '\\' || pchar == '.' || pchar == '?' ||
pchar == '{')
{
*r++ = '\\';
*r++ = pchar;
}
else
{
*r++ = pchar;
}
p++, plen--;
}
*r++ = '$';
VARATT_SIZEP(result) = r - ((unsigned char *) result);
PG_RETURN_TEXT_P(result);
}

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: pg_proc.h,v 1.272 2002/09/18 21:35:23 tgl Exp $
* $Id: pg_proc.h,v 1.273 2002/09/22 17:27:23 tgl Exp $
*
* NOTES
* The script catalog/genbki.sh reads this file and generates .bki
@ -2076,6 +2076,9 @@ DESCR("convert int4 to char");
DATA(insert OID = 1622 ( repeat PGNSP PGUID 12 f f t f i 2 25 "25 23" repeat - _null_ ));
DESCR("replicate string int4 times");
DATA(insert OID = 1623 ( similar_escape PGNSP PGUID 12 f f f f i 2 25 "25 25" similar_escape - _null_ ));
DESCR("convert SQL99 regexp pattern to POSIX style");
DATA(insert OID = 1624 ( mul_d_interval PGNSP PGUID 12 f f t f i 2 1186 "701 1186" mul_d_interval - _null_ ));
DATA(insert OID = 1633 ( texticlike PGNSP PGUID 12 f f t f i 2 16 "25 25" texticlike - _null_ ));
@ -2087,7 +2090,7 @@ DESCR("matches LIKE expression, case-insensitive");
DATA(insert OID = 1636 ( nameicnlike PGNSP PGUID 12 f f t f i 2 16 "19 25" nameicnlike - _null_ ));
DESCR("does not match LIKE expression, case-insensitive");
DATA(insert OID = 1637 ( like_escape PGNSP PGUID 12 f f t f i 2 25 "25 25" like_escape - _null_ ));
DESCR("convert match pattern to use backslash escapes");
DESCR("convert LIKE pattern to use backslash escapes");
DATA(insert OID = 1689 ( update_pg_pwd_and_pg_group PGNSP PGUID 12 f f t f v 0 2279 "" update_pg_pwd_and_pg_group - _null_ ));
DESCR("update pg_pwd and pg_group files");
@ -2784,7 +2787,7 @@ DESCR("matches LIKE expression");
DATA(insert OID = 2008 ( notlike PGNSP PGUID 12 f f t f i 2 16 "17 17" byteanlike - _null_ ));
DESCR("does not match LIKE expression");
DATA(insert OID = 2009 ( like_escape PGNSP PGUID 12 f f t f i 2 17 "17 17" like_escape_bytea - _null_ ));
DESCR("convert match pattern to use backslash escapes");
DESCR("convert LIKE pattern to use backslash escapes");
DATA(insert OID = 2010 ( length PGNSP PGUID 12 f f t f i 1 23 "17" byteaoctetlen - _null_ ));
DESCR("octet length");
DATA(insert OID = 2011 ( byteacat PGNSP PGUID 12 f f t f i 2 17 "17 17" byteacat - _null_ ));
@ -2889,9 +2892,9 @@ DATA(insert OID = 2072 ( date_mi_interval PGNSP PGUID 14 f f t f i 2 1114 "1082
DESCR("subtract");
DATA(insert OID = 2073 ( substring PGNSP PGUID 12 f f t f i 2 25 "25 25" textregexsubstr - _null_ ));
DESCR("substitutes regular expression");
DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, like_escape($2, $3))" - _null_ ));
DESCR("substitutes regular expression with escape argument");
DESCR("extracts text matching regular expression");
DATA(insert OID = 2074 ( substring PGNSP PGUID 14 f f t f i 3 25 "25 25 25" "select substring($1, similar_escape($2, $3))" - _null_ ));
DESCR("extracts text matching SQL99 regular expression");
DATA(insert OID = 2075 ( bit PGNSP PGUID 12 f f t f i 1 1560 "20" bitfromint8 - _null_ ));
DESCR("int8 to bitstring");

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: builtins.h,v 1.201 2002/09/19 22:48:34 tgl Exp $
* $Id: builtins.h,v 1.202 2002/09/22 17:27:25 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -371,6 +371,7 @@ extern Datum nameicregexne(PG_FUNCTION_ARGS);
extern Datum texticregexeq(PG_FUNCTION_ARGS);
extern Datum texticregexne(PG_FUNCTION_ARGS);
extern Datum textregexsubstr(PG_FUNCTION_ARGS);
extern Datum similar_escape(PG_FUNCTION_ARGS);
/* regproc.c */
extern Datum regprocin(PG_FUNCTION_ARGS);

View File

@ -142,15 +142,15 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
t
(1 row)
-- T581 regular expression substring
SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd";
-- T581 regular expression substring (with SQL99's bizarre regexp syntax)
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
bcd
-----
bcd
(1 row)
-- No match should return NULL
SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
True
------
t
@ -175,8 +175,16 @@ SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
t
(1 row)
-- PostgreSQL extention to allow omitting the escape character
SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde";
-- PostgreSQL extension to allow omitting the escape character;
-- here the regexp is taken as Posix syntax
SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
cde
-----
cde
(1 row)
-- With a parenthesized subexpression, return only what matches the subexpr
SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
cde
-----
cde

View File

@ -62,19 +62,24 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
-- T581 regular expression substring
SELECT SUBSTRING('abcdefg' FROM '(b|f).*(d)' FOR '#') AS "bcd";
-- T581 regular expression substring (with SQL99's bizarre regexp syntax)
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
-- No match should return NULL
SELECT SUBSTRING('abcdefg' FROM '(1|2|3)' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
-- Null inputs should return NULL
SELECT SUBSTRING('abcdefg' FROM '(b|c)' FOR NULL) IS NULL AS "True";
SELECT SUBSTRING(NULL FROM '(b|c)' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
-- PostgreSQL extention to allow omitting the escape character
SELECT SUBSTRING('abcdefg' FROM '(c|d).e') AS "cde";
-- PostgreSQL extension to allow omitting the escape character;
-- here the regexp is taken as Posix syntax
SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde";
-- With a parenthesized subexpression, return only what matches the subexpr
SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde";
-- E021-11 position expression
SELECT POSITION('4' IN '1234567890') = '4' AS "4";