1999-07-22 17:09:15 +02:00
|
|
|
<!--
|
2010-05-11 18:07:42 +02:00
|
|
|
$PostgreSQL: pgsql/doc/src/sgml/ref/cluster.sgml,v 1.50 2010/05/11 16:07:42 rhaas Exp $
|
2001-12-08 04:24:40 +01:00
|
|
|
PostgreSQL documentation
|
1999-07-22 17:09:15 +02:00
|
|
|
-->
|
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<refentry id="SQL-CLUSTER">
|
|
|
|
<refmeta>
|
2010-04-03 09:23:02 +02:00
|
|
|
<refentrytitle>CLUSTER</refentrytitle>
|
2008-11-14 11:22:48 +01:00
|
|
|
<manvolnum>7</manvolnum>
|
1999-07-06 19:16:42 +02:00
|
|
|
<refmiscinfo>SQL - Language Statements</refmiscinfo>
|
|
|
|
</refmeta>
|
2003-04-15 15:25:08 +02:00
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<refnamediv>
|
2003-04-15 15:25:08 +02:00
|
|
|
<refname>CLUSTER</refname>
|
|
|
|
<refpurpose>cluster a table according to an index</refpurpose>
|
1998-12-29 03:24:47 +01:00
|
|
|
</refnamediv>
|
2003-04-15 15:25:08 +02:00
|
|
|
|
2003-08-31 19:32:24 +02:00
|
|
|
<indexterm zone="sql-cluster">
|
|
|
|
<primary>CLUSTER</primary>
|
|
|
|
</indexterm>
|
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<refsynopsisdiv>
|
2003-04-15 15:25:08 +02:00
|
|
|
<synopsis>
|
2009-09-19 12:23:27 +02:00
|
|
|
CLUSTER [VERBOSE] <replaceable class="PARAMETER">table_name</replaceable> [ USING <replaceable class="PARAMETER">index_name</replaceable> ]
|
2008-11-24 09:46:04 +01:00
|
|
|
CLUSTER [VERBOSE]
|
2003-04-15 15:25:08 +02:00
|
|
|
</synopsis>
|
1999-07-06 19:16:42 +02:00
|
|
|
</refsynopsisdiv>
|
|
|
|
|
2003-04-15 15:25:08 +02:00
|
|
|
<refsect1>
|
|
|
|
<title>Description</title>
|
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<para>
|
2010-02-07 21:48:13 +01:00
|
|
|
<command>CLUSTER</command> instructs <productname>PostgreSQL</productname>
|
2001-01-14 00:58:55 +01:00
|
|
|
to cluster the table specified
|
2009-09-19 12:23:27 +02:00
|
|
|
by <replaceable class="parameter">table_name</replaceable>
|
1999-07-06 19:16:42 +02:00
|
|
|
based on the index specified by
|
2009-09-19 12:23:27 +02:00
|
|
|
<replaceable class="parameter">index_name</replaceable>. The index must
|
2010-02-07 21:48:13 +01:00
|
|
|
already have been defined on
|
2009-09-19 12:23:27 +02:00
|
|
|
<replaceable class="parameter">table_name</replaceable>.
|
1999-07-06 19:16:42 +02:00
|
|
|
</para>
|
1999-07-22 17:09:15 +02:00
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<para>
|
2001-01-14 00:58:55 +01:00
|
|
|
When a table is clustered, it is physically reordered
|
2002-08-11 04:43:57 +02:00
|
|
|
based on the index information. Clustering is a one-time operation:
|
|
|
|
when the table is subsequently updated, the changes are
|
|
|
|
not clustered. That is, no attempt is made to store new or
|
2007-04-08 04:07:35 +02:00
|
|
|
updated rows according to their index order. (If one wishes, one can
|
|
|
|
periodically recluster by issuing the command again. Also, setting
|
2010-02-07 21:48:13 +01:00
|
|
|
the table's <literal>FILLFACTOR</literal> storage parameter to less than
|
|
|
|
100% can aid in preserving cluster ordering during updates, since updated
|
|
|
|
rows are kept on the same page if enough space is available there.)
|
1999-07-06 19:16:42 +02:00
|
|
|
</para>
|
|
|
|
|
2002-11-15 04:09:39 +01:00
|
|
|
<para>
|
|
|
|
When a table is clustered, <productname>PostgreSQL</productname>
|
2007-04-08 04:07:35 +02:00
|
|
|
remembers which index it was clustered by. The form
|
2009-09-19 12:23:27 +02:00
|
|
|
<command>CLUSTER <replaceable class="parameter">table_name</replaceable></command>
|
2010-05-11 18:07:42 +02:00
|
|
|
reclusters the table using the same index as before. You can also
|
|
|
|
use the <literal>CLUSTER</literal> or <literal>SET WITHOUT CLUSTER</literal>
|
|
|
|
forms of <xref linkend="SQL-ALTERTABLE"> to set the index to be used for
|
|
|
|
future cluster operations, or to clear any previous setting.
|
2002-11-15 04:09:39 +01:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
2007-04-08 04:07:35 +02:00
|
|
|
<command>CLUSTER</command> without any parameter reclusters all the
|
|
|
|
previously-clustered tables in the current database that the calling user
|
|
|
|
owns, or all such tables if called by a superuser. This
|
2006-10-31 02:52:31 +01:00
|
|
|
form of <command>CLUSTER</command> cannot be executed inside a transaction
|
|
|
|
block.
|
2002-11-15 04:09:39 +01:00
|
|
|
</para>
|
|
|
|
|
2003-02-19 05:06:28 +01:00
|
|
|
<para>
|
|
|
|
When a table is being clustered, an <literal>ACCESS
|
|
|
|
EXCLUSIVE</literal> lock is acquired on it. This prevents any other
|
2003-11-02 13:59:54 +01:00
|
|
|
database operations (both reads and writes) from operating on the
|
|
|
|
table until the <command>CLUSTER</command> is finished.
|
2003-02-19 05:06:28 +01:00
|
|
|
</para>
|
2003-04-15 15:25:08 +02:00
|
|
|
</refsect1>
|
|
|
|
|
|
|
|
<refsect1>
|
2003-09-12 02:12:47 +02:00
|
|
|
<title>Parameters</title>
|
2003-04-15 15:25:08 +02:00
|
|
|
|
|
|
|
<variablelist>
|
|
|
|
<varlistentry>
|
2009-09-19 12:23:27 +02:00
|
|
|
<term><replaceable class="PARAMETER">table_name</replaceable></term>
|
2003-04-15 15:25:08 +02:00
|
|
|
<listitem>
|
|
|
|
<para>
|
2007-04-08 02:26:34 +02:00
|
|
|
The name (possibly schema-qualified) of a table.
|
2003-04-15 15:25:08 +02:00
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
</varlistentry>
|
|
|
|
|
|
|
|
<varlistentry>
|
2009-09-19 12:23:27 +02:00
|
|
|
<term><replaceable class="PARAMETER">index_name</replaceable></term>
|
2003-04-15 15:25:08 +02:00
|
|
|
<listitem>
|
|
|
|
<para>
|
2007-04-08 02:26:34 +02:00
|
|
|
The name of an index.
|
2003-04-15 15:25:08 +02:00
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
</varlistentry>
|
2008-11-24 09:46:04 +01:00
|
|
|
|
|
|
|
<varlistentry>
|
|
|
|
<term><literal>VERBOSE</literal></term>
|
|
|
|
<listitem>
|
|
|
|
<para>
|
|
|
|
Prints a progress report as each table is clustered.
|
|
|
|
</para>
|
|
|
|
</listitem>
|
|
|
|
</varlistentry>
|
2003-04-15 15:25:08 +02:00
|
|
|
</variablelist>
|
|
|
|
</refsect1>
|
|
|
|
|
|
|
|
<refsect1>
|
|
|
|
<title>Notes</title>
|
1999-07-06 19:16:42 +02:00
|
|
|
|
|
|
|
<para>
|
|
|
|
In cases where you are accessing single rows randomly
|
2003-04-15 15:25:08 +02:00
|
|
|
within a table, the actual order of the data in the
|
1999-07-06 19:16:42 +02:00
|
|
|
table is unimportant. However, if you tend to access some
|
|
|
|
data more than others, and there is an index that groups
|
|
|
|
them together, you will benefit from using <command>CLUSTER</command>.
|
2003-04-15 15:25:08 +02:00
|
|
|
If you are requesting a range of indexed values from a table, or a
|
1999-07-06 19:16:42 +02:00
|
|
|
single indexed value that has multiple rows that match,
|
|
|
|
<command>CLUSTER</command> will help because once the index identifies the
|
2006-11-04 20:03:51 +01:00
|
|
|
table page for the first row that matches, all other rows
|
|
|
|
that match are probably already on the same table page,
|
2003-04-15 15:25:08 +02:00
|
|
|
and so you save disk accesses and speed up the query.
|
1999-07-06 19:16:42 +02:00
|
|
|
</para>
|
1998-10-30 20:34:40 +01:00
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<para>
|
2002-08-11 04:43:57 +02:00
|
|
|
During the cluster operation, a temporary copy of the table is created
|
|
|
|
that contains the table data in the index order. Temporary copies of
|
|
|
|
each index on the table are created as well. Therefore, you need free
|
|
|
|
space on disk at least equal to the sum of the table size and the index
|
|
|
|
sizes.
|
|
|
|
</para>
|
|
|
|
|
2002-11-15 04:09:39 +01:00
|
|
|
<para>
|
2002-12-30 19:42:17 +01:00
|
|
|
Because <command>CLUSTER</command> remembers the clustering information,
|
|
|
|
one can cluster the tables one wants clustered manually the first time, and
|
|
|
|
setup a timed event similar to <command>VACUUM</command> so that the tables
|
2003-04-15 15:25:08 +02:00
|
|
|
are periodically reclustered.
|
2002-08-11 04:43:57 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
2004-03-23 14:21:41 +01:00
|
|
|
Because the planner records statistics about the ordering of
|
2010-04-03 09:23:02 +02:00
|
|
|
tables, it is advisable to run <xref linkend="sql-analyze">
|
|
|
|
on the newly clustered table.
|
Update reference documentation on may/can/might:
Standard English uses "may", "can", and "might" in different ways:
may - permission, "You may borrow my rake."
can - ability, "I can lift that log."
might - possibility, "It might rain today."
Unfortunately, in conversational English, their use is often mixed, as
in, "You may use this variable to do X", when in fact, "can" is a better
choice. Similarly, "It may crash" is better stated, "It might crash".
2007-02-01 00:26:05 +01:00
|
|
|
Otherwise, the planner might make poor choices of query plans.
|
2002-08-11 04:43:57 +02:00
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
|
|
|
There is another way to cluster data. The
|
2006-11-04 20:03:51 +01:00
|
|
|
<command>CLUSTER</command> command reorders the original table by
|
|
|
|
scanning it using the index you specify. This can be slow
|
|
|
|
on large tables because the rows are fetched from the table
|
|
|
|
in index order, and if the table is disordered, the
|
1999-07-06 19:16:42 +02:00
|
|
|
entries are on random pages, so there is one disk page
|
2006-11-04 20:03:51 +01:00
|
|
|
retrieved for every row moved. (<productname>PostgreSQL</productname> has
|
|
|
|
a cache, but the majority of a big table will not fit in the cache.)
|
2007-02-01 01:28:19 +01:00
|
|
|
The other way to cluster a table is to use:
|
1999-07-22 17:09:15 +02:00
|
|
|
|
2003-04-15 15:25:08 +02:00
|
|
|
<programlisting>
|
|
|
|
CREATE TABLE <replaceable class="parameter">newtable</replaceable> AS
|
2006-11-04 20:03:51 +01:00
|
|
|
SELECT * FROM <replaceable class="parameter">table</replaceable> ORDER BY <replaceable class="parameter">columnlist</replaceable>;
|
2003-04-15 15:25:08 +02:00
|
|
|
</programlisting>
|
1999-07-22 17:09:15 +02:00
|
|
|
|
2006-11-04 20:03:51 +01:00
|
|
|
which uses the <productname>PostgreSQL</productname> sorting code
|
|
|
|
to produce the desired order;
|
|
|
|
this is usually much faster than an index scan for disordered data.
|
|
|
|
Then you drop the old table, use
|
2003-04-15 15:25:08 +02:00
|
|
|
<command>ALTER TABLE ... RENAME</command>
|
2006-11-04 20:03:51 +01:00
|
|
|
to rename <replaceable class="parameter">newtable</replaceable> to the
|
|
|
|
old name, and recreate the table's indexes.
|
|
|
|
The big disadvantage of this approach is that it does not preserve
|
2002-08-11 04:43:57 +02:00
|
|
|
OIDs, constraints, foreign key relationships, granted privileges, and
|
2004-11-15 07:32:15 +01:00
|
|
|
other ancillary properties of the table — all such items must be
|
2006-11-04 20:03:51 +01:00
|
|
|
manually recreated. Another disadvantage is that this way requires a sort
|
|
|
|
temporary file about the same size as the table itself, so peak disk usage
|
|
|
|
is about three times the table size instead of twice the table size.
|
2002-08-10 22:43:46 +02:00
|
|
|
</para>
|
1998-12-29 03:24:47 +01:00
|
|
|
</refsect1>
|
1999-07-06 19:16:42 +02:00
|
|
|
|
2003-04-15 15:25:08 +02:00
|
|
|
<refsect1>
|
|
|
|
<title>Examples</title>
|
|
|
|
|
1999-07-06 19:16:42 +02:00
|
|
|
<para>
|
2003-04-15 15:25:08 +02:00
|
|
|
Cluster the table <literal>employees</literal> on the basis of
|
2007-04-08 02:26:34 +02:00
|
|
|
its index <literal>employees_ind</literal>:
|
2003-04-15 15:25:08 +02:00
|
|
|
<programlisting>
|
2007-04-08 02:26:34 +02:00
|
|
|
CLUSTER employees USING employees_ind;
|
2003-04-15 15:25:08 +02:00
|
|
|
</programlisting>
|
|
|
|
</para>
|
|
|
|
|
2002-11-15 04:09:39 +01:00
|
|
|
<para>
|
2004-03-23 14:21:41 +01:00
|
|
|
Cluster the <literal>employees</literal> table using the same
|
2003-02-19 05:06:28 +01:00
|
|
|
index that was used before:
|
2003-04-15 15:25:08 +02:00
|
|
|
<programlisting>
|
2007-04-08 02:26:34 +02:00
|
|
|
CLUSTER employees;
|
2003-04-15 15:25:08 +02:00
|
|
|
</programlisting>
|
|
|
|
</para>
|
|
|
|
|
2002-11-15 04:09:39 +01:00
|
|
|
<para>
|
2005-01-04 01:39:53 +01:00
|
|
|
Cluster all tables in the database that have previously been clustered:
|
2003-04-15 15:25:08 +02:00
|
|
|
<programlisting>
|
2002-11-18 18:12:07 +01:00
|
|
|
CLUSTER;
|
2003-04-15 15:25:08 +02:00
|
|
|
</programlisting>
|
|
|
|
</para>
|
1999-07-06 19:16:42 +02:00
|
|
|
</refsect1>
|
|
|
|
|
2003-04-15 15:25:08 +02:00
|
|
|
<refsect1>
|
|
|
|
<title>Compatibility</title>
|
|
|
|
|
|
|
|
<para>
|
2007-04-08 04:07:35 +02:00
|
|
|
There is no <command>CLUSTER</command> statement in the SQL standard.
|
|
|
|
</para>
|
|
|
|
|
|
|
|
<para>
|
|
|
|
The syntax
|
2007-04-08 02:26:34 +02:00
|
|
|
<synopsis>
|
2009-09-19 12:23:27 +02:00
|
|
|
CLUSTER <replaceable class="PARAMETER">index_name</replaceable> ON <replaceable class="PARAMETER">table_name</replaceable>
|
2007-04-08 02:26:34 +02:00
|
|
|
</synopsis>
|
2007-04-08 04:07:35 +02:00
|
|
|
is also supported for compatibility with pre-8.3 <productname>PostgreSQL</>
|
|
|
|
versions.
|
2003-04-15 15:25:08 +02:00
|
|
|
</para>
|
1998-05-13 07:34:00 +02:00
|
|
|
</refsect1>
|
2003-02-19 05:06:28 +01:00
|
|
|
|
|
|
|
<refsect1>
|
|
|
|
<title>See Also</title>
|
|
|
|
|
|
|
|
<simplelist type="inline">
|
2010-04-03 09:23:02 +02:00
|
|
|
<member><xref linkend="app-clusterdb"></member>
|
2003-02-19 05:06:28 +01:00
|
|
|
</simplelist>
|
|
|
|
</refsect1>
|
1999-07-06 19:16:42 +02:00
|
|
|
</refentry>
|