Allow using huge TLB pages on Linux (MAP_HUGETLB)

This patch adds an option, huge_tlb_pages, which allows requesting the
shared memory segment to be allocated using huge pages, by using the
MAP_HUGETLB flag in mmap(). This can improve performance.

The default is 'try', which means that we will attempt using huge pages,
and fall back to non-huge pages if it doesn't work. Currently, only Linux
has MAP_HUGETLB. On other platforms, the default 'try' behaves the same as
'off'.

In the passing, don't try to round the mmap() size to a multiple of
pagesize. mmap() doesn't require that, and there's no particular reason for
PostgreSQL to do that either. When using MAP_HUGETLB, however, round the
request size up to nearest 2MB boundary. This is to work around a bug in
some Linux kernel versions, but also to avoid wasting memory, because the
kernel will round the size up anyway.

Many people were involved in writing this patch, including Christian Kruse,
Richard Poole, Abhijit Menon-Sen, reviewed by Peter Geoghegan, Andres Freund
and me.
This commit is contained in:
Heikki Linnakangas 2014-01-29 13:44:45 +02:00
parent b7643b19f0
commit 1a3458b6d8
6 changed files with 183 additions and 39 deletions

View File

@ -1107,6 +1107,43 @@ include 'filename'
</listitem>
</varlistentry>
<varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
<term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
<indexterm>
<primary><varname>huge_tlb_pages</> configuration parameter</primary>
</indexterm>
<listitem>
<para>
Enables/disables the use of huge TLB pages. Valid values are
<literal>try</literal> (the default), <literal>on</literal>,
and <literal>off</literal>.
</para>
<para>
At present, this feature is supported only on Linux. The setting
is ignored on other systems.
</para>
<para>
The use of huge TLB pages results in smaller page tables and
less CPU time spent on memory management, increasing performance. For
more details, see
<ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
Remember that you will need at least shared_buffers / huge page size +
1 huge TLB pages. So for example for a system with 6GB shared buffers
and a hugepage size of 2kb of you will need at least 3156 huge pages.
</para>
<para>
With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
the server will try to use huge pages, but fall back to using
normal allocation if that fails. With <literal>on</literal, failure
to use huge pages will prevent the server from starting up. With
<literal>off</literal>, huge pages will not be used.
</para>
</listitem>
</varlistentry>
<varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
<term><varname>temp_buffers</varname> (<type>integer</type>)</term>
<indexterm>

View File

@ -32,6 +32,7 @@
#include "portability/mem.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"
#include "utils/guc.h"
typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
@ -41,7 +42,7 @@ typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
unsigned long UsedShmemSegID = 0;
void *UsedShmemSegAddr = NULL;
static Size AnonymousShmemSize;
static void *AnonymousShmem;
static void *AnonymousShmem = NULL;
static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
static void IpcMemoryDetach(int status, Datum shmaddr);
@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
return true;
}
/*
* Creates an anonymous mmap()ed shared memory segment.
*
* Pass the requested size in *size. This function will modify *size to the
* actual size of the allocation, if it ends up allocating a segment that is
* larger than requested.
*/
#ifndef EXEC_BACKEND
static void *
CreateAnonymousSegment(Size *size)
{
Size allocsize;
void *ptr = MAP_FAILED;
#ifndef MAP_HUGETLB
if (huge_tlb_pages == HUGE_TLB_ON)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("huge TLB pages not supported on this platform")));
#else
if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
{
/*
* Round up the request size to a suitable large value.
*
* Some Linux kernel versions are known to have a bug, which causes
* mmap() with MAP_HUGETLB to fail if the request size is not a
* multiple of any supported huge page size. To work around that, we
* round up the request size to nearest 2MB. 2MB is the most common
* huge page page size on affected systems.
*
* Aside from that bug, even with a kernel that does the allocation
* correctly, rounding it up ourselves avoids wasting memory. Without
* it, if we for example make an allocation of 2MB + 1 bytes, the
* kernel might decide to use two 2MB huge pages for that, and waste 2
* MB - 1 of memory. When we do the rounding ourselves, we can use
* that space for allocations.
*/
int hugepagesize = 2 * 1024 * 1024;
allocsize = *size;
if (allocsize % hugepagesize != 0)
allocsize += hugepagesize - (allocsize % hugepagesize);
ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
}
#endif
if (huge_tlb_pages == HUGE_TLB_OFF ||
(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
{
allocsize = *size;
ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
}
if (ptr == MAP_FAILED)
ereport(FATAL,
(errmsg("could not map anonymous shared memory: %m"),
(errno == ENOMEM) ?
errhint("This error usually means that PostgreSQL's request "
"for a shared memory segment exceeded available memory, "
"swap space or huge pages. To reduce the request size "
"(currently %zu bytes), reduce PostgreSQL's shared "
"memory usage, perhaps by reducing shared_buffers or "
"max_connections.",
*size) : 0));
*size = allocsize;
return ptr;
}
#endif
/*
* PGSharedMemoryCreate
@ -344,7 +419,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
PGShmemHeader *hdr;
IpcMemoryId shmid;
struct stat statbuf;
Size sysvsize = size;
Size sysvsize;
#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
if (huge_tlb_pages == HUGE_TLB_ON)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("huge TLB pages not supported on this platform")));
#endif
/* Room for a header? */
Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
@ -359,6 +441,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
* to run many copies of PostgreSQL without needing to adjust system
* settings.
*
* We assume that no one will attempt to run PostgreSQL 9.3 or later on
* systems that are ancient enough that anonymous shared memory is not
* supported, such as pre-2.4 versions of Linux. If that turns out to be
* false, we might need to add a run-time test here and do this only if
* the running kernel supports it.
*
* However, we disable this logic in the EXEC_BACKEND case, and fall back
* to the old method of allocating the entire segment using System V
* shared memory, because there's no way to attach an mmap'd segment to a
@ -366,44 +454,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
* developer use, this shouldn't be a big problem.
*/
#ifndef EXEC_BACKEND
{
long pagesize = sysconf(_SC_PAGE_SIZE);
AnonymousShmem = CreateAnonymousSegment(&size);
AnonymousShmemSize = size;
/*
* Ensure request size is a multiple of pagesize.
*
* pagesize will, for practical purposes, always be a power of two.
* But just in case it isn't, we do it this way instead of using
* TYPEALIGN().
*/
if (pagesize > 0 && size % pagesize != 0)
size += pagesize - (size % pagesize);
/*
* We assume that no one will attempt to run PostgreSQL 9.3 or later
* on systems that are ancient enough that anonymous shared memory is
* not supported, such as pre-2.4 versions of Linux. If that turns
* out to be false, we might need to add a run-time test here and do
* this only if the running kernel supports it.
*/
AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-1, 0);
if (AnonymousShmem == MAP_FAILED)
ereport(FATAL,
(errmsg("could not map anonymous shared memory: %m"),
(errno == ENOMEM) ?
errhint("This error usually means that PostgreSQL's request "
"for a shared memory segment exceeded available memory "
"or swap space. To reduce the request size (currently "
"%zu bytes), reduce PostgreSQL's shared memory usage, "
"perhaps by reducing shared_buffers or "
"max_connections.",
size) : 0));
AnonymousShmemSize = size;
/* Now we need only allocate a minimal-sized SysV shmem block. */
sysvsize = sizeof(PGShmemHeader);
}
/* Now we need only allocate a minimal-sized SysV shmem block. */
sysvsize = sizeof(PGShmemHeader);
#else
sysvsize = size;
#endif
/* Make sure PGSharedMemoryAttach doesn't fail without need */

View File

@ -128,6 +128,11 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
DWORD size_high;
DWORD size_low;
if (huge_tlb_pages == HUGE_TLB_ON)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("huge TLB pages not supported on this platform")));
/* Room for a header? */
Assert(size > MAXALIGN(sizeof(PGShmemHeader)));

View File

@ -64,6 +64,7 @@
#include "storage/dsm_impl.h"
#include "storage/standby.h"
#include "storage/fd.h"
#include "storage/pg_shmem.h"
#include "storage/proc.h"
#include "storage/predicate.h"
#include "tcop/tcopprot.h"
@ -387,6 +388,23 @@ static const struct config_enum_entry synchronous_commit_options[] = {
{NULL, 0, false}
};
/*
* Although only "on", "off", "try" are documented, we accept all the likely
* variants of "on" and "off".
*/
static const struct config_enum_entry huge_tlb_options[] = {
{"off", HUGE_TLB_OFF, false},
{"on", HUGE_TLB_ON, false},
{"try", HUGE_TLB_TRY, false},
{"true", HUGE_TLB_ON, true},
{"false", HUGE_TLB_OFF, true},
{"yes", HUGE_TLB_ON, true},
{"no", HUGE_TLB_OFF, true},
{"1", HUGE_TLB_ON, true},
{"0", HUGE_TLB_OFF, true},
{NULL, 0, false}
};
/*
* Options for enum values stored in other modules
*/
@ -447,6 +465,12 @@ int tcp_keepalives_idle;
int tcp_keepalives_interval;
int tcp_keepalives_count;
/*
* This really belongs in pg_shmem.c, but is defined here so that it doesn't
* need to be duplicated in all the different implementations of pg_shmem.c.
*/
int huge_tlb_pages;
/*
* These variables are all dummies that don't do anything, except in some
* cases provide the value for SHOW to display. The real state is elsewhere
@ -3430,6 +3454,15 @@ static struct config_enum ConfigureNamesEnum[] =
NULL, NULL, NULL
},
{
{"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
gettext_noop("Use of huge TLB pages on Linux"),
NULL
},
&huge_tlb_pages,
HUGE_TLB_TRY, huge_tlb_options,
NULL, NULL, NULL
},
/* End-of-list marker */
{

View File

@ -115,6 +115,8 @@
#shared_buffers = 32MB # min 128kB
# (change requires restart)
#huge_tlb_pages = try # on, off, or try
# (change requires restart)
#temp_buffers = 8MB # min 800kB
#max_prepared_transactions = 0 # zero disables the feature
# (change requires restart)

View File

@ -38,6 +38,16 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */
#endif
} PGShmemHeader;
/* GUC variable */
extern int huge_tlb_pages;
/* Possible values for huge_tlb_pages */
typedef enum
{
HUGE_TLB_OFF,
HUGE_TLB_ON,
HUGE_TLB_TRY
} HugeTlbType;
#ifdef EXEC_BACKEND
#ifndef WIN32