From 1a3458b6d8d202715a83c88474a1b63726d0929e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 29 Jan 2014 13:44:45 +0200 Subject: [PATCH] Allow using huge TLB pages on Linux (MAP_HUGETLB) This patch adds an option, huge_tlb_pages, which allows requesting the shared memory segment to be allocated using huge pages, by using the MAP_HUGETLB flag in mmap(). This can improve performance. The default is 'try', which means that we will attempt using huge pages, and fall back to non-huge pages if it doesn't work. Currently, only Linux has MAP_HUGETLB. On other platforms, the default 'try' behaves the same as 'off'. In the passing, don't try to round the mmap() size to a multiple of pagesize. mmap() doesn't require that, and there's no particular reason for PostgreSQL to do that either. When using MAP_HUGETLB, however, round the request size up to nearest 2MB boundary. This is to work around a bug in some Linux kernel versions, but also to avoid wasting memory, because the kernel will round the size up anyway. Many people were involved in writing this patch, including Christian Kruse, Richard Poole, Abhijit Menon-Sen, reviewed by Peter Geoghegan, Andres Freund and me. --- doc/src/sgml/config.sgml | 37 +++++ src/backend/port/sysv_shmem.c | 135 +++++++++++++----- src/backend/port/win32_shmem.c | 5 + src/backend/utils/misc/guc.c | 33 +++++ src/backend/utils/misc/postgresql.conf.sample | 2 + src/include/storage/pg_shmem.h | 10 ++ 6 files changed, 183 insertions(+), 39 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 14ed6c7a53..e7c255987d 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1107,6 +1107,43 @@ include 'filename' + + huge_tlb_pages (enum) + + huge_tlb_pages configuration parameter + + + + Enables/disables the use of huge TLB pages. Valid values are + try (the default), on, + and off. + + + + At present, this feature is supported only on Linux. The setting + is ignored on other systems. + + + + The use of huge TLB pages results in smaller page tables and + less CPU time spent on memory management, increasing performance. For + more details, see + the Debian wiki. + Remember that you will need at least shared_buffers / huge page size + + 1 huge TLB pages. So for example for a system with 6GB shared buffers + and a hugepage size of 2kb of you will need at least 3156 huge pages. + + + + With huge_tlb_pages set to try, + the server will try to use huge pages, but fall back to using + normal allocation if that fails. With onoff, huge pages will not be used. + + + + temp_buffers (integer) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 0d01617e2f..f7596bf6e0 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -32,6 +32,7 @@ #include "portability/mem.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" +#include "utils/guc.h" typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ @@ -41,7 +42,7 @@ typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; static Size AnonymousShmemSize; -static void *AnonymousShmem; +static void *AnonymousShmem = NULL; static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); @@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) return true; } +/* + * Creates an anonymous mmap()ed shared memory segment. + * + * Pass the requested size in *size. This function will modify *size to the + * actual size of the allocation, if it ends up allocating a segment that is + * larger than requested. + */ +#ifndef EXEC_BACKEND +static void * +CreateAnonymousSegment(Size *size) +{ + Size allocsize; + void *ptr = MAP_FAILED; + +#ifndef MAP_HUGETLB + if (huge_tlb_pages == HUGE_TLB_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge TLB pages not supported on this platform"))); +#else + if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY) + { + /* + * Round up the request size to a suitable large value. + * + * Some Linux kernel versions are known to have a bug, which causes + * mmap() with MAP_HUGETLB to fail if the request size is not a + * multiple of any supported huge page size. To work around that, we + * round up the request size to nearest 2MB. 2MB is the most common + * huge page page size on affected systems. + * + * Aside from that bug, even with a kernel that does the allocation + * correctly, rounding it up ourselves avoids wasting memory. Without + * it, if we for example make an allocation of 2MB + 1 bytes, the + * kernel might decide to use two 2MB huge pages for that, and waste 2 + * MB - 1 of memory. When we do the rounding ourselves, we can use + * that space for allocations. + */ + int hugepagesize = 2 * 1024 * 1024; + + allocsize = *size; + if (allocsize % hugepagesize != 0) + allocsize += hugepagesize - (allocsize % hugepagesize); + + ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0); + if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED) + elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m"); + } +#endif + + if (huge_tlb_pages == HUGE_TLB_OFF || + (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)) + { + allocsize = *size; + ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0); + } + + if (ptr == MAP_FAILED) + ereport(FATAL, + (errmsg("could not map anonymous shared memory: %m"), + (errno == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request " + "for a shared memory segment exceeded available memory, " + "swap space or huge pages. To reduce the request size " + "(currently %zu bytes), reduce PostgreSQL's shared " + "memory usage, perhaps by reducing shared_buffers or " + "max_connections.", + *size) : 0)); + + *size = allocsize; + return ptr; +} +#endif /* * PGSharedMemoryCreate @@ -344,7 +419,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) PGShmemHeader *hdr; IpcMemoryId shmid; struct stat statbuf; - Size sysvsize = size; + Size sysvsize; + +#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB) + if (huge_tlb_pages == HUGE_TLB_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge TLB pages not supported on this platform"))); +#endif /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); @@ -359,6 +441,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) * to run many copies of PostgreSQL without needing to adjust system * settings. * + * We assume that no one will attempt to run PostgreSQL 9.3 or later on + * systems that are ancient enough that anonymous shared memory is not + * supported, such as pre-2.4 versions of Linux. If that turns out to be + * false, we might need to add a run-time test here and do this only if + * the running kernel supports it. + * * However, we disable this logic in the EXEC_BACKEND case, and fall back * to the old method of allocating the entire segment using System V * shared memory, because there's no way to attach an mmap'd segment to a @@ -366,44 +454,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) * developer use, this shouldn't be a big problem. */ #ifndef EXEC_BACKEND - { - long pagesize = sysconf(_SC_PAGE_SIZE); + AnonymousShmem = CreateAnonymousSegment(&size); + AnonymousShmemSize = size; - /* - * Ensure request size is a multiple of pagesize. - * - * pagesize will, for practical purposes, always be a power of two. - * But just in case it isn't, we do it this way instead of using - * TYPEALIGN(). - */ - if (pagesize > 0 && size % pagesize != 0) - size += pagesize - (size % pagesize); - - /* - * We assume that no one will attempt to run PostgreSQL 9.3 or later - * on systems that are ancient enough that anonymous shared memory is - * not supported, such as pre-2.4 versions of Linux. If that turns - * out to be false, we might need to add a run-time test here and do - * this only if the running kernel supports it. - */ - AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, - -1, 0); - if (AnonymousShmem == MAP_FAILED) - ereport(FATAL, - (errmsg("could not map anonymous shared memory: %m"), - (errno == ENOMEM) ? - errhint("This error usually means that PostgreSQL's request " - "for a shared memory segment exceeded available memory " - "or swap space. To reduce the request size (currently " - "%zu bytes), reduce PostgreSQL's shared memory usage, " - "perhaps by reducing shared_buffers or " - "max_connections.", - size) : 0)); - AnonymousShmemSize = size; - - /* Now we need only allocate a minimal-sized SysV shmem block. */ - sysvsize = sizeof(PGShmemHeader); - } + /* Now we need only allocate a minimal-sized SysV shmem block. */ + sysvsize = sizeof(PGShmemHeader); +#else + sysvsize = size; #endif /* Make sure PGSharedMemoryAttach doesn't fail without need */ diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 80f198277a..9b0cceb530 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -128,6 +128,11 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port) DWORD size_high; DWORD size_low; + if (huge_tlb_pages == HUGE_TLB_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge TLB pages not supported on this platform"))); + /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2cc8f90e6d..a9b9794965 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -64,6 +64,7 @@ #include "storage/dsm_impl.h" #include "storage/standby.h" #include "storage/fd.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/predicate.h" #include "tcop/tcopprot.h" @@ -387,6 +388,23 @@ static const struct config_enum_entry synchronous_commit_options[] = { {NULL, 0, false} }; +/* + * Although only "on", "off", "try" are documented, we accept all the likely + * variants of "on" and "off". + */ +static const struct config_enum_entry huge_tlb_options[] = { + {"off", HUGE_TLB_OFF, false}, + {"on", HUGE_TLB_ON, false}, + {"try", HUGE_TLB_TRY, false}, + {"true", HUGE_TLB_ON, true}, + {"false", HUGE_TLB_OFF, true}, + {"yes", HUGE_TLB_ON, true}, + {"no", HUGE_TLB_OFF, true}, + {"1", HUGE_TLB_ON, true}, + {"0", HUGE_TLB_OFF, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -447,6 +465,12 @@ int tcp_keepalives_idle; int tcp_keepalives_interval; int tcp_keepalives_count; +/* + * This really belongs in pg_shmem.c, but is defined here so that it doesn't + * need to be duplicated in all the different implementations of pg_shmem.c. + */ +int huge_tlb_pages; + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -3430,6 +3454,15 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Use of huge TLB pages on Linux"), + NULL + }, + &huge_tlb_pages, + HUGE_TLB_TRY, huge_tlb_options, + NULL, NULL, NULL + }, /* End-of-list marker */ { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 7ad6b7cb45..c8673b382d 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -115,6 +115,8 @@ #shared_buffers = 32MB # min 128kB # (change requires restart) +#huge_tlb_pages = try # on, off, or try + # (change requires restart) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 22ef901e89..df094e801d 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -38,6 +38,16 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ #endif } PGShmemHeader; +/* GUC variable */ +extern int huge_tlb_pages; + +/* Possible values for huge_tlb_pages */ +typedef enum +{ + HUGE_TLB_OFF, + HUGE_TLB_ON, + HUGE_TLB_TRY +} HugeTlbType; #ifdef EXEC_BACKEND #ifndef WIN32