From d2bddc2500fb74d56e5bc53a1cfa269e2e846510 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Fri, 17 Jul 2020 14:33:00 +1200 Subject: [PATCH] Add huge_page_size setting for use on Linux. This allows the huge page size to be set explicitly. The default is 0, meaning it will use the system default, as before. Author: Odin Ugedal Discussion: https://postgr.es/m/20200608154639.20254-1-odin%40ugedal.com --- doc/src/sgml/config.sgml | 27 ++++++++ doc/src/sgml/runtime.sgml | 55 ++++++++++------ src/backend/port/sysv_shmem.c | 62 ++++++++++++++----- src/backend/utils/misc/guc.c | 32 +++++++++- src/backend/utils/misc/postgresql.conf.sample | 2 + src/include/storage/pg_shmem.h | 1 + 6 files changed, 141 insertions(+), 38 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index b353c61683..e0ea397ed4 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1582,6 +1582,33 @@ include_dir 'conf.d' + + huge_page_size (integer) + + huge_page_size configuration parameter + + + + + Controls the size of huge pages, when they are enabled with + . + The default is zero (0). + When set to 0, the default huge page size on the + system will be used. + + + Some commonly available page sizes on modern 64 bit server architectures include: + 2MB and 1GB (Intel and AMD), 16MB and + 16GB (IBM POWER), and 64kB, 2MB, + 32MB and 1GB (ARM). For more information + about usage and support, see . + + + Non-default settings are currently supported only on Linux. + + + + temp_buffers (integer) diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml index 937bb2e8ac..e09cb55efc 100644 --- a/doc/src/sgml/runtime.sgml +++ b/doc/src/sgml/runtime.sgml @@ -1391,13 +1391,14 @@ export PG_OOM_ADJUST_VALUE=0 using large values of . To use this feature in PostgreSQL you need a kernel with CONFIG_HUGETLBFS=y and - CONFIG_HUGETLB_PAGE=y. You will also have to adjust - the kernel setting vm.nr_hugepages. To estimate the - number of huge pages needed, start PostgreSQL - without huge pages enabled and check the - postmaster's anonymous shared memory segment size, as well as the system's - huge page size, using the /proc file system. This might - look like: + CONFIG_HUGETLB_PAGE=y. You will also have to configure + the operating system to provide enough huge pages of the desired size. + To estimate the number of huge pages needed, start + PostgreSQL without huge pages enabled and check + the postmaster's anonymous shared memory segment size, as well as the + system's default and supported huge page sizes, using the + /proc and /sys file systems. + This might look like: $ head -1 $PGDATA/postmaster.pid 4170 @@ -1405,27 +1406,40 @@ $ pmap 4170 | awk '/rw-s/ && /zero/ {print $2}' 6490428K $ grep ^Hugepagesize /proc/meminfo Hugepagesize: 2048 kB +$ ls /sys/kernel/mm/hugepages +hugepages-1048576kB hugepages-2048kB + + In this example the default is 2MB, but you can also explicitly request + either 2MB or 1GB with . + + Assuming 2MB huge pages, 6490428 / 2048 gives approximately 3169.154, so in this example we need at - least 3170 huge pages, which we can set with: + least 3170 huge pages. A larger setting would be + appropriate if other programs on the machine also need huge pages. + We can set this with: -$ sysctl -w vm.nr_hugepages=3170 +# sysctl -w vm.nr_hugepages=3170 - A larger setting would be appropriate if other programs on the machine - also need huge pages. Don't forget to add this setting - to /etc/sysctl.conf so that it will be reapplied - after reboots. + Don't forget to add this setting to /etc/sysctl.conf + so that it is reapplied after reboots. For non-default huge page sizes, + we can instead use: + +# echo 3170 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + + It is also possible to provide these settings at boot time using + kernel parameters such as hugepagesz=2M hugepages=3170. Sometimes the kernel is not able to allocate the desired number of huge - pages immediately, so it might be necessary to repeat the command or to - reboot. (Immediately after a reboot, most of the machine's memory - should be available to convert into huge pages.) To verify the huge - page allocation situation, use: + pages immediately due to fragmentation, so it might be necessary + to repeat the command or to reboot. (Immediately after a reboot, most of + the machine's memory should be available to convert into huge pages.) + To verify the huge page allocation situation for a given size, use: -$ grep Huge /proc/meminfo +$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages @@ -1438,8 +1452,9 @@ $ grep Huge /proc/meminfo The default behavior for huge pages in - PostgreSQL is to use them when possible and - to fall back to normal pages when failing. To enforce the use of huge + PostgreSQL is to use them when possible, with + the system's default huge page size, and + to fall back to normal pages on failure. To enforce the use of huge pages, you can set to on in postgresql.conf. Note that with this setting PostgreSQL will fail to diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 198a6985bf..203555822d 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -32,6 +32,7 @@ #endif #include "miscadmin.h" +#include "port/pg_bitutils.h" #include "portability/mem.h" #include "storage/dsm.h" #include "storage/fd.h" @@ -448,7 +449,7 @@ PGSharedMemoryAttach(IpcMemoryId shmId, #ifdef MAP_HUGETLB /* - * Identify the huge page size to use. + * Identify the huge page size to use, and compute the related mmap flags. * * Some Linux kernel versions have a bug causing mmap() to fail on requests * that are not a multiple of the hugepage size. Versions without that bug @@ -464,25 +465,13 @@ PGSharedMemoryAttach(IpcMemoryId shmId, * hugepage sizes, we might want to think about more invasive strategies, * such as increasing shared_buffers to absorb the extra space. * - * Returns the (real or assumed) page size into *hugepagesize, + * Returns the (real, assumed or config provided) page size into *hugepagesize, * and the hugepage-related mmap flags to use into *mmap_flags. - * - * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems - * that support it, we might OR in additional bits to specify a particular - * non-default huge page size. */ static void GetHugePageSize(Size *hugepagesize, int *mmap_flags) { - /* - * If we fail to find out the system's default huge page size, assume it - * is 2MB. This will work fine when the actual size is less. If it's - * more, we might get mmap() or munmap() failures due to unaligned - * requests; but at this writing, there are no reports of any non-Linux - * systems being picky about that. - */ - *hugepagesize = 2 * 1024 * 1024; - *mmap_flags = MAP_HUGETLB; + Size default_hugepagesize = 0; /* * System-dependent code to find out the default huge page size. @@ -491,6 +480,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) * nnnn kB". Ignore any failures, falling back to the preset default. */ #ifdef __linux__ + { FILE *fp = AllocateFile("/proc/meminfo", "r"); char buf[128]; @@ -505,7 +495,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) { if (ch == 'k') { - *hugepagesize = sz * (Size) 1024; + default_hugepagesize = sz * (Size) 1024; break; } /* We could accept other units besides kB, if needed */ @@ -515,6 +505,44 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) } } #endif /* __linux__ */ + + if (huge_page_size != 0) + { + /* If huge page size is requested explicitly, use that. */ + *hugepagesize = (Size) huge_page_size * 1024; + } + else if (default_hugepagesize != 0) + { + /* Otherwise use the system default, if we have it. */ + *hugepagesize = default_hugepagesize; + } + else + { + /* + * If we fail to find out the system's default huge page size, or no + * huge page size is requested explicitly, assume it is 2MB. This will + * work fine when the actual size is less. If it's more, we might get + * mmap() or munmap() failures due to unaligned requests; but at this + * writing, there are no reports of any non-Linux systems being picky + * about that. + */ + *hugepagesize = 2 * 1024 * 1024; + } + + *mmap_flags = MAP_HUGETLB; + + /* + * On recent enough Linux, also include the explicit page size, if + * necessary. + */ +#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT) + if (*hugepagesize != default_hugepagesize) + { + int shift = pg_ceil_log2_64(*hugepagesize); + + *mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif } #endif /* MAP_HUGETLB */ @@ -583,7 +611,7 @@ CreateAnonymousSegment(Size *size) "(currently %zu bytes), reduce PostgreSQL's shared " "memory usage, perhaps by reducing shared_buffers or " "max_connections.", - *size) : 0)); + allocsize) : 0)); } *size = allocsize; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 031ca0327f..99a3e4f6f6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -20,11 +20,14 @@ #include #include #include -#include +#ifndef WIN32 +#include +#endif #include #ifdef HAVE_SYSLOG #include #endif +#include #include "access/commit_ts.h" #include "access/gin.h" @@ -198,6 +201,7 @@ static bool check_max_wal_senders(int *newval, void **extra, GucSource source); static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source); static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source); static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source); +static bool check_huge_page_size(int *newval, void **extra, GucSource source); static void assign_pgstat_temp_directory(const char *newval, void *extra); static bool check_application_name(char **newval, void **extra, GucSource source); static void assign_application_name(const char *newval, void *extra); @@ -576,6 +580,7 @@ int ssl_renegotiation_limit; * need to be duplicated in all the different implementations of pg_shmem.c. */ int huge_pages; +int huge_page_size; /* * These variables are all dummies that don't do anything, except in some @@ -3381,6 +3386,17 @@ static struct config_int ConfigureNamesInt[] = NULL, assign_tcp_user_timeout, show_tcp_user_timeout }, + { + {"huge_page_size", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("The size of huge page that should be requested."), + NULL, + GUC_UNIT_KB + }, + &huge_page_size, + 0, 0, INT_MAX, + check_huge_page_size, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL @@ -11565,6 +11581,20 @@ check_maintenance_io_concurrency(int *newval, void **extra, GucSource source) return true; } +static bool +check_huge_page_size(int *newval, void **extra, GucSource source) +{ +#if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)) + /* Recent enough Linux only, for now. See GetHugePageSize(). */ + if (*newval != 0) + { + GUC_check_errdetail("huge_page_size must be 0 on this platform."); + return false; + } +#endif + return true; +} + static void assign_pgstat_temp_directory(const char *newval, void *extra) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e430e33c7b..29e0152196 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -122,6 +122,8 @@ # (change requires restart) #huge_pages = try # on, off, or try # (change requires restart) +#huge_page_size = 0 # zero for system default + # (change requires restart) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 0de26b3427..9992932a00 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -44,6 +44,7 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ /* GUC variables */ extern int shared_memory_type; extern int huge_pages; +extern int huge_page_size; /* Possible values for huge_pages */ typedef enum