Support long distance matching for zstd compression

zstd compression supports a special mode for finding matched in distant
past, which may result in better compression ratio, at the expense of
using more memory (the window size is 128MB).

To enable this optional mode, use the "long" keyword when specifying the
compression method (--compress=zstd:long).

Author: Justin Pryzby
Reviewed-by: Tomas Vondra, Jacob Champion
Discussion: https://postgr.es/m/20230224191840.GD1653@telsasoft.com
Discussion: https://postgr.es/m/20220327205020.GM28503@telsasoft.com
This commit is contained in:
Tomas Vondra 2023-04-06 17:18:38 +02:00
parent 983ec23007
commit 2820adf775
12 changed files with 127 additions and 6 deletions

View File

@ -2729,7 +2729,8 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
level. Otherwise, it should be a comma-separated list of items,
each of the form <replaceable>keyword</replaceable> or
<replaceable>keyword=value</replaceable>. Currently, the supported
keywords are <literal>level</literal> and <literal>workers</literal>.
keywords are <literal>level</literal>, <literal>long</literal> and
<literal>workers</literal>.
</para>
<para>
@ -2746,6 +2747,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
<literal>3</literal>).
</para>
<para>
The <literal>long</literal> keyword enables long-distance matching
mode, for improved compression ratio, at the expense of higher memory
use. Long-distance mode is supported only for
<literal>zstd</literal>.
</para>
<para>
The <literal>workers</literal> keyword sets the number of threads
that should be used for parallel compression. Parallel compression

View File

@ -424,8 +424,8 @@ PostgreSQL documentation
level. Otherwise, it should be a comma-separated list of items,
each of the form <literal>keyword</literal> or
<literal>keyword=value</literal>.
Currently, the supported keywords are <literal>level</literal>
and <literal>workers</literal>.
Currently, the supported keywords are <literal>level</literal>,
<literal>long</literal>, and <literal>workers</literal>.
The detail string cannot be used when the compression method
is specified as a plain integer.
</para>

View File

@ -681,6 +681,8 @@ PostgreSQL documentation
as though it had been fed through <application>gzip</application>,
<application>lz4</application>, or <application>zstd</application>;
but the default is not to compress.
With zstd compression, <literal>long</literal> mode may improve the
compression ratio, at the cost of increased memory use.
</para>
<para>
The tar archive format currently does not support compression at all.

View File

@ -118,6 +118,18 @@ bbsink_zstd_begin_backup(bbsink *sink)
compress->workers, ZSTD_getErrorName(ret)));
}
if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
{
ret = ZSTD_CCtx_setParameter(mysink->cctx,
ZSTD_c_enableLongDistanceMatching,
compress->long_distance);
if (ZSTD_isError(ret))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("could not set compression flag for %s: %s",
"long", ZSTD_getErrorName(ret)));
}
/*
* We need our own buffer, because we're going to pass different data to
* the next sink than what gets passed to us.

View File

@ -106,6 +106,19 @@ bbstreamer_zstd_compressor_new(bbstreamer *next, pg_compress_specification *comp
compress->workers, ZSTD_getErrorName(ret));
}
if ((compress->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0)
{
ret = ZSTD_CCtx_setParameter(streamer->cctx,
ZSTD_c_enableLongDistanceMatching,
compress->long_distance);
if (ZSTD_isError(ret))
{
pg_log_error("could not set compression flag for %s: %s",
"long", ZSTD_getErrorName(ret));
exit(1);
}
}
/* Initialize the ZSTD output buffer. */
streamer->zstd_outBuf.dst = streamer->base.bbs_buffer.data;
streamer->zstd_outBuf.size = streamer->base.bbs_buffer.maxlen;

View File

@ -139,7 +139,14 @@ SKIP:
'gzip:workers=3',
'invalid compression specification: compression algorithm "gzip" does not accept a worker count',
'failure on worker count for gzip'
],);
],
[
'gzip:long',
'invalid compression specification: compression algorithm "gzip" does not support long-distance mode',
'failure on long mode for gzip'
],
);
for my $cft (@compression_failure_tests)
{
my $cfail = quotemeta($client_fails . $cft->[1]);

View File

@ -80,6 +80,11 @@ _ZstdCStreamParams(pg_compress_specification compress)
_Zstd_CCtx_setParam_or_die(cstream, ZSTD_c_compressionLevel,
compress.level, "level");
if (compress.options & PG_COMPRESSION_OPTION_LONG_DISTANCE)
_Zstd_CCtx_setParam_or_die(cstream,
ZSTD_c_enableLongDistanceMatching,
compress.long_distance, "long");
return cstream;
}

View File

@ -267,11 +267,12 @@ my %pgdump_runs = (
],
},
# Exercise long mode for test coverage
compression_zstd_plain => {
test_key => 'compression',
compile_option => 'zstd',
dump_cmd => [
'pg_dump', '--format=plain', '--compress=zstd',
'pg_dump', '--format=plain', '--compress=zstd:long',
"--file=$tempdir/compression_zstd_plain.sql.zst", 'postgres',
],
# Decompress the generated file to run through the tests.

View File

@ -49,6 +49,14 @@ my @test_configuration = (
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
{
'compression_method' => 'zstd',
'backup_flags' => [ '--compress', 'server-zstd:level=1,long' ],
'backup_archive' => 'base.tar.zst',
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
});
for my $tc (@test_configuration)

View File

@ -50,6 +50,14 @@ my @test_configuration = (
'decompress_flags' => ['-d'],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
{
'compression_method' => 'zstd',
'backup_flags' => ['--compress', 'client-zstd:level=1,long'],
'backup_archive' => 'base.tar.zst',
'decompress_program' => $ENV{'ZSTD'},
'decompress_flags' => [ '-d' ],
'enabled' => check_pg_config("#define USE_ZSTD 1")
},
{
'compression_method' => 'parallel zstd',
'backup_flags' => [ '--compress', 'client-zstd:workers=3' ],

View File

@ -12,7 +12,7 @@
* Otherwise, a compression specification is a comma-separated list of items,
* each having the form keyword or keyword=value.
*
* Currently, the only supported keywords are "level" and "workers".
* Currently, the supported keywords are "level", "long", and "workers".
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
*
@ -38,6 +38,8 @@
static int expect_integer_value(char *keyword, char *value,
pg_compress_specification *result);
static bool expect_boolean_value(char *keyword, char *value,
pg_compress_specification *result);
/*
* Look up a compression algorithm by name. Returns true and sets *algorithm
@ -232,6 +234,11 @@ parse_compress_specification(pg_compress_algorithm algorithm, char *specificatio
result->workers = expect_integer_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_WORKERS;
}
else if (strcmp(keyword, "long") == 0)
{
result->long_distance = expect_boolean_value(keyword, value, result);
result->options |= PG_COMPRESSION_OPTION_LONG_DISTANCE;
}
else
result->parse_error =
psprintf(_("unrecognized compression option: \"%s\""), keyword);
@ -289,6 +296,43 @@ expect_integer_value(char *keyword, char *value, pg_compress_specification *resu
return ivalue;
}
/*
* Parse 'value' as a boolean and return the result.
*
* If parsing fails, set result->parse_error to an appropriate message
* and return -1. The caller must check result->parse_error to determine if
* the call was successful.
*
* Valid values are: yes, no, on, off, 1, 0.
*
* Inspired by ParseVariableBool().
*/
static bool
expect_boolean_value(char *keyword, char *value, pg_compress_specification *result)
{
if (value == NULL)
return true;
if (pg_strcasecmp(value, "yes") == 0)
return true;
if (pg_strcasecmp(value, "on") == 0)
return true;
if (pg_strcasecmp(value, "1") == 0)
return true;
if (pg_strcasecmp(value, "no") == 0)
return false;
if (pg_strcasecmp(value, "off") == 0)
return false;
if (pg_strcasecmp(value, "0") == 0)
return false;
result->parse_error =
psprintf(_("value for compression option \"%s\" must be a boolean"),
keyword);
return false;
}
/*
* Returns NULL if the compression specification string was syntactically
* valid and semantically sensible. Otherwise, returns an error message.
@ -354,6 +398,17 @@ validate_compress_specification(pg_compress_specification *spec)
get_compress_algorithm_name(spec->algorithm));
}
/*
* Of the compression algorithms that we currently support, only zstd
* supports long-distance mode.
*/
if ((spec->options & PG_COMPRESSION_OPTION_LONG_DISTANCE) != 0 &&
(spec->algorithm != PG_COMPRESSION_ZSTD))
{
return psprintf(_("compression algorithm \"%s\" does not support long-distance mode"),
get_compress_algorithm_name(spec->algorithm));
}
return NULL;
}

View File

@ -27,6 +27,7 @@ typedef enum pg_compress_algorithm
} pg_compress_algorithm;
#define PG_COMPRESSION_OPTION_WORKERS (1 << 0)
#define PG_COMPRESSION_OPTION_LONG_DISTANCE (1 << 1)
typedef struct pg_compress_specification
{
@ -34,6 +35,7 @@ typedef struct pg_compress_specification
unsigned options; /* OR of PG_COMPRESSION_OPTION constants */
int level;
int workers;
bool long_distance;
char *parse_error; /* NULL if parsing was OK, else message */
} pg_compress_specification;