diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index 3ace204249..40f5d72056 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.83 2008/02/29 17:47:41 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.84 2008/03/07 23:20:21 tgl Exp $ * * * INTERFACE ROUTINES @@ -576,7 +576,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, /* ---------- * Compress and/or save external until data fits into target length * - * 1: Inline compress attributes with attstorage 'x' + * 1: Inline compress attributes with attstorage 'x', and store very + * large attributes with attstorage 'x' or 'e' external immediately * 2: Store attributes with attstorage 'x' or 'e' external * 3: Inline compress attributes with attstorage 'm' * 4: Store attributes with attstorage 'm' external @@ -595,7 +596,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, maxDataLen = TOAST_TUPLE_TARGET - hoff; /* - * Look for attributes with attstorage 'x' to compress + * Look for attributes with attstorage 'x' to compress. Also find large + * attributes with attstorage 'x' or 'e', and store them external. */ while (heap_compute_data_size(tupleDesc, toast_values, toast_isnull) > maxDataLen) @@ -606,7 +608,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, Datum new_value; /* - * Search for the biggest yet uncompressed internal attribute + * Search for the biggest yet unprocessed internal attribute */ for (i = 0; i < numAttrs; i++) { @@ -616,7 +618,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, continue; /* can't happen, toast_action would be 'p' */ if (VARATT_IS_COMPRESSED(toast_values[i])) continue; - if (att[i]->attstorage != 'x') + if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e') continue; if (toast_sizes[i] > biggest_size) { @@ -629,30 +631,58 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, break; /* - * Attempt to compress it inline + * Attempt to compress it inline, if it has attstorage 'x' */ i = biggest_attno; - old_value = toast_values[i]; - new_value = toast_compress_datum(old_value); - - if (DatumGetPointer(new_value) != NULL) + if (att[i]->attstorage == 'x') { - /* successful compression */ - if (toast_free[i]) - pfree(DatumGetPointer(old_value)); - toast_values[i] = new_value; - toast_free[i] = true; - toast_sizes[i] = VARSIZE(toast_values[i]); - need_change = true; - need_free = true; + old_value = toast_values[i]; + new_value = toast_compress_datum(old_value); + + if (DatumGetPointer(new_value) != NULL) + { + /* successful compression */ + if (toast_free[i]) + pfree(DatumGetPointer(old_value)); + toast_values[i] = new_value; + toast_free[i] = true; + toast_sizes[i] = VARSIZE(toast_values[i]); + need_change = true; + need_free = true; + } + else + { + /* incompressible, ignore on subsequent compression passes */ + toast_action[i] = 'x'; + } } else { - /* - * incompressible data, ignore on subsequent compression passes - */ + /* has attstorage 'e', ignore on subsequent compression passes */ toast_action[i] = 'x'; } + + /* + * If this value is by itself more than maxDataLen (after compression + * if any), push it out to the toast table immediately, if possible. + * This avoids uselessly compressing other fields in the common case + * where we have one long field and several short ones. + * + * XXX maybe the threshold should be less than maxDataLen? + */ + if (toast_sizes[i] > maxDataLen && + rel->rd_rel->reltoastrelid != InvalidOid) + { + old_value = toast_values[i]; + toast_action[i] = 'p'; + toast_values[i] = toast_save_datum(rel, toast_values[i], + use_wal, use_fsm); + if (toast_free[i]) + pfree(DatumGetPointer(old_value)); + toast_free[i] = true; + need_change = true; + need_free = true; + } } /* @@ -761,9 +791,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, } else { - /* - * incompressible data, ignore on subsequent compression passes - */ + /* incompressible, ignore on subsequent compression passes */ toast_action[i] = 'x'; } } @@ -1047,16 +1075,28 @@ toast_compress_datum(Datum value) Assert(!VARATT_IS_COMPRESSED(value)); /* - * No point in wasting a palloc cycle if value is too short for - * compression + * No point in wasting a palloc cycle if value size is out of the + * allowed range for compression */ - if (valsize < PGLZ_strategy_default->min_input_size) + if (valsize < PGLZ_strategy_default->min_input_size || + valsize > PGLZ_strategy_default->max_input_size) return PointerGetDatum(NULL); tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize)); + + /* + * We recheck the actual size even if pglz_compress() reports success, + * because it might be satisfied with having saved as little as one byte + * in the compressed data --- which could turn into a net loss once you + * consider header and alignment padding. Worst case, the compressed + * format might require three padding bytes (plus header, which is included + * in VARSIZE(tmp)), whereas the uncompressed format would take only one + * header byte and no padding if the value is short enough. So we insist + * on a savings of more than 2 bytes to ensure we have a gain. + */ if (pglz_compress(VARDATA_ANY(value), valsize, (PGLZ_Header *) tmp, PGLZ_strategy_default) && - VARSIZE(tmp) < VARSIZE_ANY(value)) + VARSIZE(tmp) < valsize - 2) { /* successful compression */ return PointerGetDatum(tmp); diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c index a7cd8f6eae..0716b25298 100644 --- a/src/backend/utils/adt/pg_lzcompress.c +++ b/src/backend/utils/adt/pg_lzcompress.c @@ -4,7 +4,7 @@ * This is an implementation of LZ compression for PostgreSQL. * It uses a simple history table and generates 2-3 byte tags * capable of backward copy information for 3-273 bytes with - * an offset of max. 4095. + * a max offset of 4095. * * Entry routines: * @@ -166,13 +166,12 @@ * * Copyright (c) 1999-2008, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.29 2008/01/01 19:45:52 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.30 2008/03/07 23:20:21 tgl Exp $ * ---------- */ #include "postgres.h" -#include -#include +#include #include "utils/pg_lzcompress.h" @@ -211,27 +210,23 @@ typedef struct PGLZ_HistEntry * ---------- */ static const PGLZ_Strategy strategy_default_data = { - 256, /* Data chunks less than 256 bytes are not - * compressed */ - 6144, /* Data chunks >= 6K force compression, unless - * compressed output is larger than input */ - 20, /* Below 6K, compression rates below 20% mean - * fallback to uncompressed */ - 128, /* Stop history lookup if a match of 128 bytes - * is found */ - 10 /* Lower good match size by 10% at every - * lookup loop iteration */ + 32, /* Data chunks less than 32 bytes are not compressed */ + 1024 * 1024, /* Data chunks over 1MB are not compressed either */ + 25, /* Require 25% compression rate, or not worth it */ + 1024, /* Give up if no compression in the first 1KB */ + 128, /* Stop history lookup if a match of 128 bytes is found */ + 10 /* Lower good match size by 10% at every loop iteration */ }; const PGLZ_Strategy *const PGLZ_strategy_default = &strategy_default_data; static const PGLZ_Strategy strategy_always_data = { - 0, /* Chunks of any size are compressed */ - 0, - 0, /* It's enough to save one single byte */ - 128, /* Stop history lookup if a match of 128 bytes - * is found */ - 6 /* Look harder for a good match */ + 0, /* Chunks of any size are compressed */ + INT_MAX, + 0, /* It's enough to save one single byte */ + INT_MAX, /* Never give up early */ + 128, /* Stop history lookup if a match of 128 bytes is found */ + 6 /* Look harder for a good match */ }; const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data; @@ -491,6 +486,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, unsigned char *ctrlp = &ctrl_dummy; unsigned char ctrlb = 0; unsigned char ctrl = 0; + bool found_match = false; int32 match_len; int32 match_off; int32 good_match; @@ -506,11 +502,12 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, strategy = PGLZ_strategy_default; /* - * If the strategy forbids compression (at all or if source chunk too - * small), fail. + * If the strategy forbids compression (at all or if source chunk size + * out of range), fail. */ if (strategy->match_size_good <= 0 || - slen < strategy->min_input_size) + slen < strategy->min_input_size || + slen > strategy->max_input_size) return false; /* @@ -519,41 +516,44 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, dest->rawsize = slen; /* - * Limit the match size to the maximum implementation allowed value + * Limit the match parameters to the supported range. */ - if ((good_match = strategy->match_size_good) > PGLZ_MAX_MATCH) + good_match = strategy->match_size_good; + if (good_match > PGLZ_MAX_MATCH) good_match = PGLZ_MAX_MATCH; - if (good_match < 17) + else if (good_match < 17) good_match = 17; - if ((good_drop = strategy->match_size_drop) < 0) + good_drop = strategy->match_size_drop; + if (good_drop < 0) good_drop = 0; - if (good_drop > 100) + else if (good_drop > 100) good_drop = 100; + need_rate = strategy->min_comp_rate; + if (need_rate < 0) + need_rate = 0; + else if (need_rate > 99) + need_rate = 99; + + /* + * Compute the maximum result size allowed by the strategy, namely + * the input size minus the minimum wanted compression rate. This had + * better be <= slen, else we might overrun the provided output buffer. + */ + if (slen > (INT_MAX/100)) + { + /* Approximate to avoid overflow */ + result_max = (slen / 100) * (100 - need_rate); + } + else + result_max = (slen * (100 - need_rate)) / 100; + /* * Initialize the history lists to empty. We do not need to zero the * hist_entries[] array; its entries are initialized as they are used. */ - memset((void *) hist_start, 0, sizeof(hist_start)); - - /* - * Compute the maximum result size allowed by the strategy. If the input - * size exceeds force_input_size, the max result size is the input size - * itself. Otherwise, it is the input size minus the minimum wanted - * compression rate. - */ - if (slen >= strategy->force_input_size) - result_max = slen; - else - { - need_rate = strategy->min_comp_rate; - if (need_rate < 0) - need_rate = 0; - else if (need_rate > 99) - need_rate = 99; - result_max = slen - ((slen * need_rate) / 100); - } + memset(hist_start, 0, sizeof(hist_start)); /* * Compress the source directly into the output buffer. @@ -570,6 +570,15 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, if (bp - bstart >= result_max) return false; + /* + * If we've emitted more than first_success_by bytes without finding + * anything compressible at all, fail. This lets us fall out + * reasonably quickly when looking at incompressible input (such as + * pre-compressed data). + */ + if (!found_match && bp - bstart >= strategy->first_success_by) + return false; + /* * Try to find a match in the history */ @@ -586,9 +595,10 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, pglz_hist_add(hist_start, hist_entries, hist_next, hist_recycle, dp, dend); - dp++; /* Do not do this ++ in the line above! */ + dp++; /* Do not do this ++ in the line above! */ /* The macro would do it four times - Jan. */ } + found_match = true; } else { @@ -599,7 +609,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, pglz_hist_add(hist_start, hist_entries, hist_next, hist_recycle, dp, dend); - dp++; /* Do not do this ++ in the line above! */ + dp++; /* Do not do this ++ in the line above! */ /* The macro would do it four times - Jan. */ } } diff --git a/src/include/utils/pg_lzcompress.h b/src/include/utils/pg_lzcompress.h index a3c49ae7a7..e81ae0d5ca 100644 --- a/src/include/utils/pg_lzcompress.h +++ b/src/include/utils/pg_lzcompress.h @@ -3,7 +3,7 @@ * * Definitions for the builtin LZ compressor * - * $PostgreSQL: pgsql/src/include/utils/pg_lzcompress.h,v 1.16 2007/11/15 21:14:45 momjian Exp $ + * $PostgreSQL: pgsql/src/include/utils/pg_lzcompress.h,v 1.17 2008/03/07 23:20:21 tgl Exp $ * ---------- */ @@ -14,7 +14,7 @@ /* ---------- * PGLZ_Header - * - * The information at the top of the compressed data. + * The information at the start of the compressed data. * ---------- */ typedef struct PGLZ_Header @@ -48,19 +48,17 @@ typedef struct PGLZ_Header * * Some values that control the compression algorithm. * - * min_input_size Minimum input data size to start compression. + * min_input_size Minimum input data size to consider compression. * - * force_input_size Minimum input data size to force compression - * even if the compression rate drops below - * min_comp_rate. But in any case the output - * must be smaller than the input. If that isn't - * the case, the compressor will throw away its - * output and copy the original, uncompressed data - * to the output buffer. + * max_input_size Maximum input data size to consider compression. * - * min_comp_rate Minimum compression rate (0-99%) to require for - * inputs smaller than force_input_size. If not - * achieved, the output will be uncompressed. + * min_comp_rate Minimum compression rate (0-99%) to require. + * Regardless of min_comp_rate, the output must be + * smaller than the input, else we don't store + * compressed. + * + * first_success_by Abandon compression if we find no compressible + * data within the first this-many bytes. * * match_size_good The initial GOOD match size when starting history * lookup. When looking up the history to find a @@ -81,8 +79,9 @@ typedef struct PGLZ_Header typedef struct PGLZ_Strategy { int32 min_input_size; - int32 force_input_size; + int32 max_input_size; int32 min_comp_rate; + int32 first_success_by; int32 match_size_good; int32 match_size_drop; } PGLZ_Strategy; @@ -91,21 +90,11 @@ typedef struct PGLZ_Strategy /* ---------- * The standard strategies * - * PGLZ_strategy_default Starts compression only if input is - * at least 256 bytes large. Stores output - * uncompressed if compression does not - * gain at least 20% size reducture but - * input does not exceed 6K. Stops history - * lookup if at least a 128 byte long - * match has been found. + * PGLZ_strategy_default Recommended default strategy for TOAST. * - * This is the default strategy if none - * is given to pglz_compress(). - * - * PGLZ_strategy_always Starts compression on any infinitely - * small input and does fallback to - * uncompressed storage only if output - * would be larger than input. + * PGLZ_strategy_always Try to compress inputs of any length. + * Fallback to uncompressed storage only if + * output would be larger than input. * ---------- */ extern const PGLZ_Strategy *const PGLZ_strategy_default;