Use TRUNCATE to preserve relfilenode for pg_largeobject + index.

Commit 9a974cbcba arranged to preserve
the relfilenode of user tables across pg_upgrade, but failed to notice
that pg_upgrade treats pg_largeobject as a user table and thus it needs
the same treatment. Otherwise, large objects will appear to vanish
after a  pg_upgrade.

Commit d498e052b4 fixed this problem
by teaching pg_dump to UPDATE pg_class.relfilenode for pg_largeobject
and its index. However, because an UPDATE on the catalog rows doesn't
change anything on disk, this can leave stray files behind in the new
cluster. They will normally be empty, but it's a little bit untidy.

Hence, this commit arranges to do the same thing using DDL. Specifically,
it makes TRUNCATE work for the pg_largeobject catalog when in
binary-upgrade mode, and it then uses that command in binary-upgrade
dumps as a way of setting pg_class.relfilenode for pg_largeobject and
its index. That way, the old files are removed from the new cluster.

Discussion: http://postgr.es/m/CA+TgmoYYMXGUJO5GZk1-MByJGu_bB8CbOL6GJQC8=Bzt6x6vDg@mail.gmail.com
This commit is contained in:
Robert Haas 2022-07-28 16:03:42 -04:00
parent e09d7a1262
commit bbe08b8869
4 changed files with 120 additions and 15 deletions

View File

@ -40,6 +40,7 @@
#include "catalog/pg_depend.h"
#include "catalog/pg_foreign_table.h"
#include "catalog/pg_inherits.h"
#include "catalog/pg_largeobject.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_statistic_ext.h"
@ -2185,7 +2186,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple)
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is not a table", relname)));
if (!allowSystemTableMods && IsSystemClass(relid, reltuple))
/*
* Most system catalogs can't be truncated at all, or at least not unless
* allow_system_table_mods=on. As an exception, however, we allow
* pg_largeobject to be truncated as part of pg_upgrade, because we need
* to change its relfilenode to match the old cluster, and allowing a
* TRUNCATE command to be executed is the easiest way of doing that.
*/
if (!allowSystemTableMods && IsSystemClass(relid, reltuple)
&& (!IsBinaryUpgrade || relid != LargeObjectRelationId))
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("permission denied: \"%s\" is a system catalog",

View File

@ -319,6 +319,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
{
char *path;
int ret;
BlockNumber segno = 0;
path = relpath(rlocator, forkNum);
@ -353,8 +354,22 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
/* Prevent other backends' fds from holding on to the disk space */
ret = do_truncate(path);
/* Register request to unlink first segment later */
register_unlink_segment(rlocator, forkNum, 0 /* first seg */ );
/*
* Except during a binary upgrade, register request to unlink first
* segment later, rather than now.
*
* If we're performing a binary upgrade, the dangers described in the
* header comments for mdunlink() do not exist, since after a crash
* or even a simple ERROR, the upgrade fails and the whole new cluster
* must be recreated from scratch. And, on the other hand, it is
* important to remove the files from disk immediately, because we
* may be about to reuse the same relfilenumber.
*/
if (!IsBinaryUpgrade)
{
register_unlink_segment(rlocator, forkNum, 0 /* first seg */ );
++segno;
}
}
/*
@ -363,15 +378,17 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
if (ret >= 0)
{
char *segpath = (char *) palloc(strlen(path) + 12);
BlockNumber segno;
/*
* Note that because we loop until getting ENOENT, we will correctly
* remove all inactive segments as well as active ones.
*/
for (segno = 1;; segno++)
for (;; segno++)
{
sprintf(segpath, "%s.%u", path, segno);
if (segno == 0)
strcpy(segpath, path);
else
sprintf(segpath, "%s.%u", path, segno);
if (!RelFileLocatorBackendIsTemp(rlocator))
{

View File

@ -41,6 +41,7 @@
#include "access/tupdesc_details.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/binary_upgrade.h"
#include "catalog/catalog.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
@ -3707,9 +3708,36 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
TransactionId freezeXid = InvalidTransactionId;
RelFileLocator newrlocator;
/* Allocate a new relfilenumber */
newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace,
NULL, persistence);
if (!IsBinaryUpgrade)
{
/* Allocate a new relfilenumber */
newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace,
NULL, persistence);
}
else if (relation->rd_rel->relkind == RELKIND_INDEX)
{
if (!OidIsValid(binary_upgrade_next_index_pg_class_relfilenumber))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("index relfilenumber value not set when in binary upgrade mode")));
newrelfilenumber = binary_upgrade_next_index_pg_class_relfilenumber;
binary_upgrade_next_index_pg_class_relfilenumber = InvalidOid;
}
else if (relation->rd_rel->relkind == RELKIND_RELATION)
{
if (!OidIsValid(binary_upgrade_next_heap_pg_class_relfilenumber))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("heap relfilenumber value not set when in binary upgrade mode")));
newrelfilenumber = binary_upgrade_next_heap_pg_class_relfilenumber;
binary_upgrade_next_heap_pg_class_relfilenumber = InvalidOid;
}
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unexpected request for new relfilenumber in binary upgrade mode")));
/*
* Get a writable copy of the pg_class tuple for the given relation.
@ -3724,9 +3752,37 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
classform = (Form_pg_class) GETSTRUCT(tuple);
/*
* Schedule unlinking of the old storage at transaction commit.
* Schedule unlinking of the old storage at transaction commit, except
* when performing a binary upgrade, when we must do it immediately.
*/
RelationDropStorage(relation);
if (IsBinaryUpgrade)
{
SMgrRelation srel;
/*
* During a binary upgrade, we use this code path to ensure that
* pg_largeobject and its index have the same relfilenumbers as in
* the old cluster. This is necessary because pg_upgrade treats
* pg_largeobject like a user table, not a system table. It is however
* possible that a table or index may need to end up with the same
* relfilenumber in the new cluster as what it had in the old cluster.
* Hence, we can't wait until commit time to remove the old storage.
*
* In general, this function needs to have transactional semantics,
* and removing the old storage before commit time surely isn't.
* However, it doesn't really matter, because if a binary upgrade
* fails at this stage, the new cluster will need to be recreated
* anyway.
*/
srel = smgropen(relation->rd_locator, relation->rd_backend);
smgrdounlinkall(&srel, 1, false);
smgrclose(srel);
}
else
{
/* Not a binary upgrade, so just schedule it to happen later. */
RelationDropStorage(relation);
}
/*
* Create storage for the main fork of the new relfilenumber. If it's a

View File

@ -3141,6 +3141,7 @@ dumpDatabase(Archive *fout)
PGresult *lo_res;
PQExpBuffer loFrozenQry = createPQExpBuffer();
PQExpBuffer loOutQry = createPQExpBuffer();
PQExpBuffer loVacQry = createPQExpBuffer();
int i_relfrozenxid,
i_relfilenode,
i_oid,
@ -3167,15 +3168,36 @@ dumpDatabase(Archive *fout)
i_relfilenode = PQfnumber(lo_res, "relfilenode");
i_oid = PQfnumber(lo_res, "oid");
appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve values for pg_largeobject and its index\n");
appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n");
appendPQExpBufferStr(loVacQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n");
for (int i = 0; i < PQntuples(lo_res); ++i)
{
Oid oid;
RelFileNumber relfilenumber;
appendPQExpBuffer(loOutQry, "UPDATE pg_catalog.pg_class\n"
"SET relfrozenxid = '%u', relminmxid = '%u', relfilenode = '%u'\n"
"SET relfrozenxid = '%u', relminmxid = '%u'\n"
"WHERE oid = %u;\n",
atooid(PQgetvalue(lo_res, i, i_relfrozenxid)),
atooid(PQgetvalue(lo_res, i, i_relminmxid)),
atooid(PQgetvalue(lo_res, i, i_relfilenode)),
atooid(PQgetvalue(lo_res, i, i_oid)));
atooid(PQgetvalue(lo_res, i, i_relfilenode)));
oid = atooid(PQgetvalue(lo_res, i, i_oid));
relfilenumber = atooid(PQgetvalue(lo_res, i, i_relfilenode));
if (oid == LargeObjectRelationId)
appendPQExpBuffer(loVacQry,
"SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n",
relfilenumber);
else if (oid == LargeObjectLOidPNIndexId)
appendPQExpBuffer(loVacQry,
"SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
relfilenumber);
}
appendPQExpBufferStr(loVacQry,
"TRUNCATE pg_catalog.pg_largeobject;\n");
appendPQExpBufferStr(loOutQry, loVacQry->data);
ArchiveEntry(fout, nilCatalogId, createDumpId(),
ARCHIVE_OPTS(.tag = "pg_largeobject",
@ -3187,6 +3209,7 @@ dumpDatabase(Archive *fout)
destroyPQExpBuffer(loFrozenQry);
destroyPQExpBuffer(loOutQry);
destroyPQExpBuffer(loVacQry);
}
PQclear(res);