Don't open formally non-existent segments in _mdfd_getseg().

Before this commit _mdfd_getseg(), in contrast to mdnblocks(), did not
verify whether all segments leading up to the to-be-opened one, were
RELSEG_SIZE sized. That is e.g. not the case after truncating a
relation, because later segments just get truncated to zero length, not
removed.

Once a "non-existent" segment has been opened in a session, mdnblocks()
will return wrong results, causing errors like "could not read block %u
in file" when accessing blocks. Closing the session, or the later
arrival of relevant invalidation messages, would "fix" the problem.

That, so far, was mostly harmless, because most segment accesses are
only done after an mdnblocks() call. But since 428b1d6b29 we try to
open segments that might have been deleted, to trigger kernel writeback
from a backend's queue of recent writes.

To fix check segment sizes in _mdfd_getseg() when opening previously
unopened segments. In practice this shouldn't imply a lot of additional
lseek() calls, because mdnblocks() will most of the time already have
opened all relevant segments.

This commit also fixes a second problem, namely that _mdfd_getseg(
EXTENSION_RETURN_NULL) extends files during recovery, which is not
desirable for the mdwriteback() case.  Add EXTENSION_REALLY_RETURN_NULL,
which does not behave that way, and use it.

Reported-By: Thom Brown
Author: Andres Freund, Abhijit Menon-Sen
Reviewd-By: Robert Haas, Fabien Coehlo
Discussion: CAA-aLv6Dp_ZsV-44QA-2zgkqWKQq=GedBX2dRSrWpxqovXK=Pg@mail.gmail.com
Fixes: 428b1d6b29
This commit is contained in:
Andres Freund 2016-04-26 20:32:51 -07:00
parent c6ff84b06a
commit 72a98a6395

View File

@ -165,9 +165,14 @@ static CycleCtr mdckpt_cycle_ctr = 0;
typedef enum /* behavior for mdopen & _mdfd_getseg */
{
EXTENSION_FAIL, /* ereport if segment not present */
EXTENSION_RETURN_NULL, /* return NULL if not present */
EXTENSION_CREATE /* create new segments as needed */
/* ereport if segment not present, create in recovery */
EXTENSION_FAIL,
/* return NULL if not present, create in recovery */
EXTENSION_RETURN_NULL,
/* return NULL if not present */
EXTENSION_REALLY_RETURN_NULL,
/* create new segments as needed */
EXTENSION_CREATE
} ExtensionBehavior;
/* local routines */
@ -591,7 +596,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
if (fd < 0)
{
if (behavior == EXTENSION_RETURN_NULL &&
if ((behavior == EXTENSION_RETURN_NULL ||
behavior == EXTENSION_REALLY_RETURN_NULL) &&
FILE_POSSIBLY_DELETED(errno))
{
pfree(path);
@ -685,7 +691,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
segnum_end;
v = _mdfd_getseg(reln, forknum, blocknum, false,
EXTENSION_RETURN_NULL);
EXTENSION_REALLY_RETURN_NULL);
/*
* We might be flushing buffers of already removed relations, that's
@ -1774,7 +1780,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
BlockNumber nextsegno;
if (!v)
return NULL; /* only possible if EXTENSION_RETURN_NULL */
return NULL; /* if EXTENSION_(REALLY_)RETURN_NULL */
targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
@ -1783,23 +1789,34 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
if (v->mdfd_chain == NULL)
{
/*
* Normally we will create new segments only if authorized by the
* caller (i.e., we are doing mdextend()). But when doing WAL
* recovery, create segments anyway; this allows cases such as
* replaying WAL data that has a write into a high-numbered
* segment of a relation that was later deleted. We want to go
* ahead and create the segments so we can finish out the replay.
*
* We have to maintain the invariant that segments before the last
* active segment are of size RELSEG_SIZE; therefore, pad them out
* with zeroes if needed. (This only matters if caller is
* extending the relation discontiguously, but that can happen in
* hash indexes.)
*/
if (behavior == EXTENSION_CREATE || InRecovery)
BlockNumber nblocks = _mdnblocks(reln, forknum, v);
int flags = 0;
if (nblocks > ((BlockNumber) RELSEG_SIZE))
elog(FATAL, "segment too big");
if (behavior == EXTENSION_CREATE ||
(InRecovery && behavior != EXTENSION_REALLY_RETURN_NULL))
{
if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
/*
* Normally we will create new segments only if authorized by
* the caller (i.e., we are doing mdextend()). But when doing
* WAL recovery, create segments anyway; this allows cases
* such as replaying WAL data that has a write into a
* high-numbered segment of a relation that was later deleted.
* We want to go ahead and create the segments so we can
* finish out the replay. However if the caller has specified
* EXTENSION_REALLY_RETURN_NULL, then extension is not desired
* even in recovery; we won't reach this point in that case.
*
* We have to maintain the invariant that segments before the
* last active segment are of size RELSEG_SIZE; therefore, if
* extending, pad them out with zeroes if needed. (This only
* matters if in recovery, or if the caller is extending the
* relation discontiguously, but that can happen in hash
* indexes.)
*/
if (nblocks < ((BlockNumber) RELSEG_SIZE))
{
char *zerobuf = palloc0(BLCKSZ);
@ -1808,16 +1825,41 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
zerobuf, skipFsync);
pfree(zerobuf);
}
v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
flags = O_CREAT;
}
else
else if (nblocks < ((BlockNumber) RELSEG_SIZE))
{
/* We won't create segment if not existent */
v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
/*
* When not extending, only open the next segment if the
* current one is exactly RELSEG_SIZE. If not (this branch),
* either return NULL or fail.
*/
if (behavior == EXTENSION_RETURN_NULL ||
behavior == EXTENSION_REALLY_RETURN_NULL)
{
/*
* Some callers discern between reasons for _mdfd_getseg()
* returning NULL based on errno. As there's no failing
* syscall involved in this case, explicitly set errno to
* ENOENT, as that seems the closest interpretation.
*/
errno = ENOENT;
return NULL;
}
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
_mdfd_segpath(reln, forknum, nextsegno),
blkno, nblocks)));
}
v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, flags);
if (v->mdfd_chain == NULL)
{
if (behavior == EXTENSION_RETURN_NULL &&
if ((behavior == EXTENSION_RETURN_NULL ||
behavior == EXTENSION_REALLY_RETURN_NULL) &&
FILE_POSSIBLY_DELETED(errno))
return NULL;
ereport(ERROR,