diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 28f147fcaf..1852aef672 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -56,7 +56,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.85 2004/07/21 22:31:20 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.86 2004/08/15 23:44:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -322,16 +322,15 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); smgrwrite(wstate->index->rd_smgr, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, - !wstate->btws_use_wal); + true); } /* - * Now write the page. If not using WAL, say isTemp = true, to suppress - * duplicate fsync. If we are using WAL, it surely isn't a temp index, - * so !use_wal is a sufficient condition. + * Now write the page. We say isTemp = true even if it's not a + * temp index, because there's no need for smgr to schedule an fsync + * for this write; we'll do it ourselves before ending the build. */ - smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, - !wstate->btws_use_wal); + smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true); if (blkno == wstate->btws_pages_written) wstate->btws_pages_written++; @@ -802,9 +801,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * If we weren't using WAL, and the index isn't temp, we must fsync it - * down to disk before it's safe to commit the transaction. + * If the index isn't temp, we must fsync it down to disk before it's + * safe to commit the transaction. (For a temp index we don't care + * since the index will be uninteresting after a crash anyway.) + * + * It's obvious that we must do this when not WAL-logging the build. + * It's less obvious that we have to do it even if we did WAL-log the + * index pages. The reason is that since we're building outside + * shared buffers, a CHECKPOINT occurring during the build has no way + * to flush the previously written data to disk (indeed it won't know + * the index even exists). A crash later on would replay WAL from the + * checkpoint, therefore it wouldn't replay our earlier WAL entries. + * If we do not fsync those pages here, they might still not be on disk + * when the crash occurs. */ - if (!wstate->btws_use_wal && !wstate->index->rd_istemp) + if (!wstate->index->rd_istemp) smgrimmedsync(wstate->index->rd_smgr); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 87a2fb4856..a7eac55f25 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.125 2004/08/13 04:50:28 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.126 2004/08/15 23:44:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -5479,18 +5479,29 @@ copy_relation_data(Relation rel, SMgrRelation dst) } /* - * Now write the page. If not using WAL, say isTemp = true, to - * suppress duplicate fsync. If we are using WAL, it surely isn't a - * temp rel, so !use_wal is a sufficient condition. + * Now write the page. We say isTemp = true even if it's not a + * temp rel, because there's no need for smgr to schedule an fsync + * for this write; we'll do it ourselves below. */ - smgrwrite(dst, blkno, buf, !use_wal); + smgrwrite(dst, blkno, buf, true); } /* - * If we weren't using WAL, and the rel isn't temp, we must fsync it - * down to disk before it's safe to commit the transaction. + * If the rel isn't temp, we must fsync it down to disk before it's + * safe to commit the transaction. (For a temp rel we don't care + * since the rel will be uninteresting after a crash anyway.) + * + * It's obvious that we must do this when not WAL-logging the copy. + * It's less obvious that we have to do it even if we did WAL-log the + * copied pages. The reason is that since we're copying outside + * shared buffers, a CHECKPOINT occurring during the copy has no way + * to flush the previously written data to disk (indeed it won't know + * the new rel even exists). A crash later on would replay WAL from the + * checkpoint, therefore it wouldn't replay our earlier WAL entries. + * If we do not fsync those pages here, they might still not be on disk + * when the crash occurs. */ - if (!use_wal && !rel->rd_istemp) + if (!rel->rd_istemp) smgrimmedsync(dst); } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c9efd6341b..aba6b4a7d8 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.77 2004/07/17 03:28:55 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.78 2004/08/15 23:44:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -621,14 +621,15 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) * * Synchronously force all of the specified relation down to disk. * - * This is really only useful for non-WAL-logged index building: - * instead of incrementally WAL-logging the index build steps, - * we can just write completed index pages to disk with smgrwrite + * This is useful for building completely new relations (eg, new + * indexes). Instead of incrementally WAL-logging the index build + * steps, we can just write completed index pages to disk with smgrwrite * or smgrextend, and then fsync the completed index file before * committing the transaction. (This is sufficient for purposes of * crash recovery, since it effectively duplicates forcing a checkpoint - * for the completed index. But it is *not* workable if one wishes - * to use the WAL log for PITR or replication purposes.) + * for the completed index. But it is *not* sufficient if one wishes + * to use the WAL log for PITR or replication purposes: in that case + * we have to make WAL entries as well.) * * The preceding writes should specify isTemp = true to avoid * duplicative fsyncs.