Attached is a patch against current CVS that fixes both of the known
problems with sequences: failure to flush XLOG after a transaction
that only does "SELECT nextval()", and failure to force a new WAL
record to be written on the first nextval after a checkpoint.
(The latter uses Vadim's idea of looking at the sequence page LSN.)
I haven't tested it really extensively, but it seems to cure the
reported problems.
Some notes:
1. I found what I believe is another bug in the sequence logic:
fetch = log = fetch - log + SEQ_LOG_VALS;
should be
fetch = log = fetch + SEQ_LOG_VALS;
I can't see any reason to reduce the number of values prefetched
by the number formerly prefetched. Also, if the sequence's "cache"
setting is large (more than SEQ_LOG_VALS), the original code could
easily fail to fetch as many values as it was supposed to cache,
let alone additional ones to be prefetched and logged.
2. I renamed XLogCtl->RedoRecPtr to SavedRedoRecPtr, and renamed
the associated routines to SetSavedRedoRecPtr/GetSavedRedoRecPtr,
in hopes of reducing confusion.
3. I believe it'd now be possible to remove SavedRedoRecPtr and
SetSavedRedoRecPtr/GetSavedRedoRecPtr entirely, in favor of letting
the postmaster fetch the updated pointer with GetRedoRecPtr just
like a backend would. This would be cleaner and less code ... but
someone might object that it introduces a risk of postmaster hangup,
if some backend crashes whilst holding info_lck. I consider that
risk minuscule given the short intervals in which info_lck is held,
but it can't be denied that the risk is not zero. Thoughts?
Comments? Unless I hear objections I will patch this in current
and the 7.2 branch. (If we agree to remove SavedRedoRecPtr,
though, I don't think we should back-patch that change.)
regards, tom lane
*** src/backend/access/transam/xact.c.orig Tue Mar 12 07:56:31 2002
--- src/backend/access/transam/xact.c Thu Mar 14 20:00:50 2002
***************
*** 546,577 ****
xid = GetCurrentTransactionId();
/*
! * We needn't write anything in xlog or clog if the transaction was
! * read-only, which we check by testing if it made any xlog entries.
*/
! if (MyLastRecPtr.xrecoff != 0)
{
- XLogRecData rdata;
- xl_xact_commit xlrec;
XLogRecPtr recptr;
BufmgrCommit();
- xlrec.xtime = time(NULL);
- rdata.buffer = InvalidBuffer;
- rdata.data = (char *) (&xlrec);
- rdata.len = SizeOfXactCommit;
- rdata.next = NULL;
-
START_CRIT_SECTION();
! /*
! * SHOULD SAVE ARRAY OF RELFILENODE-s TO DROP
! */
! recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, &rdata);
/*
! * Sleep before commit! So we can flush more than one commit
* records per single fsync. (The idea is some other backend may
* do the XLogFlush while we're sleeping. This needs work still,
* because on most Unixen, the minimum select() delay is 10msec or
--- 546,593 ----
xid = GetCurrentTransactionId();
/*
! * We only need to log the commit in xlog and clog if the transaction made
! * any transaction-controlled XLOG entries. (Otherwise, its XID appears
! * nowhere in permanent storage, so no one will ever care if it
! * committed.) However, we must flush XLOG to disk if we made any XLOG
! * entries, whether in or out of transaction control. For example, if we
! * reported a nextval() result to the client, this ensures that any XLOG
! * record generated by nextval will hit the disk before we report the
! * transaction committed.
*/
! if (MyXactMadeXLogEntry)
{
XLogRecPtr recptr;
BufmgrCommit();
START_CRIT_SECTION();
! if (MyLastRecPtr.xrecoff != 0)
! {
! /* Need to emit a commit record */
! XLogRecData rdata;
! xl_xact_commit xlrec;
!
! xlrec.xtime = time(NULL);
! rdata.buffer = InvalidBuffer;
! rdata.data = (char *) (&xlrec);
! rdata.len = SizeOfXactCommit;
! rdata.next = NULL;
!
! /*
! * XXX SHOULD SAVE ARRAY OF RELFILENODE-s TO DROP
! */
! recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, &rdata);
! }
! else
! {
! /* Just flush through last record written by me */
! recptr = ProcLastRecEnd;
! }
/*
! * Sleep before flush! So we can flush more than one commit
* records per single fsync. (The idea is some other backend may
* do the XLogFlush while we're sleeping. This needs work still,
* because on most Unixen, the minimum select() delay is 10msec or
***************
*** 593,607 ****
XLogFlush(recptr);
! /* Break the chain of back-links in the XLOG records I output */
! MyLastRecPtr.xrecoff = 0;
!
! /* Mark the transaction committed in clog */
! TransactionIdCommit(xid);
END_CRIT_SECTION();
}
/* Show myself as out of the transaction in PROC array */
MyProc->logRec.xrecoff = 0;
--- 609,625 ----
XLogFlush(recptr);
! /* Mark the transaction committed in clog, if needed */
! if (MyLastRecPtr.xrecoff != 0)
! TransactionIdCommit(xid);
END_CRIT_SECTION();
}
+ /* Break the chain of back-links in the XLOG records I output */
+ MyLastRecPtr.xrecoff = 0;
+ MyXactMadeXLogEntry = false;
+
/* Show myself as out of the transaction in PROC array */
MyProc->logRec.xrecoff = 0;
***************
*** 689,696 ****
TransactionId xid = GetCurrentTransactionId();
/*
! * We needn't write anything in xlog or clog if the transaction was
! * read-only, which we check by testing if it made any xlog entries.
*
* Extra check here is to catch case that we aborted partway through
* RecordTransactionCommit ...
--- 707,717 ----
TransactionId xid = GetCurrentTransactionId();
/*
! * We only need to log the abort in xlog and clog if the transaction made
! * any transaction-controlled XLOG entries. (Otherwise, its XID appears
! * nowhere in permanent storage, so no one will ever care if it
! * committed.) We do not flush XLOG to disk in any case, since the
! * default assumption after a crash would be that we aborted, anyway.
*
* Extra check here is to catch case that we aborted partway through
* RecordTransactionCommit ...
***************
*** 714,724 ****
*/
recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, &rdata);
- /*
- * There's no need for XLogFlush here, since the default
- * assumption would be that we aborted, anyway.
- */
-
/* Mark the transaction aborted in clog */
TransactionIdAbort(xid);
--- 735,740 ----
***************
*** 727,732 ****
--- 743,750 ----
/* Break the chain of back-links in the XLOG records I output */
MyLastRecPtr.xrecoff = 0;
+ MyXactMadeXLogEntry = false;
+
/* Show myself as out of the transaction in PROC array */
MyProc->logRec.xrecoff = 0;
*** src/backend/access/transam/xlog.c.orig Tue Mar 12 07:56:31 2002
--- src/backend/access/transam/xlog.c Thu Mar 14 20:29:51 2002
***************
*** 131,157 ****
/*
* MyLastRecPtr points to the start of the last XLOG record inserted by the
! * current transaction. If MyLastRecPtr.xrecoff == 0, then we are not in
! * a transaction or the transaction has not yet made any loggable changes.
*
* Note that XLOG records inserted outside transaction control are not
! * reflected into MyLastRecPtr.
*/
XLogRecPtr MyLastRecPtr = {0, 0};
/*
* ProcLastRecPtr points to the start of the last XLOG record inserted by the
* current backend. It is updated for all inserts, transaction-controlled
! * or not.
*/
static XLogRecPtr ProcLastRecPtr = {0, 0};
/*
* RedoRecPtr is this backend's local copy of the REDO record pointer
* (which is almost but not quite the same as a pointer to the most recent
* CHECKPOINT record). We update this from the shared-memory copy,
* XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
! * hold the Insert lock). See XLogInsert for details.
*/
static XLogRecPtr RedoRecPtr;
--- 131,166 ----
/*
* MyLastRecPtr points to the start of the last XLOG record inserted by the
! * current transaction. If MyLastRecPtr.xrecoff == 0, then the current
! * xact hasn't yet inserted any transaction-controlled XLOG records.
*
* Note that XLOG records inserted outside transaction control are not
! * reflected into MyLastRecPtr. They do, however, cause MyXactMadeXLogEntry
! * to be set true. The latter can be used to test whether the current xact
! * made any loggable changes (including out-of-xact changes, such as
! * sequence updates).
*/
XLogRecPtr MyLastRecPtr = {0, 0};
+ bool MyXactMadeXLogEntry = false;
+
/*
* ProcLastRecPtr points to the start of the last XLOG record inserted by the
* current backend. It is updated for all inserts, transaction-controlled
! * or not. ProcLastRecEnd is similar but points to end+1 of last record.
*/
static XLogRecPtr ProcLastRecPtr = {0, 0};
+ XLogRecPtr ProcLastRecEnd = {0, 0};
+
/*
* RedoRecPtr is this backend's local copy of the REDO record pointer
* (which is almost but not quite the same as a pointer to the most recent
* CHECKPOINT record). We update this from the shared-memory copy,
* XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
! * hold the Insert lock). See XLogInsert for details. We are also allowed
! * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
! * see GetRedoRecPtr.
*/
static XLogRecPtr RedoRecPtr;
***************
*** 272,278 ****
StartUpID ThisStartUpID;
/* This value is not protected by *any* lock... */
! XLogRecPtr RedoRecPtr; /* see SetRedoRecPtr/GetRedoRecPtr */
slock_t info_lck; /* locks shared LogwrtRqst/LogwrtResult */
} XLogCtlData;
--- 281,288 ----
StartUpID ThisStartUpID;
/* This value is not protected by *any* lock... */
! /* see SetSavedRedoRecPtr/GetSavedRedoRecPtr */
! XLogRecPtr SavedRedoRecPtr;
slock_t info_lck; /* locks shared LogwrtRqst/LogwrtResult */
} XLogCtlData;
***************
*** 777,782 ****
--- 787,793 ----
MyLastRecPtr = RecPtr;
ProcLastRecPtr = RecPtr;
Insert->PrevRecord = RecPtr;
+ MyXactMadeXLogEntry = true;
Insert->currpos += SizeOfXLogRecord;
freespace -= SizeOfXLogRecord;
***************
*** 855,860 ****
--- 866,873 ----
SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
}
+ ProcLastRecEnd = RecPtr;
+
END_CRIT_SECTION();
return (RecPtr);
***************
*** 2538,2544 ****
ThisStartUpID = checkPoint.ThisStartUpID;
RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
! XLogCtl->RedoRecPtr = checkPoint.redo;
if (XLByteLT(RecPtr, checkPoint.redo))
elog(PANIC, "invalid redo in checkpoint record");
--- 2551,2557 ----
ThisStartUpID = checkPoint.ThisStartUpID;
RedoRecPtr = XLogCtl->Insert.RedoRecPtr =
! XLogCtl->SavedRedoRecPtr = checkPoint.redo;
if (XLByteLT(RecPtr, checkPoint.redo))
elog(PANIC, "invalid redo in checkpoint record");
***************
*** 2824,2855 ****
SetThisStartUpID(void)
{
ThisStartUpID = XLogCtl->ThisStartUpID;
! RedoRecPtr = XLogCtl->RedoRecPtr;
}
/*
* CheckPoint process called by postmaster saves copy of new RedoRecPtr
! * in shmem (using SetRedoRecPtr). When checkpointer completes, postmaster
! * calls GetRedoRecPtr to update its own copy of RedoRecPtr, so that
! * subsequently-spawned backends will start out with a reasonably up-to-date
! * local RedoRecPtr. Since these operations are not protected by any lock
! * and copying an XLogRecPtr isn't atomic, it's unsafe to use either of these
! * routines at other times!
! *
! * Note: once spawned, a backend must update its local RedoRecPtr from
! * XLogCtl->Insert.RedoRecPtr while holding the insert lock. This is
! * done in XLogInsert().
*/
void
! SetRedoRecPtr(void)
{
! XLogCtl->RedoRecPtr = RedoRecPtr;
}
void
GetRedoRecPtr(void)
{
! RedoRecPtr = XLogCtl->RedoRecPtr;
}
/*
--- 2837,2883 ----
SetThisStartUpID(void)
{
ThisStartUpID = XLogCtl->ThisStartUpID;
! RedoRecPtr = XLogCtl->SavedRedoRecPtr;
}
/*
* CheckPoint process called by postmaster saves copy of new RedoRecPtr
! * in shmem (using SetSavedRedoRecPtr). When checkpointer completes,
! * postmaster calls GetSavedRedoRecPtr to update its own copy of RedoRecPtr,
! * so that subsequently-spawned backends will start out with a reasonably
! * up-to-date local RedoRecPtr. Since these operations are not protected by
! * any lock and copying an XLogRecPtr isn't atomic, it's unsafe to use either
! * of these routines at other times!
*/
void
! SetSavedRedoRecPtr(void)
{
! XLogCtl->SavedRedoRecPtr = RedoRecPtr;
}
void
+ GetSavedRedoRecPtr(void)
+ {
+ RedoRecPtr = XLogCtl->SavedRedoRecPtr;
+ }
+
+ /*
+ * Once spawned, a backend may update its local RedoRecPtr from
+ * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
+ * to do so. This is done in XLogInsert() or GetRedoRecPtr().
+ */
+ XLogRecPtr
GetRedoRecPtr(void)
{
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
!
! SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
! Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
! RedoRecPtr = xlogctl->Insert.RedoRecPtr;
! SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
!
! return RedoRecPtr;
}
/*
***************
*** 2862,2867 ****
--- 2890,2896 ----
/* suppress in-transaction check in CreateCheckPoint */
MyLastRecPtr.xrecoff = 0;
+ MyXactMadeXLogEntry = false;
CritSectionCount++;
CreateDummyCaches();
***************
*** 2886,2892 ****
uint32 _logId;
uint32 _logSeg;
! if (MyLastRecPtr.xrecoff != 0)
elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
/*
--- 2915,2921 ----
uint32 _logId;
uint32 _logSeg;
! if (MyXactMadeXLogEntry)
elog(ERROR, "CreateCheckPoint: cannot be called inside transaction block");
/*
***************
*** 2972,2980 ****
/*
* Here we update the shared RedoRecPtr for future XLogInsert calls;
! * this must be done while holding the insert lock.
*/
! RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
/*
* Get UNDO record ptr - this is oldest of PROC->logRec values. We do
--- 3001,3016 ----
/*
* Here we update the shared RedoRecPtr for future XLogInsert calls;
! * this must be done while holding the insert lock AND the info_lck.
*/
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
!
! SpinLockAcquire_NoHoldoff(&xlogctl->info_lck);
! RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
! SpinLockRelease_NoHoldoff(&xlogctl->info_lck);
! }
/*
* Get UNDO record ptr - this is oldest of PROC->logRec values. We do
*** src/backend/bootstrap/bootstrap.c.orig Tue Mar 12 07:56:32 2002
--- src/backend/bootstrap/bootstrap.c Thu Mar 14 20:19:51 2002
***************
*** 386,392 ****
InitDummyProcess(); /* needed to get LWLocks */
CreateDummyCaches();
CreateCheckPoint(false);
! SetRedoRecPtr();
proc_exit(0); /* done */
case BS_XLOG_STARTUP:
--- 386,392 ----
InitDummyProcess(); /* needed to get LWLocks */
CreateDummyCaches();
CreateCheckPoint(false);
! SetSavedRedoRecPtr(); /* pass redo ptr back to postmaster */
proc_exit(0); /* done */
case BS_XLOG_STARTUP:
*** src/backend/commands/sequence.c.orig Tue Mar 12 07:56:35 2002
--- src/backend/commands/sequence.c Thu Mar 14 22:06:22 2002
***************
*** 286,291 ****
--- 286,292 ----
char *seqname = get_seq_name(seqin);
SeqTable elm;
Buffer buf;
+ Page page;
Form_pg_sequence seq;
int64 incby,
maxv,
***************
*** 316,321 ****
--- 317,323 ----
seq = read_info("nextval", elm, &buf); /* lock page' buffer and
* read tuple */
+ page = BufferGetPage(buf);
last = next = result = seq->last_value;
incby = seq->increment_by;
***************
*** 331,341 ****
log--;
}
if (log < fetch)
{
! fetch = log = fetch - log + SEQ_LOG_VALS;
logit = true;
}
while (fetch) /* try to fetch cache [+ log ] numbers */
{
--- 333,365 ----
log--;
}
+ /*
+ * Decide whether we should emit a WAL log record. If so, force up
+ * the fetch count to grab SEQ_LOG_VALS more values than we actually
+ * need to cache. (These will then be usable without logging.)
+ *
+ * If this is the first nextval after a checkpoint, we must force
+ * a new WAL record to be written anyway, else replay starting from the
+ * checkpoint would fail to advance the sequence past the logged
+ * values. In this case we may as well fetch extra values.
+ */
if (log < fetch)
{
! /* forced log to satisfy local demand for values */
! fetch = log = fetch + SEQ_LOG_VALS;
logit = true;
}
+ else
+ {
+ XLogRecPtr redoptr = GetRedoRecPtr();
+
+ if (XLByteLE(PageGetLSN(page), redoptr))
+ {
+ /* last update of seq was before checkpoint */
+ fetch = log = fetch + SEQ_LOG_VALS;
+ logit = true;
+ }
+ }
while (fetch) /* try to fetch cache [+ log ] numbers */
{
***************
*** 386,391 ****
--- 410,418 ----
}
}
+ log -= fetch; /* adjust for any unfetched numbers */
+ Assert(log >= 0);
+
/* save info in local cache */
elm->last = result; /* last returned number */
elm->cached = last; /* last fetched number */
***************
*** 396,402 ****
xl_seq_rec xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
- Page page = BufferGetPage(buf);
xlrec.node = elm->rel->rd_node;
rdata[0].buffer = InvalidBuffer;
--- 423,428 ----
***************
*** 417,431 ****
PageSetLSN(page, recptr);
PageSetSUI(page, ThisStartUpID);
-
- if (fetch) /* not all numbers were fetched */
- log -= fetch;
}
/* update on-disk data */
seq->last_value = last; /* last fetched number */
seq->is_called = true;
- Assert(log >= 0);
seq->log_cnt = log; /* how much is logged */
END_CRIT_SECTION();
--- 443,453 ----
*** src/backend/postmaster/postmaster.c.orig Tue Mar 12 07:57:01 2002
--- src/backend/postmaster/postmaster.c Thu Mar 14 20:20:01 2002
***************
*** 1683,1689 ****
{
checkpointed = time(NULL);
/* Update RedoRecPtr for future child backends */
! GetRedoRecPtr();
}
}
else
--- 1683,1689 ----
{
checkpointed = time(NULL);
/* Update RedoRecPtr for future child backends */
! GetSavedRedoRecPtr();
}
}
else
*** src/include/access/xlog.h.orig Fri Nov 16 12:17:19 2001
--- src/include/access/xlog.h Thu Mar 14 20:20:51 2002
***************
*** 178,183 ****
--- 178,185 ----
extern StartUpID ThisStartUpID; /* current SUI */
extern bool InRecovery;
extern XLogRecPtr MyLastRecPtr;
+ extern bool MyXactMadeXLogEntry;
+ extern XLogRecPtr ProcLastRecEnd;
/* these variables are GUC parameters related to XLOG */
extern int CheckPointSegments;
***************
*** 205,212 ****
extern void CreateCheckPoint(bool shutdown);
extern void SetThisStartUpID(void);
extern void XLogPutNextOid(Oid nextOid);
! extern void SetRedoRecPtr(void);
! extern void GetRedoRecPtr(void);
/* in storage/ipc/sinval.c, but don't want to declare in sinval.h because
* we'd have to include xlog.h into that ...
--- 207,215 ----
extern void CreateCheckPoint(bool shutdown);
extern void SetThisStartUpID(void);
extern void XLogPutNextOid(Oid nextOid);
! extern void SetSavedRedoRecPtr(void);
! extern void GetSavedRedoRecPtr(void);
! extern XLogRecPtr GetRedoRecPtr(void);
/* in storage/ipc/sinval.c, but don't want to declare in sinval.h because
* we'd have to include xlog.h into that ...