Avoiding unnecessary reads in recovery

Поиск
Список
Период
Сортировка
От Heikki Linnakangas
Тема Avoiding unnecessary reads in recovery
Дата
Msg-id 462F4E33.50904@enterprisedb.com
обсуждение исходный текст
Ответы Re: Avoiding unnecessary reads in recovery  (Heikki Linnakangas <heikki@enterprisedb.com>)
Re: Avoiding unnecessary reads in recovery  (Gregory Stark <stark@enterprisedb.com>)
Re: Avoiding unnecessary reads in recovery  ("Simon Riggs" <simon@2ndquadrant.com>)
Re: Avoiding unnecessary reads in recovery  (Jim Nasby <decibel@decibel.org>)
Список pgsql-hackers
In recovery, with full_pages_writes=on, we read in each page only to
overwrite the contents with a full page image. That's a waste of time,
and can have a surprisingly large effect on recovery time.

As a quick test on my laptop, I initialized a DBT-2 test with 5
warehouses, and let it run for 2 minutes without think-times to generate
some WAL. Then I did a "kill -9 postmaster", and took a copy of the data
directory to use for testing recovery.

With CVS HEAD, the recovery took ~ 2 minutes. With the attached patch,
it took 5 seconds. (yes, I used the same not-yet-recovered data
directory in both tests, and cleared the os cache with "echo 1 >
/proc/sys/vm/drop_caches").

I was surprised how big a difference it makes, but when you think about
it it's logical. Without the patch, it's doing roughly the same I/O as
the test itself, reading in pages, modifying them, and writing them
back. With the patch, all the reads are done sequentially from the WAL,
and then written back in a batch at the end of the WAL replay which is a
lot more efficient.

It's interesting that (with the patch) full_page_writes can *shorten*
your recovery time. I've always thought it to have a purely negative
effect on performance.

I'll leave it up to the jury if this tiny little change is appropriate
after feature freeze...

While working on this, this comment in ReadBuffer caught my eye:

>     /*
>      * During WAL recovery, the first access to any data page should
>      * overwrite the whole page from the WAL; so a clobbered page
>      * header is not reason to fail.  Hence, when InRecovery we may
>      * always act as though zero_damaged_pages is ON.
>      */
>     if (zero_damaged_pages || InRecovery)
>     {

But that assumption only holds if full_page_writes is enabled, right? I
changed that in the attached patch as well, but if it isn't accepted
that part of it should still be applied, I think.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/xlogutils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlogutils.c,v
retrieving revision 1.49
diff -c -r1.49 xlogutils.c
*** src/backend/access/transam/xlogutils.c    5 Jan 2007 22:19:24 -0000    1.49
--- src/backend/access/transam/xlogutils.c    25 Apr 2007 11:40:09 -0000
***************
*** 226,232 ****
      if (blkno < lastblock)
      {
          /* page exists in file */
!         buffer = ReadBuffer(reln, blkno);
      }
      else
      {
--- 226,235 ----
      if (blkno < lastblock)
      {
          /* page exists in file */
!         if(init)
!             buffer = ZapBuffer(reln, blkno);
!         else
!             buffer = ReadBuffer(reln, blkno);
      }
      else
      {
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.216
diff -c -r1.216 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c    30 Mar 2007 18:34:55 -0000    1.216
--- src/backend/storage/buffer/bufmgr.c    25 Apr 2007 11:44:27 -0000
***************
*** 97,102 ****
--- 97,103 ----
  static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
                    int set_flag_bits);
  static void buffer_write_error_callback(void *arg);
+ static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum, bool alloc_only);
  static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
              bool *foundPtr);
  static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
***************
*** 121,126 ****
--- 122,148 ----
  Buffer
  ReadBuffer(Relation reln, BlockNumber blockNum)
  {
+     return ReadBuffer_common(reln, blockNum, false);
+ }
+
+ /*
+  * ZapBuffer -- like ReadBuffer, but doesn't read the contents of the page
+  *        from disk. The caller is expected to completely rewrite the page,
+  *        regardless of the current contents. This should only be used in
+  *        recovery where there's no concurrent readers that might see the
+  *        contents of the page before the caller rewrites it.
+  */
+ Buffer
+ ZapBuffer(Relation reln, BlockNumber blockNum)
+ {
+     Assert(InRecovery);
+
+     return ReadBuffer_common(reln, blockNum, true);
+ }
+
+ static Buffer
+ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool alloc_only)
+ {
      volatile BufferDesc *bufHdr;
      Block        bufBlock;
      bool        found;
***************
*** 253,269 ****
      }
      else
      {
          smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
          /* check for garbage data */
          if (!PageHeaderIsValid((PageHeader) bufBlock))
          {
              /*
!              * During WAL recovery, the first access to any data page should
!              * overwrite the whole page from the WAL; so a clobbered page
!              * header is not reason to fail.  Hence, when InRecovery we may
!              * always act as though zero_damaged_pages is ON.
               */
!             if (zero_damaged_pages || InRecovery)
              {
                  ereport(WARNING,
                          (errcode(ERRCODE_DATA_CORRUPTED),
--- 275,293 ----
      }
      else
      {
+         if(!alloc_only)
+         {
          smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
          /* check for garbage data */
          if (!PageHeaderIsValid((PageHeader) bufBlock))
          {
              /*
!              * When full_pages_writes is enabled, the first access to any data page should
!              * overwrite the whole page from the WAL during recovery; so a clobbered page
!              * header is not reason to fail.  Hence, we may
!              * act as though zero_damaged_pages is ON.
               */
!             if (zero_damaged_pages || (InRecovery && fullPageWrites))
              {
                  ereport(WARNING,
                          (errcode(ERRCODE_DATA_CORRUPTED),
***************
*** 277,282 ****
--- 301,307 ----
                   errmsg("invalid page header in block %u of relation \"%s\"",
                          blockNum, RelationGetRelationName(reln))));
          }
+         }
      }

      if (isLocalBuf)
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.384
diff -c -r1.384 guc.c
*** src/backend/utils/misc/guc.c    12 Apr 2007 06:53:47 -0000    1.384
--- src/backend/utils/misc/guc.c    25 Apr 2007 11:19:52 -0000
***************
*** 103,109 ****
  extern int    CommitDelay;
  extern int    CommitSiblings;
  extern char *default_tablespace;
- extern bool fullPageWrites;

  #ifdef TRACE_SORT
  extern bool trace_sort;
--- 103,108 ----
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.76
diff -c -r1.76 xlog.h
*** src/include/access/xlog.h    5 Jan 2007 22:19:51 -0000    1.76
--- src/include/access/xlog.h    25 Apr 2007 11:19:46 -0000
***************
*** 142,147 ****
--- 142,148 ----
  extern int    XLogArchiveTimeout;
  extern char *XLOG_sync_method;
  extern const char XLOG_sync_method_default[];
+ extern bool fullPageWrites;

  #define XLogArchivingActive()    (XLogArchiveCommand[0] != '\0')

Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.102
diff -c -r1.102 bufmgr.h
*** src/include/storage/bufmgr.h    5 Jan 2007 22:19:57 -0000    1.102
--- src/include/storage/bufmgr.h    25 Apr 2007 11:39:35 -0000
***************
*** 111,116 ****
--- 111,117 ----
   * prototypes for functions in bufmgr.c
   */
  extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+ extern Buffer ZapBuffer(Relation reln, BlockNumber blockNum);
  extern void ReleaseBuffer(Buffer buffer);
  extern void UnlockReleaseBuffer(Buffer buffer);
  extern void MarkBufferDirty(Buffer buffer);

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Dave Page
Дата:
Сообщение: Re: Buildfarm: Stage logs not available for MSVC builds
Следующее
От: Kenneth Marshall
Дата:
Сообщение: Re: [PATCHES] Full page writes improvement, code update