Re: I/O on select count(*)

Поиск
Список
Период
Сортировка
От Robert Lor
Тема Re: I/O on select count(*)
Дата
Msg-id 482CA9BE.8060204@sun.com
обсуждение исходный текст
Ответ на Re: I/O on select count(*)  (Tom Lane <tgl@sss.pgh.pa.us>)
Список pgsql-performance
Tom Lane wrote:
> Hmm, the problem would be trying to figure out what percentage of writes
> could be blamed solely on hint-bit updates and not any other change to
> the page.  I don't think that the bufmgr currently keeps enough state to
> know that, but you could probably modify it easily enough, since callers
> distinguish MarkBufferDirty from SetBufferCommitInfoNeedsSave.  Define
> another flag bit that's set only by the first, and test it during
> write-out.
>
Ok, I made a few changes to bufmgr per my understanding of your
description above and with my limited understanding of the code. Patch
is attached.

Assuming the patch is correct, the effect of writes due to hint bits is
quite significant. I collected the data below by runing pgbench in one
terminal and psql on another to run the query.

Is the data plausible?

-Robert

--------------


Backend PID    : 16189
SQL Statement  : select count(*) from accounts;
Execution time : 17.33 sec

============ Buffer Read Counts ============
Tablespace   Database      Table      Count
      1663      16384       2600          1
      1663      16384       2601          1
      1663      16384       2615          1
      1663      16384       1255          2
      1663      16384       2602          2
      1663      16384       2603          2
      1663      16384       2616          2
      1663      16384       2650          2
      1663      16384       2678          2
      1663      16384       1247          3
      1663      16384       1249          3
      1663      16384       2610          3
      1663      16384       2655          3
      1663      16384       2679          3
      1663      16384       2684          3
      1663      16384       2687          3
      1663      16384       2690          3
      1663      16384       2691          3
      1663      16384       2703          4
      1663      16384       1259          5
      1663      16384       2653          5
      1663      16384       2662          5
      1663      16384       2663          5
      1663      16384       2659          7
      1663      16384      16397       8390

======== Dirty Buffer Write Counts =========
Tablespace   Database      Table      Count
      1663      16384      16402          2
      1663      16384      16394         11
      1663      16384      16397       4771

========== Hint Bits Write Counts ==========
Tablespace   Database      Table      Count
      1663      16384      16397       4508

Total buffer cache hits      :       732
Total buffer cache misses    :      7731
Average read time from cache :      9136 (ns)
Average read time from disk  :    384201 (ns)
Average write time to disk   :    210709 (ns)


Backend PID    : 16189
SQL Statement  : select count(*) from accounts;
Execution time : 12.72 sec

============ Buffer Read Counts ============
Tablespace   Database      Table      Count
      1663      16384      16397       8392

======== Dirty Buffer Write Counts =========
Tablespace   Database      Table      Count
      1663      16384      16394          6
      1663      16384      16402          7
      1663      16384      16397       2870

========== Hint Bits Write Counts ==========
Tablespace   Database      Table      Count
      1663      16384      16402          2
      1663      16384      16397       2010

Total buffer cache hits      :       606
Total buffer cache misses    :      7786
Average read time from cache :      6949 (ns)
Average read time from disk  :    706288 (ns)
Average write time to disk   :     90426 (ns)

Index: bufmgr.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.228
diff -u -3 -p -r1.228 bufmgr.c
--- bufmgr.c    1 Jan 2008 19:45:51 -0000    1.228
+++ bufmgr.c    15 May 2008 20:56:38 -0000
@@ -42,6 +42,7 @@
 #include "storage/smgr.h"
 #include "utils/resowner.h"
 #include "pgstat.h"
+#include "pg_trace.h"


 /* Note: these two macros only work on shared buffers, not local ones! */
@@ -171,6 +172,7 @@ ReadBuffer_common(Relation reln, BlockNu
     if (isExtend)
         blockNum = smgrnblocks(reln->rd_smgr);

+    TRACE_POSTGRESQL_BUFFER_READ_START(blockNum, reln->rd_node.spcNode, reln->rd_node.dbNode, reln->rd_node.relNode,
isLocalBuf);
     pgstat_count_buffer_read(reln);

     if (isLocalBuf)
@@ -200,12 +202,16 @@ ReadBuffer_common(Relation reln, BlockNu
     {
         if (!isExtend)
         {
+            TRACE_POSTGRESQL_BUFFER_HIT();
             /* Just need to update stats before we exit */
             pgstat_count_buffer_hit(reln);

             if (VacuumCostActive)
                 VacuumCostBalance += VacuumCostPageHit;

+            TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum,
+                 reln->rd_node.spcNode, reln->rd_node.dbNode,
+                 reln->rd_node.relNode, isLocalBuf, found);
             return BufferDescriptorGetBuffer(bufHdr);
         }

@@ -257,6 +263,7 @@ ReadBuffer_common(Relation reln, BlockNu
             } while (!StartBufferIO(bufHdr, true));
         }
     }
+    TRACE_POSTGRESQL_BUFFER_MISS();

     /*
      * if we have gotten to this point, we have allocated a buffer for the
@@ -324,6 +331,9 @@ ReadBuffer_common(Relation reln, BlockNu
     if (VacuumCostActive)
         VacuumCostBalance += VacuumCostPageMiss;

+    TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, reln->rd_node.spcNode,
+        reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf, found);
+
     return BufferDescriptorGetBuffer(bufHdr);
 }

@@ -466,6 +476,11 @@ BufferAlloc(Relation reln,
              * happens to be trying to split the page the first one got from
              * StrategyGetBuffer.)
              */
+
+            TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_START(blockNum,
+              reln->rd_node.spcNode, reln->rd_node.dbNode,
+              reln->rd_node.relNode);
+
             if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
             {
                 /*
@@ -488,6 +503,11 @@ BufferAlloc(Relation reln,
                 /* OK, do the I/O */
                 FlushBuffer(buf, NULL);
                 LWLockRelease(buf->content_lock);
+
+                TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_DONE(
+                  blockNum, reln->rd_node.spcNode,
+                  reln->rd_node.dbNode, reln->rd_node.relNode,
+                  (oldFlags & BM_HINT_BITS_TEST));
             }
             else
             {
@@ -1734,6 +1754,10 @@ FlushBuffer(volatile BufferDesc *buf, SM
     buf->flags &= ~BM_JUST_DIRTIED;
     UnlockBufHdr(buf);

+    TRACE_POSTGRESQL_BUFFER_WRITE_START(buf->tag.blockNum,
+         reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode,
+         reln->smgr_rnode.relNode);
+
     smgrwrite(reln,
               buf->tag.blockNum,
               (char *) BufHdrGetBlock(buf),
@@ -1741,6 +1765,10 @@ FlushBuffer(volatile BufferDesc *buf, SM

     BufferFlushCount++;

+    TRACE_POSTGRESQL_BUFFER_WRITE_DONE(buf->tag.blockNum,
+         reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode,
+         reln->smgr_rnode.relNode, (buf->flags & BM_HINT_BITS_TEST));
+
     /*
      * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
      * end the io_in_progress state.
@@ -2155,7 +2183,7 @@ SetBufferCommitInfoNeedsSave(Buffer buff
         Assert(bufHdr->refcount > 0);
         if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
             VacuumCostBalance += VacuumCostPageDirty;
-        bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+        bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED | BM_HINT_BITS_TEST);
         UnlockBufHdr(bufHdr);
     }
 }
@@ -2492,7 +2520,7 @@ TerminateBufferIO(volatile BufferDesc *b
     Assert(buf->flags & BM_IO_IN_PROGRESS);
     buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
     if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
-        buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+        buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED | BM_HINT_BITS_TEST);
     buf->flags |= set_flag_bits;

     UnlockBufHdr(buf);
Index: buf_internals.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.95
diff -u -3 -p -r1.95 buf_internals.h
--- buf_internals.h    1 Jan 2008 19:45:58 -0000    1.95
+++ buf_internals.h    15 May 2008 20:59:42 -0000
@@ -36,6 +36,7 @@
 #define BM_JUST_DIRTIED            (1 << 5)        /* dirtied since write started */
 #define BM_PIN_COUNT_WAITER        (1 << 6)        /* have waiter for sole pin */
 #define BM_CHECKPOINT_NEEDED    (1 << 7)        /* must write for checkpoint */
+#define BM_HINT_BITS_TEST    (1 << 8)        /* test effect of writes due to hint bits */

 typedef bits16 BufFlags;


В списке pgsql-performance по дате отправления:

Предыдущее
От: "Subbiah Stalin-XCGF84"
Дата:
Сообщение: Re: Update performance degrades over time
Следующее
От: James Mansion
Дата:
Сообщение: Re: I/O on select count(*)