Re: I/O on select count(*)
От | Robert Lor |
---|---|
Тема | Re: I/O on select count(*) |
Дата | |
Msg-id | 482CA9BE.8060204@sun.com обсуждение исходный текст |
Ответ на | Re: I/O on select count(*) (Tom Lane <tgl@sss.pgh.pa.us>) |
Список | pgsql-performance |
Tom Lane wrote: > Hmm, the problem would be trying to figure out what percentage of writes > could be blamed solely on hint-bit updates and not any other change to > the page. I don't think that the bufmgr currently keeps enough state to > know that, but you could probably modify it easily enough, since callers > distinguish MarkBufferDirty from SetBufferCommitInfoNeedsSave. Define > another flag bit that's set only by the first, and test it during > write-out. > Ok, I made a few changes to bufmgr per my understanding of your description above and with my limited understanding of the code. Patch is attached. Assuming the patch is correct, the effect of writes due to hint bits is quite significant. I collected the data below by runing pgbench in one terminal and psql on another to run the query. Is the data plausible? -Robert -------------- Backend PID : 16189 SQL Statement : select count(*) from accounts; Execution time : 17.33 sec ============ Buffer Read Counts ============ Tablespace Database Table Count 1663 16384 2600 1 1663 16384 2601 1 1663 16384 2615 1 1663 16384 1255 2 1663 16384 2602 2 1663 16384 2603 2 1663 16384 2616 2 1663 16384 2650 2 1663 16384 2678 2 1663 16384 1247 3 1663 16384 1249 3 1663 16384 2610 3 1663 16384 2655 3 1663 16384 2679 3 1663 16384 2684 3 1663 16384 2687 3 1663 16384 2690 3 1663 16384 2691 3 1663 16384 2703 4 1663 16384 1259 5 1663 16384 2653 5 1663 16384 2662 5 1663 16384 2663 5 1663 16384 2659 7 1663 16384 16397 8390 ======== Dirty Buffer Write Counts ========= Tablespace Database Table Count 1663 16384 16402 2 1663 16384 16394 11 1663 16384 16397 4771 ========== Hint Bits Write Counts ========== Tablespace Database Table Count 1663 16384 16397 4508 Total buffer cache hits : 732 Total buffer cache misses : 7731 Average read time from cache : 9136 (ns) Average read time from disk : 384201 (ns) Average write time to disk : 210709 (ns) Backend PID : 16189 SQL Statement : select count(*) from accounts; Execution time : 12.72 sec ============ Buffer Read Counts ============ Tablespace Database Table Count 1663 16384 16397 8392 ======== Dirty Buffer Write Counts ========= Tablespace Database Table Count 1663 16384 16394 6 1663 16384 16402 7 1663 16384 16397 2870 ========== Hint Bits Write Counts ========== Tablespace Database Table Count 1663 16384 16402 2 1663 16384 16397 2010 Total buffer cache hits : 606 Total buffer cache misses : 7786 Average read time from cache : 6949 (ns) Average read time from disk : 706288 (ns) Average write time to disk : 90426 (ns) Index: bufmgr.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v retrieving revision 1.228 diff -u -3 -p -r1.228 bufmgr.c --- bufmgr.c 1 Jan 2008 19:45:51 -0000 1.228 +++ bufmgr.c 15 May 2008 20:56:38 -0000 @@ -42,6 +42,7 @@ #include "storage/smgr.h" #include "utils/resowner.h" #include "pgstat.h" +#include "pg_trace.h" /* Note: these two macros only work on shared buffers, not local ones! */ @@ -171,6 +172,7 @@ ReadBuffer_common(Relation reln, BlockNu if (isExtend) blockNum = smgrnblocks(reln->rd_smgr); + TRACE_POSTGRESQL_BUFFER_READ_START(blockNum, reln->rd_node.spcNode, reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf); pgstat_count_buffer_read(reln); if (isLocalBuf) @@ -200,12 +202,16 @@ ReadBuffer_common(Relation reln, BlockNu { if (!isExtend) { + TRACE_POSTGRESQL_BUFFER_HIT(); /* Just need to update stats before we exit */ pgstat_count_buffer_hit(reln); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageHit; + TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, + reln->rd_node.spcNode, reln->rd_node.dbNode, + reln->rd_node.relNode, isLocalBuf, found); return BufferDescriptorGetBuffer(bufHdr); } @@ -257,6 +263,7 @@ ReadBuffer_common(Relation reln, BlockNu } while (!StartBufferIO(bufHdr, true)); } } + TRACE_POSTGRESQL_BUFFER_MISS(); /* * if we have gotten to this point, we have allocated a buffer for the @@ -324,6 +331,9 @@ ReadBuffer_common(Relation reln, BlockNu if (VacuumCostActive) VacuumCostBalance += VacuumCostPageMiss; + TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, reln->rd_node.spcNode, + reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf, found); + return BufferDescriptorGetBuffer(bufHdr); } @@ -466,6 +476,11 @@ BufferAlloc(Relation reln, * happens to be trying to split the page the first one got from * StrategyGetBuffer.) */ + + TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_START(blockNum, + reln->rd_node.spcNode, reln->rd_node.dbNode, + reln->rd_node.relNode); + if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED)) { /* @@ -488,6 +503,11 @@ BufferAlloc(Relation reln, /* OK, do the I/O */ FlushBuffer(buf, NULL); LWLockRelease(buf->content_lock); + + TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_DONE( + blockNum, reln->rd_node.spcNode, + reln->rd_node.dbNode, reln->rd_node.relNode, + (oldFlags & BM_HINT_BITS_TEST)); } else { @@ -1734,6 +1754,10 @@ FlushBuffer(volatile BufferDesc *buf, SM buf->flags &= ~BM_JUST_DIRTIED; UnlockBufHdr(buf); + TRACE_POSTGRESQL_BUFFER_WRITE_START(buf->tag.blockNum, + reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, + reln->smgr_rnode.relNode); + smgrwrite(reln, buf->tag.blockNum, (char *) BufHdrGetBlock(buf), @@ -1741,6 +1765,10 @@ FlushBuffer(volatile BufferDesc *buf, SM BufferFlushCount++; + TRACE_POSTGRESQL_BUFFER_WRITE_DONE(buf->tag.blockNum, + reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, + reln->smgr_rnode.relNode, (buf->flags & BM_HINT_BITS_TEST)); + /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and * end the io_in_progress state. @@ -2155,7 +2183,7 @@ SetBufferCommitInfoNeedsSave(Buffer buff Assert(bufHdr->refcount > 0); if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED | BM_HINT_BITS_TEST); UnlockBufHdr(bufHdr); } } @@ -2492,7 +2520,7 @@ TerminateBufferIO(volatile BufferDesc *b Assert(buf->flags & BM_IO_IN_PROGRESS); buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR); if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED)) - buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); + buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED | BM_HINT_BITS_TEST); buf->flags |= set_flag_bits; UnlockBufHdr(buf); Index: buf_internals.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/storage/buf_internals.h,v retrieving revision 1.95 diff -u -3 -p -r1.95 buf_internals.h --- buf_internals.h 1 Jan 2008 19:45:58 -0000 1.95 +++ buf_internals.h 15 May 2008 20:59:42 -0000 @@ -36,6 +36,7 @@ #define BM_JUST_DIRTIED (1 << 5) /* dirtied since write started */ #define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole pin */ #define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */ +#define BM_HINT_BITS_TEST (1 << 8) /* test effect of writes due to hint bits */ typedef bits16 BufFlags;
В списке pgsql-performance по дате отправления: