Re: Instrument checkpoint sync calls

Поиск
Список
Период
Сортировка
От Greg Smith
Тема Re: Instrument checkpoint sync calls
Дата
Msg-id 4CF52875.9040806@2ndquadrant.com
обсуждение исходный текст
Ответ на Re: Instrument checkpoint sync calls  (Jeff Janes <jeff.janes@gmail.com>)
Ответы Re: Instrument checkpoint sync calls  (Cédric Villemain <cedric.villemain.debian@gmail.com>)
Re: Instrument checkpoint sync calls  (Jeff Janes <jeff.janes@gmail.com>)
Список pgsql-hackers
Jeff Janes wrote:
> For the individual file sync times emitted under debug1, it would be
> very handy if the file being synced was identified, for example
> "relation base/16384/16523". Rather than being numbered sequentially
> within a given checkpoint.
>

I was numbering them sequentially so that it's straightforward to graph
the sync times in an external analysis tool, but the relation data is
helpful too.  New patch reflecting all upthread suggestions is
attached.  The output looks like this now at DEBUG1:

LOG:  checkpoint starting: xlog
DEBUG:  checkpoint sync: number=1 file=base/16424/11645
time=11589.549000 msec
DEBUG:  checkpoint sync: number=2 file=base/16424/16438 time=16.148000 msec
DEBUG:  checkpoint sync: number=3 file=base/16424/16437 time=53.530000 msec
DEBUG:  checkpoint sync: number=4 file=base/16424/16447 time=10.214000 msec
DEBUG:  checkpoint sync: number=5 file=base/16424/11607 time=1.499000 msec
DEBUG:  checkpoint sync: number=6 file=base/16424/16425_fsm
time=2.921000 msec
DEBUG:  checkpoint sync: number=7 file=base/16424/16437.1 time=4.237000 msec
DEBUG:  checkpoint sync: number=8 file=base/16424/16428_fsm
time=1.654000 msec
DEBUG:  checkpoint sync: number=9 file=base/16424/16442 time=7.920000 msec
DEBUG:  checkpoint sync: number=10 file=base/16424/16428_vm
time=2.613000 msec
DEBUG:  checkpoint sync: number=11 file=base/16424/11618 time=1.468000 msec
DEBUG:  checkpoint sync: number=12 file=base/16424/16437_fsm
time=2.638000 msec
DEBUG:  checkpoint sync: number=13 file=base/16424/16428 time=2.883000 msec
DEBUG:  checkpoint sync: number=14 file=base/16424/16425 time=3.369000 msec
DEBUG:  checkpoint sync: number=15 file=base/16424/16437_vm
time=8.686000 msec
DEBUG:  checkpoint sync: number=16 file=base/16424/16425_vm
time=5.984000 msec
LOG:  checkpoint complete: wrote 2074 buffers (50.6%); 0 transaction log
file(s) added, 0 removed, 3 recycled; write=0.617 s, sync=11.715 s,
total=22.167 s; sync files=16, longest=11.589 s, average=0.724 s

I kept the units for the DEBUG level ones in msec because that's a
better scale for the common really short syncs during good behavior.
But the summary info in seconds now appears at the end of the existing
"checkpoint complete" message, so only one line to parse for those
looking to analyze the gross checkpoint data.  That looks to work well
enough for finding situations like the big ext3 spikes.  You can easily
see one in this example by the fact that "longest=11.589 s" is almost
the entirety of "sync=11.715 s".  That's the really key thing there's
currently no visibility into, that's made obvious with this patch.

This might be ready for some proper review now.  I know there's at least
one blatant bug still in here I haven't found yet, related to how the
averages are computed.  I saw this once:

LOG:  checkpoint complete: wrote 0 buffers (0.0%); 0 transaction log
file(s) added, 0 removed, 1 recycled; write=0.000 s, sync=0.000 s,
total=0.001 s; sync files=0, longest=0.000 s,
average=-9223372036854775808.-2147483 s

After an immediate checkpoint, so at least one path not quite right yet.

--
Greg Smith   2ndQuadrant US    greg@2ndQuadrant.com   Baltimore, MD


diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ede6ceb..6f6eb3b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7063,10 +7063,15 @@ LogCheckpointEnd(bool restartpoint)
 {
     long        write_secs,
                 sync_secs,
-                total_secs;
+                total_secs,
+                longest_secs,
+                average_secs;
     int            write_usecs,
                 sync_usecs,
-                total_usecs;
+                total_usecs,
+                longest_usecs,
+                average_usecs;
+    double        average_sync_time;

     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

@@ -7082,18 +7087,35 @@ LogCheckpointEnd(bool restartpoint)
                         CheckpointStats.ckpt_sync_end_t,
                         &sync_secs, &sync_usecs);

+    /*
+     * Timing values returned from CheckpointStats are in milliseconds.
+     * Convert to the second plus microsecond form that TimestampDifference
+     * returns for homogeneous printing.
+     */
+    longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000);
+    longest_usecs = 1000 * (CheckpointStats.ckpt_longest_sync - longest_secs * 1000);
+
+    average_sync_time = CheckpointStats.ckpt_longest_sync / CheckpointStats.ckpt_sync_rels;
+    average_secs = (long) (average_sync_time / 1000);
+    average_usecs = 1000 * (average_sync_time - average_secs * 1000);
+
     if (restartpoint)
         elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
-             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+             "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
              CheckpointStats.ckpt_bufs_written,
              (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
              write_secs, write_usecs / 1000,
              sync_secs, sync_usecs / 1000,
-             total_secs, total_usecs / 1000);
+             total_secs, total_usecs / 1000,
+             CheckpointStats.ckpt_sync_rels,
+             longest_secs, longest_usecs / 1000,
+             average_secs, average_usecs / 1000);
     else
         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
              "%d transaction log file(s) added, %d removed, %d recycled; "
-             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+             "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
              CheckpointStats.ckpt_bufs_written,
              (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
              CheckpointStats.ckpt_segs_added,
@@ -7101,7 +7123,10 @@ LogCheckpointEnd(bool restartpoint)
              CheckpointStats.ckpt_segs_recycled,
              write_secs, write_usecs / 1000,
              sync_secs, sync_usecs / 1000,
-             total_secs, total_usecs / 1000);
+             total_secs, total_usecs / 1000,
+             CheckpointStats.ckpt_sync_rels,
+             longest_secs, longest_usecs / 1000,
+             average_secs, average_usecs / 1000);
 }

 /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db21..cadd938 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -20,6 +20,7 @@

 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
@@ -927,6 +928,13 @@ mdsync(void)
     PendingOperationEntry *entry;
     int            absorb_counter;

+    /* Statistics on sync times */
+    int processed = 0;
+    instr_time sync_start, sync_end, sync_diff;
+    double elapsed;
+    double longest = 0;
+    double total_elapsed = 0;
+
     /*
      * This is only called during checkpoints, and checkpoints should only
      * occur in processes that have created a pendingOpsTable.
@@ -1069,9 +1077,29 @@ mdsync(void)
                 seg = _mdfd_getseg(reln, entry->tag.forknum,
                               entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
                                    false, EXTENSION_RETURN_NULL);
+
+                if (log_checkpoints)
+                    INSTR_TIME_SET_CURRENT(sync_start);
+
                 if (seg != NULL &&
                     FileSync(seg->mdfd_vfd) >= 0)
+                {
+                    if (log_checkpoints)
+                    {
+                        INSTR_TIME_SET_CURRENT(sync_end);
+                        sync_diff = sync_end;
+                        INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+                        elapsed = (double) INSTR_TIME_GET_MICROSEC(sync_diff) / 1000.0;
+                        if (elapsed > longest)
+                            longest = elapsed;
+                        total_elapsed += elapsed;
+                        processed++;
+                        elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%f msec",
+                            processed, FilePathName(seg->mdfd_vfd), elapsed);
+                    }
+
                     break;        /* success; break out of retry loop */
+                    }

                 /*
                  * XXX is there any point in allowing more than one retry?
@@ -1113,6 +1141,11 @@ mdsync(void)
             elog(ERROR, "pendingOpsTable corrupted");
     }                            /* end loop over hashtable entries */

+    /* Return sync performance metrics for report at checkpoint end */
+    CheckpointStats.ckpt_sync_rels = processed;
+    CheckpointStats.ckpt_longest_sync = longest;
+    CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
     /* Flag successful completion of mdsync */
     mdsync_in_progress = false;
 }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2a..ec4d9db 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -257,6 +257,13 @@ typedef struct CheckpointStatsData
     int            ckpt_segs_added;    /* # of new xlog segments created */
     int            ckpt_segs_removed;        /* # of xlog segments deleted */
     int            ckpt_segs_recycled;        /* # of xlog segments recycled */
+
+    int         ckpt_sync_rels;        /* # of relations synced */
+    double         ckpt_longest_sync;    /* Longest sync for one relation */
+    double         ckpt_agg_sync_time;    /* The sum of all the individual sync
+                                       times, which is not necessarily the
+                                       same as the total elapsed time for
+                                       the entire sync phase. */
 } CheckpointStatsData;

 extern CheckpointStatsData CheckpointStats;

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Tom Lane
Дата:
Сообщение: Re: crash-safe visibility map, take three
Следующее
От: Tom Lane
Дата:
Сообщение: Re: crash-safe visibility map, take three