Re: Instrument checkpoint sync calls

Поиск
Список
Период
Сортировка
От Greg Smith
Тема Re: Instrument checkpoint sync calls
Дата
Msg-id 4CFC02D8.70302@2ndquadrant.com
обсуждение исходный текст
Ответ на Re: Instrument checkpoint sync calls  (Jeff Janes <jeff.janes@gmail.com>)
Ответы Re: Instrument checkpoint sync calls  (Jeff Janes <jeff.janes@gmail.com>)
Re: Instrument checkpoint sync calls  (Robert Haas <robertmhaas@gmail.com>)
Список pgsql-hackers
Jeff Janes wrote:
> I've attached a tiny patch to apply over yours, to deal with this and
> with the case where no files are synced.
>

Thanks for that.  That obvious error eluded me because in most of the
patch update testing I was doing (on ext3), the longest sync was always
about the same length as the total sync time.

Attached patch (in correct diff form this time!) collects up all
changes.  That includes elimination of a potential race condition if
someone changes log_checkpoints while a long sync phase is executing.  I
don't know whether that can happen, and it obviously won't give accurate
stats going back to the beginning of the checkpoint in that case, but
it  tries to defend aginst producing complete garbage if that value
changes out from under it.

This is the first version of this patch I feel fairly good about; no
open concerns left on my side.  Jeff, since you're now the de-facto
credited reviewer of this one by virtue of suggesting bug fixes, could
you take this update out for a spin too?

> Combining this instrumentation patch with the backend sync one already
> committed, the net result under log_min_messages=debug1is somewhat
> undesirable in that I can now see the individual sync times for the
> syncs done by the checkpoint writer, but I do not get times for the
> syncs done by the backend (I only get informed of their existence).
>

On a properly working system, backend syncs shouldn't be happening.  So
if you see them, I think the question shouldn't be "how long are they
taking?", it's "how do I get rid of them?"  From that perspective,
knowing of their existence is sufficient to suggest the necessary tuning
changes, such as dropping bgwriter_delay.

When you get into a situation where they are showing up, a whole lot of
them can happen in a very brief period; enough that I'd actually be
concerned about the added timing overhead, something I normally don't
care about very much.  That's the main reason I didn't bother
instrumenting those too--the situations where they happen are ones
already running low on resources.

Big writes for things that can only be written out at checkpoint time,
on the other hand, are unavoidable in the current design.  Providing
detail on them is going to be relevant unless there's a major
refactoring of how those happen.

--
Greg Smith   2ndQuadrant US    greg@2ndQuadrant.com   Baltimore, MD
PostgreSQL Training, Services and Support        www.2ndQuadrant.us


diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ae9ed8f..6adc139 100644
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
*************** LogCheckpointEnd(bool restartpoint)
*** 6955,6964 ****
  {
      long        write_secs,
                  sync_secs,
!                 total_secs;
      int            write_usecs,
                  sync_usecs,
!                 total_usecs;

      CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

--- 6955,6969 ----
  {
      long        write_secs,
                  sync_secs,
!                 total_secs,
!                 longest_secs,
!                 average_secs;
      int            write_usecs,
                  sync_usecs,
!                 total_usecs,
!                 longest_usecs,
!                 average_usecs;
!     double        average_sync_time;

      CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

*************** LogCheckpointEnd(bool restartpoint)
*** 6974,6991 ****
                          CheckpointStats.ckpt_sync_end_t,
                          &sync_secs, &sync_usecs);

      if (restartpoint)
          elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
!              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
               CheckpointStats.ckpt_bufs_written,
               (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
               write_secs, write_usecs / 1000,
               sync_secs, sync_usecs / 1000,
!              total_secs, total_usecs / 1000);
      else
          elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
               "%d transaction log file(s) added, %d removed, %d recycled; "
!              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
               CheckpointStats.ckpt_bufs_written,
               (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
               CheckpointStats.ckpt_segs_added,
--- 6979,7017 ----
                          CheckpointStats.ckpt_sync_end_t,
                          &sync_secs, &sync_usecs);

+     /*
+      * Timing values returned from CheckpointStats are in milliseconds.
+      * Convert to the second plus microsecond form that TimestampDifference
+      * returns for homogeneous printing.
+      */
+     longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000);
+     longest_usecs = 1000 * (CheckpointStats.ckpt_longest_sync -
+         longest_secs * 1000);
+
+     average_sync_time = 0;
+     if (CheckpointStats.ckpt_sync_rels > 0)
+         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+             CheckpointStats.ckpt_sync_rels;
+     average_secs = (long) (average_sync_time / 1000);
+     average_usecs = 1000 * (average_sync_time - average_secs * 1000);
+
      if (restartpoint)
          elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
!              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
!              "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
               CheckpointStats.ckpt_bufs_written,
               (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
               write_secs, write_usecs / 1000,
               sync_secs, sync_usecs / 1000,
!              total_secs, total_usecs / 1000,
!              CheckpointStats.ckpt_sync_rels,
!              longest_secs, longest_usecs / 1000,
!              average_secs, average_usecs / 1000);
      else
          elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
               "%d transaction log file(s) added, %d removed, %d recycled; "
!              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
!              "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
               CheckpointStats.ckpt_bufs_written,
               (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
               CheckpointStats.ckpt_segs_added,
*************** LogCheckpointEnd(bool restartpoint)
*** 6993,6999 ****
               CheckpointStats.ckpt_segs_recycled,
               write_secs, write_usecs / 1000,
               sync_secs, sync_usecs / 1000,
!              total_secs, total_usecs / 1000);
  }

  /*
--- 7019,7028 ----
               CheckpointStats.ckpt_segs_recycled,
               write_secs, write_usecs / 1000,
               sync_secs, sync_usecs / 1000,
!              total_secs, total_usecs / 1000,
!              CheckpointStats.ckpt_sync_rels,
!              longest_secs, longest_usecs / 1000,
!              average_secs, average_usecs / 1000);
  }

  /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db21..1f602a7 100644
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 20,25 ****
--- 20,26 ----

  #include "catalog/catalog.h"
  #include "miscadmin.h"
+ #include "portability/instr_time.h"
  #include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "storage/bufmgr.h"
*************** mdsync(void)
*** 927,932 ****
--- 928,940 ----
      PendingOperationEntry *entry;
      int            absorb_counter;

+     /* Statistics on sync times */
+     int processed = 0;
+     instr_time sync_start, sync_end, sync_diff;
+     double elapsed;
+     double longest = 0;
+     double total_elapsed = 0;
+
      /*
       * This is only called during checkpoints, and checkpoints should only
       * occur in processes that have created a pendingOpsTable.
*************** mdsync(void)
*** 1069,1077 ****
--- 1077,1107 ----
                  seg = _mdfd_getseg(reln, entry->tag.forknum,
                                entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
                                     false, EXTENSION_RETURN_NULL);
+
+                 if (log_checkpoints)
+                     INSTR_TIME_SET_CURRENT(sync_start);
+                 else
+                     INSTR_TIME_SET_ZERO(sync_start);
+
                  if (seg != NULL &&
                      FileSync(seg->mdfd_vfd) >= 0)
+                 {
+                     if (log_checkpoints && (! INSTR_TIME_IS_ZERO(sync_start)))
+                     {
+                         INSTR_TIME_SET_CURRENT(sync_end);
+                         sync_diff = sync_end;
+                         INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+                         elapsed = (double) INSTR_TIME_GET_MICROSEC(sync_diff) / 1000.0;
+                         if (elapsed > longest)
+                             longest = elapsed;
+                         total_elapsed += elapsed;
+                         processed++;
+                         elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%f msec",
+                             processed, FilePathName(seg->mdfd_vfd), elapsed);
+                     }
+
                      break;        /* success; break out of retry loop */
+                 }

                  /*
                   * XXX is there any point in allowing more than one retry?
*************** mdsync(void)
*** 1113,1118 ****
--- 1143,1153 ----
              elog(ERROR, "pendingOpsTable corrupted");
      }                            /* end loop over hashtable entries */

+     /* Return sync performance metrics for report at checkpoint end */
+     CheckpointStats.ckpt_sync_rels = processed;
+     CheckpointStats.ckpt_longest_sync = longest;
+     CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
      /* Flag successful completion of mdsync */
      mdsync_in_progress = false;
  }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2a..d95f843 100644
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
*************** typedef struct CheckpointStatsData
*** 257,262 ****
--- 257,269 ----
      int            ckpt_segs_added;    /* # of new xlog segments created */
      int            ckpt_segs_removed;        /* # of xlog segments deleted */
      int            ckpt_segs_recycled;        /* # of xlog segments recycled */
+
+     int            ckpt_sync_rels;        /* # of relations synced */
+     double        ckpt_longest_sync;    /* Longest sync for one relation */
+     double        ckpt_agg_sync_time;    /* The sum of all the individual sync
+                                        times, which is not necessarily the
+                                        same as the total elapsed time for
+                                        the entire sync phase. */
  } CheckpointStatsData;

  extern CheckpointStatsData CheckpointStats;

В списке pgsql-hackers по дате отправления:

Предыдущее
От: "Kevin Grittner"
Дата:
Сообщение: Re: V3: Idle in transaction cancellation
Следующее
От: Greg Smith
Дата:
Сообщение: Re: Spread checkpoint sync