Re: BRIN FSM vacuuming questions

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: BRIN FSM vacuuming questions
Дата
Msg-id 10177.1522777176@sss.pgh.pa.us
обсуждение исходный текст
Ответ на Re: BRIN FSM vacuuming questions  (Tom Lane <tgl@sss.pgh.pa.us>)
Список pgsql-hackers
I wrote:
> [ assorted complaining about BRIN FSM management ]

Here's a proposed patch for this.  Changes:

* Do RecordPageWithFreeSpace unconditionally in brin_page_cleanup,
and do FreeSpaceMapVacuum unconditionally in brin_vacuum_scan.
Because of the FSM's roundoff behavior, the old complications here
weren't really buying any savings.

* Elsewhere, we are trying to update the FSM for just a single-page
status update, so use FreeSpaceMapVacuumRange() to limit how much
of the upper FSM gets traversed.

* Fix a couple of places that neglected to do the upper-page
vacuuming at all after recording new free space.  If the policy
is to be that BRIN should do that, it should do it everywhere.

* Remove premature RecordPageWithFreeSpace call in brin_getinsertbuffer
where it's about to return an extended page to the caller.  The caller
should do that, instead, after it's inserted its new tuple.  Fix the
one caller that forgot to do so.

* Simplify logic in brin_doupdate's same-page-update case by
postponing brin_initialize_empty_new_buffer to after the critical
section; I see little point in doing it before.

* Avoid repeat calls of RelationGetNumberOfBlocks in brin_vacuum_scan.

* Avoid duplicate BufferGetBlockNumber and BufferGetPage calls in
a couple of places where we already had the right values.

* Move a BRIN_elog call out of a critical section; that's pretty
unsafe and I don't think it buys us anything to not wait till
after the critical section.

* I moved the "*extended = false" step in brin_getinsertbuffer into
the routine's main loop.  There's no actual bug there, since the loop
can't iterate with *extended still true, but it doesn't seem very
future-proof as coded; and it's certainly not documented as a loop
invariant.

* Assorted comment improvements.

The use of FreeSpaceMapVacuumRange makes this a HEAD-only patch.
I'm not sure if any of the other changes are worth back-patching.

            regards, tom lane

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 0e5849e..6ed115f 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -1121,16 +1121,22 @@ initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
 static void
 terminate_brin_buildstate(BrinBuildState *state)
 {
-    /* release the last index buffer used */
+    /*
+     * Release the last index buffer used.  We might as well ensure that
+     * whatever free space remains in that page is available in FSM, too.
+     */
     if (!BufferIsInvalid(state->bs_currentInsertBuf))
     {
         Page        page;
+        Size        freespace;
+        BlockNumber blk;

         page = BufferGetPage(state->bs_currentInsertBuf);
-        RecordPageWithFreeSpace(state->bs_irel,
-                                BufferGetBlockNumber(state->bs_currentInsertBuf),
-                                PageGetFreeSpace(page));
+        freespace = PageGetFreeSpace(page);
+        blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
         ReleaseBuffer(state->bs_currentInsertBuf);
+        RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
+        FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
     }

     brin_free_desc(state->bs_bdesc);
@@ -1445,14 +1451,15 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
 static void
 brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
 {
-    bool        vacuum_fsm = false;
+    BlockNumber nblocks;
     BlockNumber blkno;

     /*
      * Scan the index in physical order, and clean up any possible mess in
      * each page.
      */
-    for (blkno = 0; blkno < RelationGetNumberOfBlocks(idxrel); blkno++)
+    nblocks = RelationGetNumberOfBlocks(idxrel);
+    for (blkno = 0; blkno < nblocks; blkno++)
     {
         Buffer        buf;

@@ -1461,15 +1468,15 @@ brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
         buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
                                  RBM_NORMAL, strategy);

-        vacuum_fsm |= brin_page_cleanup(idxrel, buf);
+        brin_page_cleanup(idxrel, buf);

         ReleaseBuffer(buf);
     }

     /*
-     * If we made any change to the FSM, make sure the new info is visible all
-     * the way to the top.
+     * Update all upper pages in the index's FSM, as well.  This ensures not
+     * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
+     * but also that any pre-existing damage or out-of-dateness is repaired.
      */
-    if (vacuum_fsm)
-        FreeSpaceMapVacuum(idxrel);
+    FreeSpaceMapVacuum(idxrel);
 }
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index 60a7025..040cb62 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -64,6 +64,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
     BrinTuple  *oldtup;
     Size        oldsz;
     Buffer        newbuf;
+    BlockNumber newblk = InvalidBlockNumber;
     bool        extended;

     Assert(newsz == MAXALIGN(newsz));
@@ -101,6 +102,8 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
             Assert(!extended);
             newbuf = InvalidBuffer;
         }
+        else
+            newblk = BufferGetBlockNumber(newbuf);
     }
     else
     {
@@ -136,7 +139,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
                 brin_initialize_empty_new_buffer(idxrel, newbuf);
             UnlockReleaseBuffer(newbuf);
             if (extended)
-                FreeSpaceMapVacuum(idxrel);
+                FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
         }
         return false;
     }
@@ -152,11 +155,12 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
         LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
         if (BufferIsValid(newbuf))
         {
+            /* As above, initialize and record new page if we got one */
             if (extended)
                 brin_initialize_empty_new_buffer(idxrel, newbuf);
             UnlockReleaseBuffer(newbuf);
             if (extended)
-                FreeSpaceMapVacuum(idxrel);
+                FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
         }
         return false;
     }
@@ -173,14 +177,6 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
     if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
         brin_can_do_samepage_update(oldbuf, origsz, newsz))
     {
-        if (BufferIsValid(newbuf))
-        {
-            /* as above */
-            if (extended)
-                brin_initialize_empty_new_buffer(idxrel, newbuf);
-            UnlockReleaseBuffer(newbuf);
-        }
-
         START_CRIT_SECTION();
         if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz))
             elog(ERROR, "failed to replace BRIN tuple");
@@ -210,8 +206,15 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,

         LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);

-        if (extended)
-            FreeSpaceMapVacuum(idxrel);
+        if (BufferIsValid(newbuf))
+        {
+            /* As above, initialize and record new page if we got one */
+            if (extended)
+                brin_initialize_empty_new_buffer(idxrel, newbuf);
+            UnlockReleaseBuffer(newbuf);
+            if (extended)
+                FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
+        }

         return true;
     }
@@ -234,7 +237,6 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
         Buffer        revmapbuf;
         ItemPointerData newtid;
         OffsetNumber newoff;
-        BlockNumber newblk = InvalidBlockNumber;
         Size        freespace = 0;

         revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
@@ -247,7 +249,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
          * need to do that here.
          */
         if (extended)
-            brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR);
+            brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);

         PageIndexTupleDeleteNoCompact(oldpage, oldoff);
         newoff = PageAddItem(newpage, (Item) newtup, newsz,
@@ -259,12 +261,9 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,

         /* needed to update FSM below */
         if (extended)
-        {
-            newblk = BufferGetBlockNumber(newbuf);
             freespace = br_page_get_freespace(newpage);
-        }

-        ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
+        ItemPointerSet(&newtid, newblk, newoff);
         brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
         MarkBufferDirty(revmapbuf);

@@ -311,9 +310,8 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,

         if (extended)
         {
-            Assert(BlockNumberIsValid(newblk));
             RecordPageWithFreeSpace(idxrel, newblk, freespace);
-            FreeSpaceMapVacuum(idxrel);
+            FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
         }

         return true;
@@ -350,6 +348,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
     Page        page;
     BlockNumber blk;
     OffsetNumber off;
+    Size        freespace = 0;
     Buffer        revmapbuf;
     ItemPointerData tid;
     bool        extended;
@@ -410,15 +409,16 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
     /* Execute the actual insertion */
     START_CRIT_SECTION();
     if (extended)
-        brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR);
+        brin_page_init(page, BRIN_PAGETYPE_REGULAR);
     off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
                       false, false);
     if (off == InvalidOffsetNumber)
-        elog(ERROR, "could not insert new index tuple to page");
+        elog(ERROR, "failed to add BRIN tuple to new page");
     MarkBufferDirty(*buffer);

-    BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
-               blk, off, heapBlk));
+    /* needed to update FSM below */
+    if (extended)
+        freespace = br_page_get_freespace(page);

     ItemPointerSet(&tid, blk, off);
     brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
@@ -456,8 +456,14 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
     LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
     LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);

+    BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
+               blk, off, heapBlk));
+
     if (extended)
-        FreeSpaceMapVacuum(idxrel);
+    {
+        RecordPageWithFreeSpace(idxrel, blk, freespace);
+        FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
+    }

     return off;
 }
@@ -599,17 +605,22 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
 }

 /*
- * Given a BRIN index page, initialize it if necessary, and record it into the
- * FSM if necessary.  Return value is true if the FSM itself needs "vacuuming".
+ * Given a BRIN index page, initialize it if necessary, and record its
+ * current free space in the FSM.
+ *
  * The main use for this is when, during vacuuming, an uninitialized page is
  * found, which could be the result of relation extension followed by a crash
  * before the page can be used.
+ *
+ * Here, we don't bother to update upper FSM pages, instead expecting that our
+ * caller (brin_vacuum_scan) will fix them at the end of the scan.  Elsewhere
+ * in this file, it's generally a good idea to propagate additions of free
+ * space into the upper FSM pages immediately.
  */
-bool
+void
 brin_page_cleanup(Relation idxrel, Buffer buf)
 {
     Page        page = BufferGetPage(buf);
-    Size        freespace;

     /*
      * If a page was left uninitialized, initialize it now; also record it in
@@ -631,7 +642,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
         {
             brin_initialize_empty_new_buffer(idxrel, buf);
             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-            return true;
+            return;
         }
         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     }
@@ -639,24 +650,18 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
     /* Nothing to be done for non-regular index pages */
     if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
         BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
-        return false;
+        return;

     /* Measure free space and record it */
-    freespace = br_page_get_freespace(page);
-    if (freespace > GetRecordedFreeSpace(idxrel, BufferGetBlockNumber(buf)))
-    {
-        RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), freespace);
-        return true;
-    }
-
-    return false;
+    RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
+                            br_page_get_freespace(page));
 }

 /*
  * Return a pinned and exclusively locked buffer which can be used to insert an
  * index item of size itemsz (caller must ensure not to request sizes
  * impossible to fulfill).  If oldbuf is a valid buffer, it is also locked (in
- * an order determined to avoid deadlocks.)
+ * an order determined to avoid deadlocks).
  *
  * If we find that the old page is no longer a regular index page (because
  * of a revmap extension), the old buffer is unlocked and we return
@@ -665,12 +670,18 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
  * If there's no existing page with enough free space to accommodate the new
  * item, the relation is extended.  If this happens, *extended is set to true,
  * and it is the caller's responsibility to initialize the page (and WAL-log
- * that fact) prior to use.
+ * that fact) prior to use.  The caller should also update the FSM with the
+ * page's remaining free space after the insertion.
  *
- * Note that in some corner cases it is possible for this routine to extend the
- * relation and then not return the buffer.  It is this routine's
+ * Note that the caller is not expected to update FSM unless *extended is set
+ * true.  This policy means that we'll update FSM when a page is created, and
+ * when it's found to have too little space for a desired tuple insertion,
+ * but not every single time we add a tuple to the page.
+ *
+ * Note that in some corner cases it is possible for this routine to extend
+ * the relation and then not return the new page.  It is this routine's
  * responsibility to WAL-log the page initialization and to record the page in
- * FSM if that happens.  Such a buffer may later be reused by this routine.
+ * FSM if that happens, since the caller certainly can't do it.
  */
 static Buffer
 brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
@@ -684,22 +695,22 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
     /* callers must have checked */
     Assert(itemsz <= BrinMaxItemSize);

-    *extended = false;
-
     if (BufferIsValid(oldbuf))
         oldblk = BufferGetBlockNumber(oldbuf);
     else
         oldblk = InvalidBlockNumber;

+    /* Choose initial target page, re-using existing target if known */
+    newblk = RelationGetTargetBlock(irel);
+    if (newblk == InvalidBlockNumber)
+        newblk = GetPageWithFreeSpace(irel, itemsz);
+
     /*
      * Loop until we find a page with sufficient free space.  By the time we
      * return to caller out of this loop, both buffers are valid and locked;
-     * if we have to restart here, neither buffer is locked and buf is not a
-     * pinned buffer.
+     * if we have to restart here, neither page is locked and newblk isn't
+     * pinned (if it's even valid).
      */
-    newblk = RelationGetTargetBlock(irel);
-    if (newblk == InvalidBlockNumber)
-        newblk = GetPageWithFreeSpace(irel, itemsz);
     for (;;)
     {
         Buffer        buf;
@@ -707,6 +718,8 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,

         CHECK_FOR_INTERRUPTS();

+        *extended = false;
+
         if (newblk == InvalidBlockNumber)
         {
             /*
@@ -741,9 +754,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,

         /*
          * We lock the old buffer first, if it's earlier than the new one; but
-         * before we do, we need to check that it hasn't been turned into a
-         * revmap page concurrently; if we detect that it happened, give up
-         * and tell caller to start over.
+         * then we need to check that it hasn't been turned into a revmap page
+         * concurrently.  If we detect that that happened, give up and tell
+         * caller to start over.
          */
         if (BufferIsValid(oldbuf) && oldblk < newblk)
         {
@@ -761,16 +774,20 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
                  * it first.
                  */
                 if (*extended)
-                {
                     brin_initialize_empty_new_buffer(irel, buf);
-                    /* shouldn't matter, but don't confuse caller */
-                    *extended = false;
-                }

                 if (extensionLockHeld)
                     UnlockRelationForExtension(irel, ExclusiveLock);

                 ReleaseBuffer(buf);
+
+                if (*extended)
+                {
+                    FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
+                    /* shouldn't matter, but don't confuse caller */
+                    *extended = false;
+                }
+
                 return InvalidBuffer;
             }
         }
@@ -785,9 +802,6 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
         /*
          * We have a new buffer to insert into.  Check that the new page has
          * enough free space, and return it if it does; otherwise start over.
-         * Note that we allow for the FSM to be out of date here, and in that
-         * case we update it and move on.
-         *
          * (br_page_get_freespace also checks that the FSM didn't hand us a
          * page that has since been repurposed for the revmap.)
          */
@@ -795,16 +809,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
             BrinMaxItemSize : br_page_get_freespace(page);
         if (freespace >= itemsz)
         {
-            RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
-
-            /*
-             * Since the target block specification can get lost on cache
-             * invalidations, make sure we update the more permanent FSM with
-             * data about it before going away.
-             */
-            if (*extended)
-                RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
-                                        freespace);
+            RelationSetTargetBlock(irel, newblk);

             /*
              * Lock the old buffer if not locked already.  Note that in this
@@ -832,6 +837,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
         if (*extended)
         {
             brin_initialize_empty_new_buffer(irel, buf);
+            /* since this should not happen, skip FreeSpaceMapVacuum */

             ereport(ERROR,
                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
@@ -845,6 +851,10 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
         if (BufferIsValid(oldbuf) && oldblk <= newblk)
             LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);

+        /*
+         * Update the FSM with the new, presumably smaller, freespace value
+         * for this page, then search for a new target page.
+         */
         newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
     }
 }
@@ -859,6 +869,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
  * there is no mechanism to get the space back and the index would bloat.
  * Also, because we would not WAL-log the action that would initialize the
  * page, the page would go uninitialized in a standby (or after recovery).
+ *
+ * While we record the page in FSM here, caller is responsible for doing FSM
+ * upper-page update if that seems appropriate.
  */
 static void
 brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h
index 8b389de..5189d5d 100644
--- a/src/include/access/brin_pageops.h
+++ b/src/include/access/brin_pageops.h
@@ -33,6 +33,6 @@ extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
 extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
                    BrinRevmap *revmap, Buffer buf);

-extern bool brin_page_cleanup(Relation idxrel, Buffer buf);
+extern void brin_page_cleanup(Relation idxrel, Buffer buf);

 #endif                            /* BRIN_PAGEOPS_H */

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Magnus Hagander
Дата:
Сообщение: Re: pgsql: Validate page level checksums in base backups
Следующее
От: Jesper Pedersen
Дата:
Сообщение: Re: [HACKERS] Runtime Partition Pruning