Обсуждение: Split _bt_insertonpg to two functions

Поиск

Список

Период

Сортировка

Split _bt_insertonpg to two functions

От

Heikki Linnakangas

Дата:

26 февраля 2007 г., 06:14:03

Here's a patch that:

Moves the logic to find a page with enough room from _bt_insertonpg to a
new function, _bt_findinsertloc. It makes the code more readable, and
simplifies the forthcoming Grouped Index Tuples patch.

Also, the insert location within page used to be calculated twice for
unique indexes, once in _bt_checkunique and second time in
_bt_insertonpg. That's a waste of cycles, and this patch fixes that.


I couldn't measure a difference with pgbench, but this micro-benchmark
shows it:

 > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
 > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
 > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest;
INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"

Without patch:    real    0m7.260s
With patch:    real    0m6.963s

On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10,
to remove any other variables.

It's not a huge difference, but it's worth having, and performance
wasn't the main motivation of the patch anyway.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/nbtree/nbtinsert.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.152
diff -c -r1.152 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c    21 Feb 2007 20:02:17 -0000    1.152
--- src/backend/access/nbtree/nbtinsert.c    26 Feb 2007 09:37:16 -0000
***************
*** 46,58 ****
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);

  static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
!                  Relation heapRel, Buffer buf,
                   ScanKey itup_scankey);
  static void _bt_insertonpg(Relation rel, Buffer buf,
                 BTStack stack,
-                int keysz, ScanKey scankey,
                 IndexTuple itup,
!                OffsetNumber afteritem,
                 bool split_only_page);
  static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
            OffsetNumber newitemoff, Size newitemsz,
--- 46,63 ----
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);

  static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
!                  Relation heapRel, Buffer buf, OffsetNumber ioffset,
                   ScanKey itup_scankey);
+ static void _bt_findinsertloc(Relation rel,
+                   Buffer *bufptr,
+                   OffsetNumber *offsetptr,
+                   int keysz,
+                   ScanKey scankey,
+                   IndexTuple newtup);
  static void _bt_insertonpg(Relation rel, Buffer buf,
                 BTStack stack,
                 IndexTuple itup,
!                OffsetNumber newitemoff,
                 bool split_only_page);
  static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
            OffsetNumber newitemoff, Size newitemsz,
***************
*** 86,91 ****
--- 91,97 ----
      ScanKey        itup_scankey;
      BTStack        stack;
      Buffer        buf;
+     OffsetNumber offset;

      /* we need an insertion scan key to do our search, so build one */
      itup_scankey = _bt_mkscankey(rel, itup);
***************
*** 94,99 ****
--- 100,107 ----
      /* find the first page containing this key */
      stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);

+     offset = InvalidOffsetNumber;
+
      /* trade in our read lock for a write lock */
      LockBuffer(buf, BUFFER_LOCK_UNLOCK);
      LockBuffer(buf, BT_WRITE);
***************
*** 128,134 ****
      {
          TransactionId xwait;

!         xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);

          if (TransactionIdIsValid(xwait))
          {
--- 136,143 ----
      {
          TransactionId xwait;

!         offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
!         xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);

          if (TransactionIdIsValid(xwait))
          {
***************
*** 142,148 ****
      }

      /* do the insertion */
!     _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);

      /* be tidy */
      _bt_freestack(stack);
--- 151,158 ----
      }

      /* do the insertion */
!     _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
!     _bt_insertonpg(rel, buf, stack, itup, offset, false);

      /* be tidy */
      _bt_freestack(stack);
***************
*** 152,169 ****
  /*
   *    _bt_check_unique() -- Check for violation of unique index constraint
   *
   * Returns InvalidTransactionId if there is no conflict, else an xact ID
   * we must wait for to see if it commits a conflicting tuple.    If an actual
   * conflict is detected, no return --- just ereport().
   */
  static TransactionId
  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
!                  Buffer buf, ScanKey itup_scankey)
  {
      TupleDesc    itupdesc = RelationGetDescr(rel);
      int            natts = rel->rd_rel->relnatts;
!     OffsetNumber offset,
!                 maxoff;
      Page        page;
      BTPageOpaque opaque;
      Buffer        nbuf = InvalidBuffer;
--- 162,182 ----
  /*
   *    _bt_check_unique() -- Check for violation of unique index constraint
   *
+  * offset points to the first possible item that could conflict. It can
+  * also point to end-of-page, which means that the first tuple to check
+  * is the first tuple on the next page.
+  *
   * Returns InvalidTransactionId if there is no conflict, else an xact ID
   * we must wait for to see if it commits a conflicting tuple.    If an actual
   * conflict is detected, no return --- just ereport().
   */
  static TransactionId
  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
!                  Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
  {
      TupleDesc    itupdesc = RelationGetDescr(rel);
      int            natts = rel->rd_rel->relnatts;
!     OffsetNumber maxoff;
      Page        page;
      BTPageOpaque opaque;
      Buffer        nbuf = InvalidBuffer;
***************
*** 173,184 ****
      maxoff = PageGetMaxOffsetNumber(page);

      /*
-      * Find first item >= proposed new item.  Note we could also get a pointer
-      * to end-of-page here.
-      */
-     offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
-
-     /*
       * Scan over all equal tuples, looking for live conflicts.
       */
      for (;;)
--- 186,191 ----
***************
*** 342,374 ****
      return InvalidTransactionId;
  }

! /*----------
!  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
!  *
!  *        This recursive procedure does the following things:
!  *
!  *            +  finds the right place to insert the tuple.
!  *            +  if necessary, splits the target page (making sure that the
!  *               split is equitable as far as post-insert free space goes).
!  *            +  inserts the tuple.
!  *            +  if the page was split, pops the parent stack, and finds the
!  *               right place to insert the new child pointer (by walking
!  *               right using information stored in the parent stack).
!  *            +  invokes itself with the appropriate tuple for the right
!  *               child page on the parent.
!  *            +  updates the metapage if a true root or fast root is split.
!  *
!  *        On entry, we must have the right buffer in which to do the
!  *        insertion, and the buffer must be pinned and write-locked.    On return,
!  *        we will have dropped both the pin and the lock on the buffer.
!  *
!  *        If 'afteritem' is >0 then the new tuple must be inserted after the
!  *        existing item of that number, noplace else.  If 'afteritem' is 0
!  *        then the procedure finds the exact spot to insert it by searching.
!  *        (keysz and scankey parameters are used ONLY if afteritem == 0.
!  *        The scankey must be an insertion-type scankey.)
   *
!  *        NOTE: if the new key is equal to one or more existing keys, we can
   *        legitimately place it anywhere in the series of equal keys --- in fact,
   *        if the new key is equal to the page's "high key" we can place it on
   *        the next page.    If it is equal to the high key, and there's not room
--- 349,359 ----
      return InvalidTransactionId;
  }

!
! /*
!  *    _bt_findinsertloc() -- Finds an insert location for a tuple
   *
!  *        If the new key is equal to one or more existing keys, we can
   *        legitimately place it anywhere in the series of equal keys --- in fact,
   *        if the new key is equal to the page's "high key" we can place it on
   *        the next page.    If it is equal to the high key, and there's not room
***************
*** 379,414 ****
   *        Once we have chosen the page to put the key on, we'll insert it before
   *        any existing equal keys because of the way _bt_binsrch() works.
   *
!  *        The locking interactions in this code are critical.  You should
!  *        grok Lehman and Yao's paper before making any changes.  In addition,
!  *        you need to understand how we disambiguate duplicate keys in this
!  *        implementation, in order to be able to find our location using
!  *        L&Y "move right" operations.  Since we may insert duplicate user
!  *        keys, and since these dups may propagate up the tree, we use the
!  *        'afteritem' parameter to position ourselves correctly for the
!  *        insertion on internal pages.
!  *----------
   */
  static void
! _bt_insertonpg(Relation rel,
!                Buffer buf,
!                BTStack stack,
!                int keysz,
!                ScanKey scankey,
!                IndexTuple itup,
!                OffsetNumber afteritem,
!                bool split_only_page)
  {
!     Page        page;
      BTPageOpaque lpageop;
      OffsetNumber newitemoff;
!     OffsetNumber firstright = InvalidOffsetNumber;
!     Size        itemsz;

-     page = BufferGetPage(buf);
      lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

!     itemsz = IndexTupleDSize(*itup);
      itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
                                   * need to be consistent */

--- 364,403 ----
   *        Once we have chosen the page to put the key on, we'll insert it before
   *        any existing equal keys because of the way _bt_binsrch() works.
   *
!  *        If there's not enough room in the space, we try to make room by
!  *        removing any LP_DELETEd tuples.
!  *
!  *        On entry, *buf and *offsetptr point to the first legal position
!  *        where the new tuple could be inserted. The caller should hold an
!  *        exclusive lock on *buf. *offsetptr can also be set to
!  *        InvalidOffsetNumber, in which case the function will search the    right
!  *        location within the page if needed. On exit, they point to the chosen
!  *        insert location. If findinsertloc decided to move right, the lock and
!  *        pin on the original page will be released and the new page returned to
!  *        the caller is exclusively locked instead.
!  *
!  *        newtup is the new tuple we're inserting, and scankey is an insertion
!  *        type scan key for it.
   */
  static void
! _bt_findinsertloc(Relation rel,
!                   Buffer *bufptr,
!                   OffsetNumber *offsetptr,
!                   int keysz,
!                   ScanKey scankey,
!                   IndexTuple newtup)
  {
!     Buffer buf = *bufptr;
!     Page page = BufferGetPage(buf);
!     Size itemsz;
      BTPageOpaque lpageop;
+     bool movedright, vacuumed;
      OffsetNumber newitemoff;
!     OffsetNumber firstlegaloff = *offsetptr;

      lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

!     itemsz = IndexTupleDSize(*newtup);
      itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
                                   * need to be consistent */

***************
*** 429,525 ****
                  "Consider a function index of an MD5 hash of the value, "
                  "or use full text indexing.")));

-     /*
-      * Determine exactly where new item will go.
-      */
-     if (afteritem > 0)
-         newitemoff = afteritem + 1;
-     else
-     {
-         /*----------
-          * If we will need to split the page to put the item here,
-          * check whether we can put the tuple somewhere to the right,
-          * instead.  Keep scanning right until we
-          *        (a) find a page with enough free space,
-          *        (b) reach the last page where the tuple can legally go, or
-          *        (c) get tired of searching.
-          * (c) is not flippant; it is important because if there are many
-          * pages' worth of equal keys, it's better to split one of the early
-          * pages than to scan all the way to the end of the run of equal keys
-          * on every insert.  We implement "get tired" as a random choice,
-          * since stopping after scanning a fixed number of pages wouldn't work
-          * well (we'd never reach the right-hand side of previously split
-          * pages).    Currently the probability of moving right is set at 0.99,
-          * which may seem too high to change the behavior much, but it does an
-          * excellent job of preventing O(N^2) behavior with many equal keys.
-          *----------
-          */
-         bool        movedright = false;
-
-         while (PageGetFreeSpace(page) < itemsz)
-         {
-             Buffer        rbuf;

-             /*
-              * before considering moving right, see if we can obtain enough
-              * space by erasing LP_DELETE items
-              */
-             if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
-             {
-                 _bt_vacuum_one_page(rel, buf);
-                 if (PageGetFreeSpace(page) >= itemsz)
-                     break;        /* OK, now we have enough space */
-             }

!             /*
!              * nope, so check conditions (b) and (c) enumerated above
!              */
!             if (P_RIGHTMOST(lpageop) ||
!                 _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
!                 random() <= (MAX_RANDOM_VALUE / 100))
!                 break;

!             /*
!              * step right to next non-dead page
!              *
!              * must write-lock that page before releasing write lock on
!              * current page; else someone else's _bt_check_unique scan could
!              * fail to see our insertion.  write locks on intermediate dead
!              * pages won't do because we don't know when they will get
!              * de-linked from the tree.
!              */
!             rbuf = InvalidBuffer;

!             for (;;)
!             {
!                 BlockNumber rblkno = lpageop->btpo_next;

!                 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
!                 page = BufferGetPage(rbuf);
!                 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
!                 if (!P_IGNORE(lpageop))
!                     break;
!                 if (P_RIGHTMOST(lpageop))
!                     elog(ERROR, "fell off the end of \"%s\"",
!                          RelationGetRelationName(rel));
!             }
!             _bt_relbuf(rel, buf);
!             buf = rbuf;
!             movedright = true;
          }

          /*
!          * Now we are on the right page, so find the insert position. If we
!          * moved right at all, we know we should insert at the start of the
!          * page, else must find the position by searching.
           */
!         if (movedright)
!             newitemoff = P_FIRSTDATAKEY(lpageop);
!         else
!             newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
      }

      /*
       * Do we need to split the page to fit the item on it?
       *
       * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
--- 418,573 ----
                  "Consider a function index of an MD5 hash of the value, "
                  "or use full text indexing.")));



!     /*----------
!      * If we will need to split the page to put the item on this page,
!      * check whether we can put the tuple somewhere to the right,
!      * instead.  Keep scanning right until we
!      *        (a) find a page with enough free space,
!      *        (b) reach the last page where the tuple can legally go, or
!      *        (c) get tired of searching.
!      * (c) is not flippant; it is important because if there are many
!      * pages' worth of equal keys, it's better to split one of the early
!      * pages than to scan all the way to the end of the run of equal keys
!      * on every insert.  We implement "get tired" as a random choice,
!      * since stopping after scanning a fixed number of pages wouldn't work
!      * well (we'd never reach the right-hand side of previously split
!      * pages).    Currently the probability of moving right is set at 0.99,
!      * which may seem too high to change the behavior much, but it does an
!      * excellent job of preventing O(N^2) behavior with many equal keys.
!      *----------
!      */
!     movedright = false;
!     vacuumed = false;
!     while (PageGetFreeSpace(page) < itemsz)
!     {
!         Buffer        rbuf;

!         /*
!          * before considering moving right, see if we can obtain enough
!          * space by erasing LP_DELETE items
!          */
!         if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
!         {
!             _bt_vacuum_one_page(rel, buf);

!             /* remember that we vacuumed this page, because that makes
!              * the hint supplied by the caller invalid */
!             vacuumed = true;

!             if (PageGetFreeSpace(page) >= itemsz)
!                 break;        /* OK, now we have enough space */
          }

          /*
!          * nope, so check conditions (b) and (c) enumerated above
           */
!         if (P_RIGHTMOST(lpageop) ||
!             _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
!             random() <= (MAX_RANDOM_VALUE / 100))
!             break;
!
!         /*
!          * step right to next non-dead page
!          *
!          * must write-lock that page before releasing write lock on
!          * current page; else someone else's _bt_check_unique scan could
!          * fail to see our insertion.  write locks on intermediate dead
!          * pages won't do because we don't know when they will get
!          * de-linked from the tree.
!          */
!         rbuf = InvalidBuffer;
!
!         for (;;)
!         {
!             BlockNumber rblkno = lpageop->btpo_next;
!
!             rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
!             page = BufferGetPage(rbuf);
!             lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
!             if (!P_IGNORE(lpageop))
!                 break;
!             if (P_RIGHTMOST(lpageop))
!                 elog(ERROR, "fell off the end of \"%s\"",
!                      RelationGetRelationName(rel));
!         }
!         _bt_relbuf(rel, buf);
!         buf = rbuf;
!         movedright = true;
!         vacuumed = false;
      }

      /*
+      * Now we are on the right page, so find the insert position. If we
+      * moved right at all, we know we should insert at the start of the
+      * page. If we didn't move right, we can use the firstlegaloff hint
+      * if the caller supplied one, unless we vacuumed the page which
+      * might have moved tuples around making the hint invalid. If we
+      * didn't move right or can't use the hint, find the position
+      * by searching.
+      */
+     if (movedright)
+         newitemoff = P_FIRSTDATAKEY(lpageop);
+     else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
+         newitemoff = firstlegaloff;
+     else
+         newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+
+     *bufptr = buf;
+     *offsetptr = newitemoff;
+ }
+
+ /*----------
+  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+  *
+  *        This recursive procedure does the following things:
+  *
+  *            +  if necessary, splits the target page (making sure that the
+  *               split is equitable as far as post-insert free space goes).
+  *            +  inserts the tuple.
+  *            +  if the page was split, pops the parent stack, and finds the
+  *               right place to insert the new child pointer (by walking
+  *               right using information stored in the parent stack).
+  *            +  invokes itself with the appropriate tuple for the right
+  *               child page on the parent.
+  *            +  updates the metapage if a true root or fast root is split.
+  *
+  *        On entry, we must have the right buffer in which to do the
+  *        insertion, and the buffer must be pinned and write-locked.    On return,
+  *        we will have dropped both the pin and the lock on the buffer.
+  *
+  *        The locking interactions in this code are critical.  You should
+  *        grok Lehman and Yao's paper before making any changes.  In addition,
+  *        you need to understand how we disambiguate duplicate keys in this
+  *        implementation, in order to be able to find our location using
+  *        L&Y "move right" operations.  Since we may insert duplicate user
+  *        keys, and since these dups may propagate up the tree, we use the
+  *        'afteritem' parameter to position ourselves correctly for the
+  *        insertion on internal pages.
+  *----------
+  */
+ static void
+ _bt_insertonpg(Relation rel,
+                Buffer buf,
+                BTStack stack,
+                IndexTuple itup,
+                OffsetNumber newitemoff,
+                bool split_only_page)
+ {
+     Page        page;
+     BTPageOpaque lpageop;
+     OffsetNumber firstright = InvalidOffsetNumber;
+     Size        itemsz;
+
+     page = BufferGetPage(buf);
+     lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+     itemsz = IndexTupleDSize(*itup);
+     itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
+                                  * need to be consistent */
+
+     /*
       * Do we need to split the page to fit the item on it?
       *
       * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
***************
*** 1427,1433 ****

          /* Recursively update the parent */
          _bt_insertonpg(rel, pbuf, stack->bts_parent,
!                        0, NULL, new_item, stack->bts_offset,
                         is_only);

          /* be tidy */
--- 1475,1481 ----

          /* Recursively update the parent */
          _bt_insertonpg(rel, pbuf, stack->bts_parent,
!                        new_item, stack->bts_offset + 1,
                         is_only);

          /* be tidy */

Re: Split _bt_insertonpg to two functions

От

Bruce Momjian

Дата:

28 февраля 2007 г., 22:08:52

Your patch has been added to the PostgreSQL unapplied patches list at:

    http://momjian.postgresql.org/cgi-bin/pgpatches

It will be applied as soon as one of the PostgreSQL committers reviews
and approves it.

---------------------------------------------------------------------------


Heikki Linnakangas wrote:
> Here's a patch that:
>
> Moves the logic to find a page with enough room from _bt_insertonpg to a
> new function, _bt_findinsertloc. It makes the code more readable, and
> simplifies the forthcoming Grouped Index Tuples patch.
>
> Also, the insert location within page used to be calculated twice for
> unique indexes, once in _bt_checkunique and second time in
> _bt_insertonpg. That's a waste of cycles, and this patch fixes that.
>
>
> I couldn't measure a difference with pgbench, but this micro-benchmark
> shows it:
>
>  > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
>  > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
>  > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest;
> INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"
>
> Without patch:    real    0m7.260s
> With patch:    real    0m6.963s
>
> On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10,
> to remove any other variables.
>
> It's not a huge difference, but it's worth having, and performance
> wasn't the main motivation of the patch anyway.
>
> --
>    Heikki Linnakangas
>    EnterpriseDB   http://www.enterprisedb.com


>
> ---------------------------(end of broadcast)---------------------------
> TIP 7: You can help support the PostgreSQL project by donating at
>
>                 http://www.postgresql.org/about/donate

--
  Bruce Momjian  <bruce@momjian.us>          http://momjian.us
  EnterpriseDB                               http://www.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

Re: Split _bt_insertonpg to two functions

От

Bruce Momjian

Дата:

03 марта 2007 г., 16:13:14

Patch applied.  Thanks.

---------------------------------------------------------------------------


Heikki Linnakangas wrote:
> Here's a patch that:
>
> Moves the logic to find a page with enough room from _bt_insertonpg to a
> new function, _bt_findinsertloc. It makes the code more readable, and
> simplifies the forthcoming Grouped Index Tuples patch.
>
> Also, the insert location within page used to be calculated twice for
> unique indexes, once in _bt_checkunique and second time in
> _bt_insertonpg. That's a waste of cycles, and this patch fixes that.
>
>
> I couldn't measure a difference with pgbench, but this micro-benchmark
> shows it:
>
>  > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
>  > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
>  > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest;
> INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"
>
> Without patch:    real    0m7.260s
> With patch:    real    0m6.963s
>
> On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10,
> to remove any other variables.
>
> It's not a huge difference, but it's worth having, and performance
> wasn't the main motivation of the patch anyway.
>
> --
>    Heikki Linnakangas
>    EnterpriseDB   http://www.enterprisedb.com

[ text/x-patch is unsupported, treating like TEXT/PLAIN ]

> Index: src/backend/access/nbtree/nbtinsert.c
> ===================================================================
> RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
> retrieving revision 1.152
> diff -c -r1.152 nbtinsert.c
> *** src/backend/access/nbtree/nbtinsert.c    21 Feb 2007 20:02:17 -0000    1.152
> --- src/backend/access/nbtree/nbtinsert.c    26 Feb 2007 09:37:16 -0000
> ***************
> *** 46,58 ****
>   static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
>
>   static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
> !                  Relation heapRel, Buffer buf,
>                    ScanKey itup_scankey);
>   static void _bt_insertonpg(Relation rel, Buffer buf,
>                  BTStack stack,
> -                int keysz, ScanKey scankey,
>                  IndexTuple itup,
> !                OffsetNumber afteritem,
>                  bool split_only_page);
>   static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
>             OffsetNumber newitemoff, Size newitemsz,
> --- 46,63 ----
>   static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
>
>   static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
> !                  Relation heapRel, Buffer buf, OffsetNumber ioffset,
>                    ScanKey itup_scankey);
> + static void _bt_findinsertloc(Relation rel,
> +                   Buffer *bufptr,
> +                   OffsetNumber *offsetptr,
> +                   int keysz,
> +                   ScanKey scankey,
> +                   IndexTuple newtup);
>   static void _bt_insertonpg(Relation rel, Buffer buf,
>                  BTStack stack,
>                  IndexTuple itup,
> !                OffsetNumber newitemoff,
>                  bool split_only_page);
>   static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
>             OffsetNumber newitemoff, Size newitemsz,
> ***************
> *** 86,91 ****
> --- 91,97 ----
>       ScanKey        itup_scankey;
>       BTStack        stack;
>       Buffer        buf;
> +     OffsetNumber offset;
>
>       /* we need an insertion scan key to do our search, so build one */
>       itup_scankey = _bt_mkscankey(rel, itup);
> ***************
> *** 94,99 ****
> --- 100,107 ----
>       /* find the first page containing this key */
>       stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
>
> +     offset = InvalidOffsetNumber;
> +
>       /* trade in our read lock for a write lock */
>       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
>       LockBuffer(buf, BT_WRITE);
> ***************
> *** 128,134 ****
>       {
>           TransactionId xwait;
>
> !         xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);
>
>           if (TransactionIdIsValid(xwait))
>           {
> --- 136,143 ----
>       {
>           TransactionId xwait;
>
> !         offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
> !         xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
>
>           if (TransactionIdIsValid(xwait))
>           {
> ***************
> *** 142,148 ****
>       }
>
>       /* do the insertion */
> !     _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);
>
>       /* be tidy */
>       _bt_freestack(stack);
> --- 151,158 ----
>       }
>
>       /* do the insertion */
> !     _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
> !     _bt_insertonpg(rel, buf, stack, itup, offset, false);
>
>       /* be tidy */
>       _bt_freestack(stack);
> ***************
> *** 152,169 ****
>   /*
>    *    _bt_check_unique() -- Check for violation of unique index constraint
>    *
>    * Returns InvalidTransactionId if there is no conflict, else an xact ID
>    * we must wait for to see if it commits a conflicting tuple.    If an actual
>    * conflict is detected, no return --- just ereport().
>    */
>   static TransactionId
>   _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
> !                  Buffer buf, ScanKey itup_scankey)
>   {
>       TupleDesc    itupdesc = RelationGetDescr(rel);
>       int            natts = rel->rd_rel->relnatts;
> !     OffsetNumber offset,
> !                 maxoff;
>       Page        page;
>       BTPageOpaque opaque;
>       Buffer        nbuf = InvalidBuffer;
> --- 162,182 ----
>   /*
>    *    _bt_check_unique() -- Check for violation of unique index constraint
>    *
> +  * offset points to the first possible item that could conflict. It can
> +  * also point to end-of-page, which means that the first tuple to check
> +  * is the first tuple on the next page.
> +  *
>    * Returns InvalidTransactionId if there is no conflict, else an xact ID
>    * we must wait for to see if it commits a conflicting tuple.    If an actual
>    * conflict is detected, no return --- just ereport().
>    */
>   static TransactionId
>   _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
> !                  Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
>   {
>       TupleDesc    itupdesc = RelationGetDescr(rel);
>       int            natts = rel->rd_rel->relnatts;
> !     OffsetNumber maxoff;
>       Page        page;
>       BTPageOpaque opaque;
>       Buffer        nbuf = InvalidBuffer;
> ***************
> *** 173,184 ****
>       maxoff = PageGetMaxOffsetNumber(page);
>
>       /*
> -      * Find first item >= proposed new item.  Note we could also get a pointer
> -      * to end-of-page here.
> -      */
> -     offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
> -
> -     /*
>        * Scan over all equal tuples, looking for live conflicts.
>        */
>       for (;;)
> --- 186,191 ----
> ***************
> *** 342,374 ****
>       return InvalidTransactionId;
>   }
>
> ! /*----------
> !  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
> !  *
> !  *        This recursive procedure does the following things:
> !  *
> !  *            +  finds the right place to insert the tuple.
> !  *            +  if necessary, splits the target page (making sure that the
> !  *               split is equitable as far as post-insert free space goes).
> !  *            +  inserts the tuple.
> !  *            +  if the page was split, pops the parent stack, and finds the
> !  *               right place to insert the new child pointer (by walking
> !  *               right using information stored in the parent stack).
> !  *            +  invokes itself with the appropriate tuple for the right
> !  *               child page on the parent.
> !  *            +  updates the metapage if a true root or fast root is split.
> !  *
> !  *        On entry, we must have the right buffer in which to do the
> !  *        insertion, and the buffer must be pinned and write-locked.    On return,
> !  *        we will have dropped both the pin and the lock on the buffer.
> !  *
> !  *        If 'afteritem' is >0 then the new tuple must be inserted after the
> !  *        existing item of that number, noplace else.  If 'afteritem' is 0
> !  *        then the procedure finds the exact spot to insert it by searching.
> !  *        (keysz and scankey parameters are used ONLY if afteritem == 0.
> !  *        The scankey must be an insertion-type scankey.)
>    *
> !  *        NOTE: if the new key is equal to one or more existing keys, we can
>    *        legitimately place it anywhere in the series of equal keys --- in fact,
>    *        if the new key is equal to the page's "high key" we can place it on
>    *        the next page.    If it is equal to the high key, and there's not room
> --- 349,359 ----
>       return InvalidTransactionId;
>   }
>
> !
> ! /*
> !  *    _bt_findinsertloc() -- Finds an insert location for a tuple
>    *
> !  *        If the new key is equal to one or more existing keys, we can
>    *        legitimately place it anywhere in the series of equal keys --- in fact,
>    *        if the new key is equal to the page's "high key" we can place it on
>    *        the next page.    If it is equal to the high key, and there's not room
> ***************
> *** 379,414 ****
>    *        Once we have chosen the page to put the key on, we'll insert it before
>    *        any existing equal keys because of the way _bt_binsrch() works.
>    *
> !  *        The locking interactions in this code are critical.  You should
> !  *        grok Lehman and Yao's paper before making any changes.  In addition,
> !  *        you need to understand how we disambiguate duplicate keys in this
> !  *        implementation, in order to be able to find our location using
> !  *        L&Y "move right" operations.  Since we may insert duplicate user
> !  *        keys, and since these dups may propagate up the tree, we use the
> !  *        'afteritem' parameter to position ourselves correctly for the
> !  *        insertion on internal pages.
> !  *----------
>    */
>   static void
> ! _bt_insertonpg(Relation rel,
> !                Buffer buf,
> !                BTStack stack,
> !                int keysz,
> !                ScanKey scankey,
> !                IndexTuple itup,
> !                OffsetNumber afteritem,
> !                bool split_only_page)
>   {
> !     Page        page;
>       BTPageOpaque lpageop;
>       OffsetNumber newitemoff;
> !     OffsetNumber firstright = InvalidOffsetNumber;
> !     Size        itemsz;
>
> -     page = BufferGetPage(buf);
>       lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
>
> !     itemsz = IndexTupleDSize(*itup);
>       itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
>                                    * need to be consistent */
>
> --- 364,403 ----
>    *        Once we have chosen the page to put the key on, we'll insert it before
>    *        any existing equal keys because of the way _bt_binsrch() works.
>    *
> !  *        If there's not enough room in the space, we try to make room by
> !  *        removing any LP_DELETEd tuples.
> !  *
> !  *        On entry, *buf and *offsetptr point to the first legal position
> !  *        where the new tuple could be inserted. The caller should hold an
> !  *        exclusive lock on *buf. *offsetptr can also be set to
> !  *        InvalidOffsetNumber, in which case the function will search the    right
> !  *        location within the page if needed. On exit, they point to the chosen
> !  *        insert location. If findinsertloc decided to move right, the lock and
> !  *        pin on the original page will be released and the new page returned to
> !  *        the caller is exclusively locked instead.
> !  *
> !  *        newtup is the new tuple we're inserting, and scankey is an insertion
> !  *        type scan key for it.
>    */
>   static void
> ! _bt_findinsertloc(Relation rel,
> !                   Buffer *bufptr,
> !                   OffsetNumber *offsetptr,
> !                   int keysz,
> !                   ScanKey scankey,
> !                   IndexTuple newtup)
>   {
> !     Buffer buf = *bufptr;
> !     Page page = BufferGetPage(buf);
> !     Size itemsz;
>       BTPageOpaque lpageop;
> +     bool movedright, vacuumed;
>       OffsetNumber newitemoff;
> !     OffsetNumber firstlegaloff = *offsetptr;
>
>       lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
>
> !     itemsz = IndexTupleDSize(*newtup);
>       itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
>                                    * need to be consistent */
>
> ***************
> *** 429,525 ****
>                   "Consider a function index of an MD5 hash of the value, "
>                   "or use full text indexing.")));
>
> -     /*
> -      * Determine exactly where new item will go.
> -      */
> -     if (afteritem > 0)
> -         newitemoff = afteritem + 1;
> -     else
> -     {
> -         /*----------
> -          * If we will need to split the page to put the item here,
> -          * check whether we can put the tuple somewhere to the right,
> -          * instead.  Keep scanning right until we
> -          *        (a) find a page with enough free space,
> -          *        (b) reach the last page where the tuple can legally go, or
> -          *        (c) get tired of searching.
> -          * (c) is not flippant; it is important because if there are many
> -          * pages' worth of equal keys, it's better to split one of the early
> -          * pages than to scan all the way to the end of the run of equal keys
> -          * on every insert.  We implement "get tired" as a random choice,
> -          * since stopping after scanning a fixed number of pages wouldn't work
> -          * well (we'd never reach the right-hand side of previously split
> -          * pages).    Currently the probability of moving right is set at 0.99,
> -          * which may seem too high to change the behavior much, but it does an
> -          * excellent job of preventing O(N^2) behavior with many equal keys.
> -          *----------
> -          */
> -         bool        movedright = false;
> -
> -         while (PageGetFreeSpace(page) < itemsz)
> -         {
> -             Buffer        rbuf;
>
> -             /*
> -              * before considering moving right, see if we can obtain enough
> -              * space by erasing LP_DELETE items
> -              */
> -             if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
> -             {
> -                 _bt_vacuum_one_page(rel, buf);
> -                 if (PageGetFreeSpace(page) >= itemsz)
> -                     break;        /* OK, now we have enough space */
> -             }
>
> !             /*
> !              * nope, so check conditions (b) and (c) enumerated above
> !              */
> !             if (P_RIGHTMOST(lpageop) ||
> !                 _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
> !                 random() <= (MAX_RANDOM_VALUE / 100))
> !                 break;
>
> !             /*
> !              * step right to next non-dead page
> !              *
> !              * must write-lock that page before releasing write lock on
> !              * current page; else someone else's _bt_check_unique scan could
> !              * fail to see our insertion.  write locks on intermediate dead
> !              * pages won't do because we don't know when they will get
> !              * de-linked from the tree.
> !              */
> !             rbuf = InvalidBuffer;
>
> !             for (;;)
> !             {
> !                 BlockNumber rblkno = lpageop->btpo_next;
>
> !                 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
> !                 page = BufferGetPage(rbuf);
> !                 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
> !                 if (!P_IGNORE(lpageop))
> !                     break;
> !                 if (P_RIGHTMOST(lpageop))
> !                     elog(ERROR, "fell off the end of \"%s\"",
> !                          RelationGetRelationName(rel));
> !             }
> !             _bt_relbuf(rel, buf);
> !             buf = rbuf;
> !             movedright = true;
>           }
>
>           /*
> !          * Now we are on the right page, so find the insert position. If we
> !          * moved right at all, we know we should insert at the start of the
> !          * page, else must find the position by searching.
>            */
> !         if (movedright)
> !             newitemoff = P_FIRSTDATAKEY(lpageop);
> !         else
> !             newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
>       }
>
>       /*
>        * Do we need to split the page to fit the item on it?
>        *
>        * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
> --- 418,573 ----
>                   "Consider a function index of an MD5 hash of the value, "
>                   "or use full text indexing.")));
>
>
>
> !     /*----------
> !      * If we will need to split the page to put the item on this page,
> !      * check whether we can put the tuple somewhere to the right,
> !      * instead.  Keep scanning right until we
> !      *        (a) find a page with enough free space,
> !      *        (b) reach the last page where the tuple can legally go, or
> !      *        (c) get tired of searching.
> !      * (c) is not flippant; it is important because if there are many
> !      * pages' worth of equal keys, it's better to split one of the early
> !      * pages than to scan all the way to the end of the run of equal keys
> !      * on every insert.  We implement "get tired" as a random choice,
> !      * since stopping after scanning a fixed number of pages wouldn't work
> !      * well (we'd never reach the right-hand side of previously split
> !      * pages).    Currently the probability of moving right is set at 0.99,
> !      * which may seem too high to change the behavior much, but it does an
> !      * excellent job of preventing O(N^2) behavior with many equal keys.
> !      *----------
> !      */
> !     movedright = false;
> !     vacuumed = false;
> !     while (PageGetFreeSpace(page) < itemsz)
> !     {
> !         Buffer        rbuf;
>
> !         /*
> !          * before considering moving right, see if we can obtain enough
> !          * space by erasing LP_DELETE items
> !          */
> !         if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
> !         {
> !             _bt_vacuum_one_page(rel, buf);
>
> !             /* remember that we vacuumed this page, because that makes
> !              * the hint supplied by the caller invalid */
> !             vacuumed = true;
>
> !             if (PageGetFreeSpace(page) >= itemsz)
> !                 break;        /* OK, now we have enough space */
>           }
>
>           /*
> !          * nope, so check conditions (b) and (c) enumerated above
>            */
> !         if (P_RIGHTMOST(lpageop) ||
> !             _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
> !             random() <= (MAX_RANDOM_VALUE / 100))
> !             break;
> !
> !         /*
> !          * step right to next non-dead page
> !          *
> !          * must write-lock that page before releasing write lock on
> !          * current page; else someone else's _bt_check_unique scan could
> !          * fail to see our insertion.  write locks on intermediate dead
> !          * pages won't do because we don't know when they will get
> !          * de-linked from the tree.
> !          */
> !         rbuf = InvalidBuffer;
> !
> !         for (;;)
> !         {
> !             BlockNumber rblkno = lpageop->btpo_next;
> !
> !             rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
> !             page = BufferGetPage(rbuf);
> !             lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
> !             if (!P_IGNORE(lpageop))
> !                 break;
> !             if (P_RIGHTMOST(lpageop))
> !                 elog(ERROR, "fell off the end of \"%s\"",
> !                      RelationGetRelationName(rel));
> !         }
> !         _bt_relbuf(rel, buf);
> !         buf = rbuf;
> !         movedright = true;
> !         vacuumed = false;
>       }
>
>       /*
> +      * Now we are on the right page, so find the insert position. If we
> +      * moved right at all, we know we should insert at the start of the
> +      * page. If we didn't move right, we can use the firstlegaloff hint
> +      * if the caller supplied one, unless we vacuumed the page which
> +      * might have moved tuples around making the hint invalid. If we
> +      * didn't move right or can't use the hint, find the position
> +      * by searching.
> +      */
> +     if (movedright)
> +         newitemoff = P_FIRSTDATAKEY(lpageop);
> +     else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
> +         newitemoff = firstlegaloff;
> +     else
> +         newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
> +
> +     *bufptr = buf;
> +     *offsetptr = newitemoff;
> + }
> +
> + /*----------
> +  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
> +  *
> +  *        This recursive procedure does the following things:
> +  *
> +  *            +  if necessary, splits the target page (making sure that the
> +  *               split is equitable as far as post-insert free space goes).
> +  *            +  inserts the tuple.
> +  *            +  if the page was split, pops the parent stack, and finds the
> +  *               right place to insert the new child pointer (by walking
> +  *               right using information stored in the parent stack).
> +  *            +  invokes itself with the appropriate tuple for the right
> +  *               child page on the parent.
> +  *            +  updates the metapage if a true root or fast root is split.
> +  *
> +  *        On entry, we must have the right buffer in which to do the
> +  *        insertion, and the buffer must be pinned and write-locked.    On return,
> +  *        we will have dropped both the pin and the lock on the buffer.
> +  *
> +  *        The locking interactions in this code are critical.  You should
> +  *        grok Lehman and Yao's paper before making any changes.  In addition,
> +  *        you need to understand how we disambiguate duplicate keys in this
> +  *        implementation, in order to be able to find our location using
> +  *        L&Y "move right" operations.  Since we may insert duplicate user
> +  *        keys, and since these dups may propagate up the tree, we use the
> +  *        'afteritem' parameter to position ourselves correctly for the
> +  *        insertion on internal pages.
> +  *----------
> +  */
> + static void
> + _bt_insertonpg(Relation rel,
> +                Buffer buf,
> +                BTStack stack,
> +                IndexTuple itup,
> +                OffsetNumber newitemoff,
> +                bool split_only_page)
> + {
> +     Page        page;
> +     BTPageOpaque lpageop;
> +     OffsetNumber firstright = InvalidOffsetNumber;
> +     Size        itemsz;
> +
> +     page = BufferGetPage(buf);
> +     lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
> +
> +     itemsz = IndexTupleDSize(*itup);
> +     itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
> +                                  * need to be consistent */
> +
> +     /*
>        * Do we need to split the page to fit the item on it?
>        *
>        * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
> ***************
> *** 1427,1433 ****
>
>           /* Recursively update the parent */
>           _bt_insertonpg(rel, pbuf, stack->bts_parent,
> !                        0, NULL, new_item, stack->bts_offset,
>                          is_only);
>
>           /* be tidy */
> --- 1475,1481 ----
>
>           /* Recursively update the parent */
>           _bt_insertonpg(rel, pbuf, stack->bts_parent,
> !                        new_item, stack->bts_offset + 1,
>                          is_only);
>
>           /* be tidy */

>
> ---------------------------(end of broadcast)---------------------------
> TIP 7: You can help support the PostgreSQL project by donating at
>
>                 http://www.postgresql.org/about/donate

--
  Bruce Momjian  <bruce@momjian.us>          http://momjian.us
  EnterpriseDB                               http://www.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

Вход в личный кабинет

Восстановление пароля

Подтверждение аккаунта

Изменение пароля

Обсуждение: Split _bt_insertonpg to two functions

Split _bt_insertonpg to two functions

Re: Split _bt_insertonpg to two functions

Re: Split _bt_insertonpg to two functions