Re: Split _bt_insertonpg to two functions

Поиск
Список
Период
Сортировка
От Bruce Momjian
Тема Re: Split _bt_insertonpg to two functions
Дата
Msg-id 200703032013.l23KD8V07740@momjian.us
обсуждение исходный текст
Ответ на Split _bt_insertonpg to two functions  (Heikki Linnakangas <heikki@enterprisedb.com>)
Список pgsql-patches
Patch applied.  Thanks.

---------------------------------------------------------------------------


Heikki Linnakangas wrote:
> Here's a patch that:
>
> Moves the logic to find a page with enough room from _bt_insertonpg to a
> new function, _bt_findinsertloc. It makes the code more readable, and
> simplifies the forthcoming Grouped Index Tuples patch.
>
> Also, the insert location within page used to be calculated twice for
> unique indexes, once in _bt_checkunique and second time in
> _bt_insertonpg. That's a waste of cycles, and this patch fixes that.
>
>
> I couldn't measure a difference with pgbench, but this micro-benchmark
> shows it:
>
>  > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
>  > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
>  > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest;
> INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"
>
> Without patch:    real    0m7.260s
> With patch:    real    0m6.963s
>
> On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10,
> to remove any other variables.
>
> It's not a huge difference, but it's worth having, and performance
> wasn't the main motivation of the patch anyway.
>
> --
>    Heikki Linnakangas
>    EnterpriseDB   http://www.enterprisedb.com

[ text/x-patch is unsupported, treating like TEXT/PLAIN ]

> Index: src/backend/access/nbtree/nbtinsert.c
> ===================================================================
> RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
> retrieving revision 1.152
> diff -c -r1.152 nbtinsert.c
> *** src/backend/access/nbtree/nbtinsert.c    21 Feb 2007 20:02:17 -0000    1.152
> --- src/backend/access/nbtree/nbtinsert.c    26 Feb 2007 09:37:16 -0000
> ***************
> *** 46,58 ****
>   static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
>
>   static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
> !                  Relation heapRel, Buffer buf,
>                    ScanKey itup_scankey);
>   static void _bt_insertonpg(Relation rel, Buffer buf,
>                  BTStack stack,
> -                int keysz, ScanKey scankey,
>                  IndexTuple itup,
> !                OffsetNumber afteritem,
>                  bool split_only_page);
>   static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
>             OffsetNumber newitemoff, Size newitemsz,
> --- 46,63 ----
>   static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
>
>   static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
> !                  Relation heapRel, Buffer buf, OffsetNumber ioffset,
>                    ScanKey itup_scankey);
> + static void _bt_findinsertloc(Relation rel,
> +                   Buffer *bufptr,
> +                   OffsetNumber *offsetptr,
> +                   int keysz,
> +                   ScanKey scankey,
> +                   IndexTuple newtup);
>   static void _bt_insertonpg(Relation rel, Buffer buf,
>                  BTStack stack,
>                  IndexTuple itup,
> !                OffsetNumber newitemoff,
>                  bool split_only_page);
>   static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
>             OffsetNumber newitemoff, Size newitemsz,
> ***************
> *** 86,91 ****
> --- 91,97 ----
>       ScanKey        itup_scankey;
>       BTStack        stack;
>       Buffer        buf;
> +     OffsetNumber offset;
>
>       /* we need an insertion scan key to do our search, so build one */
>       itup_scankey = _bt_mkscankey(rel, itup);
> ***************
> *** 94,99 ****
> --- 100,107 ----
>       /* find the first page containing this key */
>       stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
>
> +     offset = InvalidOffsetNumber;
> +
>       /* trade in our read lock for a write lock */
>       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
>       LockBuffer(buf, BT_WRITE);
> ***************
> *** 128,134 ****
>       {
>           TransactionId xwait;
>
> !         xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);
>
>           if (TransactionIdIsValid(xwait))
>           {
> --- 136,143 ----
>       {
>           TransactionId xwait;
>
> !         offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
> !         xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
>
>           if (TransactionIdIsValid(xwait))
>           {
> ***************
> *** 142,148 ****
>       }
>
>       /* do the insertion */
> !     _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);
>
>       /* be tidy */
>       _bt_freestack(stack);
> --- 151,158 ----
>       }
>
>       /* do the insertion */
> !     _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
> !     _bt_insertonpg(rel, buf, stack, itup, offset, false);
>
>       /* be tidy */
>       _bt_freestack(stack);
> ***************
> *** 152,169 ****
>   /*
>    *    _bt_check_unique() -- Check for violation of unique index constraint
>    *
>    * Returns InvalidTransactionId if there is no conflict, else an xact ID
>    * we must wait for to see if it commits a conflicting tuple.    If an actual
>    * conflict is detected, no return --- just ereport().
>    */
>   static TransactionId
>   _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
> !                  Buffer buf, ScanKey itup_scankey)
>   {
>       TupleDesc    itupdesc = RelationGetDescr(rel);
>       int            natts = rel->rd_rel->relnatts;
> !     OffsetNumber offset,
> !                 maxoff;
>       Page        page;
>       BTPageOpaque opaque;
>       Buffer        nbuf = InvalidBuffer;
> --- 162,182 ----
>   /*
>    *    _bt_check_unique() -- Check for violation of unique index constraint
>    *
> +  * offset points to the first possible item that could conflict. It can
> +  * also point to end-of-page, which means that the first tuple to check
> +  * is the first tuple on the next page.
> +  *
>    * Returns InvalidTransactionId if there is no conflict, else an xact ID
>    * we must wait for to see if it commits a conflicting tuple.    If an actual
>    * conflict is detected, no return --- just ereport().
>    */
>   static TransactionId
>   _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
> !                  Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
>   {
>       TupleDesc    itupdesc = RelationGetDescr(rel);
>       int            natts = rel->rd_rel->relnatts;
> !     OffsetNumber maxoff;
>       Page        page;
>       BTPageOpaque opaque;
>       Buffer        nbuf = InvalidBuffer;
> ***************
> *** 173,184 ****
>       maxoff = PageGetMaxOffsetNumber(page);
>
>       /*
> -      * Find first item >= proposed new item.  Note we could also get a pointer
> -      * to end-of-page here.
> -      */
> -     offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
> -
> -     /*
>        * Scan over all equal tuples, looking for live conflicts.
>        */
>       for (;;)
> --- 186,191 ----
> ***************
> *** 342,374 ****
>       return InvalidTransactionId;
>   }
>
> ! /*----------
> !  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
> !  *
> !  *        This recursive procedure does the following things:
> !  *
> !  *            +  finds the right place to insert the tuple.
> !  *            +  if necessary, splits the target page (making sure that the
> !  *               split is equitable as far as post-insert free space goes).
> !  *            +  inserts the tuple.
> !  *            +  if the page was split, pops the parent stack, and finds the
> !  *               right place to insert the new child pointer (by walking
> !  *               right using information stored in the parent stack).
> !  *            +  invokes itself with the appropriate tuple for the right
> !  *               child page on the parent.
> !  *            +  updates the metapage if a true root or fast root is split.
> !  *
> !  *        On entry, we must have the right buffer in which to do the
> !  *        insertion, and the buffer must be pinned and write-locked.    On return,
> !  *        we will have dropped both the pin and the lock on the buffer.
> !  *
> !  *        If 'afteritem' is >0 then the new tuple must be inserted after the
> !  *        existing item of that number, noplace else.  If 'afteritem' is 0
> !  *        then the procedure finds the exact spot to insert it by searching.
> !  *        (keysz and scankey parameters are used ONLY if afteritem == 0.
> !  *        The scankey must be an insertion-type scankey.)
>    *
> !  *        NOTE: if the new key is equal to one or more existing keys, we can
>    *        legitimately place it anywhere in the series of equal keys --- in fact,
>    *        if the new key is equal to the page's "high key" we can place it on
>    *        the next page.    If it is equal to the high key, and there's not room
> --- 349,359 ----
>       return InvalidTransactionId;
>   }
>
> !
> ! /*
> !  *    _bt_findinsertloc() -- Finds an insert location for a tuple
>    *
> !  *        If the new key is equal to one or more existing keys, we can
>    *        legitimately place it anywhere in the series of equal keys --- in fact,
>    *        if the new key is equal to the page's "high key" we can place it on
>    *        the next page.    If it is equal to the high key, and there's not room
> ***************
> *** 379,414 ****
>    *        Once we have chosen the page to put the key on, we'll insert it before
>    *        any existing equal keys because of the way _bt_binsrch() works.
>    *
> !  *        The locking interactions in this code are critical.  You should
> !  *        grok Lehman and Yao's paper before making any changes.  In addition,
> !  *        you need to understand how we disambiguate duplicate keys in this
> !  *        implementation, in order to be able to find our location using
> !  *        L&Y "move right" operations.  Since we may insert duplicate user
> !  *        keys, and since these dups may propagate up the tree, we use the
> !  *        'afteritem' parameter to position ourselves correctly for the
> !  *        insertion on internal pages.
> !  *----------
>    */
>   static void
> ! _bt_insertonpg(Relation rel,
> !                Buffer buf,
> !                BTStack stack,
> !                int keysz,
> !                ScanKey scankey,
> !                IndexTuple itup,
> !                OffsetNumber afteritem,
> !                bool split_only_page)
>   {
> !     Page        page;
>       BTPageOpaque lpageop;
>       OffsetNumber newitemoff;
> !     OffsetNumber firstright = InvalidOffsetNumber;
> !     Size        itemsz;
>
> -     page = BufferGetPage(buf);
>       lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
>
> !     itemsz = IndexTupleDSize(*itup);
>       itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
>                                    * need to be consistent */
>
> --- 364,403 ----
>    *        Once we have chosen the page to put the key on, we'll insert it before
>    *        any existing equal keys because of the way _bt_binsrch() works.
>    *
> !  *        If there's not enough room in the space, we try to make room by
> !  *        removing any LP_DELETEd tuples.
> !  *
> !  *        On entry, *buf and *offsetptr point to the first legal position
> !  *        where the new tuple could be inserted. The caller should hold an
> !  *        exclusive lock on *buf. *offsetptr can also be set to
> !  *        InvalidOffsetNumber, in which case the function will search the    right
> !  *        location within the page if needed. On exit, they point to the chosen
> !  *        insert location. If findinsertloc decided to move right, the lock and
> !  *        pin on the original page will be released and the new page returned to
> !  *        the caller is exclusively locked instead.
> !  *
> !  *        newtup is the new tuple we're inserting, and scankey is an insertion
> !  *        type scan key for it.
>    */
>   static void
> ! _bt_findinsertloc(Relation rel,
> !                   Buffer *bufptr,
> !                   OffsetNumber *offsetptr,
> !                   int keysz,
> !                   ScanKey scankey,
> !                   IndexTuple newtup)
>   {
> !     Buffer buf = *bufptr;
> !     Page page = BufferGetPage(buf);
> !     Size itemsz;
>       BTPageOpaque lpageop;
> +     bool movedright, vacuumed;
>       OffsetNumber newitemoff;
> !     OffsetNumber firstlegaloff = *offsetptr;
>
>       lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
>
> !     itemsz = IndexTupleDSize(*newtup);
>       itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
>                                    * need to be consistent */
>
> ***************
> *** 429,525 ****
>                   "Consider a function index of an MD5 hash of the value, "
>                   "or use full text indexing.")));
>
> -     /*
> -      * Determine exactly where new item will go.
> -      */
> -     if (afteritem > 0)
> -         newitemoff = afteritem + 1;
> -     else
> -     {
> -         /*----------
> -          * If we will need to split the page to put the item here,
> -          * check whether we can put the tuple somewhere to the right,
> -          * instead.  Keep scanning right until we
> -          *        (a) find a page with enough free space,
> -          *        (b) reach the last page where the tuple can legally go, or
> -          *        (c) get tired of searching.
> -          * (c) is not flippant; it is important because if there are many
> -          * pages' worth of equal keys, it's better to split one of the early
> -          * pages than to scan all the way to the end of the run of equal keys
> -          * on every insert.  We implement "get tired" as a random choice,
> -          * since stopping after scanning a fixed number of pages wouldn't work
> -          * well (we'd never reach the right-hand side of previously split
> -          * pages).    Currently the probability of moving right is set at 0.99,
> -          * which may seem too high to change the behavior much, but it does an
> -          * excellent job of preventing O(N^2) behavior with many equal keys.
> -          *----------
> -          */
> -         bool        movedright = false;
> -
> -         while (PageGetFreeSpace(page) < itemsz)
> -         {
> -             Buffer        rbuf;
>
> -             /*
> -              * before considering moving right, see if we can obtain enough
> -              * space by erasing LP_DELETE items
> -              */
> -             if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
> -             {
> -                 _bt_vacuum_one_page(rel, buf);
> -                 if (PageGetFreeSpace(page) >= itemsz)
> -                     break;        /* OK, now we have enough space */
> -             }
>
> !             /*
> !              * nope, so check conditions (b) and (c) enumerated above
> !              */
> !             if (P_RIGHTMOST(lpageop) ||
> !                 _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
> !                 random() <= (MAX_RANDOM_VALUE / 100))
> !                 break;
>
> !             /*
> !              * step right to next non-dead page
> !              *
> !              * must write-lock that page before releasing write lock on
> !              * current page; else someone else's _bt_check_unique scan could
> !              * fail to see our insertion.  write locks on intermediate dead
> !              * pages won't do because we don't know when they will get
> !              * de-linked from the tree.
> !              */
> !             rbuf = InvalidBuffer;
>
> !             for (;;)
> !             {
> !                 BlockNumber rblkno = lpageop->btpo_next;
>
> !                 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
> !                 page = BufferGetPage(rbuf);
> !                 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
> !                 if (!P_IGNORE(lpageop))
> !                     break;
> !                 if (P_RIGHTMOST(lpageop))
> !                     elog(ERROR, "fell off the end of \"%s\"",
> !                          RelationGetRelationName(rel));
> !             }
> !             _bt_relbuf(rel, buf);
> !             buf = rbuf;
> !             movedright = true;
>           }
>
>           /*
> !          * Now we are on the right page, so find the insert position. If we
> !          * moved right at all, we know we should insert at the start of the
> !          * page, else must find the position by searching.
>            */
> !         if (movedright)
> !             newitemoff = P_FIRSTDATAKEY(lpageop);
> !         else
> !             newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
>       }
>
>       /*
>        * Do we need to split the page to fit the item on it?
>        *
>        * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
> --- 418,573 ----
>                   "Consider a function index of an MD5 hash of the value, "
>                   "or use full text indexing.")));
>
>
>
> !     /*----------
> !      * If we will need to split the page to put the item on this page,
> !      * check whether we can put the tuple somewhere to the right,
> !      * instead.  Keep scanning right until we
> !      *        (a) find a page with enough free space,
> !      *        (b) reach the last page where the tuple can legally go, or
> !      *        (c) get tired of searching.
> !      * (c) is not flippant; it is important because if there are many
> !      * pages' worth of equal keys, it's better to split one of the early
> !      * pages than to scan all the way to the end of the run of equal keys
> !      * on every insert.  We implement "get tired" as a random choice,
> !      * since stopping after scanning a fixed number of pages wouldn't work
> !      * well (we'd never reach the right-hand side of previously split
> !      * pages).    Currently the probability of moving right is set at 0.99,
> !      * which may seem too high to change the behavior much, but it does an
> !      * excellent job of preventing O(N^2) behavior with many equal keys.
> !      *----------
> !      */
> !     movedright = false;
> !     vacuumed = false;
> !     while (PageGetFreeSpace(page) < itemsz)
> !     {
> !         Buffer        rbuf;
>
> !         /*
> !          * before considering moving right, see if we can obtain enough
> !          * space by erasing LP_DELETE items
> !          */
> !         if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
> !         {
> !             _bt_vacuum_one_page(rel, buf);
>
> !             /* remember that we vacuumed this page, because that makes
> !              * the hint supplied by the caller invalid */
> !             vacuumed = true;
>
> !             if (PageGetFreeSpace(page) >= itemsz)
> !                 break;        /* OK, now we have enough space */
>           }
>
>           /*
> !          * nope, so check conditions (b) and (c) enumerated above
>            */
> !         if (P_RIGHTMOST(lpageop) ||
> !             _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
> !             random() <= (MAX_RANDOM_VALUE / 100))
> !             break;
> !
> !         /*
> !          * step right to next non-dead page
> !          *
> !          * must write-lock that page before releasing write lock on
> !          * current page; else someone else's _bt_check_unique scan could
> !          * fail to see our insertion.  write locks on intermediate dead
> !          * pages won't do because we don't know when they will get
> !          * de-linked from the tree.
> !          */
> !         rbuf = InvalidBuffer;
> !
> !         for (;;)
> !         {
> !             BlockNumber rblkno = lpageop->btpo_next;
> !
> !             rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
> !             page = BufferGetPage(rbuf);
> !             lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
> !             if (!P_IGNORE(lpageop))
> !                 break;
> !             if (P_RIGHTMOST(lpageop))
> !                 elog(ERROR, "fell off the end of \"%s\"",
> !                      RelationGetRelationName(rel));
> !         }
> !         _bt_relbuf(rel, buf);
> !         buf = rbuf;
> !         movedright = true;
> !         vacuumed = false;
>       }
>
>       /*
> +      * Now we are on the right page, so find the insert position. If we
> +      * moved right at all, we know we should insert at the start of the
> +      * page. If we didn't move right, we can use the firstlegaloff hint
> +      * if the caller supplied one, unless we vacuumed the page which
> +      * might have moved tuples around making the hint invalid. If we
> +      * didn't move right or can't use the hint, find the position
> +      * by searching.
> +      */
> +     if (movedright)
> +         newitemoff = P_FIRSTDATAKEY(lpageop);
> +     else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
> +         newitemoff = firstlegaloff;
> +     else
> +         newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
> +
> +     *bufptr = buf;
> +     *offsetptr = newitemoff;
> + }
> +
> + /*----------
> +  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
> +  *
> +  *        This recursive procedure does the following things:
> +  *
> +  *            +  if necessary, splits the target page (making sure that the
> +  *               split is equitable as far as post-insert free space goes).
> +  *            +  inserts the tuple.
> +  *            +  if the page was split, pops the parent stack, and finds the
> +  *               right place to insert the new child pointer (by walking
> +  *               right using information stored in the parent stack).
> +  *            +  invokes itself with the appropriate tuple for the right
> +  *               child page on the parent.
> +  *            +  updates the metapage if a true root or fast root is split.
> +  *
> +  *        On entry, we must have the right buffer in which to do the
> +  *        insertion, and the buffer must be pinned and write-locked.    On return,
> +  *        we will have dropped both the pin and the lock on the buffer.
> +  *
> +  *        The locking interactions in this code are critical.  You should
> +  *        grok Lehman and Yao's paper before making any changes.  In addition,
> +  *        you need to understand how we disambiguate duplicate keys in this
> +  *        implementation, in order to be able to find our location using
> +  *        L&Y "move right" operations.  Since we may insert duplicate user
> +  *        keys, and since these dups may propagate up the tree, we use the
> +  *        'afteritem' parameter to position ourselves correctly for the
> +  *        insertion on internal pages.
> +  *----------
> +  */
> + static void
> + _bt_insertonpg(Relation rel,
> +                Buffer buf,
> +                BTStack stack,
> +                IndexTuple itup,
> +                OffsetNumber newitemoff,
> +                bool split_only_page)
> + {
> +     Page        page;
> +     BTPageOpaque lpageop;
> +     OffsetNumber firstright = InvalidOffsetNumber;
> +     Size        itemsz;
> +
> +     page = BufferGetPage(buf);
> +     lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
> +
> +     itemsz = IndexTupleDSize(*itup);
> +     itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
> +                                  * need to be consistent */
> +
> +     /*
>        * Do we need to split the page to fit the item on it?
>        *
>        * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
> ***************
> *** 1427,1433 ****
>
>           /* Recursively update the parent */
>           _bt_insertonpg(rel, pbuf, stack->bts_parent,
> !                        0, NULL, new_item, stack->bts_offset,
>                          is_only);
>
>           /* be tidy */
> --- 1475,1481 ----
>
>           /* Recursively update the parent */
>           _bt_insertonpg(rel, pbuf, stack->bts_parent,
> !                        new_item, stack->bts_offset + 1,
>                          is_only);
>
>           /* be tidy */

>
> ---------------------------(end of broadcast)---------------------------
> TIP 7: You can help support the PostgreSQL project by donating at
>
>                 http://www.postgresql.org/about/donate

--
  Bruce Momjian  <bruce@momjian.us>          http://momjian.us
  EnterpriseDB                               http://www.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

В списке pgsql-patches по дате отправления:

Предыдущее
От: Bruce Momjian
Дата:
Сообщение: Re: Fast COPY after TRUNCATE bug and fix
Следующее
От: Bruce Momjian
Дата:
Сообщение: Re: cosmetic patch to large object regression test