Обсуждение: Split _bt_insertonpg to two functions
Here's a patch that: Moves the logic to find a page with enough room from _bt_insertonpg to a new function, _bt_findinsertloc. It makes the code more readable, and simplifies the forthcoming Grouped Index Tuples patch. Also, the insert location within page used to be calculated twice for unique indexes, once in _bt_checkunique and second time in _bt_insertonpg. That's a waste of cycles, and this patch fixes that. I couldn't measure a difference with pgbench, but this micro-benchmark shows it: > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);" > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest; INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;" Without patch: real 0m7.260s With patch: real 0m6.963s On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10, to remove any other variables. It's not a huge difference, but it's worth having, and performance wasn't the main motivation of the patch anyway. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com Index: src/backend/access/nbtree/nbtinsert.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v retrieving revision 1.152 diff -c -r1.152 nbtinsert.c *** src/backend/access/nbtree/nbtinsert.c 21 Feb 2007 20:02:17 -0000 1.152 --- src/backend/access/nbtree/nbtinsert.c 26 Feb 2007 09:37:16 -0000 *************** *** 46,58 **** static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, ! Relation heapRel, Buffer buf, ScanKey itup_scankey); static void _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, - int keysz, ScanKey scankey, IndexTuple itup, ! OffsetNumber afteritem, bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, --- 46,63 ---- static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, ! Relation heapRel, Buffer buf, OffsetNumber ioffset, ScanKey itup_scankey); + static void _bt_findinsertloc(Relation rel, + Buffer *bufptr, + OffsetNumber *offsetptr, + int keysz, + ScanKey scankey, + IndexTuple newtup); static void _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, IndexTuple itup, ! OffsetNumber newitemoff, bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, *************** *** 86,91 **** --- 91,97 ---- ScanKey itup_scankey; BTStack stack; Buffer buf; + OffsetNumber offset; /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, itup); *************** *** 94,99 **** --- 100,107 ---- /* find the first page containing this key */ stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE); + offset = InvalidOffsetNumber; + /* trade in our read lock for a write lock */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BT_WRITE); *************** *** 128,134 **** { TransactionId xwait; ! xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey); if (TransactionIdIsValid(xwait)) { --- 136,143 ---- { TransactionId xwait; ! offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); ! xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey); if (TransactionIdIsValid(xwait)) { *************** *** 142,148 **** } /* do the insertion */ ! _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false); /* be tidy */ _bt_freestack(stack); --- 151,158 ---- } /* do the insertion */ ! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup); ! _bt_insertonpg(rel, buf, stack, itup, offset, false); /* be tidy */ _bt_freestack(stack); *************** *** 152,169 **** /* * _bt_check_unique() -- Check for violation of unique index constraint * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). */ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, ! Buffer buf, ScanKey itup_scankey) { TupleDesc itupdesc = RelationGetDescr(rel); int natts = rel->rd_rel->relnatts; ! OffsetNumber offset, ! maxoff; Page page; BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; --- 162,182 ---- /* * _bt_check_unique() -- Check for violation of unique index constraint * + * offset points to the first possible item that could conflict. It can + * also point to end-of-page, which means that the first tuple to check + * is the first tuple on the next page. + * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). */ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, ! Buffer buf, OffsetNumber offset, ScanKey itup_scankey) { TupleDesc itupdesc = RelationGetDescr(rel); int natts = rel->rd_rel->relnatts; ! OffsetNumber maxoff; Page page; BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; *************** *** 173,184 **** maxoff = PageGetMaxOffsetNumber(page); /* - * Find first item >= proposed new item. Note we could also get a pointer - * to end-of-page here. - */ - offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); - - /* * Scan over all equal tuples, looking for live conflicts. */ for (;;) --- 186,191 ---- *************** *** 342,374 **** return InvalidTransactionId; } ! /*---------- ! * _bt_insertonpg() -- Insert a tuple on a particular page in the index. ! * ! * This recursive procedure does the following things: ! * ! * + finds the right place to insert the tuple. ! * + if necessary, splits the target page (making sure that the ! * split is equitable as far as post-insert free space goes). ! * + inserts the tuple. ! * + if the page was split, pops the parent stack, and finds the ! * right place to insert the new child pointer (by walking ! * right using information stored in the parent stack). ! * + invokes itself with the appropriate tuple for the right ! * child page on the parent. ! * + updates the metapage if a true root or fast root is split. ! * ! * On entry, we must have the right buffer in which to do the ! * insertion, and the buffer must be pinned and write-locked. On return, ! * we will have dropped both the pin and the lock on the buffer. ! * ! * If 'afteritem' is >0 then the new tuple must be inserted after the ! * existing item of that number, noplace else. If 'afteritem' is 0 ! * then the procedure finds the exact spot to insert it by searching. ! * (keysz and scankey parameters are used ONLY if afteritem == 0. ! * The scankey must be an insertion-type scankey.) * ! * NOTE: if the new key is equal to one or more existing keys, we can * legitimately place it anywhere in the series of equal keys --- in fact, * if the new key is equal to the page's "high key" we can place it on * the next page. If it is equal to the high key, and there's not room --- 349,359 ---- return InvalidTransactionId; } ! ! /* ! * _bt_findinsertloc() -- Finds an insert location for a tuple * ! * If the new key is equal to one or more existing keys, we can * legitimately place it anywhere in the series of equal keys --- in fact, * if the new key is equal to the page's "high key" we can place it on * the next page. If it is equal to the high key, and there's not room *************** *** 379,414 **** * Once we have chosen the page to put the key on, we'll insert it before * any existing equal keys because of the way _bt_binsrch() works. * ! * The locking interactions in this code are critical. You should ! * grok Lehman and Yao's paper before making any changes. In addition, ! * you need to understand how we disambiguate duplicate keys in this ! * implementation, in order to be able to find our location using ! * L&Y "move right" operations. Since we may insert duplicate user ! * keys, and since these dups may propagate up the tree, we use the ! * 'afteritem' parameter to position ourselves correctly for the ! * insertion on internal pages. ! *---------- */ static void ! _bt_insertonpg(Relation rel, ! Buffer buf, ! BTStack stack, ! int keysz, ! ScanKey scankey, ! IndexTuple itup, ! OffsetNumber afteritem, ! bool split_only_page) { ! Page page; BTPageOpaque lpageop; OffsetNumber newitemoff; ! OffsetNumber firstright = InvalidOffsetNumber; ! Size itemsz; - page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ --- 364,403 ---- * Once we have chosen the page to put the key on, we'll insert it before * any existing equal keys because of the way _bt_binsrch() works. * ! * If there's not enough room in the space, we try to make room by ! * removing any LP_DELETEd tuples. ! * ! * On entry, *buf and *offsetptr point to the first legal position ! * where the new tuple could be inserted. The caller should hold an ! * exclusive lock on *buf. *offsetptr can also be set to ! * InvalidOffsetNumber, in which case the function will search the right ! * location within the page if needed. On exit, they point to the chosen ! * insert location. If findinsertloc decided to move right, the lock and ! * pin on the original page will be released and the new page returned to ! * the caller is exclusively locked instead. ! * ! * newtup is the new tuple we're inserting, and scankey is an insertion ! * type scan key for it. */ static void ! _bt_findinsertloc(Relation rel, ! Buffer *bufptr, ! OffsetNumber *offsetptr, ! int keysz, ! ScanKey scankey, ! IndexTuple newtup) { ! Buffer buf = *bufptr; ! Page page = BufferGetPage(buf); ! Size itemsz; BTPageOpaque lpageop; + bool movedright, vacuumed; OffsetNumber newitemoff; ! OffsetNumber firstlegaloff = *offsetptr; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! itemsz = IndexTupleDSize(*newtup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ *************** *** 429,525 **** "Consider a function index of an MD5 hash of the value, " "or use full text indexing."))); - /* - * Determine exactly where new item will go. - */ - if (afteritem > 0) - newitemoff = afteritem + 1; - else - { - /*---------- - * If we will need to split the page to put the item here, - * check whether we can put the tuple somewhere to the right, - * instead. Keep scanning right until we - * (a) find a page with enough free space, - * (b) reach the last page where the tuple can legally go, or - * (c) get tired of searching. - * (c) is not flippant; it is important because if there are many - * pages' worth of equal keys, it's better to split one of the early - * pages than to scan all the way to the end of the run of equal keys - * on every insert. We implement "get tired" as a random choice, - * since stopping after scanning a fixed number of pages wouldn't work - * well (we'd never reach the right-hand side of previously split - * pages). Currently the probability of moving right is set at 0.99, - * which may seem too high to change the behavior much, but it does an - * excellent job of preventing O(N^2) behavior with many equal keys. - *---------- - */ - bool movedright = false; - - while (PageGetFreeSpace(page) < itemsz) - { - Buffer rbuf; - /* - * before considering moving right, see if we can obtain enough - * space by erasing LP_DELETE items - */ - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) - { - _bt_vacuum_one_page(rel, buf); - if (PageGetFreeSpace(page) >= itemsz) - break; /* OK, now we have enough space */ - } ! /* ! * nope, so check conditions (b) and (c) enumerated above ! */ ! if (P_RIGHTMOST(lpageop) || ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || ! random() <= (MAX_RANDOM_VALUE / 100)) ! break; ! /* ! * step right to next non-dead page ! * ! * must write-lock that page before releasing write lock on ! * current page; else someone else's _bt_check_unique scan could ! * fail to see our insertion. write locks on intermediate dead ! * pages won't do because we don't know when they will get ! * de-linked from the tree. ! */ ! rbuf = InvalidBuffer; ! for (;;) ! { ! BlockNumber rblkno = lpageop->btpo_next; ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); ! page = BufferGetPage(rbuf); ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! if (!P_IGNORE(lpageop)) ! break; ! if (P_RIGHTMOST(lpageop)) ! elog(ERROR, "fell off the end of \"%s\"", ! RelationGetRelationName(rel)); ! } ! _bt_relbuf(rel, buf); ! buf = rbuf; ! movedright = true; } /* ! * Now we are on the right page, so find the insert position. If we ! * moved right at all, we know we should insert at the start of the ! * page, else must find the position by searching. */ ! if (movedright) ! newitemoff = P_FIRSTDATAKEY(lpageop); ! else ! newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); } /* * Do we need to split the page to fit the item on it? * * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, --- 418,573 ---- "Consider a function index of an MD5 hash of the value, " "or use full text indexing."))); ! /*---------- ! * If we will need to split the page to put the item on this page, ! * check whether we can put the tuple somewhere to the right, ! * instead. Keep scanning right until we ! * (a) find a page with enough free space, ! * (b) reach the last page where the tuple can legally go, or ! * (c) get tired of searching. ! * (c) is not flippant; it is important because if there are many ! * pages' worth of equal keys, it's better to split one of the early ! * pages than to scan all the way to the end of the run of equal keys ! * on every insert. We implement "get tired" as a random choice, ! * since stopping after scanning a fixed number of pages wouldn't work ! * well (we'd never reach the right-hand side of previously split ! * pages). Currently the probability of moving right is set at 0.99, ! * which may seem too high to change the behavior much, but it does an ! * excellent job of preventing O(N^2) behavior with many equal keys. ! *---------- ! */ ! movedright = false; ! vacuumed = false; ! while (PageGetFreeSpace(page) < itemsz) ! { ! Buffer rbuf; ! /* ! * before considering moving right, see if we can obtain enough ! * space by erasing LP_DELETE items ! */ ! if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) ! { ! _bt_vacuum_one_page(rel, buf); ! /* remember that we vacuumed this page, because that makes ! * the hint supplied by the caller invalid */ ! vacuumed = true; ! if (PageGetFreeSpace(page) >= itemsz) ! break; /* OK, now we have enough space */ } /* ! * nope, so check conditions (b) and (c) enumerated above */ ! if (P_RIGHTMOST(lpageop) || ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || ! random() <= (MAX_RANDOM_VALUE / 100)) ! break; ! ! /* ! * step right to next non-dead page ! * ! * must write-lock that page before releasing write lock on ! * current page; else someone else's _bt_check_unique scan could ! * fail to see our insertion. write locks on intermediate dead ! * pages won't do because we don't know when they will get ! * de-linked from the tree. ! */ ! rbuf = InvalidBuffer; ! ! for (;;) ! { ! BlockNumber rblkno = lpageop->btpo_next; ! ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); ! page = BufferGetPage(rbuf); ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); ! if (!P_IGNORE(lpageop)) ! break; ! if (P_RIGHTMOST(lpageop)) ! elog(ERROR, "fell off the end of \"%s\"", ! RelationGetRelationName(rel)); ! } ! _bt_relbuf(rel, buf); ! buf = rbuf; ! movedright = true; ! vacuumed = false; } /* + * Now we are on the right page, so find the insert position. If we + * moved right at all, we know we should insert at the start of the + * page. If we didn't move right, we can use the firstlegaloff hint + * if the caller supplied one, unless we vacuumed the page which + * might have moved tuples around making the hint invalid. If we + * didn't move right or can't use the hint, find the position + * by searching. + */ + if (movedright) + newitemoff = P_FIRSTDATAKEY(lpageop); + else if(firstlegaloff != InvalidOffsetNumber && !vacuumed) + newitemoff = firstlegaloff; + else + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); + + *bufptr = buf; + *offsetptr = newitemoff; + } + + /*---------- + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if necessary, splits the target page (making sure that the + * split is equitable as far as post-insert free space goes). + * + inserts the tuple. + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invokes itself with the appropriate tuple for the right + * child page on the parent. + * + updates the metapage if a true root or fast root is split. + * + * On entry, we must have the right buffer in which to do the + * insertion, and the buffer must be pinned and write-locked. On return, + * we will have dropped both the pin and the lock on the buffer. + * + * The locking interactions in this code are critical. You should + * grok Lehman and Yao's paper before making any changes. In addition, + * you need to understand how we disambiguate duplicate keys in this + * implementation, in order to be able to find our location using + * L&Y "move right" operations. Since we may insert duplicate user + * keys, and since these dups may propagate up the tree, we use the + * 'afteritem' parameter to position ourselves correctly for the + * insertion on internal pages. + *---------- + */ + static void + _bt_insertonpg(Relation rel, + Buffer buf, + BTStack stack, + IndexTuple itup, + OffsetNumber newitemoff, + bool split_only_page) + { + Page page; + BTPageOpaque lpageop; + OffsetNumber firstright = InvalidOffsetNumber; + Size itemsz; + + page = BufferGetPage(buf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ + + /* * Do we need to split the page to fit the item on it? * * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, *************** *** 1427,1433 **** /* Recursively update the parent */ _bt_insertonpg(rel, pbuf, stack->bts_parent, ! 0, NULL, new_item, stack->bts_offset, is_only); /* be tidy */ --- 1475,1481 ---- /* Recursively update the parent */ _bt_insertonpg(rel, pbuf, stack->bts_parent, ! new_item, stack->bts_offset + 1, is_only); /* be tidy */
Your patch has been added to the PostgreSQL unapplied patches list at: http://momjian.postgresql.org/cgi-bin/pgpatches It will be applied as soon as one of the PostgreSQL committers reviews and approves it. --------------------------------------------------------------------------- Heikki Linnakangas wrote: > Here's a patch that: > > Moves the logic to find a page with enough room from _bt_insertonpg to a > new function, _bt_findinsertloc. It makes the code more readable, and > simplifies the forthcoming Grouped Index Tuples patch. > > Also, the insert location within page used to be calculated twice for > unique indexes, once in _bt_checkunique and second time in > _bt_insertonpg. That's a waste of cycles, and this patch fixes that. > > > I couldn't measure a difference with pgbench, but this micro-benchmark > shows it: > > > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);" > > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync > > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest; > INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;" > > Without patch: real 0m7.260s > With patch: real 0m6.963s > > On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10, > to remove any other variables. > > It's not a huge difference, but it's worth having, and performance > wasn't the main motivation of the patch anyway. > > -- > Heikki Linnakangas > EnterpriseDB http://www.enterprisedb.com > > ---------------------------(end of broadcast)--------------------------- > TIP 7: You can help support the PostgreSQL project by donating at > > http://www.postgresql.org/about/donate -- Bruce Momjian <bruce@momjian.us> http://momjian.us EnterpriseDB http://www.enterprisedb.com + If your life is a hard drive, Christ can be your backup. +
Patch applied. Thanks. --------------------------------------------------------------------------- Heikki Linnakangas wrote: > Here's a patch that: > > Moves the logic to find a page with enough room from _bt_insertonpg to a > new function, _bt_findinsertloc. It makes the code more readable, and > simplifies the forthcoming Grouped Index Tuples patch. > > Also, the insert location within page used to be calculated twice for > unique indexes, once in _bt_checkunique and second time in > _bt_insertonpg. That's a waste of cycles, and this patch fixes that. > > > I couldn't measure a difference with pgbench, but this micro-benchmark > shows it: > > > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);" > > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync > > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest; > INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;" > > Without patch: real 0m7.260s > With patch: real 0m6.963s > > On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10, > to remove any other variables. > > It's not a huge difference, but it's worth having, and performance > wasn't the main motivation of the patch anyway. > > -- > Heikki Linnakangas > EnterpriseDB http://www.enterprisedb.com [ text/x-patch is unsupported, treating like TEXT/PLAIN ] > Index: src/backend/access/nbtree/nbtinsert.c > =================================================================== > RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v > retrieving revision 1.152 > diff -c -r1.152 nbtinsert.c > *** src/backend/access/nbtree/nbtinsert.c 21 Feb 2007 20:02:17 -0000 1.152 > --- src/backend/access/nbtree/nbtinsert.c 26 Feb 2007 09:37:16 -0000 > *************** > *** 46,58 **** > static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); > > static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, > ! Relation heapRel, Buffer buf, > ScanKey itup_scankey); > static void _bt_insertonpg(Relation rel, Buffer buf, > BTStack stack, > - int keysz, ScanKey scankey, > IndexTuple itup, > ! OffsetNumber afteritem, > bool split_only_page); > static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, > OffsetNumber newitemoff, Size newitemsz, > --- 46,63 ---- > static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); > > static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, > ! Relation heapRel, Buffer buf, OffsetNumber ioffset, > ScanKey itup_scankey); > + static void _bt_findinsertloc(Relation rel, > + Buffer *bufptr, > + OffsetNumber *offsetptr, > + int keysz, > + ScanKey scankey, > + IndexTuple newtup); > static void _bt_insertonpg(Relation rel, Buffer buf, > BTStack stack, > IndexTuple itup, > ! OffsetNumber newitemoff, > bool split_only_page); > static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, > OffsetNumber newitemoff, Size newitemsz, > *************** > *** 86,91 **** > --- 91,97 ---- > ScanKey itup_scankey; > BTStack stack; > Buffer buf; > + OffsetNumber offset; > > /* we need an insertion scan key to do our search, so build one */ > itup_scankey = _bt_mkscankey(rel, itup); > *************** > *** 94,99 **** > --- 100,107 ---- > /* find the first page containing this key */ > stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE); > > + offset = InvalidOffsetNumber; > + > /* trade in our read lock for a write lock */ > LockBuffer(buf, BUFFER_LOCK_UNLOCK); > LockBuffer(buf, BT_WRITE); > *************** > *** 128,134 **** > { > TransactionId xwait; > > ! xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey); > > if (TransactionIdIsValid(xwait)) > { > --- 136,143 ---- > { > TransactionId xwait; > > ! offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); > ! xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey); > > if (TransactionIdIsValid(xwait)) > { > *************** > *** 142,148 **** > } > > /* do the insertion */ > ! _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false); > > /* be tidy */ > _bt_freestack(stack); > --- 151,158 ---- > } > > /* do the insertion */ > ! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup); > ! _bt_insertonpg(rel, buf, stack, itup, offset, false); > > /* be tidy */ > _bt_freestack(stack); > *************** > *** 152,169 **** > /* > * _bt_check_unique() -- Check for violation of unique index constraint > * > * Returns InvalidTransactionId if there is no conflict, else an xact ID > * we must wait for to see if it commits a conflicting tuple. If an actual > * conflict is detected, no return --- just ereport(). > */ > static TransactionId > _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, > ! Buffer buf, ScanKey itup_scankey) > { > TupleDesc itupdesc = RelationGetDescr(rel); > int natts = rel->rd_rel->relnatts; > ! OffsetNumber offset, > ! maxoff; > Page page; > BTPageOpaque opaque; > Buffer nbuf = InvalidBuffer; > --- 162,182 ---- > /* > * _bt_check_unique() -- Check for violation of unique index constraint > * > + * offset points to the first possible item that could conflict. It can > + * also point to end-of-page, which means that the first tuple to check > + * is the first tuple on the next page. > + * > * Returns InvalidTransactionId if there is no conflict, else an xact ID > * we must wait for to see if it commits a conflicting tuple. If an actual > * conflict is detected, no return --- just ereport(). > */ > static TransactionId > _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, > ! Buffer buf, OffsetNumber offset, ScanKey itup_scankey) > { > TupleDesc itupdesc = RelationGetDescr(rel); > int natts = rel->rd_rel->relnatts; > ! OffsetNumber maxoff; > Page page; > BTPageOpaque opaque; > Buffer nbuf = InvalidBuffer; > *************** > *** 173,184 **** > maxoff = PageGetMaxOffsetNumber(page); > > /* > - * Find first item >= proposed new item. Note we could also get a pointer > - * to end-of-page here. > - */ > - offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); > - > - /* > * Scan over all equal tuples, looking for live conflicts. > */ > for (;;) > --- 186,191 ---- > *************** > *** 342,374 **** > return InvalidTransactionId; > } > > ! /*---------- > ! * _bt_insertonpg() -- Insert a tuple on a particular page in the index. > ! * > ! * This recursive procedure does the following things: > ! * > ! * + finds the right place to insert the tuple. > ! * + if necessary, splits the target page (making sure that the > ! * split is equitable as far as post-insert free space goes). > ! * + inserts the tuple. > ! * + if the page was split, pops the parent stack, and finds the > ! * right place to insert the new child pointer (by walking > ! * right using information stored in the parent stack). > ! * + invokes itself with the appropriate tuple for the right > ! * child page on the parent. > ! * + updates the metapage if a true root or fast root is split. > ! * > ! * On entry, we must have the right buffer in which to do the > ! * insertion, and the buffer must be pinned and write-locked. On return, > ! * we will have dropped both the pin and the lock on the buffer. > ! * > ! * If 'afteritem' is >0 then the new tuple must be inserted after the > ! * existing item of that number, noplace else. If 'afteritem' is 0 > ! * then the procedure finds the exact spot to insert it by searching. > ! * (keysz and scankey parameters are used ONLY if afteritem == 0. > ! * The scankey must be an insertion-type scankey.) > * > ! * NOTE: if the new key is equal to one or more existing keys, we can > * legitimately place it anywhere in the series of equal keys --- in fact, > * if the new key is equal to the page's "high key" we can place it on > * the next page. If it is equal to the high key, and there's not room > --- 349,359 ---- > return InvalidTransactionId; > } > > ! > ! /* > ! * _bt_findinsertloc() -- Finds an insert location for a tuple > * > ! * If the new key is equal to one or more existing keys, we can > * legitimately place it anywhere in the series of equal keys --- in fact, > * if the new key is equal to the page's "high key" we can place it on > * the next page. If it is equal to the high key, and there's not room > *************** > *** 379,414 **** > * Once we have chosen the page to put the key on, we'll insert it before > * any existing equal keys because of the way _bt_binsrch() works. > * > ! * The locking interactions in this code are critical. You should > ! * grok Lehman and Yao's paper before making any changes. In addition, > ! * you need to understand how we disambiguate duplicate keys in this > ! * implementation, in order to be able to find our location using > ! * L&Y "move right" operations. Since we may insert duplicate user > ! * keys, and since these dups may propagate up the tree, we use the > ! * 'afteritem' parameter to position ourselves correctly for the > ! * insertion on internal pages. > ! *---------- > */ > static void > ! _bt_insertonpg(Relation rel, > ! Buffer buf, > ! BTStack stack, > ! int keysz, > ! ScanKey scankey, > ! IndexTuple itup, > ! OffsetNumber afteritem, > ! bool split_only_page) > { > ! Page page; > BTPageOpaque lpageop; > OffsetNumber newitemoff; > ! OffsetNumber firstright = InvalidOffsetNumber; > ! Size itemsz; > > - page = BufferGetPage(buf); > lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > > ! itemsz = IndexTupleDSize(*itup); > itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we > * need to be consistent */ > > --- 364,403 ---- > * Once we have chosen the page to put the key on, we'll insert it before > * any existing equal keys because of the way _bt_binsrch() works. > * > ! * If there's not enough room in the space, we try to make room by > ! * removing any LP_DELETEd tuples. > ! * > ! * On entry, *buf and *offsetptr point to the first legal position > ! * where the new tuple could be inserted. The caller should hold an > ! * exclusive lock on *buf. *offsetptr can also be set to > ! * InvalidOffsetNumber, in which case the function will search the right > ! * location within the page if needed. On exit, they point to the chosen > ! * insert location. If findinsertloc decided to move right, the lock and > ! * pin on the original page will be released and the new page returned to > ! * the caller is exclusively locked instead. > ! * > ! * newtup is the new tuple we're inserting, and scankey is an insertion > ! * type scan key for it. > */ > static void > ! _bt_findinsertloc(Relation rel, > ! Buffer *bufptr, > ! OffsetNumber *offsetptr, > ! int keysz, > ! ScanKey scankey, > ! IndexTuple newtup) > { > ! Buffer buf = *bufptr; > ! Page page = BufferGetPage(buf); > ! Size itemsz; > BTPageOpaque lpageop; > + bool movedright, vacuumed; > OffsetNumber newitemoff; > ! OffsetNumber firstlegaloff = *offsetptr; > > lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > > ! itemsz = IndexTupleDSize(*newtup); > itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we > * need to be consistent */ > > *************** > *** 429,525 **** > "Consider a function index of an MD5 hash of the value, " > "or use full text indexing."))); > > - /* > - * Determine exactly where new item will go. > - */ > - if (afteritem > 0) > - newitemoff = afteritem + 1; > - else > - { > - /*---------- > - * If we will need to split the page to put the item here, > - * check whether we can put the tuple somewhere to the right, > - * instead. Keep scanning right until we > - * (a) find a page with enough free space, > - * (b) reach the last page where the tuple can legally go, or > - * (c) get tired of searching. > - * (c) is not flippant; it is important because if there are many > - * pages' worth of equal keys, it's better to split one of the early > - * pages than to scan all the way to the end of the run of equal keys > - * on every insert. We implement "get tired" as a random choice, > - * since stopping after scanning a fixed number of pages wouldn't work > - * well (we'd never reach the right-hand side of previously split > - * pages). Currently the probability of moving right is set at 0.99, > - * which may seem too high to change the behavior much, but it does an > - * excellent job of preventing O(N^2) behavior with many equal keys. > - *---------- > - */ > - bool movedright = false; > - > - while (PageGetFreeSpace(page) < itemsz) > - { > - Buffer rbuf; > > - /* > - * before considering moving right, see if we can obtain enough > - * space by erasing LP_DELETE items > - */ > - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) > - { > - _bt_vacuum_one_page(rel, buf); > - if (PageGetFreeSpace(page) >= itemsz) > - break; /* OK, now we have enough space */ > - } > > ! /* > ! * nope, so check conditions (b) and (c) enumerated above > ! */ > ! if (P_RIGHTMOST(lpageop) || > ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || > ! random() <= (MAX_RANDOM_VALUE / 100)) > ! break; > > ! /* > ! * step right to next non-dead page > ! * > ! * must write-lock that page before releasing write lock on > ! * current page; else someone else's _bt_check_unique scan could > ! * fail to see our insertion. write locks on intermediate dead > ! * pages won't do because we don't know when they will get > ! * de-linked from the tree. > ! */ > ! rbuf = InvalidBuffer; > > ! for (;;) > ! { > ! BlockNumber rblkno = lpageop->btpo_next; > > ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); > ! page = BufferGetPage(rbuf); > ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > ! if (!P_IGNORE(lpageop)) > ! break; > ! if (P_RIGHTMOST(lpageop)) > ! elog(ERROR, "fell off the end of \"%s\"", > ! RelationGetRelationName(rel)); > ! } > ! _bt_relbuf(rel, buf); > ! buf = rbuf; > ! movedright = true; > } > > /* > ! * Now we are on the right page, so find the insert position. If we > ! * moved right at all, we know we should insert at the start of the > ! * page, else must find the position by searching. > */ > ! if (movedright) > ! newitemoff = P_FIRSTDATAKEY(lpageop); > ! else > ! newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); > } > > /* > * Do we need to split the page to fit the item on it? > * > * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, > --- 418,573 ---- > "Consider a function index of an MD5 hash of the value, " > "or use full text indexing."))); > > > > ! /*---------- > ! * If we will need to split the page to put the item on this page, > ! * check whether we can put the tuple somewhere to the right, > ! * instead. Keep scanning right until we > ! * (a) find a page with enough free space, > ! * (b) reach the last page where the tuple can legally go, or > ! * (c) get tired of searching. > ! * (c) is not flippant; it is important because if there are many > ! * pages' worth of equal keys, it's better to split one of the early > ! * pages than to scan all the way to the end of the run of equal keys > ! * on every insert. We implement "get tired" as a random choice, > ! * since stopping after scanning a fixed number of pages wouldn't work > ! * well (we'd never reach the right-hand side of previously split > ! * pages). Currently the probability of moving right is set at 0.99, > ! * which may seem too high to change the behavior much, but it does an > ! * excellent job of preventing O(N^2) behavior with many equal keys. > ! *---------- > ! */ > ! movedright = false; > ! vacuumed = false; > ! while (PageGetFreeSpace(page) < itemsz) > ! { > ! Buffer rbuf; > > ! /* > ! * before considering moving right, see if we can obtain enough > ! * space by erasing LP_DELETE items > ! */ > ! if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) > ! { > ! _bt_vacuum_one_page(rel, buf); > > ! /* remember that we vacuumed this page, because that makes > ! * the hint supplied by the caller invalid */ > ! vacuumed = true; > > ! if (PageGetFreeSpace(page) >= itemsz) > ! break; /* OK, now we have enough space */ > } > > /* > ! * nope, so check conditions (b) and (c) enumerated above > */ > ! if (P_RIGHTMOST(lpageop) || > ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || > ! random() <= (MAX_RANDOM_VALUE / 100)) > ! break; > ! > ! /* > ! * step right to next non-dead page > ! * > ! * must write-lock that page before releasing write lock on > ! * current page; else someone else's _bt_check_unique scan could > ! * fail to see our insertion. write locks on intermediate dead > ! * pages won't do because we don't know when they will get > ! * de-linked from the tree. > ! */ > ! rbuf = InvalidBuffer; > ! > ! for (;;) > ! { > ! BlockNumber rblkno = lpageop->btpo_next; > ! > ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); > ! page = BufferGetPage(rbuf); > ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > ! if (!P_IGNORE(lpageop)) > ! break; > ! if (P_RIGHTMOST(lpageop)) > ! elog(ERROR, "fell off the end of \"%s\"", > ! RelationGetRelationName(rel)); > ! } > ! _bt_relbuf(rel, buf); > ! buf = rbuf; > ! movedright = true; > ! vacuumed = false; > } > > /* > + * Now we are on the right page, so find the insert position. If we > + * moved right at all, we know we should insert at the start of the > + * page. If we didn't move right, we can use the firstlegaloff hint > + * if the caller supplied one, unless we vacuumed the page which > + * might have moved tuples around making the hint invalid. If we > + * didn't move right or can't use the hint, find the position > + * by searching. > + */ > + if (movedright) > + newitemoff = P_FIRSTDATAKEY(lpageop); > + else if(firstlegaloff != InvalidOffsetNumber && !vacuumed) > + newitemoff = firstlegaloff; > + else > + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); > + > + *bufptr = buf; > + *offsetptr = newitemoff; > + } > + > + /*---------- > + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. > + * > + * This recursive procedure does the following things: > + * > + * + if necessary, splits the target page (making sure that the > + * split is equitable as far as post-insert free space goes). > + * + inserts the tuple. > + * + if the page was split, pops the parent stack, and finds the > + * right place to insert the new child pointer (by walking > + * right using information stored in the parent stack). > + * + invokes itself with the appropriate tuple for the right > + * child page on the parent. > + * + updates the metapage if a true root or fast root is split. > + * > + * On entry, we must have the right buffer in which to do the > + * insertion, and the buffer must be pinned and write-locked. On return, > + * we will have dropped both the pin and the lock on the buffer. > + * > + * The locking interactions in this code are critical. You should > + * grok Lehman and Yao's paper before making any changes. In addition, > + * you need to understand how we disambiguate duplicate keys in this > + * implementation, in order to be able to find our location using > + * L&Y "move right" operations. Since we may insert duplicate user > + * keys, and since these dups may propagate up the tree, we use the > + * 'afteritem' parameter to position ourselves correctly for the > + * insertion on internal pages. > + *---------- > + */ > + static void > + _bt_insertonpg(Relation rel, > + Buffer buf, > + BTStack stack, > + IndexTuple itup, > + OffsetNumber newitemoff, > + bool split_only_page) > + { > + Page page; > + BTPageOpaque lpageop; > + OffsetNumber firstright = InvalidOffsetNumber; > + Size itemsz; > + > + page = BufferGetPage(buf); > + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > + > + itemsz = IndexTupleDSize(*itup); > + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we > + * need to be consistent */ > + > + /* > * Do we need to split the page to fit the item on it? > * > * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, > *************** > *** 1427,1433 **** > > /* Recursively update the parent */ > _bt_insertonpg(rel, pbuf, stack->bts_parent, > ! 0, NULL, new_item, stack->bts_offset, > is_only); > > /* be tidy */ > --- 1475,1481 ---- > > /* Recursively update the parent */ > _bt_insertonpg(rel, pbuf, stack->bts_parent, > ! new_item, stack->bts_offset + 1, > is_only); > > /* be tidy */ > > ---------------------------(end of broadcast)--------------------------- > TIP 7: You can help support the PostgreSQL project by donating at > > http://www.postgresql.org/about/donate -- Bruce Momjian <bruce@momjian.us> http://momjian.us EnterpriseDB http://www.enterprisedb.com + If your life is a hard drive, Christ can be your backup. +