*** a/doc/src/sgml/gist.sgml --- b/doc/src/sgml/gist.sgml *************** *** 642,647 **** my_distance(PG_FUNCTION_ARGS) --- 642,704 ---- + + GiST buffering build + + Build of large GiST indexes (indexes which aren't fitting to the cache) + by sequential insertions tends to be slow, because large fraction of + insertions involves actual IO operations. The exception is well-ordered + datasets where near index tuples are routing to near index pages. + PostgreSQL from version 9.2 supports buffering index build technique + which can dramatically reduce number of IO operations during index build. + This technique appears to be application of buffer tree and based on + follwing works. + + + + + + Efficient Bulk Operations on Dynamic R-trees + + + + + + + The Buffer Tree: A new technique for optimal I/O-algorithms + + + + + + + Implemented tecnique can be briefly described as follows. Additional + buffers of are attached to all nodes of some levels of the tree. Leaf + nodes always don't contain buffers. When index tuple reaches node with + buffer, it is placed into buffer. When buffer is overflowed it is emptying + into lower buffers or leaf pages. One emptying process can trigger + cascading emptying processes of underlying buffers. If split occurs in + node with buffer, then this buffers splits into two buffers using + penalty function. Processing of index tuples in + bulk manner in small sub-trees give dramatical economy of IO operations. + However, buffering index build requires some extra CPU resources because + of extra penalty calls. Also, it can infuence + to produced index quality in both positive and negative directions. + That influence depends on various factors including dataset distribution, + operator class implementation etc. + + + + Buffering build can be turned off, turned on or be in automatical mode. In + automatical mode it is initially off, but it turns on when index size + reaches . That behaviour is + controlled by BUFFERING parameter. Also step of levels + where buffers appear and size of buffers are available for tuning by + LEVELSTEP and BUFFERSIZE parameters. + See for details. + + + *** a/doc/src/sgml/ref/create_index.sgml --- b/doc/src/sgml/ref/create_index.sgml *************** *** 341,346 **** CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ name + + GiST indexes additionaly accepts parameters: + + + + + + BUFFERING + + + Determining mode of buffering build technique described in + . With OFF it is + disabled. With ON it is enabled during all build process. With + AUTO (default) it is initially disabled, but it turns on when + index size reaches . + + + + + + LEVELSTEP + + + In buffering build buffers located at tree levels i * LEVELSTEP, + i > 0 (we use upward level numbering, level = 0 corresponds to leaf pages). + By default LEVELSTEP is calculated so that sub-tree + of LEVELSTEP height fits + and . + + + + + + BUFFERSIZE + + + Maximum size of node buffer in pages. By default it is calculated so that + half emptying of node buffer fill in average one page per underlying node + buffer. This ratio guarantees effective IO usage. In some cases lower + BUFFERSIZE can give comparable IO economy with less CPU + overhead. + + + + + *** a/src/backend/access/common/reloptions.c --- b/src/backend/access/common/reloptions.c *************** *** 30,35 **** --- 30,38 ---- #include "utils/memutils.h" #include "utils/rel.h" + + static void validateBufferingOption(char *value); + /* * Contents of pg_class.reloptions * *************** *** 159,164 **** static relopt_int intRelOpts[] = --- 162,183 ---- RELOPT_KIND_HEAP | RELOPT_KIND_TOAST }, -1, 0, 2000000000 }, + { + { + "levelstep", + "Level step in GiST index buffering build", + RELOPT_KIND_GIST + }, + -1, 1, 100 + }, + { + { + "buffersize", + "Buffer size in GiST index buffering build", + RELOPT_KIND_GIST + }, + -1, 1, 1000000000 + }, /* list terminator */ {{NULL}} }; *************** *** 219,224 **** static relopt_real realRelOpts[] = --- 238,254 ---- static relopt_string stringRelOpts[] = { + { + { + "buffering", + "Enables buffering build for this GiST index", + RELOPT_KIND_GIST + }, + 4, + false, + validateBufferingOption, + "auto" + }, /* list terminator */ {{NULL}} }; *************** *** 1267,1269 **** tablespace_reloptions(Datum reloptions, bool validate) --- 1297,1318 ---- return (bytea *) tsopts; } + + /* + * Validator for "buffering" option of GiST indexed. Allows only "on", "off" and + * "auto" values. + */ + static void + validateBufferingOption(char *value) + { + if (!value || + ( + strcmp(value, "on") && + strcmp(value, "off") && + strcmp(value, "auto") + ) + ) + { + elog(ERROR, "Only \"on\", \"off\" and \"auto\" values are available for \"buffering\" option."); + } + } *** a/src/backend/access/gist/Makefile --- b/src/backend/access/gist/Makefile *************** *** 13,18 **** top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \ ! gistproc.o gistsplit.o include $(top_srcdir)/src/backend/common.mk --- 13,18 ---- include $(top_builddir)/src/Makefile.global OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \ ! gistproc.o gistsplit.o gistbuild.o gistbuildbuffers.o include $(top_srcdir)/src/backend/common.mk *** a/src/backend/access/gist/README --- b/src/backend/access/gist/README *************** *** 24,29 **** The current implementation of GiST supports: --- 24,30 ---- * provides NULL-safe interface to GiST core * Concurrency * Recovery support via WAL logging + * Buffering build algorithm The support for concurrency implemented in PostgreSQL was developed based on the paper "Access Methods for Next-Generation Database Systems" by *************** *** 31,36 **** Marcel Kornaker: --- 32,43 ---- http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz + Buffering build algorithm for GiST was developed based on the paper "Efficient + Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold + and Jeffrey Scott Vitter. + + http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf + The original algorithms were modified in several ways: * They had to be adapted to PostgreSQL conventions. For example, the SEARCH *************** *** 278,283 **** would complicate the insertion algorithm. So when an insertion sees a page --- 285,394 ---- with F_FOLLOW_RIGHT set, it immediately tries to bring the split that crashed in the middle to completion by adding the downlink in the parent. + Buffering build algorithm + -------------------- + + In buffering build algorithm levels are numbering upwards and leaf pages level + has number zero. An example is given on the picture below. Such numbering is + used in order to make pages save it's level number during all life-time even if + root splits. + + Level Tree + + 3 * + / \ + 2 * * + / | \ / | \ + 1 * * * * * * + / \ / \ / \ / \ / \ / \ + 0 o o o o o o o o o o o o + + * - internal page + o - leaf page + + In buffering build algorithm additional buffers are associated with pages. Leaf + pages never have buffers. Internal pages have buffers with some level step. + I.e. pages on levels level_step*i (i >=1) have buffers. If level_step = 1 then + each internal page have buffer. + + Level Tree (level_step = 1) Tree (level_step = 2) + + 3 *(b) * + / \ / \ + 2 *(b) *(b) *(b) *(b) + / | \ / | \ / | \ / | \ + 1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * * + / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ + 0 o o o o o o o o o o o o o o o o o o o o o o o o + + (b) - buffer + + Each buffer can be in one of the two following states: + 1) Last page of the buffer is kept in main memory. Node buffer is automatically + switching to that state when new index tuple is added to the buffer or index + tuple is removed from it. + 2) All pages of the buffers are on the disk. All node buffer is automatically + switching to that state when new emptying process starts. + + When index tuple is inserting, it's first path can end in following points: + 1) Leaf page if no levels has buffers, i.e. root level <= level_step. + 2) Buffer of root page if root page has a buffer. + 3) Buffer of topmost level page if root page doesn't have a buffer. + + New index tuples are processing until root level buffer (or buffer of topmost + level page) will be filled at half. When some buffer is filled at half then + the process of it's emptying is starting. + + Buffer emptying process means that index tuples from buffer are moving into + underlying buffers(if any) or leaf pages. Buffer emptying process involves + following: + 1) Pages of emptying buffer are one by one loading into main memory until + emptying stops. + 2) Last pages of underlying buffers are loaded into main memory. + 3) Page associated with buffer are cached. + 4) Pages between buffer and underlying buffers if level_step != 1 are cached. + (note that pages associated with underlying buffers aren't loading) + For emptying to leaf pages list of those items is following: + 1) Pages of emptying buffer are one by one loading into main memory until + emptying stops. + 2) Page associated with buffer are cached. + 3) Pages between buffer and leaf pages if level_step != 1 are cached. + 4) Leaf pages are cached. + Illustration of this requirements is given below. + + Buffer emptying to another buffers Buffer emptying to leaf pages + + +(cb) +(cb) + / \ / \ + + + + + + / \ / \ / \ / \ + *(ab) *(ab) *(ab) *(ab) x x x x + + + - cached internal page + x - cache leaf page + * - non-cached internal page + (b) - buffer with loaded last page + + One buffer emptying process can trigger another buffer emptying processes. + Buffer emptying stack data structure is the data structure responsible for + sequence of buffer emptying. Each node buffer which is half filled should be + inserted into buffer emptying stack. + + When we're moving from buffer emptying on higher level to the buffer emptying + on lower level, loaded part of tree (only pages of tree not the buffers) are + remained in the main memory. Tree parts stack is the data structure which + represents hierarchy of loaded tree parts. + + If split occurs on the page which have a buffer then index tuples are + relocating. Index tuples are relocating between buffers of pages produces by + split using penalty method. + + When all index tuples are inserted there are still some index tuples in buffers. + At this moment final buffer emptying starts. Each level have a list of non-empty + buffers. Final emptying contain loop over all tree levels starting from topmost. + On each levels all it's buffers are sequentially emptying until all buffers of + the level are empty. Since no index tuples move upwards during buffer emptying + all the buffers are empty when final emptying are finished. Authors: Teodor Sigaev *** a/src/backend/access/gist/gist.c --- b/src/backend/access/gist/gist.c *************** *** 24,38 **** #include "utils/memutils.h" #include "utils/rel.h" - /* Working state for gistbuild and its callback */ - typedef struct - { - GISTSTATE giststate; - int numindexattrs; - double indtuples; - MemoryContext tmpCtx; - } GISTBuildState; - /* A List of these is used represent a split-in-progress. */ typedef struct { --- 24,29 ---- *************** *** 41,56 **** typedef struct } GISTPageSplitInfo; /* non-export function prototypes */ - static void gistbuildCallback(Relation index, - HeapTuple htup, - Datum *values, - bool *isnull, - bool tupleIsAlive, - void *state); - static void gistdoinsert(Relation r, - IndexTuple itup, - Size freespace, - GISTSTATE *GISTstate); static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate); static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, --- 32,37 ---- *************** *** 89,226 **** createTempGistContext(void) } /* - * Routine to build an index. Basically calls insert over and over. - * - * XXX: it would be nice to implement some sort of bulk-loading - * algorithm, but it is not clear how to do that. - */ - Datum - gistbuild(PG_FUNCTION_ARGS) - { - Relation heap = (Relation) PG_GETARG_POINTER(0); - Relation index = (Relation) PG_GETARG_POINTER(1); - IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); - IndexBuildResult *result; - double reltuples; - GISTBuildState buildstate; - Buffer buffer; - Page page; - - /* - * We expect to be called exactly once for any index relation. If that's - * not the case, big trouble's what we have. - */ - if (RelationGetNumberOfBlocks(index) != 0) - elog(ERROR, "index \"%s\" already contains data", - RelationGetRelationName(index)); - - /* no locking is needed */ - initGISTstate(&buildstate.giststate, index); - - /* initialize the root page */ - buffer = gistNewBuffer(index); - Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); - page = BufferGetPage(buffer); - - START_CRIT_SECTION(); - - GISTInitBuffer(buffer, F_LEAF); - - MarkBufferDirty(buffer); - - if (RelationNeedsWAL(index)) - { - XLogRecPtr recptr; - XLogRecData rdata; - - rdata.data = (char *) &(index->rd_node); - rdata.len = sizeof(RelFileNode); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - - recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); - PageSetLSN(page, recptr); - PageSetTLI(page, ThisTimeLineID); - } - else - PageSetLSN(page, GetXLogRecPtrForTemp()); - - UnlockReleaseBuffer(buffer); - - END_CRIT_SECTION(); - - /* build the index */ - buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs; - buildstate.indtuples = 0; - - /* - * create a temporary memory context that is reset once for each tuple - * inserted into the index - */ - buildstate.tmpCtx = createTempGistContext(); - - /* do the heap scan */ - reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, - gistbuildCallback, (void *) &buildstate); - - /* okay, all heap tuples are indexed */ - MemoryContextDelete(buildstate.tmpCtx); - - freeGISTstate(&buildstate.giststate); - - /* - * Return statistics - */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); - - result->heap_tuples = reltuples; - result->index_tuples = buildstate.indtuples; - - PG_RETURN_POINTER(result); - } - - /* - * Per-tuple callback from IndexBuildHeapScan - */ - static void - gistbuildCallback(Relation index, - HeapTuple htup, - Datum *values, - bool *isnull, - bool tupleIsAlive, - void *state) - { - GISTBuildState *buildstate = (GISTBuildState *) state; - IndexTuple itup; - MemoryContext oldCtx; - - oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); - - /* form an index tuple and point it at the heap tuple */ - itup = gistFormTuple(&buildstate->giststate, index, - values, isnull, true /* size is currently bogus */ ); - itup->t_tid = htup->t_self; - - /* - * Since we already have the index relation locked, we call gistdoinsert - * directly. Normal access method calls dispatch through gistinsert, - * which locks the relation for write. This is the right thing to do if - * you're inserting single tups, but not when you're initializing the - * whole index at once. - * - * In this path we respect the fillfactor setting, whereas insertions - * after initial build do not. - */ - gistdoinsert(index, itup, - RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR), - &buildstate->giststate); - - buildstate->indtuples += 1; - MemoryContextSwitchTo(oldCtx); - MemoryContextReset(buildstate->tmpCtx); - } - - /* * gistbuildempty() -- build an empty gist index in the initialization fork */ Datum --- 70,75 ---- *************** *** 275,281 **** gistinsert(PG_FUNCTION_ARGS) PG_RETURN_BOOL(false); } - /* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. --- 124,129 ---- *************** *** 508,514 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate, /* Write the WAL record */ if (RelationNeedsWAL(state->r)) recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf, ! dist, oldrlink, oldnsn, leftchildbuf); else recptr = GetXLogRecPtrForTemp(); --- 356,362 ---- /* Write the WAL record */ if (RelationNeedsWAL(state->r)) recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf, ! dist, oldrlink, oldnsn, leftchildbuf, false); else recptr = GetXLogRecPtrForTemp(); *************** *** 608,614 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate, * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ ! static void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; --- 456,462 ---- * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ ! void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; *************** *** 917,924 **** gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) { /* * Page was split while we looked elsewhere. We didn't see the ! * downlink to the right page when we scanned the parent, so ! * add it to the queue now. * * Put the right page ahead of the queue, so that we visit it * next. That's important, because if this is the lowest internal --- 765,772 ---- { /* * Page was split while we looked elsewhere. We didn't see the ! * downlink to the right page when we scanned the parent, so add ! * it to the queue now. * * Put the right page ahead of the queue, so that we visit it * next. That's important, because if this is the lowest internal *************** *** 965,971 **** gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", RelationGetRelationName(r), child); ! return NULL; /* keep compiler quiet */ } /* --- 813,819 ---- elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", RelationGetRelationName(r), child); ! return NULL; /* keep compiler quiet */ } /* *************** *** 1414,1419 **** initGISTstate(GISTSTATE *giststate, Relation index) --- 1262,1268 ---- else giststate->supportCollation[i] = DEFAULT_COLLATION_OID; } + giststate->gfbb = NULL; } void *** /dev/null --- b/src/backend/access/gist/gistbuild.c *************** *** 0 **** --- 1,1091 ---- + /*------------------------------------------------------------------------- + * + * gistbuild.c + * build algorithm for GiST indexes implementation. + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistbuild.c + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include "access/genam.h" + #include "access/gist_private.h" + #include "catalog/index.h" + #include "catalog/pg_collation.h" + #include "miscadmin.h" + #include "optimizer/cost.h" + #include "storage/bufmgr.h" + #include "storage/indexfsm.h" + #include "storage/smgr.h" + #include "utils/memutils.h" + #include "utils/rel.h" + + #define LEAF_PAGES_STATS_STEP 128 + #define LEAF_PAGES_STATS_COUNT 16 + + + /* Working state for gistbuild and its callback */ + typedef struct + { + GISTSTATE giststate; + int numindexattrs; + int64 indtuples; + int64 indtuplesSize; + + /* + * Buffering build mode. Possible values: 'y' - we are in buffering build + * mode. 'a' - we are now in regular build mode, but can switch to + * buffering build mode when we decide to. 'n' - we are in regular build + * mode and aren't going to switch. + */ + char bufferingMode; + MemoryContext tmpCtx; + } GISTBuildState; + + static void freeUnreferencedPath(GISTBufferingInsertStack * path); + static bool gistBufferingBuildPlaceToPage(GISTInsertState *state, + GISTBufferingInsertStack * path, + GISTSTATE *giststate, + IndexTuple *itup, int ntup, + OffsetNumber oldoffnum); + static void processItup(GISTSTATE *giststate, GISTInsertState *state, + GISTBuildBuffers * gfbb, IndexTuple itup, + GISTBufferingInsertStack * startparent); + static void gistFindCorrectParent(GISTSTATE *giststate, Relation r, + GISTBufferingInsertStack * child); + static void processEmptyingStack(GISTSTATE *giststate, GISTInsertState *state); + static void gistBufferingBuildInsert(Relation index, IndexTuple itup, + GISTBuildState *buildstate); + static void gistBuildCallback(Relation index, + HeapTuple htup, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state); + static int getMaxLevel(Relation index); + static bool initBuffering(GISTBuildState *buildstate, Relation index); + + /* + * Free unreferenced parts of path; + */ + static void + freeUnreferencedPath(GISTBufferingInsertStack * path) + { + while (path->refCount == 0) + { + /* + * Path part is unreferenced. We can free it and decrease reference + * count of parent. If parent becomes unreferenced too procedure + * should be repeated for it. + */ + GISTBufferingInsertStack *tmp = path->parent; + + pfree(path); + path = tmp; + if (path) + path->refCount--; + else + break; + } + } + + /* + * Decrease reference count of path part and remove unreferenced path parts if + * any. + */ + void + decreasePathRefcount(GISTBufferingInsertStack * path) + { + path->refCount--; + freeUnreferencedPath(path); + } + + /* + * Index tuple insert function of buffering build algorithm. In simpler than + * regular insert function in the fact that it don't takes care about + * concurrency. It invokes buffer relocation function when it splits page. Also + * it take several oldoffnums as a parameter because buffer relocation can alter + * a number of parent index tuples. + */ + static bool + gistBufferingBuildPlaceToPage(GISTInsertState *state, + GISTBufferingInsertStack * path, + GISTSTATE *giststate, + IndexTuple *itup, int ntup, + OffsetNumber oldoffnum) + { + GISTBuildBuffers *gfbb = giststate->gfbb; + Buffer buffer = ReadBuffer(state->r, path->blkno); + Page page = (Page) BufferGetPage(buffer); + bool is_leaf = (GistPageIsLeaf(page)) ? true : false; + int i; + bool is_split; + XLogRecPtr recptr; + + LockBuffer(buffer, GIST_EXCLUSIVE); + + /* Remove old index tuple if needed */ + if (OffsetNumberIsValid(oldoffnum)) + PageIndexTupleDelete(page, oldoffnum); + + /* Check if there is enough space for insertion */ + is_split = gistnospace(page, itup, ntup, + InvalidOffsetNumber, state->freespace); + + if (is_split) + { + /* no space for insertion */ + IndexTuple *itvec; + int tlen; + SplitedPageLayout *dist = NULL, + *ptr; + SplitedPageLayout rootpg; + BlockNumber blkno = BufferGetBlockNumber(buffer); + BlockNumber oldrlink = InvalidBlockNumber; + bool is_rootsplit; + XLogRecPtr recptr; + GistNSN oldnsn = {0, 0}; + + is_rootsplit = (blkno == GIST_ROOT_BLKNO); + + /* + * Form index tuples vector to split. Old tuple was already removed + * from the vector. + */ + itvec = gistextractpage(page, &tlen); + itvec = gistjoinvector(itvec, &tlen, itup, ntup); + dist = gistSplit(state->r, page, itvec, tlen, giststate); + + /* + * Set up pages to work with. Allocate new buffers for all but the + * leftmost page. The original page becomes the new leftmost page, and + * is just replaced with the new contents. + * + * For a root-split, allocate new buffers for all child pages, the + * original page is overwritten with new root page containing + * downlinks to the new child pages. + */ + ptr = dist; + if (!is_rootsplit) + { + oldnsn = GistPageGetOpaque(page)->nsn; + oldrlink = GistPageGetOpaque(page)->rightlink; + dist->buffer = buffer; + dist->block.blkno = BufferGetBlockNumber(buffer); + dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); + + /* clean all flags except F_LEAF */ + GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; + + ptr = ptr->next; + } + for (; ptr; ptr = ptr->next) + { + /* Allocate new page */ + ptr->buffer = gistNewBuffer(state->r); + GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); + ptr->page = BufferGetPage(ptr->buffer); + ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); + } + + /* + * Now that we know which blocks the new pages go to, set up downlink + * tuples to point to them. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); + GistTupleSetValid(ptr->itup); + } + + { + StringInfoData sb; + + initStringInfo(&sb); + for (ptr = dist; ptr; ptr = ptr->next) + appendStringInfo(&sb, "%u ", ptr->block.blkno); + } + + if (is_rootsplit) + { + /* + * Adjust the top element in the insert stacks for the new root + * page. + */ + GISTBufferingInsertStack *oldroot = gfbb->rootitem; + + gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc( + gfbb->context, sizeof(GISTBufferingInsertStack)); + gfbb->rootitem->parent = NULL; + gfbb->rootitem->blkno = GIST_ROOT_BLKNO; + gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber; + gfbb->rootitem->level = oldroot->level + 1; + gfbb->rootitem->refCount = 1; + + oldroot->parent = gfbb->rootitem; + oldroot->blkno = dist->block.blkno; + oldroot->downlinkoffnum = InvalidOffsetNumber; + } + + /* + * Parent may be changed from the moment we set it. So, let us adjust + * the parent. + */ + if (!is_rootsplit) + gistFindCorrectParent(giststate, state->r, path); + + /* + * Relocate index tuples from buffer of splitted page between buffers + * of the pages produced by split. + */ + relocateBuildBuffersOnSplit(giststate->gfbb, giststate, state->r, + path, buffer, dist); + + /* + * If this is a root split, we construct the new root page with the + * downlinks here directly, instead of do recursive call for their + * insertion. Add the new root page to the list along with the child + * pages. + */ + if (is_rootsplit) + { + IndexTuple *downlinks; + int ndownlinks = 0; + int i; + + rootpg.buffer = buffer; + rootpg.page = PageGetTempPageCopySpecial( + BufferGetPage(rootpg.buffer)); + GistPageGetOpaque(rootpg.page)->flags = 0; + + /* Prepare a vector of all the downlinks */ + for (ptr = dist; ptr; ptr = ptr->next) + ndownlinks++; + downlinks = palloc(sizeof(IndexTuple) * ndownlinks); + for (i = 0, ptr = dist; ptr; ptr = ptr->next) + downlinks[i++] = ptr->itup; + + rootpg.block.blkno = GIST_ROOT_BLKNO; + rootpg.block.num = ndownlinks; + rootpg.list = gistfillitupvec(downlinks, ndownlinks, + &(rootpg.lenlist)); + rootpg.itup = NULL; + + rootpg.next = dist; + dist = &rootpg; + } + + /* + * Fill all pages. All the pages are new, ie. freshly allocated empty + * pages, or a temporary copy of the old page. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + char *data = (char *) (ptr->list); + + for (i = 0; i < ptr->block.num; i++) + { + if (PageAddItem(ptr->page, + (Item) data, IndexTupleSize((IndexTuple) data), + i + FirstOffsetNumber, false, false) == + InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(state->r)); + data += IndexTupleSize((IndexTuple) data); + } + + /* Set up rightlinks */ + if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) + GistPageGetOpaque(ptr->page)->rightlink = + ptr->next->block.blkno; + else + GistPageGetOpaque(ptr->page)->rightlink = oldrlink; + } + + /* Mark buffers dirty */ + for (ptr = dist; ptr; ptr = ptr->next) + MarkBufferDirty(ptr->buffer); + + PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); + dist->page = BufferGetPage(dist->buffer); + + /* TODO: Write the WAL record */ + if (RelationNeedsWAL(state->r)) + recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf, + dist, oldrlink, oldnsn, InvalidBuffer, true); + else + recptr = GetXLogRecPtrForTemp(); + + for (ptr = dist; ptr; ptr = ptr->next) + { + PageSetLSN(ptr->page, recptr); + PageSetTLI(ptr->page, ThisTimeLineID); + } + + /* Release buffers, except the one holding the inserted/updated tuple */ + for (ptr = dist; ptr; ptr = ptr->next) + { + if (BufferIsValid(ptr->buffer)) + UnlockReleaseBuffer(ptr->buffer); + } + + /* + * If it wasn't root split, we have to insert downlinks to parent + * page. + */ + if (!is_rootsplit) + { + IndexTuple *itups; + int cnt = 0, + i; + + /* + * Count number of downlinks for insert. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + cnt++; + } + + /* Allocate array of new index tuples and array of offset numbers */ + itups = (IndexTuple *) palloc(sizeof(IndexTuple) * cnt); + + /* Fill those arrays */ + i = 0; + for (ptr = dist; ptr; ptr = ptr->next) + { + itups[i] = ptr->itup; + i++; + } + + gistBufferingBuildPlaceToPage(state, path->parent, giststate, itups, + cnt, path->downlinkoffnum); + } + } + else + { + /* + * Enough of space. Just insert index tuples to the page. + */ + gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); + MarkBufferDirty(buffer); + + /* Write the WAL record */ + if (RelationNeedsWAL(state->r)) + { + recptr = gistXLogUpdate(state->r->rd_node, buffer, &oldoffnum, + OffsetNumberIsValid(oldoffnum) ? 1 : 0, itup, ntup, InvalidBuffer); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + else + { + recptr = GetXLogRecPtrForTemp(); + PageSetLSN(page, recptr); + } + UnlockReleaseBuffer(buffer); + } + + return is_split; + } + + /* + * Process index tuple. Run index tuple down until it meet leaf page or + * node buffer. + */ + static void + processItup(GISTSTATE *giststate, GISTInsertState *state, + GISTBuildBuffers * gfbb, IndexTuple itup, + GISTBufferingInsertStack * startparent) + { + GISTBufferingInsertStack *path; + BlockNumber childblkno; + Buffer buffer; + + /* + * NULL in startparent means that we start index tuple processing from the + * root. + */ + if (!startparent) + path = gfbb->rootitem; + else + path = startparent; + + /* + * Loop until we are on leaf page (level == 0) or we reach level with + * buffers (if it wasn't level that we've start at). + */ + for (;;) + { + ItemId iid; + IndexTuple idxtuple, + newtup; + Page page; + OffsetNumber childoffnum; + GISTBufferingInsertStack *parent; + + /* + * Do we meet a level with buffers? Surely buffer of page we start + * from doesn't matter. + */ + if (path != startparent && LEVEL_HAS_BUFFERS(path->level, gfbb)) + break; + + /* Do we meet leaf page? */ + if (path->level == 0) + break; + + /* Choose child for insertion */ + buffer = ReadBuffer(state->r, path->blkno); + LockBuffer(buffer, GIST_EXCLUSIVE); + + page = (Page) BufferGetPage(buffer); + childoffnum = gistchoose(state->r, page, itup, giststate); + iid = PageGetItemId(page, childoffnum); + idxtuple = (IndexTuple) PageGetItem(page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + /* Adjust key representing child if needed */ + newtup = gistgetadjusted(state->r, idxtuple, itup, giststate); + + UnlockReleaseBuffer(buffer); + + if (newtup) + gistBufferingBuildPlaceToPage(state, path, giststate, + &newtup, 1, childoffnum); + + /* Create new path item representing current page */ + parent = path; + path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context, + sizeof(GISTBufferingInsertStack)); + path->parent = parent; + path->level = parent->level - 1; + path->blkno = childblkno; + path->downlinkoffnum = childoffnum; + + /* It's unreferenced just now */ + path->refCount = 0; + + /* Adjust reference count of parent */ + if (parent) + parent->refCount++; + } + + if (LEVEL_HAS_BUFFERS(path->level, gfbb)) + { + /* + * We've reached level with buffers. Now place index tuple to the + * buffer and add buffer emptying stack element if buffer overflows. + */ + bool wasOverflowed; + NodeBuffer *childNodeBuffer; + + childNodeBuffer = getNodeBuffer(gfbb, giststate, path->blkno, + path->downlinkoffnum, path->parent, + true); + wasOverflowed = BUFFER_IS_OVERLOW(childNodeBuffer, gfbb); + pushItupToNodeBuffer(gfbb, childNodeBuffer, itup); + if (!wasOverflowed && BUFFER_IS_OVERLOW(childNodeBuffer, gfbb)) + { + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + gfbb->bufferEmptyingStack = lcons(childNodeBuffer, gfbb->bufferEmptyingStack); + MemoryContextSwitchTo(oldcxt); + } + } + else + { + /* + * We've reached leaf level. So, place index tuple here. + */ + gistBufferingBuildPlaceToPage(state, path, giststate, &itup, 1, InvalidOffsetNumber); + } + + /* + * Free unreferenced path items if any. Path item may be referenced by + * node buffer. + */ + freeUnreferencedPath(path); + } + + + /* + * Find correct parent using rightlinks in buffering index build. + */ + static void + gistFindCorrectParent(GISTSTATE *giststate, Relation r, GISTBufferingInsertStack * child) + { + GISTBuildBuffers *gfbb = giststate->gfbb; + GISTBufferingInsertStack *parent = child->parent; + OffsetNumber i, + maxoff; + ItemId iid; + IndexTuple idxtuple; + Buffer buffer; + Page page; + bool copied = false; + + buffer = ReadBuffer(r, parent->blkno); + page = BufferGetPage(buffer); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistcheckpage(r, buffer); + + /* Check if it has not moved */ + if (child->downlinkoffnum != InvalidOffsetNumber) + { + iid = PageGetItemId(page, child->downlinkoffnum); + idxtuple = (IndexTuple) PageGetItem(page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + { + /* Still there */ + UnlockReleaseBuffer(buffer); + return; + } + } + + /* parent is changed, look child in right links until found */ + while (true) + { + maxoff = PageGetMaxOffsetNumber(page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + { + /* yes!!, found */ + child->downlinkoffnum = i; + UnlockReleaseBuffer(buffer); + return; + } + } + + /* + * We should copy parent path item because some other path items can + * refer to it. + */ + if (!copied) + { + parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context, + sizeof(GISTBufferingInsertStack)); + memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack)); + if (parent->parent) + parent->parent->refCount++; + decreasePathRefcount(child->parent); + child->parent = parent; + parent->refCount = 1; + copied = true; + } + + parent->blkno = GistPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); + + if (parent->blkno == InvalidBlockNumber) + { + /* + * End of chain and still didn't find parent. Should not happen + * during index build. + */ + break; + } + + /* Next page */ + buffer = ReadBuffer(r, parent->blkno); + page = BufferGetPage(buffer); + LockBuffer(buffer, GIST_EXCLUSIVE); + gistcheckpage(r, buffer); + } + + elog(ERROR, "failed to re-find parent for block %u", child->blkno); + } + + /* + * Process buffers emptying stack. Emptying of one buffer can cause emptying + * of other buffers. This function iterates until this cascading emptying + * process finished, e.g. until buffers emptying stack is empty. + */ + static void + processEmptyingStack(GISTSTATE *giststate, GISTInsertState *state) + { + GISTBuildBuffers *gfbb = giststate->gfbb; + int requiredSizeDecrease = gfbb->pagesPerBuffer * BLCKSZ; + + /* Iterate while we have elements in buffers emptying stack. */ + while (gfbb->bufferEmptyingStack != NIL) + { + int initialBusySize, + busySize; + NodeBuffer *emptyingNodeBuffer; + + /* Get node buffer from emptying stack. */ + emptyingNodeBuffer = (NodeBuffer *) linitial(gfbb->bufferEmptyingStack); + gfbb->bufferEmptyingStack = list_delete_first(gfbb->bufferEmptyingStack); + + /* + * We are going to load last pages of buffers where emptying will be + * to. So let's unload any previously loaded buffers. + */ + unloadNodeBuffers(gfbb); + + /* + * We'are going to empty buffer at half of maximal size. So let's + * memorise initial busy size. + */ + initialBusySize = getNodeBufferBusySize(gfbb, emptyingNodeBuffer); + + /* Variables for split of current emptying buffer detection. */ + gfbb->currentEmptyingBufferSplit = false; + gfbb->currentEmptyingBufferBlockNumber = emptyingNodeBuffer->nodeBlocknum; + + while (true) + { + IndexTuple itup; + + /* Get one index tuple from node buffer and process it */ + if (!popItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup)) + break; + processItup(giststate, state, gfbb, itup, + emptyingNodeBuffer->path); + + /* Free all the memory allocated during index tuple processing */ + MemoryContextReset(CurrentMemoryContext); + + /* + * If current emptying node buffer split we should stop emptying + * just because there is no such node buffer anymore. + */ + if (gfbb->currentEmptyingBufferSplit) + break; + + busySize = getNodeBufferBusySize(gfbb, emptyingNodeBuffer); + + /* + * If we've processed half of buffer size limit and buffer is not + * overflowed anymore we should stop in order to avoid exceeding + * of limit in lower buffers. + */ + if (initialBusySize - busySize >= requiredSizeDecrease && + !BUFFER_IS_OVERLOW(emptyingNodeBuffer, gfbb)) + break; + } + } + } + + /* + * Insert function for buffering index build. + */ + static void + gistBufferingBuildInsert(Relation index, IndexTuple itup, + GISTBuildState *buildstate) + { + GISTBuildBuffers *gfbb = buildstate->giststate.gfbb; + GISTInsertState insertstate; + + memset(&insertstate, 0, sizeof(GISTInsertState)); + insertstate.freespace = RelationGetTargetPageFreeSpace(index, + GIST_DEFAULT_FILLFACTOR); + insertstate.r = index; + + /* We are ready for index tuple processing */ + processItup(&buildstate->giststate, &insertstate, gfbb, itup, NULL); + + /* Process buffer emptying stack if any */ + processEmptyingStack(&buildstate->giststate, &insertstate); + } + + /* + * Per-tuple callback from IndexBuildHeapScan + */ + static void + gistBuildCallback(Relation index, + HeapTuple htup, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) + { + GISTBuildState *buildstate = (GISTBuildState *) state; + IndexTuple itup; + MemoryContext oldCtx; + + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + /* form an index tuple and point it at the heap tuple */ + itup = gistFormTuple(&buildstate->giststate, index, + values, isnull, true /* size is currently bogus */ ); + itup->t_tid = htup->t_self; + + if (buildstate->bufferingMode == 'y') + { + /* We've decided to use buffering. So let's use buffering insert. */ + gistBufferingBuildInsert(index, itup, buildstate); + } + else + { + /* + * We didn't decide to use buffering yet or aren't goint to use it at + * all. Since we already have the index relation locked, we call + * gistdoinsert directly. Normal access method calls dispatch through + * gistinsert, which locks the relation for write. This is the right + * thing to do if you're inserting single tups, but not when you're + * initializing the whole index at once. + * + * In this path we respect the fillfactor setting, whereas insertions + * after initial build do not. + */ + gistdoinsert(index, itup, + RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR), + &buildstate->giststate); + } + + buildstate->indtuples += 1; + buildstate->indtuplesSize += IndexTupleSize(itup); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); + + /* + * For automatic switching to buffering mode, check whether index fits to + * effective cache. + */ + if (buildstate->bufferingMode == 'a' && + effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) + { + /* + * Index doesn't fit to effective cache anymore. Trying to switch to + * buffering build mode. + */ + if (initBuffering(buildstate, index)) + { + /* + * Buffering build is successfully initialized. Now we can set + * appropriate flag. + */ + buildstate->bufferingMode = 'y'; + elog(INFO, "switching to buffered mode"); + } + else + { + /* + * Failed to switch to buffering build due to not enough memory + * settings. Mark that we aren't going to switch anymore. + */ + buildstate->bufferingMode = 'n'; + } + } + } + + /* + * Get maximum level number of GiST index. + */ + static int + getMaxLevel(Relation index) + { + int maxLevel = 0; + BlockNumber blkno = GIST_ROOT_BLKNO; + + while (true) + { + Buffer buffer; + Page page; + IndexTuple itup; + + buffer = ReadBuffer(index, blkno); + page = (Page) BufferGetPage(buffer); + if (GistPageIsLeaf(page)) + { + /* Page is leaf. We've counted height of tree. */ + ReleaseBuffer(buffer); + break; + } + /* Page is not leaf. Iterate to underlying page. */ + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + ReleaseBuffer(buffer); + maxLevel++; + } + return maxLevel; + } + + /* + * Initial calculations for GiST buffering build. + */ + static bool + initBuffering(GISTBuildState *buildstate, Relation index) + { + int pagesPerBuffer = -1; + Size pageFreeSpace; + Size itupAvgSize; + int i, + maxIndexTuplesCount; + int effectiveMemory; + int levelStep = 0; + GISTBuildBuffers *gfbb; + + /* try to get user difened options */ + if (index->rd_options) + { + GiSTOptions *options = (GiSTOptions *) index->rd_options; + + levelStep = options->levelStep; + pagesPerBuffer = options->bufferSize; + } + + /* calc number of index tuples which fit in page */ + pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - + sizeof(GISTPageOpaqueData) - sizeof(ItemIdData); + + if (buildstate->indtuples > 0) + { + itupAvgSize = round((double) buildstate->indtuplesSize / + (double) buildstate->indtuples); + } + else + { + itupAvgSize = (Size) MAXALIGN(sizeof(IndexTupleData)); + for (i = 0; i < index->rd_att->natts; i++) + { + if (index->rd_att->attrs[i]->attlen < 0) + itupAvgSize += VARHDRSZ; + else + itupAvgSize += index->rd_att->attrs[i]->attlen; + } + } + maxIndexTuplesCount = pageFreeSpace / itupAvgSize; + + /* calculate level step if it isn't specified by user */ + effectiveMemory = Min(maintenance_work_mem * 1024 / BLCKSZ, + effective_cache_size); + if (levelStep <= 0) + { + levelStep = (int) log((double) effectiveMemory / 4.0) / + log((double) maxIndexTuplesCount); + if (levelStep < 0) + levelStep = 0; + } + + /* calculate buffer size if it isn't specified by user */ + if (pagesPerBuffer <= 0) + { + pagesPerBuffer = 2; + for (i = 0; i < levelStep; i++) + { + pagesPerBuffer *= maxIndexTuplesCount; + } + } + + if (levelStep > 0) + { + /* Enough of memory for at least level_step == 1. */ + gfbb = palloc(sizeof(GISTBuildBuffers)); + gfbb->pagesPerBuffer = pagesPerBuffer; + gfbb->levelStep = levelStep; + initGiSTBuildBuffers(gfbb, buildstate->indtuples > 0 ? getMaxLevel(index) : 0); + buildstate->giststate.gfbb = gfbb; + elog(INFO, "Level step = %d, pagesPerBuffer = %d", + levelStep, pagesPerBuffer); + return true; + } + else + { + /* Not enough memory for buffering build. */ + return false; + } + } + + /* + * Routine to build an index. Basically calls insert over and over. + * + * XXX: it would be nice to implement some sort of bulk-loading + * algorithm, but it is not clear how to do that. + */ + Datum + gistbuild(PG_FUNCTION_ARGS) + { + Relation heap = (Relation) PG_GETARG_POINTER(0); + Relation index = (Relation) PG_GETARG_POINTER(1); + IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); + IndexBuildResult *result; + double reltuples; + GISTBuildState buildstate; + Buffer buffer; + Page page; + MemoryContext oldcxt = CurrentMemoryContext; + + if (index->rd_options) + { + /* Get buffering mode from the options string */ + GiSTOptions *options = (GiSTOptions *) index->rd_options; + char *bufferingMode = (char *) options + options->bufferingModeOffset; + + if (!strcmp(bufferingMode, "on")) + buildstate.bufferingMode = 'y'; + else if (!strcmp(bufferingMode, "off")) + buildstate.bufferingMode = 'n'; + else + buildstate.bufferingMode = 'a'; + } + else + { + /* Automatic buffering mode switching by default */ + buildstate.bufferingMode = 'a'; + } + + /* + * We expect to be called exactly once for any index relation. If that's + * not the case, big trouble's what we have. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* no locking is needed */ + initGISTstate(&buildstate.giststate, index); + + /* initialize the root page */ + buffer = gistNewBuffer(index); + Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); + page = BufferGetPage(buffer); + + START_CRIT_SECTION(); + + GISTInitBuffer(buffer, F_LEAF); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + XLogRecData rdata; + + rdata.data = (char *) &(index->rd_node); + rdata.len = sizeof(RelFileNode); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + else + PageSetLSN(page, GetXLogRecPtrForTemp()); + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* build the index */ + buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs; + buildstate.indtuples = 0; + buildstate.indtuplesSize = 0; + + /* + * create a temporary memory context that is reset once for each tuple + * inserted into the index + */ + buildstate.tmpCtx = createTempGistContext(); + + if (buildstate.bufferingMode == 'y') + { + if (!initBuffering(&buildstate, index)) + buildstate.bufferingMode = 'n'; + } + + /* + * Do the heap scan. + */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, + gistBuildCallback, (void *) &buildstate); + + /* + * If buffering build do final node buffers emptying. + */ + if (buildstate.bufferingMode == 'y') + { + int i; + GISTInsertState insertstate; + NodeBuffer *nodeBuffer; + MemoryContext oldCtx; + GISTBuildBuffers *gfbb = buildstate.giststate.gfbb; + + oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); + + memset(&insertstate, 0, sizeof(GISTInsertState)); + insertstate.freespace = RelationGetTargetPageFreeSpace(index, + GIST_DEFAULT_FILLFACTOR); + insertstate.r = index; + + for (;;) + { + nodeBuffer = getNodeBuffer(gfbb, &buildstate.giststate, GIST_ROOT_BLKNO, + InvalidOffsetNumber, NULL, false); + if (!nodeBuffer || nodeBuffer->blocksCount <= 0) + break; + MemoryContextSwitchTo(gfbb->context); + gfbb->bufferEmptyingStack = lcons(nodeBuffer, gfbb->bufferEmptyingStack); + MemoryContextSwitchTo(buildstate.tmpCtx); + processEmptyingStack(&buildstate.giststate, &insertstate); + } + + /* + * Iterate through the levels from the most higher. + */ + for (i = gfbb->buffersOnLevelsCount - 1; i >= 0; i--) + { + bool nonEmpty = true; + + /* + * Until we have non-empty node buffers on the level, iterate over + * them and initial emptying of non-empty ones. + */ + while (nonEmpty) + { + ListCell *p; + + nonEmpty = false; + + for (p = list_head(gfbb->buffersOnLevels[i]); p; p = p->next) + { + /* Get next node buffer */ + nodeBuffer = (NodeBuffer *) p->data.ptr_value; + + /* Skip empty node buffer */ + if (nodeBuffer->blocksCount == 0) + continue; + + /* Memorize that we met non-empty buffer. */ + nonEmpty = true; + + /* Process emptying of node buffer */ + MemoryContextSwitchTo(gfbb->context); + gfbb->bufferEmptyingStack = lcons(nodeBuffer, gfbb->bufferEmptyingStack); + MemoryContextSwitchTo(buildstate.tmpCtx); + processEmptyingStack(&buildstate.giststate, &insertstate); + } + } + } + MemoryContextSwitchTo(oldCtx); + } + + /* okay, all heap tuples are indexed */ + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(buildstate.tmpCtx); + + freeGISTstate(&buildstate.giststate); + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = (double) buildstate.indtuples; + + PG_RETURN_POINTER(result); + } *** /dev/null --- b/src/backend/access/gist/gistbuildbuffers.c *************** *** 0 **** --- 1,884 ---- + + /*------------------------------------------------------------------------- + * + * gistbuildbuffers.c + * buffers management functions for GiST buffering build algorithm. + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gist/gistbuildbuffers.c + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include "access/genam.h" + #include "access/gist_private.h" + #include "catalog/index.h" + #include "catalog/pg_collation.h" + #include "miscadmin.h" + #include "storage/bufmgr.h" + #include "storage/indexfsm.h" + #include "storage/buffile.h" + #include "utils/memutils.h" + #include "utils/rel.h" + + static NodeBufferPage *allocateNewPageBuffer(GISTBuildBuffers * gfbb); + static void addLoadedBuffer(GISTBuildBuffers * gfbb, BlockNumber blocknum); + static void loadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer); + static void unloadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer); + static void placeItupToPage(NodeBufferPage * pageBuffer, IndexTuple item); + static void getItupFromPage(NodeBufferPage * pageBuffer, IndexTuple *item); + static int freeBlocks_cmp(const void *a, const void *b); + static long getFreeBlock(GISTBuildBuffers * gfbb); + static void releaseBlock(GISTBuildBuffers * gfbb, long blocknum); + + /* + * Initialize GiST buffering build structure. + */ + void + initGiSTBuildBuffers(GISTBuildBuffers * gfbb, int maxLevel) + { + HASHCTL hashCtl; + + /* + * Create temporary file initialize data structures for free pages + * management. + */ + gfbb->pfile = BufFileCreateTemp(true); + gfbb->nFileBlocks = 0; + gfbb->nFreeBlocks = 0; + gfbb->blocksSorted = false; + gfbb->freeBlocksLen = 32; + gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); + + /* + * Current memory context will be used for all in-memory data structures + * of buffers. + */ + gfbb->context = CurrentMemoryContext; + + /* + * nodeBuffersTab hash is association between index blocks and it's + * buffer. + */ + hashCtl.keysize = sizeof(BlockNumber); + hashCtl.entrysize = sizeof(NodeBuffer); + hashCtl.hcxt = CurrentMemoryContext; + hashCtl.hash = tag_hash; + hashCtl.match = memcmp; + gfbb->nodeBuffersTab = hash_create("gistbuildbuffers", + 1024, + &hashCtl, + HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION + | HASH_COMPARE); + + /* + * Stack of node buffers which was planned for emptying. + */ + gfbb->bufferEmptyingStack = NIL; + + gfbb->currentEmptyingBufferBlockNumber = InvalidBlockNumber; + gfbb->currentEmptyingBufferSplit = false; + + /* + * Per-level node buffers lists for final buffers emptying process. + */ + gfbb->buffersOnLevelsLen = 16; + gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * + gfbb->buffersOnLevelsLen); + gfbb->buffersOnLevelsCount = 0; + + /* + * "Prepared" blocks are index blocks which buffers are currently loaded + * into memory + */ + gfbb->loadedBuffersLen = 32; + gfbb->loadedBuffers = (BlockNumber *) palloc(gfbb->loadedBuffersLen * + sizeof(BlockNumber)); + gfbb->loadedBuffersCount = 0; + + gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context, + sizeof + (GISTBufferingInsertStack)); + gfbb->rootitem->parent = NULL; + gfbb->rootitem->blkno = GIST_ROOT_BLKNO; + gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber; + gfbb->rootitem->level = maxLevel; + } + + /* + * Return NodeBuffer structure by it's block number. If createNew flag is + * specified then new NodeBuffer structure will be created on it's absence. + */ + NodeBuffer * + getNodeBuffer(GISTBuildBuffers * gfbb, GISTSTATE *giststate, + BlockNumber nodeBlocknum, + OffsetNumber downlinkoffnum, + GISTBufferingInsertStack * parent, bool createNew) + { + NodeBuffer *nodeBuffer; + bool found; + + /* + * Find nodeBuffer in hash table + */ + nodeBuffer = (NodeBuffer *) hash_search(gfbb->nodeBuffersTab, + (const void *) &nodeBlocknum, + createNew ? HASH_ENTER : HASH_FIND, + &found); + if (!found) + { + GISTBufferingInsertStack *path; + int levelIndex; + int i; + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + /* + * Node buffer wasn't found. Create new if required. + */ + if (!createNew) + return NULL; + + if (nodeBlocknum != GIST_ROOT_BLKNO) + { + path = (GISTBufferingInsertStack *) palloc(sizeof(GISTBufferingInsertStack)); + path->parent = parent; + path->blkno = nodeBlocknum; + path->downlinkoffnum = downlinkoffnum; + path->level = parent->level - 1; + path->refCount = 0; + parent->refCount++; + Assert(path->level > 0); + } + else + { + path = gfbb->rootitem; + } + + path->refCount++; + + /* + * New node buffer. Fill data structure with default values. + */ + nodeBuffer->pageBuffer = NULL; + nodeBuffer->blocksCount = 0; + nodeBuffer->level = path->level; + nodeBuffer->path = path; + + /* + * Maintain level lists of buffers. We don't put root node buffer to + * that lists because it can disappear on node split. And we evade + * removing from that lists due to possible scan by final emptying. + * Also, it order to evade removing split old NodeBuffer structure is + * reused on that non-root node buffer split. + */ + if (nodeBlocknum != GIST_ROOT_BLKNO) + { + /* Calc index of node buffer level. */ + levelIndex = (nodeBuffer->level - gfbb->levelStep) / gfbb->levelStep; + + /* + * Probably, we should increase number of allocated buffers lists. + */ + while (levelIndex >= gfbb->buffersOnLevelsLen) + { + gfbb->buffersOnLevelsLen *= 2; + gfbb->buffersOnLevels = + (List **) repalloc(gfbb->buffersOnLevels, + gfbb->buffersOnLevelsLen * + sizeof(List *)); + } + + /* Initialize new buffers lists as empty. */ + if (levelIndex >= gfbb->buffersOnLevelsCount) + { + for (i = gfbb->buffersOnLevelsCount; i <= levelIndex; i++) + { + gfbb->buffersOnLevels[i] = NIL; + } + gfbb->buffersOnLevelsCount = levelIndex + 1; + } + + /* Add node buffer to the corresponding list */ + gfbb->buffersOnLevels[levelIndex] = lcons( + nodeBuffer, gfbb->buffersOnLevels[levelIndex]); + } + MemoryContextSwitchTo(oldcxt); + } + else + { + if (parent != nodeBuffer->path->parent) + { + /* + * Other parent path item was provided than we've remembered. We + * trust caller to provide more correct parent than we have. + */ + decreasePathRefcount(nodeBuffer->path->parent); + nodeBuffer->path->parent = parent; + parent->refCount++; + } + } + + return nodeBuffer; + } + + /* + * Allocate memory for buffer page. + */ + static NodeBufferPage * + allocateNewPageBuffer(GISTBuildBuffers * gfbb) + { + NodeBufferPage *pageBuffer; + + /* + * Allocate memory for page in appropriate context. + */ + pageBuffer = (NodeBufferPage *) MemoryContextAlloc(gfbb->context, BLCKSZ); + + /* + * Set page free space + */ + PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; + return pageBuffer; + } + + /* + * Add specified block number into preparedBlocks array. + */ + static void + addLoadedBuffer(GISTBuildBuffers * gfbb, BlockNumber blocknum) + { + if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) + { + /* + * not enough memory is currently allocated + */ + gfbb->loadedBuffersLen *= 2; + gfbb->loadedBuffers = (BlockNumber *) repalloc(gfbb->loadedBuffers, + gfbb->loadedBuffersLen * + sizeof(BlockNumber)); + } + gfbb->loadedBuffers[gfbb->loadedBuffersCount] = blocknum; + gfbb->loadedBuffersCount++; + } + + + /* + * Load last page of node buffer into main memory. + */ + static void + loadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer) + { + /* Check if we really should load something */ + if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) + { + /* Read block from temporary file */ + nodeBuffer->pageBuffer = allocateNewPageBuffer(gfbb); + BufFileSeekBlock(gfbb->pfile, nodeBuffer->pageBlocknum); + BufFileRead(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ); + releaseBlock(gfbb, nodeBuffer->pageBlocknum); + nodeBuffer->pageBlocknum = InvalidBlockNumber; + + /* Mark node buffer as loaded */ + addLoadedBuffer(gfbb, nodeBuffer->nodeBlocknum); + } + } + + /* + * Write last page of node buffer to the disk. + */ + static void + unloadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer) + { + /* Check if we have something to write */ + if (nodeBuffer->pageBuffer) + { + BlockNumber blkno; + + /* Write block to the temporary file */ + blkno = getFreeBlock(gfbb); + BufFileSeekBlock(gfbb->pfile, blkno); + BufFileWrite(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ); + nodeBuffer->pageBlocknum = blkno; + pfree(nodeBuffer->pageBuffer); + nodeBuffer->pageBuffer = NULL; + } + } + + /* + * Write last pages of all node buffers to the disk. + */ + void + unloadNodeBuffers(GISTBuildBuffers * gfbb) + { + int i; + + /* Iterate over loaded node buffers */ + for (i = 0; i < gfbb->loadedBuffersCount; i++) + { + NodeBuffer *nodeBuffer; + bool found; + + nodeBuffer = hash_search(gfbb->nodeBuffersTab, &gfbb->loadedBuffers[i], + HASH_FIND, &found); + + /* + * Node buffer can be not found. It can disappear during page split. + * So, it's ok, just skip it. + */ + if (!found) + continue; + + unloadNodeBuffer(gfbb, nodeBuffer); + } + gfbb->loadedBuffersCount = 0; + } + + /* + * Add item to buffer page. + */ + static void + placeItupToPage(NodeBufferPage * pageBuffer, IndexTuple itup) + { + /* + * Get pointer to page free space start + */ + char *ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET + + PAGE_FREE_SPACE(pageBuffer) - MAXALIGN(IndexTupleSize(itup)); + size_t itupSize, + alignedSize; + + itupSize = MAXALIGN(IndexTupleSize(itup)); + alignedSize = IndexTupleSize(itup); + + /* + * There should be enough of space + */ + Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(IndexTupleSize(itup))); + + /* + * Reduce free space value of page + */ + PAGE_FREE_SPACE(pageBuffer) -= alignedSize; + + /* + * Copy index tuple to free space + */ + memcpy(ptr, itup, IndexTupleSize(itup)); + } + + /* + * Get last item from buffer page and remove it from page. + */ + static void + getItupFromPage(NodeBufferPage * pageBuffer, IndexTuple *itup) + { + /* + * Get pointer to last index tuple + */ + IndexTuple ptr = (IndexTuple) ((char *) pageBuffer + + BUFFER_PAGE_DATA_OFFSET + + PAGE_FREE_SPACE(pageBuffer)); + + /* + * Page shouldn't be empty + */ + Assert(!PAGE_IS_EMPTY(pageBuffer)); + + /* + * Allocate memory for returned index tuple copy + */ + *itup = (IndexTuple) palloc(IndexTupleSize(ptr)); + + /* + * Copy data + */ + memcpy(*itup, ptr, IndexTupleSize(ptr)); + + /* + * Increase free space value of page + */ + PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(IndexTupleSize(*itup)); + } + + /* + * Push new index tuple to node buffer. + */ + void + pushItupToNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer, + IndexTuple itup) + { + /* + * Most part of memory operations will be in buffering build persistent + * context. So, let's switch to it. + */ + MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); + + if (nodeBuffer->blocksCount == 0) + { + nodeBuffer->pageBuffer = allocateNewPageBuffer(gfbb); + nodeBuffer->pageBuffer->prev = InvalidBlockNumber; + nodeBuffer->blocksCount = 1; + addLoadedBuffer(gfbb, nodeBuffer->nodeBlocknum); + } + + /* Load last page of node buffer if needed */ + if (!nodeBuffer->pageBuffer) + { + loadNodeBuffer(gfbb, nodeBuffer); + } + + /* + * Check if there is enough space on the last page for the tuple + */ + if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) + { + /* + * Swap previous block to disk and allocate new one + */ + BlockNumber blkno; + + nodeBuffer->blocksCount++; + + blkno = getFreeBlock(gfbb); + BufFileSeekBlock(gfbb->pfile, blkno); + BufFileWrite(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ); + + PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = + BLCKSZ - MAXALIGN(offsetof(NodeBufferPage, tupledata)); + nodeBuffer->pageBuffer->prev = blkno; + } + + placeItupToPage(nodeBuffer->pageBuffer, itup); + + /* + * Restore memory context + */ + MemoryContextSwitchTo(oldcxt); + } + + /* + * Removes one index tuple from node buffer. Returns true if success and false + * if node buffer is empty. + */ + bool + popItupFromNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer, + IndexTuple *itup) + { + /* + * If node buffer is empty then return false. + */ + if (nodeBuffer->blocksCount <= 0) + return false; + + /* Load last page of node buffer if needed */ + if (!nodeBuffer->pageBuffer) + loadNodeBuffer(gfbb, nodeBuffer); + + /* + * Get index tuple from last non-empty page and mark it as dirty. + */ + getItupFromPage(nodeBuffer->pageBuffer, itup); + + /* + * Check if the page which the index tuple was got from is now empty + */ + if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) + { + BlockNumber prevblkno; + + /* + * If it's empty then we need to release buffer file block and free + * page buffer. + */ + nodeBuffer->blocksCount--; + + /* + * If there's more pages, fetch previous one + */ + prevblkno = nodeBuffer->pageBuffer->prev; + if (prevblkno != InvalidBlockNumber) + { + Assert(nodeBuffer->blocksCount > 0); + BufFileSeekBlock(gfbb->pfile, prevblkno); + BufFileRead(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ); + + releaseBlock(gfbb, prevblkno); + } + else + { + Assert(nodeBuffer->blocksCount == 0); + pfree(nodeBuffer->pageBuffer); + nodeBuffer->pageBuffer = NULL; + } + } + return true; + } + + /* + * qsort comparator for sorting freeBlocks[] into decreasing order. + */ + static int + freeBlocks_cmp(const void *a, const void *b) + { + long ablk = *((const long *) a); + long bblk = *((const long *) b); + + /* + * can't just subtract because long might be wider than int + */ + if (ablk < bblk) + return 1; + if (ablk > bblk) + return -1; + return 0; + } + + /* + * Select a currently unused block for writing to. + * + * NB: should only be called when writer is ready to write immediately, + * to ensure that first write pass is sequential. + */ + static long + getFreeBlock(GISTBuildBuffers * gfbb) + { + /* + * If there are multiple free blocks, we select the one appearing last in + * freeBlocks[] (after sorting the array if needed). If there are none, + * assign the next block at the end of the file. + */ + if (gfbb->nFreeBlocks > 0) + { + if (!gfbb->blocksSorted) + { + qsort((void *) gfbb->freeBlocks, gfbb->nFreeBlocks, + sizeof(long), freeBlocks_cmp); + gfbb->blocksSorted = true; + } + return gfbb->freeBlocks[--gfbb->nFreeBlocks]; + } + else + return gfbb->nFileBlocks++; + } + + /* + * Return a block# to the freelist. + */ + static void + releaseBlock(GISTBuildBuffers * gfbb, long blocknum) + { + int ndx; + + /* + * Enlarge freeBlocks array if full. + */ + if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) + { + gfbb->freeBlocksLen *= 2; + gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, + gfbb->freeBlocksLen * + sizeof(long)); + } + + /* + * Add blocknum to array, and mark the array unsorted if it's no longer in + * decreasing order. + */ + ndx = gfbb->nFreeBlocks++; + gfbb->freeBlocks[ndx] = blocknum; + if (ndx > 0 && gfbb->freeBlocks[ndx - 1] < blocknum) + gfbb->blocksSorted = false; + } + + /* + * Free buffering build data structure. + */ + void + freeGiSTBuildBuffers(GISTBuildBuffers * gfbb) + { + /* Close buffers file. */ + BufFileClose(gfbb->pfile); + + /* All other things will be free on memory context release */ + } + + /* + * Data structure representing information about node buffer for index tuples + * relocation from splitted node buffer. + */ + typedef struct + { + GISTENTRY entry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + SplitedPageLayout *dist; + NodeBuffer *nodeBuffer; + } RelocationBufferInfo; + + /* + * Maintain data structures on page split. + */ + void + relocateBuildBuffersOnSplit(GISTBuildBuffers * gfbb, GISTSTATE *giststate, + Relation r, GISTBufferingInsertStack * path, + Buffer buffer, SplitedPageLayout *dist) + { + RelocationBufferInfo *relocationBuffersInfos; + bool found; + NodeBuffer *nodeBuffer; + BlockNumber blocknum; + IndexTuple itup; + int splitPagesCount = 0, + i; + GISTENTRY entry[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + SplitedPageLayout *ptr; + int level = path->level; + NodeBuffer nodebuf; + + blocknum = BufferGetBlockNumber(buffer); + + /* + * If splitted page level doesn't have buffers, then everything is done. + * Otherwise we also need to relocate index tuples of buffer of splitted + * page. + */ + if (!LEVEL_HAS_BUFFERS(level, gfbb)) + return; + + /* + * Get pointer of node buffer of splitted page and remove it from the + * hash. + */ + nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, + HASH_FIND, &found); + if (!found) + { + /* + * Node buffer should anyway be created at this moment. Either by + * index tuples insertion or page split. + */ + elog(ERROR, + "node buffer of splitting page (%u) doesn't exists while it should.", + blocknum); + } + + /* + * We are going to perform some operations with node buffers hash. Thus, + * it unsafe to operate with already removed hash item and impossible if + * we reuse it. Let's save it. + */ + memcpy(&nodebuf, nodeBuffer, sizeof(NodeBuffer)); + nodeBuffer = &nodebuf; + + if (blocknum == GIST_ROOT_BLKNO) + { + /* + * If root split, we don't reuse NodeBuffer data structure. So let's + * remove it from hash. + */ + hash_search(gfbb->nodeBuffersTab, &blocknum, HASH_REMOVE, &found); + Assert(found); + } + + /* + * Count pages produced by split and save pointer data structure of the + * last one. + */ + for (ptr = dist; ptr; ptr = ptr->next) + { + splitPagesCount++; + } + + /* + * Allocate memory for information about relocation buffers. + */ + relocationBuffersInfos = + (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * + splitPagesCount); + + /* + * Fill relocation buffers information for node buffers of pages produced + * by split. + */ + i = 0; + for (ptr = dist; ptr; ptr = ptr->next) + { + NodeBuffer *newNodeBuffer; + + /* + * Decompress parent index tuple of node buffer page. + */ + gistDeCompressAtt(giststate, r, + ptr->itup, NULL, (OffsetNumber) 0, + relocationBuffersInfos[i].entry, + relocationBuffersInfos[i].isnull); + + newNodeBuffer = getNodeBuffer(gfbb, giststate, ptr->block.blkno, + path->downlinkoffnum, path->parent, true); + + /* + * Fill relocation information + */ + relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; + if (newNodeBuffer->nodeBlocknum == blocknum) + { + /* + * Reuse of NodeBuffer data structure of splitted node. Old + * version was copied. + */ + newNodeBuffer->blocksCount = 0; + newNodeBuffer->pageBuffer = NULL; + newNodeBuffer->pageBlocknum = InvalidBlockNumber; + } + + /* + * Fill node buffer structure + */ + relocationBuffersInfos[i].dist = ptr; + + i++; + } + + /* + * Loop of index tuples relocation. + */ + while (popItupFromNodeBuffer(gfbb, nodeBuffer, &itup)) + { + float sum_grow, + which_grow[INDEX_MAX_KEYS]; + int i, + which; + IndexTuple newtup; + bool wasOverflow; + + /* + * Choose node buffer for index tuple insert. + */ + + gistDeCompressAtt(giststate, r, + itup, NULL, (OffsetNumber) 0, entry, isnull); + + which = -1; + *which_grow = -1.0f; + sum_grow = 1.0f; + + for (i = 0; i < splitPagesCount && sum_grow; i++) + { + int j; + RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; + + sum_grow = 0.0f; + for (j = 0; j < r->rd_att->natts; j++) + { + float usize; + + usize = gistpenalty(giststate, j, + &splitPageInfo->entry[j], + splitPageInfo->isnull[j], + &entry[j], isnull[j]); + + if (which_grow[j] < 0 || usize < which_grow[j]) + { + which = i; + which_grow[j] = usize; + if (j < r->rd_att->natts - 1 && i == 0) + which_grow[j + 1] = -1; + sum_grow += which_grow[j]; + } + else if (which_grow[j] == usize) + sum_grow += usize; + else + { + sum_grow = 1; + break; + } + } + } + + wasOverflow = + BUFFER_IS_OVERLOW(relocationBuffersInfos[which].nodeBuffer, gfbb); + + /* + * push item to selected node buffer + */ + pushItupToNodeBuffer(gfbb, relocationBuffersInfos[which].nodeBuffer, + itup); + + /* + * If node buffer was just overflowed then we should add it to the + * emptying stack. + */ + if (BUFFER_IS_OVERLOW(relocationBuffersInfos[which].nodeBuffer, gfbb) + && !wasOverflow) + { + MemoryContext oldcxt = CurrentMemoryContext; + + MemoryContextSwitchTo(gfbb->context); + gfbb->bufferEmptyingStack = + lcons(relocationBuffersInfos[which].nodeBuffer, + gfbb->bufferEmptyingStack); + MemoryContextSwitchTo(oldcxt); + } + + /* + * adjust tuple of parent page + */ + newtup = gistgetadjusted(r, relocationBuffersInfos[which].dist->itup, + itup, giststate); + if (newtup) + { + /* + * Parent page index tuple expands. We need to update old index + * tuple with the new one. + */ + gistDeCompressAtt(giststate, r, + newtup, NULL, (OffsetNumber) 0, + relocationBuffersInfos[which].entry, + relocationBuffersInfos[which].isnull); + + relocationBuffersInfos[which].dist->itup = newtup; + } + } + + /* Report about splitting for current emptying buffer */ + if (blocknum == gfbb->currentEmptyingBufferBlockNumber) + gfbb->currentEmptyingBufferSplit = true; + + /* + * If root split, we don't reuse NodeBuffer data structure. So, one + * reference to path item goes away. + */ + if (blocknum == GIST_ROOT_BLKNO) + decreasePathRefcount(nodeBuffer->path); + + pfree(relocationBuffersInfos); + } + + /* + * Return size of node buffer occupied by stored index tuples. + */ + int + getNodeBufferBusySize(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer) + { + int size; + + /* + * No occupied buffer file blocks means that node buffer is empty. + */ + if (nodeBuffer->blocksCount == 0) + return 0; + if (!nodeBuffer->pageBuffer) + loadNodeBuffer(gfbb, nodeBuffer); + + /* + * We assume only the last page to be not fully filled. + */ + size = (BLCKSZ - MAXALIGN(sizeof(uint32))) * nodeBuffer->blocksCount; + size -= PAGE_FREE_SPACE(nodeBuffer->pageBuffer); + return size; + } *** a/src/backend/access/gist/gistutil.c --- b/src/backend/access/gist/gistutil.c *************** *** 670,682 **** gistoptions(PG_FUNCTION_ARGS) { Datum reloptions = PG_GETARG_DATUM(0); bool validate = PG_GETARG_BOOL(1); ! bytea *result; ! result = default_reloptions(reloptions, validate, RELOPT_KIND_GIST); - if (result) - PG_RETURN_BYTEA_P(result); - PG_RETURN_NULL(); } /* --- 670,701 ---- { Datum reloptions = PG_GETARG_DATUM(0); bool validate = PG_GETARG_BOOL(1); ! relopt_value *options; ! GiSTOptions *rdopts; ! int numoptions; ! static const relopt_parse_elt tab[] = { ! {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)}, ! {"levelstep", RELOPT_TYPE_INT, offsetof(GiSTOptions, levelStep)}, ! {"buffersize", RELOPT_TYPE_INT, offsetof(GiSTOptions, bufferSize)}, ! {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)} ! }; ! options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST, ! &numoptions); ! ! /* if none set, we're done */ ! if (numoptions == 0) ! PG_RETURN_NULL(); ! ! rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions); ! ! fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions, ! validate, tab, lengthof(tab)); ! ! pfree(options); ! ! PG_RETURN_BYTEA_P(rdopts); } /* *** a/src/backend/access/gist/gistxlog.c --- b/src/backend/access/gist/gistxlog.c *************** *** 266,272 **** gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageGetOpaque(page)->nsn = xldata->orignsn; ! if (i < xlrec.data->npage - 1 && !isrootsplit) GistMarkFollowRight(page); else GistClearFollowRight(page); --- 266,273 ---- else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageGetOpaque(page)->nsn = xldata->orignsn; ! if (i < xlrec.data->npage - 1 && !isrootsplit && ! !xldata->noFollowFight) GistMarkFollowRight(page); else GistClearFollowRight(page); *************** *** 414,420 **** XLogRecPtr gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN orignsn, ! Buffer leftchildbuf) { XLogRecData *rdata; gistxlogPageSplit xlrec; --- 415,421 ---- gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN orignsn, ! Buffer leftchildbuf, bool noFollowFight) { XLogRecData *rdata; gistxlogPageSplit xlrec; *************** *** 436,441 **** gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, --- 437,443 ---- xlrec.npage = (uint16) npage; xlrec.leftchild = BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; + xlrec.noFollowFight = noFollowFight; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(gistxlogPageSplit); *** a/src/include/access/gist_private.h --- b/src/include/access/gist_private.h *************** *** 17,29 **** --- 17,72 ---- #include "access/gist.h" #include "access/itup.h" #include "storage/bufmgr.h" + #include "storage/buffile.h" #include "utils/rbtree.h" + #include "utils/hsearch.h" + + /* Has specified level buffers? */ + #define LEVEL_HAS_BUFFERS(level,gfbb) ((level) != 0 && (level) % (gfbb)->levelStep == 0) + /* Is specified buffer at least halt-filled (should be planned for emptying)?*/ + #define BUFFER_IS_OVERLOW(nodeBuffer,gfbb) ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2) /* Buffer lock modes */ #define GIST_SHARE BUFFER_LOCK_SHARE #define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE #define GIST_UNLOCK BUFFER_LOCK_UNLOCK + typedef struct + { + BlockNumber prev; + uint32 freespace; + char tupledata[1]; + } NodeBufferPage; + + #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(NodeBufferPage, tupledata)) + /* Returns free space in node buffer page */ + #define PAGE_FREE_SPACE(nbp) (nbp->freespace) + /* Checks if node buffer page is empty */ + #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET) + /* Checks if node buffers page don't contain sufficient space for index tuple */ + #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \ + MAXALIGN(IndexTupleSize(itup))) + + /* Buffer of tree node data structure */ + typedef struct NodeBuffer + { + /* number of page containing node */ + BlockNumber nodeBlocknum; + + /* count of blocks occupied by buffer */ + int blocksCount; + + BlockNumber pageBlocknum; + NodeBufferPage *pageBuffer; + + /* corresponding downlink number in parent page */ + OffsetNumber downlinkoffnum; + /* level of corresponding node */ + int level; + + struct GISTBufferingInsertStack *path; + } NodeBuffer; + /* * GISTSTATE: information needed for any GiST index operation * *************** *** 44,49 **** typedef struct GISTSTATE --- 87,94 ---- /* Collations to pass to the support functions */ Oid supportCollation[INDEX_MAX_KEYS]; + struct GISTBuildBuffers *gfbb; + TupleDesc tupdesc; } GISTSTATE; *************** *** 170,175 **** typedef struct gistxlogPageSplit --- 215,221 ---- BlockNumber leftchild; /* like in gistxlogPageUpdate */ uint16 npage; /* # of pages in the split */ + bool noFollowFight; /* skip followRight flag setting */ /* * follow: 1. gistxlogPage and array of IndexTupleData per page *************** *** 225,230 **** typedef struct GISTInsertStack --- 271,346 ---- struct GISTInsertStack *parent; } GISTInsertStack; + /* + * Extended GISTInsertStack for buffering GiST index build. It additionally hold + * level number of page. + */ + typedef struct GISTBufferingInsertStack + { + /* current page */ + BlockNumber blkno; + + /* offset of the downlink in the parent page, that points to this page */ + OffsetNumber downlinkoffnum; + + /* pointer to parent */ + struct GISTBufferingInsertStack *parent; + + int refCount; + + /* level number */ + int level; + } GISTBufferingInsertStack; + + /* + * Data structure with general information about build buffers. + */ + typedef struct GISTBuildBuffers + { + /* memory context which is persistent during buffering build */ + MemoryContext context; + /* underlying files */ + BufFile *pfile; + /* # of blocks used in underlying files */ + long nFileBlocks; + /* is freeBlocks[] currently in order? */ + bool blocksSorted; + /* resizable array of free blocks */ + long *freeBlocks; + /* # of currently free blocks */ + int nFreeBlocks; + /* current allocated length of freeBlocks[] */ + int freeBlocksLen; + + /* hash for buffers by block number */ + HTAB *nodeBuffersTab; + + /* stack of buffers for emptying */ + List *bufferEmptyingStack; + /* number of currently emptying buffer */ + BlockNumber currentEmptyingBufferBlockNumber; + /* whether currently emptying buffer was split - a signal to stop emptying */ + bool currentEmptyingBufferSplit; + + /* step of levels for buffers location */ + int levelStep; + /* maximal number of pages occupied by buffer */ + int pagesPerBuffer; + + /* array of lists of non-empty buffers on levels for final emptying */ + List **buffersOnLevels; + int buffersOnLevelsLen; + int buffersOnLevelsCount; + + /* dynamic array of block numbers of buffer loaded into main memory */ + BlockNumber *loadedBuffers; + /* number of block numbers */ + int loadedBuffersCount; + /* length of array */ + int loadedBuffersLen; + GISTBufferingInsertStack *rootitem; + } GISTBuildBuffers; + typedef struct GistSplitVector { GIST_SPLITVEC splitVector; /* to/from PickSplit method */ *************** *** 286,291 **** extern Datum gistinsert(PG_FUNCTION_ARGS); --- 402,411 ---- extern MemoryContext createTempGistContext(void); extern void initGISTstate(GISTSTATE *giststate, Relation index); extern void freeGISTstate(GISTSTATE *giststate); + void gistdoinsert(Relation r, + IndexTuple itup, + Size freespace, + GISTSTATE *GISTstate); extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); *************** *** 305,311 **** extern XLogRecPtr gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN oldnsn, ! Buffer leftchild); /* gistget.c */ extern Datum gistgettuple(PG_FUNCTION_ARGS); --- 425,431 ---- BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN oldnsn, ! Buffer leftchild, bool noFollowFight); /* gistget.c */ extern Datum gistgettuple(PG_FUNCTION_ARGS); *************** *** 313,318 **** extern Datum gistgetbitmap(PG_FUNCTION_ARGS); --- 433,450 ---- /* gistutil.c */ + /* + * Storage type for GiST's reloptions + */ + typedef struct GiSTOptions + { + int32 vl_len_; /* varlena header (do not touch directly!) */ + int fillfactor; /* page fill factor in percent (0..100) */ + int bufferingModeOffset; /* use buffering build? */ + int levelStep; /* level step in buffering build */ + int bufferSize; /* buffer size in buffering build */ + } GiSTOptions; + #define GiSTPageSize \ ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) *************** *** 380,383 **** extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup, --- 512,536 ---- GistSplitVector *v, GistEntryVector *entryvec, int attno); + /* gistbuild.c */ + void decreasePathRefcount(GISTBufferingInsertStack * path); + + /* gistbuildbuffers.c */ + void initGiSTBuildBuffers(GISTBuildBuffers * gfbb, int maxLevel); + extern NodeBuffer *getNodeBuffer(GISTBuildBuffers * gfbb, GISTSTATE *giststate, + BlockNumber blkno, OffsetNumber downlinkoffnu, + GISTBufferingInsertStack * parent, bool createNew); + void pushItupToNodeBuffer(GISTBuildBuffers * gfbb, + NodeBuffer * nodeBuffer, IndexTuple item); + bool popItupFromNodeBuffer(GISTBuildBuffers * gfbb, + NodeBuffer * nodeBuffer, IndexTuple *item); + void freeGiSTBuildBuffers(GISTBuildBuffers * gfbb); + extern void relocateBuildBuffersOnSplit(GISTBuildBuffers * gfbb, + GISTSTATE *giststate, Relation r, + GISTBufferingInsertStack * path, Buffer buffer, + SplitedPageLayout *dist); + int getNodeBufferBusySize(GISTBuildBuffers * gfbb, + NodeBuffer * nodeBuffer); + void unloadNodeBuffers(GISTBuildBuffers * gfbb); + #endif /* GIST_PRIVATE_H */