Re: We need to rethink relation cache entry rebuild

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: We need to rethink relation cache entry rebuild
Дата
Msg-id 4389.1263167864@sss.pgh.pa.us
обсуждение исходный текст
Ответ на We need to rethink relation cache entry rebuild  (Tom Lane <tgl@sss.pgh.pa.us>)
Список pgsql-hackers
I wrote:
> Basically I think we have to fix this by ensuring that an error escape
> can't occur while a relcache entry is in a partially rebuilt state.

Attached is a draft patch for this.  In addition to fixing the stated
problem, it also takes care of a thinko that I found along the way:
RelationClearRelation assumes that any rd_indexprs or rd_indpred trees
can be freed by deleting the rd_indexcxt context, as the comments in
rel.h imply.  But actually the code that loads those fields was putting
the trees directly into CacheMemoryContext, meaning that a cache flush
on an index that has expressions or a predicate would result in a
session-lifespan memory leak.

I think this is not too complicated to back-patch --- if anything the
logic is simpler than before.  Comments?

            regards, tom lane

Index: src/backend/utils/cache/relcache.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v
retrieving revision 1.297
diff -c -r1.297 relcache.c
*** src/backend/utils/cache/relcache.c    7 Jan 2010 20:39:45 -0000    1.297
--- src/backend/utils/cache/relcache.c    10 Jan 2010 23:45:58 -0000
***************
*** 198,203 ****
--- 198,204 ----

  /* non-export function prototypes */

+ static void RelationDestroyRelation(Relation relation);
  static void RelationClearRelation(Relation relation, bool rebuild);

  static void RelationReloadIndexInfo(Relation relation);
***************
*** 211,220 ****
            int natts, const FormData_pg_attribute *attrs);

  static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
! static Relation AllocateRelationDesc(Relation relation, Form_pg_class relp);
  static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
  static void RelationBuildTupleDesc(Relation relation);
! static Relation RelationBuildDesc(Oid targetRelId, Relation oldrelation);
  static void RelationInitPhysicalAddr(Relation relation);
  static void load_critical_index(Oid indexoid);
  static TupleDesc GetPgClassDescriptor(void);
--- 212,221 ----
            int natts, const FormData_pg_attribute *attrs);

  static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK);
! static Relation AllocateRelationDesc(Form_pg_class relp);
  static void RelationParseRelOptions(Relation relation, HeapTuple tuple);
  static void RelationBuildTupleDesc(Relation relation);
! static Relation RelationBuildDesc(Oid targetRelId, bool insertIt);
  static void RelationInitPhysicalAddr(Relation relation);
  static void load_critical_index(Oid indexoid);
  static TupleDesc GetPgClassDescriptor(void);
***************
*** 305,319 ****
   *        AllocateRelationDesc
   *
   *        This is used to allocate memory for a new relation descriptor
!  *        and initialize the rd_rel field.
!  *
!  *        If 'relation' is NULL, allocate a new RelationData object.
!  *        If not, reuse the given object (that path is taken only when
!  *        we have to rebuild a relcache entry during RelationClearRelation).
   */
  static Relation
! AllocateRelationDesc(Relation relation, Form_pg_class relp)
  {
      MemoryContext oldcxt;
      Form_pg_class relationForm;

--- 306,317 ----
   *        AllocateRelationDesc
   *
   *        This is used to allocate memory for a new relation descriptor
!  *        and initialize the rd_rel field from the given pg_class tuple.
   */
  static Relation
! AllocateRelationDesc(Form_pg_class relp)
  {
+     Relation    relation;
      MemoryContext oldcxt;
      Form_pg_class relationForm;

***************
*** 321,335 ****
      oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

      /*
!      * allocate space for new relation descriptor, if needed
       */
!     if (relation == NULL)
!         relation = (Relation) palloc(sizeof(RelationData));

      /*
!      * clear all fields of reldesc
       */
-     MemSet(relation, 0, sizeof(RelationData));
      relation->rd_targblock = InvalidBlockNumber;
      relation->rd_fsm_nblocks = InvalidBlockNumber;
      relation->rd_vm_nblocks = InvalidBlockNumber;
--- 319,331 ----
      oldcxt = MemoryContextSwitchTo(CacheMemoryContext);

      /*
!      * allocate and zero space for new relation descriptor
       */
!     relation = (Relation) palloc0(sizeof(RelationData));

      /*
!      * clear fields of reldesc that should initialize to something non-zero
       */
      relation->rd_targblock = InvalidBlockNumber;
      relation->rd_fsm_nblocks = InvalidBlockNumber;
      relation->rd_vm_nblocks = InvalidBlockNumber;
***************
*** 387,393 ****
      {
          case RELKIND_RELATION:
          case RELKIND_TOASTVALUE:
-         case RELKIND_UNCATALOGED:
          case RELKIND_INDEX:
              break;
          default:
--- 383,388 ----
***************
*** 415,420 ****
--- 410,416 ----
          relation->rd_options = MemoryContextAlloc(CacheMemoryContext,
                                                    VARSIZE(options));
          memcpy(relation->rd_options, options, VARSIZE(options));
+         pfree(options);
      }
  }

***************
*** 805,828 ****
  /*
   *        RelationBuildDesc
   *
!  *        Build a relation descriptor --- either a new one, or by
!  *        recycling the given old relation object.  The latter case
!  *        supports rebuilding a relcache entry without invalidating
!  *        pointers to it.  The caller must hold at least
   *        AccessShareLock on the target relid.
   *
   *        Returns NULL if no pg_class row could be found for the given relid
   *        (suggesting we are trying to access a just-deleted relation).
   *        Any other error is reported via elog.
   */
  static Relation
! RelationBuildDesc(Oid targetRelId, Relation oldrelation)
  {
      Relation    relation;
      Oid            relid;
      HeapTuple    pg_class_tuple;
      Form_pg_class relp;
-     MemoryContext oldcxt;

      /*
       * find the tuple in pg_class corresponding to the given relation id
--- 801,822 ----
  /*
   *        RelationBuildDesc
   *
!  *        Build a relation descriptor.  The caller must hold at least
   *        AccessShareLock on the target relid.
   *
+  *        The new descriptor is inserted into the hash table if insertIt is true.
+  *
   *        Returns NULL if no pg_class row could be found for the given relid
   *        (suggesting we are trying to access a just-deleted relation).
   *        Any other error is reported via elog.
   */
  static Relation
! RelationBuildDesc(Oid targetRelId, bool insertIt)
  {
      Relation    relation;
      Oid            relid;
      HeapTuple    pg_class_tuple;
      Form_pg_class relp;

      /*
       * find the tuple in pg_class corresponding to the given relation id
***************
*** 845,851 ****
       * allocate storage for the relation descriptor, and copy pg_class_tuple
       * to relation->rd_rel.
       */
!     relation = AllocateRelationDesc(oldrelation, relp);

      /*
       * initialize the relation's relation id (relation->rd_id)
--- 839,845 ----
       * allocate storage for the relation descriptor, and copy pg_class_tuple
       * to relation->rd_rel.
       */
!     relation = AllocateRelationDesc(relp);

      /*
       * initialize the relation's relation id (relation->rd_id)
***************
*** 916,926 ****
      heap_freetuple(pg_class_tuple);

      /*
!      * Insert newly created relation into relcache hash tables.
       */
!     oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
!     RelationCacheInsert(relation);
!     MemoryContextSwitchTo(oldcxt);

      /* It's fully valid */
      relation->rd_isvalid = true;
--- 910,919 ----
      heap_freetuple(pg_class_tuple);

      /*
!      * Insert newly created relation into relcache hash table, if requested.
       */
!     if (insertIt)
!         RelationCacheInsert(relation);

      /* It's fully valid */
      relation->rd_isvalid = true;
***************
*** 1569,1577 ****
      if (RelationIsValid(rd))
      {
          RelationIncrementReferenceCount(rd);
!         /* revalidate nailed index if necessary */
          if (!rd->rd_isvalid)
!             RelationReloadIndexInfo(rd);
          return rd;
      }

--- 1562,1580 ----
      if (RelationIsValid(rd))
      {
          RelationIncrementReferenceCount(rd);
!         /* revalidate cache entry if necessary */
          if (!rd->rd_isvalid)
!         {
!             /*
!              * Indexes only have a limited number of possible schema changes,
!              * and we don't want to use the full-blown procedure because it's
!              * a headache for indexes that reload itself depends on.
!              */
!             if (rd->rd_rel->relkind == RELKIND_INDEX)
!                 RelationReloadIndexInfo(rd);
!             else
!                 RelationClearRelation(rd, true);
!         }
          return rd;
      }

***************
*** 1579,1585 ****
       * no reldesc in the cache, so have RelationBuildDesc() build one and add
       * it.
       */
!     rd = RelationBuildDesc(relationId, NULL);
      if (RelationIsValid(rd))
          RelationIncrementReferenceCount(rd);
      return rd;
--- 1582,1588 ----
       * no reldesc in the cache, so have RelationBuildDesc() build one and add
       * it.
       */
!     rd = RelationBuildDesc(relationId, true);
      if (RelationIsValid(rd))
          RelationIncrementReferenceCount(rd);
      return rd;
***************
*** 1761,1766 ****
--- 1764,1813 ----
  }

  /*
+  * RelationDestroyRelation
+  *
+  *    Physically delete a relation cache entry and all subsidiary data.
+  *    Caller must already have unhooked the entry from the hash table.
+  */
+ static void
+ RelationDestroyRelation(Relation relation)
+ {
+     Assert(RelationHasReferenceCountZero(relation));
+
+     /*
+      * Make sure smgr and lower levels close the relation's files, if they
+      * weren't closed already.  (This was probably done by caller, but let's
+      * just be real sure.)
+      */
+     RelationCloseSmgr(relation);
+
+     /*
+      * Free all the subsidiary data structures of the relcache entry,
+      * then the entry itself.
+      */
+     if (relation->rd_rel)
+         pfree(relation->rd_rel);
+     /* can't use DecrTupleDescRefCount here */
+     Assert(relation->rd_att->tdrefcount > 0);
+     if (--relation->rd_att->tdrefcount == 0)
+         FreeTupleDesc(relation->rd_att);
+     list_free(relation->rd_indexlist);
+     bms_free(relation->rd_indexattr);
+     FreeTriggerDesc(relation->trigdesc);
+     if (relation->rd_options)
+         pfree(relation->rd_options);
+     if (relation->rd_indextuple)
+         pfree(relation->rd_indextuple);
+     if (relation->rd_am)
+         pfree(relation->rd_am);
+     if (relation->rd_indexcxt)
+         MemoryContextDelete(relation->rd_indexcxt);
+     if (relation->rd_rulescxt)
+         MemoryContextDelete(relation->rd_rulescxt);
+     pfree(relation);
+ }
+
+ /*
   * RelationClearRelation
   *
   *     Physically blow away a relation cache entry, or reset it and rebuild
***************
*** 1776,1782 ****
  RelationClearRelation(Relation relation, bool rebuild)
  {
      Oid            old_reltype = relation->rd_rel->reltype;
-     MemoryContext oldcxt;

      /*
       * Make sure smgr and lower levels close the relation's files, if they
--- 1823,1828 ----
***************
*** 1791,1797 ****
       * Never, never ever blow away a nailed-in system relation, because we'd
       * be unable to recover.  However, we must reset rd_targblock, in case we
       * got called because of a relation cache flush that was triggered by
!      * VACUUM.
       *
       * If it's a nailed index, then we need to re-read the pg_class row to see
       * if its relfilenode changed.    We can't necessarily do that here, because
--- 1837,1843 ----
       * Never, never ever blow away a nailed-in system relation, because we'd
       * be unable to recover.  However, we must reset rd_targblock, in case we
       * got called because of a relation cache flush that was triggered by
!      * VACUUM.  Likewise reset the fsm and vm size info.
       *
       * If it's a nailed index, then we need to re-read the pg_class row to see
       * if its relfilenode changed.    We can't necessarily do that here, because
***************
*** 1830,1869 ****
          return;
      }

!     /*
!      * Remove relation from hash tables
!      *
!      * Note: we might be reinserting it momentarily, but we must not have it
!      * visible in the hash tables until it's valid again, so don't try to
!      * optimize this away...
!      */
!     oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
!     RelationCacheDelete(relation);
!     MemoryContextSwitchTo(oldcxt);
!
!     /* Clear out catcache's entries for this relation */
!     CatalogCacheFlushRelation(RelationGetRelid(relation));

      /*
!      * Free all the subsidiary data structures of the relcache entry. We
!      * cannot free rd_att if we are trying to rebuild the entry, however,
!      * because pointers to it may be cached in various places. The rule
!      * manager might also have pointers into the rewrite rules. So to begin
!      * with, we can only get rid of these fields:
       */
!     FreeTriggerDesc(relation->trigdesc);
!     if (relation->rd_indextuple)
!         pfree(relation->rd_indextuple);
!     if (relation->rd_am)
!         pfree(relation->rd_am);
!     if (relation->rd_rel)
!         pfree(relation->rd_rel);
!     if (relation->rd_options)
!         pfree(relation->rd_options);
!     list_free(relation->rd_indexlist);
!     bms_free(relation->rd_indexattr);
!     if (relation->rd_indexcxt)
!         MemoryContextDelete(relation->rd_indexcxt);

      /*
       * If we're really done with the relcache entry, blow it away. But if
--- 1876,1889 ----
          return;
      }

!     /* Mark it invalid until we've finished rebuild */
!     relation->rd_isvalid = false;

      /*
!      * Clear out catcache's entries for this relation.  This is a bit of
!      * a hack, but it's a convenient place to do it.
       */
!     CatalogCacheFlushRelation(RelationGetRelid(relation));

      /*
       * If we're really done with the relcache entry, blow it away. But if
***************
*** 1873,1956 ****
       */
      if (!rebuild)
      {
!         /* ok to zap remaining substructure */
          flush_rowtype_cache(old_reltype);
!         /* can't use DecrTupleDescRefCount here */
!         Assert(relation->rd_att->tdrefcount > 0);
!         if (--relation->rd_att->tdrefcount == 0)
!             FreeTupleDesc(relation->rd_att);
!         if (relation->rd_rulescxt)
!             MemoryContextDelete(relation->rd_rulescxt);
!         pfree(relation);
      }
      else
      {
          /*
!          * When rebuilding an open relcache entry, must preserve ref count and
!          * rd_createSubid/rd_newRelfilenodeSubid state.  Also attempt to
!          * preserve the tupledesc and rewrite-rule substructures in place.
!          * (Note: the refcount mechanism for tupledescs may eventually ensure
!          * that we don't really need to preserve the tupledesc in-place, but
!          * for now there are still a lot of places that assume an open rel's
!          * tupledesc won't move.)
           *
           * Note that this process does not touch CurrentResourceOwner; which
           * is good because whatever ref counts the entry may have do not
           * necessarily belong to that resource owner.
           */
          Oid            save_relid = RelationGetRelid(relation);
!         int            old_refcnt = relation->rd_refcnt;
!         SubTransactionId old_createSubid = relation->rd_createSubid;
!         SubTransactionId old_newRelfilenodeSubid = relation->rd_newRelfilenodeSubid;
!         struct PgStat_TableStatus *old_pgstat_info = relation->pgstat_info;
!         TupleDesc    old_att = relation->rd_att;
!         RuleLock   *old_rules = relation->rd_rules;
!         MemoryContext old_rulescxt = relation->rd_rulescxt;

!         if (RelationBuildDesc(save_relid, relation) != relation)
          {
              /* Should only get here if relation was deleted */
              flush_rowtype_cache(old_reltype);
!             Assert(old_att->tdrefcount > 0);
!             if (--old_att->tdrefcount == 0)
!                 FreeTupleDesc(old_att);
!             if (old_rulescxt)
!                 MemoryContextDelete(old_rulescxt);
!             pfree(relation);
              elog(ERROR, "relation %u deleted while still in use", save_relid);
          }
-         relation->rd_refcnt = old_refcnt;
-         relation->rd_createSubid = old_createSubid;
-         relation->rd_newRelfilenodeSubid = old_newRelfilenodeSubid;
-         relation->pgstat_info = old_pgstat_info;

!         if (equalTupleDescs(old_att, relation->rd_att))
!         {
!             /* needn't flush typcache here */
!             Assert(relation->rd_att->tdrefcount == 1);
!             if (--relation->rd_att->tdrefcount == 0)
!                 FreeTupleDesc(relation->rd_att);
!             relation->rd_att = old_att;
!         }
!         else
!         {
              flush_rowtype_cache(old_reltype);
!             Assert(old_att->tdrefcount > 0);
!             if (--old_att->tdrefcount == 0)
!                 FreeTupleDesc(old_att);
!         }
!         if (equalRuleLocks(old_rules, relation->rd_rules))
          {
!             if (relation->rd_rulescxt)
!                 MemoryContextDelete(relation->rd_rulescxt);
!             relation->rd_rules = old_rules;
!             relation->rd_rulescxt = old_rulescxt;
          }
!         else
          {
!             if (old_rulescxt)
!                 MemoryContextDelete(old_rulescxt);
          }
      }
  }

--- 1893,2008 ----
       */
      if (!rebuild)
      {
!         /* Flush any rowtype cache entry */
          flush_rowtype_cache(old_reltype);
!
!         /* Remove it from the hash table */
!         RelationCacheDelete(relation);
!
!         /* And release storage */
!         RelationDestroyRelation(relation);
      }
      else
      {
          /*
!          * Our strategy for rebuilding an open relcache entry is to build
!          * a new entry from scratch, swap its contents with the old entry,
!          * and finally delete the new entry (along with any infrastructure
!          * swapped over from the old entry).  This is to avoid trouble in case
!          * an error causes us to lose control partway through.  The old entry
!          * will still be marked !rd_isvalid, so we'll try to rebuild it again
!          * on next access.  Meanwhile it's not any less valid than it was
!          * before, so any code that might expect to continue accessing it
!          * isn't hurt by the rebuild failure.  (Consider for example a
!          * subtransaction that ALTERs a table and then gets cancelled partway
!          * through the cache entry rebuild.  The outer transaction should
!          * still see the not-modified cache entry as valid.)  The worst
!          * consequence of an error is leaking the necessarily-unreferenced
!          * new entry, and this shouldn't happen often enough for that to be
!          * a big problem.
!          *
!          * When rebuilding an open relcache entry, we must preserve ref count
!          * and rd_createSubid/rd_newRelfilenodeSubid state.  Also attempt to
!          * preserve the tupledesc and rewrite-rule substructures in place,
!          * because various places assume that these structures won't move
!          * while they are working with an open relcache entry.  (Note: the
!          * refcount mechanism for tupledescs may eventually allow us to remove
!          * this hack for the tupledesc.)
           *
           * Note that this process does not touch CurrentResourceOwner; which
           * is good because whatever ref counts the entry may have do not
           * necessarily belong to that resource owner.
           */
+         Relation    newrel;
          Oid            save_relid = RelationGetRelid(relation);
!         bool        keep_tupdesc;
!         bool        keep_rules;

!         /* Build temporary entry, but don't link it into hashtable */
!         newrel = RelationBuildDesc(save_relid, false);
!         if (newrel == NULL)
          {
              /* Should only get here if relation was deleted */
              flush_rowtype_cache(old_reltype);
!             RelationCacheDelete(relation);
!             RelationDestroyRelation(relation);
              elog(ERROR, "relation %u deleted while still in use", save_relid);
          }

!         keep_tupdesc = equalTupleDescs(relation->rd_att, newrel->rd_att);
!         keep_rules = equalRuleLocks(relation->rd_rules, newrel->rd_rules);
!         if (!keep_tupdesc)
              flush_rowtype_cache(old_reltype);
!
!         /*
!          * Perform swapping of the relcache entry contents.  Within this
!          * process the old entry is momentarily invalid, so there *must*
!          * be no possibility of CHECK_FOR_INTERRUPTS within this sequence.
!          * Do it in all-in-line code for safety.
!          *
!          * Since the vast majority of fields should be swapped, our method
!          * is to swap the whole structures and then re-swap those few fields
!          * we didn't want swapped.
!          */
          {
!             RelationData tmpstruct;
!
!             memcpy(&tmpstruct, newrel, sizeof(RelationData));
!             memcpy(newrel, relation, sizeof(RelationData));
!             memcpy(relation, &tmpstruct, sizeof(RelationData));
          }
!
! #define SWAPFIELD(fldtype, fldname) \
!         do { \
!             fldtype _tmp = newrel->fldname; \
!             newrel->fldname = relation->fldname; \
!             relation->fldname = _tmp; \
!         } while (0)
!
!         /* rd_smgr must not be swapped, due to back-links from smgr level */
!         SWAPFIELD(SMgrRelation, rd_smgr);
!         /* rd_refcnt must be preserved */
!         SWAPFIELD(int, rd_refcnt);
!         /* isnailed shouldn't change */
!         Assert(newrel->rd_isnailed == relation->rd_isnailed);
!         /* creation sub-XIDs must be preserved */
!         SWAPFIELD(SubTransactionId, rd_createSubid);
!         SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid);
!         /* preserve old tupledesc and rules if no logical change */
!         if (keep_tupdesc)
!             SWAPFIELD(TupleDesc, rd_att);
!         if (keep_rules)
          {
!             SWAPFIELD(RuleLock *, rd_rules);
!             SWAPFIELD(MemoryContext, rd_rulescxt);
          }
+         /* pgstat_info must be preserved */
+         SWAPFIELD(struct PgStat_TableStatus *, pgstat_info);
+
+ #undef SWAPFIELD
+
+         /* And now we can throw away the temporary entry */
+         RelationDestroyRelation(newrel);
      }
  }

***************
*** 2836,2842 ****
      Relation    ird;

      LockRelationOid(indexoid, AccessShareLock);
!     ird = RelationBuildDesc(indexoid, NULL);
      if (ird == NULL)
          elog(PANIC, "could not open critical system index %u", indexoid);
      ird->rd_isnailed = true;
--- 2888,2894 ----
      Relation    ird;

      LockRelationOid(indexoid, AccessShareLock);
!     ird = RelationBuildDesc(indexoid, true);
      if (ird == NULL)
          elog(PANIC, "could not open critical system index %u", indexoid);
      ird->rd_isnailed = true;
***************
*** 3293,3299 ****
      fix_opfuncids((Node *) result);

      /* Now save a copy of the completed tree in the relcache entry. */
!     oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
      relation->rd_indexprs = (List *) copyObject(result);
      MemoryContextSwitchTo(oldcxt);

--- 3345,3351 ----
      fix_opfuncids((Node *) result);

      /* Now save a copy of the completed tree in the relcache entry. */
!     oldcxt = MemoryContextSwitchTo(relation->rd_indexcxt);
      relation->rd_indexprs = (List *) copyObject(result);
      MemoryContextSwitchTo(oldcxt);

***************
*** 3368,3374 ****
      fix_opfuncids((Node *) result);

      /* Now save a copy of the completed tree in the relcache entry. */
!     oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
      relation->rd_indpred = (List *) copyObject(result);
      MemoryContextSwitchTo(oldcxt);

--- 3420,3426 ----
      fix_opfuncids((Node *) result);

      /* Now save a copy of the completed tree in the relcache entry. */
!     oldcxt = MemoryContextSwitchTo(relation->rd_indexcxt);
      relation->rd_indpred = (List *) copyObject(result);
      MemoryContextSwitchTo(oldcxt);


В списке pgsql-hackers по дате отправления:

Предыдущее
От: "David E. Wheeler"
Дата:
Сообщение: Re: Feature patch 1 for plperl [PATCH]
Следующее
От: Tom Lane
Дата:
Сообщение: Re: Feature patch 1 for plperl [PATCH]