Re: We need to rethink relation cache entry rebuild
От | Tom Lane |
---|---|
Тема | Re: We need to rethink relation cache entry rebuild |
Дата | |
Msg-id | 4389.1263167864@sss.pgh.pa.us обсуждение исходный текст |
Ответ на | We need to rethink relation cache entry rebuild (Tom Lane <tgl@sss.pgh.pa.us>) |
Список | pgsql-hackers |
I wrote: > Basically I think we have to fix this by ensuring that an error escape > can't occur while a relcache entry is in a partially rebuilt state. Attached is a draft patch for this. In addition to fixing the stated problem, it also takes care of a thinko that I found along the way: RelationClearRelation assumes that any rd_indexprs or rd_indpred trees can be freed by deleting the rd_indexcxt context, as the comments in rel.h imply. But actually the code that loads those fields was putting the trees directly into CacheMemoryContext, meaning that a cache flush on an index that has expressions or a predicate would result in a session-lifespan memory leak. I think this is not too complicated to back-patch --- if anything the logic is simpler than before. Comments? regards, tom lane Index: src/backend/utils/cache/relcache.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v retrieving revision 1.297 diff -c -r1.297 relcache.c *** src/backend/utils/cache/relcache.c 7 Jan 2010 20:39:45 -0000 1.297 --- src/backend/utils/cache/relcache.c 10 Jan 2010 23:45:58 -0000 *************** *** 198,203 **** --- 198,204 ---- /* non-export function prototypes */ + static void RelationDestroyRelation(Relation relation); static void RelationClearRelation(Relation relation, bool rebuild); static void RelationReloadIndexInfo(Relation relation); *************** *** 211,220 **** int natts, const FormData_pg_attribute *attrs); static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK); ! static Relation AllocateRelationDesc(Relation relation, Form_pg_class relp); static void RelationParseRelOptions(Relation relation, HeapTuple tuple); static void RelationBuildTupleDesc(Relation relation); ! static Relation RelationBuildDesc(Oid targetRelId, Relation oldrelation); static void RelationInitPhysicalAddr(Relation relation); static void load_critical_index(Oid indexoid); static TupleDesc GetPgClassDescriptor(void); --- 212,221 ---- int natts, const FormData_pg_attribute *attrs); static HeapTuple ScanPgRelation(Oid targetRelId, bool indexOK); ! static Relation AllocateRelationDesc(Form_pg_class relp); static void RelationParseRelOptions(Relation relation, HeapTuple tuple); static void RelationBuildTupleDesc(Relation relation); ! static Relation RelationBuildDesc(Oid targetRelId, bool insertIt); static void RelationInitPhysicalAddr(Relation relation); static void load_critical_index(Oid indexoid); static TupleDesc GetPgClassDescriptor(void); *************** *** 305,319 **** * AllocateRelationDesc * * This is used to allocate memory for a new relation descriptor ! * and initialize the rd_rel field. ! * ! * If 'relation' is NULL, allocate a new RelationData object. ! * If not, reuse the given object (that path is taken only when ! * we have to rebuild a relcache entry during RelationClearRelation). */ static Relation ! AllocateRelationDesc(Relation relation, Form_pg_class relp) { MemoryContext oldcxt; Form_pg_class relationForm; --- 306,317 ---- * AllocateRelationDesc * * This is used to allocate memory for a new relation descriptor ! * and initialize the rd_rel field from the given pg_class tuple. */ static Relation ! AllocateRelationDesc(Form_pg_class relp) { + Relation relation; MemoryContext oldcxt; Form_pg_class relationForm; *************** *** 321,335 **** oldcxt = MemoryContextSwitchTo(CacheMemoryContext); /* ! * allocate space for new relation descriptor, if needed */ ! if (relation == NULL) ! relation = (Relation) palloc(sizeof(RelationData)); /* ! * clear all fields of reldesc */ - MemSet(relation, 0, sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; relation->rd_vm_nblocks = InvalidBlockNumber; --- 319,331 ---- oldcxt = MemoryContextSwitchTo(CacheMemoryContext); /* ! * allocate and zero space for new relation descriptor */ ! relation = (Relation) palloc0(sizeof(RelationData)); /* ! * clear fields of reldesc that should initialize to something non-zero */ relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; relation->rd_vm_nblocks = InvalidBlockNumber; *************** *** 387,393 **** { case RELKIND_RELATION: case RELKIND_TOASTVALUE: - case RELKIND_UNCATALOGED: case RELKIND_INDEX: break; default: --- 383,388 ---- *************** *** 415,420 **** --- 410,416 ---- relation->rd_options = MemoryContextAlloc(CacheMemoryContext, VARSIZE(options)); memcpy(relation->rd_options, options, VARSIZE(options)); + pfree(options); } } *************** *** 805,828 **** /* * RelationBuildDesc * ! * Build a relation descriptor --- either a new one, or by ! * recycling the given old relation object. The latter case ! * supports rebuilding a relcache entry without invalidating ! * pointers to it. The caller must hold at least * AccessShareLock on the target relid. * * Returns NULL if no pg_class row could be found for the given relid * (suggesting we are trying to access a just-deleted relation). * Any other error is reported via elog. */ static Relation ! RelationBuildDesc(Oid targetRelId, Relation oldrelation) { Relation relation; Oid relid; HeapTuple pg_class_tuple; Form_pg_class relp; - MemoryContext oldcxt; /* * find the tuple in pg_class corresponding to the given relation id --- 801,822 ---- /* * RelationBuildDesc * ! * Build a relation descriptor. The caller must hold at least * AccessShareLock on the target relid. * + * The new descriptor is inserted into the hash table if insertIt is true. + * * Returns NULL if no pg_class row could be found for the given relid * (suggesting we are trying to access a just-deleted relation). * Any other error is reported via elog. */ static Relation ! RelationBuildDesc(Oid targetRelId, bool insertIt) { Relation relation; Oid relid; HeapTuple pg_class_tuple; Form_pg_class relp; /* * find the tuple in pg_class corresponding to the given relation id *************** *** 845,851 **** * allocate storage for the relation descriptor, and copy pg_class_tuple * to relation->rd_rel. */ ! relation = AllocateRelationDesc(oldrelation, relp); /* * initialize the relation's relation id (relation->rd_id) --- 839,845 ---- * allocate storage for the relation descriptor, and copy pg_class_tuple * to relation->rd_rel. */ ! relation = AllocateRelationDesc(relp); /* * initialize the relation's relation id (relation->rd_id) *************** *** 916,926 **** heap_freetuple(pg_class_tuple); /* ! * Insert newly created relation into relcache hash tables. */ ! oldcxt = MemoryContextSwitchTo(CacheMemoryContext); ! RelationCacheInsert(relation); ! MemoryContextSwitchTo(oldcxt); /* It's fully valid */ relation->rd_isvalid = true; --- 910,919 ---- heap_freetuple(pg_class_tuple); /* ! * Insert newly created relation into relcache hash table, if requested. */ ! if (insertIt) ! RelationCacheInsert(relation); /* It's fully valid */ relation->rd_isvalid = true; *************** *** 1569,1577 **** if (RelationIsValid(rd)) { RelationIncrementReferenceCount(rd); ! /* revalidate nailed index if necessary */ if (!rd->rd_isvalid) ! RelationReloadIndexInfo(rd); return rd; } --- 1562,1580 ---- if (RelationIsValid(rd)) { RelationIncrementReferenceCount(rd); ! /* revalidate cache entry if necessary */ if (!rd->rd_isvalid) ! { ! /* ! * Indexes only have a limited number of possible schema changes, ! * and we don't want to use the full-blown procedure because it's ! * a headache for indexes that reload itself depends on. ! */ ! if (rd->rd_rel->relkind == RELKIND_INDEX) ! RelationReloadIndexInfo(rd); ! else ! RelationClearRelation(rd, true); ! } return rd; } *************** *** 1579,1585 **** * no reldesc in the cache, so have RelationBuildDesc() build one and add * it. */ ! rd = RelationBuildDesc(relationId, NULL); if (RelationIsValid(rd)) RelationIncrementReferenceCount(rd); return rd; --- 1582,1588 ---- * no reldesc in the cache, so have RelationBuildDesc() build one and add * it. */ ! rd = RelationBuildDesc(relationId, true); if (RelationIsValid(rd)) RelationIncrementReferenceCount(rd); return rd; *************** *** 1761,1766 **** --- 1764,1813 ---- } /* + * RelationDestroyRelation + * + * Physically delete a relation cache entry and all subsidiary data. + * Caller must already have unhooked the entry from the hash table. + */ + static void + RelationDestroyRelation(Relation relation) + { + Assert(RelationHasReferenceCountZero(relation)); + + /* + * Make sure smgr and lower levels close the relation's files, if they + * weren't closed already. (This was probably done by caller, but let's + * just be real sure.) + */ + RelationCloseSmgr(relation); + + /* + * Free all the subsidiary data structures of the relcache entry, + * then the entry itself. + */ + if (relation->rd_rel) + pfree(relation->rd_rel); + /* can't use DecrTupleDescRefCount here */ + Assert(relation->rd_att->tdrefcount > 0); + if (--relation->rd_att->tdrefcount == 0) + FreeTupleDesc(relation->rd_att); + list_free(relation->rd_indexlist); + bms_free(relation->rd_indexattr); + FreeTriggerDesc(relation->trigdesc); + if (relation->rd_options) + pfree(relation->rd_options); + if (relation->rd_indextuple) + pfree(relation->rd_indextuple); + if (relation->rd_am) + pfree(relation->rd_am); + if (relation->rd_indexcxt) + MemoryContextDelete(relation->rd_indexcxt); + if (relation->rd_rulescxt) + MemoryContextDelete(relation->rd_rulescxt); + pfree(relation); + } + + /* * RelationClearRelation * * Physically blow away a relation cache entry, or reset it and rebuild *************** *** 1776,1782 **** RelationClearRelation(Relation relation, bool rebuild) { Oid old_reltype = relation->rd_rel->reltype; - MemoryContext oldcxt; /* * Make sure smgr and lower levels close the relation's files, if they --- 1823,1828 ---- *************** *** 1791,1797 **** * Never, never ever blow away a nailed-in system relation, because we'd * be unable to recover. However, we must reset rd_targblock, in case we * got called because of a relation cache flush that was triggered by ! * VACUUM. * * If it's a nailed index, then we need to re-read the pg_class row to see * if its relfilenode changed. We can't necessarily do that here, because --- 1837,1843 ---- * Never, never ever blow away a nailed-in system relation, because we'd * be unable to recover. However, we must reset rd_targblock, in case we * got called because of a relation cache flush that was triggered by ! * VACUUM. Likewise reset the fsm and vm size info. * * If it's a nailed index, then we need to re-read the pg_class row to see * if its relfilenode changed. We can't necessarily do that here, because *************** *** 1830,1869 **** return; } ! /* ! * Remove relation from hash tables ! * ! * Note: we might be reinserting it momentarily, but we must not have it ! * visible in the hash tables until it's valid again, so don't try to ! * optimize this away... ! */ ! oldcxt = MemoryContextSwitchTo(CacheMemoryContext); ! RelationCacheDelete(relation); ! MemoryContextSwitchTo(oldcxt); ! ! /* Clear out catcache's entries for this relation */ ! CatalogCacheFlushRelation(RelationGetRelid(relation)); /* ! * Free all the subsidiary data structures of the relcache entry. We ! * cannot free rd_att if we are trying to rebuild the entry, however, ! * because pointers to it may be cached in various places. The rule ! * manager might also have pointers into the rewrite rules. So to begin ! * with, we can only get rid of these fields: */ ! FreeTriggerDesc(relation->trigdesc); ! if (relation->rd_indextuple) ! pfree(relation->rd_indextuple); ! if (relation->rd_am) ! pfree(relation->rd_am); ! if (relation->rd_rel) ! pfree(relation->rd_rel); ! if (relation->rd_options) ! pfree(relation->rd_options); ! list_free(relation->rd_indexlist); ! bms_free(relation->rd_indexattr); ! if (relation->rd_indexcxt) ! MemoryContextDelete(relation->rd_indexcxt); /* * If we're really done with the relcache entry, blow it away. But if --- 1876,1889 ---- return; } ! /* Mark it invalid until we've finished rebuild */ ! relation->rd_isvalid = false; /* ! * Clear out catcache's entries for this relation. This is a bit of ! * a hack, but it's a convenient place to do it. */ ! CatalogCacheFlushRelation(RelationGetRelid(relation)); /* * If we're really done with the relcache entry, blow it away. But if *************** *** 1873,1956 **** */ if (!rebuild) { ! /* ok to zap remaining substructure */ flush_rowtype_cache(old_reltype); ! /* can't use DecrTupleDescRefCount here */ ! Assert(relation->rd_att->tdrefcount > 0); ! if (--relation->rd_att->tdrefcount == 0) ! FreeTupleDesc(relation->rd_att); ! if (relation->rd_rulescxt) ! MemoryContextDelete(relation->rd_rulescxt); ! pfree(relation); } else { /* ! * When rebuilding an open relcache entry, must preserve ref count and ! * rd_createSubid/rd_newRelfilenodeSubid state. Also attempt to ! * preserve the tupledesc and rewrite-rule substructures in place. ! * (Note: the refcount mechanism for tupledescs may eventually ensure ! * that we don't really need to preserve the tupledesc in-place, but ! * for now there are still a lot of places that assume an open rel's ! * tupledesc won't move.) * * Note that this process does not touch CurrentResourceOwner; which * is good because whatever ref counts the entry may have do not * necessarily belong to that resource owner. */ Oid save_relid = RelationGetRelid(relation); ! int old_refcnt = relation->rd_refcnt; ! SubTransactionId old_createSubid = relation->rd_createSubid; ! SubTransactionId old_newRelfilenodeSubid = relation->rd_newRelfilenodeSubid; ! struct PgStat_TableStatus *old_pgstat_info = relation->pgstat_info; ! TupleDesc old_att = relation->rd_att; ! RuleLock *old_rules = relation->rd_rules; ! MemoryContext old_rulescxt = relation->rd_rulescxt; ! if (RelationBuildDesc(save_relid, relation) != relation) { /* Should only get here if relation was deleted */ flush_rowtype_cache(old_reltype); ! Assert(old_att->tdrefcount > 0); ! if (--old_att->tdrefcount == 0) ! FreeTupleDesc(old_att); ! if (old_rulescxt) ! MemoryContextDelete(old_rulescxt); ! pfree(relation); elog(ERROR, "relation %u deleted while still in use", save_relid); } - relation->rd_refcnt = old_refcnt; - relation->rd_createSubid = old_createSubid; - relation->rd_newRelfilenodeSubid = old_newRelfilenodeSubid; - relation->pgstat_info = old_pgstat_info; ! if (equalTupleDescs(old_att, relation->rd_att)) ! { ! /* needn't flush typcache here */ ! Assert(relation->rd_att->tdrefcount == 1); ! if (--relation->rd_att->tdrefcount == 0) ! FreeTupleDesc(relation->rd_att); ! relation->rd_att = old_att; ! } ! else ! { flush_rowtype_cache(old_reltype); ! Assert(old_att->tdrefcount > 0); ! if (--old_att->tdrefcount == 0) ! FreeTupleDesc(old_att); ! } ! if (equalRuleLocks(old_rules, relation->rd_rules)) { ! if (relation->rd_rulescxt) ! MemoryContextDelete(relation->rd_rulescxt); ! relation->rd_rules = old_rules; ! relation->rd_rulescxt = old_rulescxt; } ! else { ! if (old_rulescxt) ! MemoryContextDelete(old_rulescxt); } } } --- 1893,2008 ---- */ if (!rebuild) { ! /* Flush any rowtype cache entry */ flush_rowtype_cache(old_reltype); ! ! /* Remove it from the hash table */ ! RelationCacheDelete(relation); ! ! /* And release storage */ ! RelationDestroyRelation(relation); } else { /* ! * Our strategy for rebuilding an open relcache entry is to build ! * a new entry from scratch, swap its contents with the old entry, ! * and finally delete the new entry (along with any infrastructure ! * swapped over from the old entry). This is to avoid trouble in case ! * an error causes us to lose control partway through. The old entry ! * will still be marked !rd_isvalid, so we'll try to rebuild it again ! * on next access. Meanwhile it's not any less valid than it was ! * before, so any code that might expect to continue accessing it ! * isn't hurt by the rebuild failure. (Consider for example a ! * subtransaction that ALTERs a table and then gets cancelled partway ! * through the cache entry rebuild. The outer transaction should ! * still see the not-modified cache entry as valid.) The worst ! * consequence of an error is leaking the necessarily-unreferenced ! * new entry, and this shouldn't happen often enough for that to be ! * a big problem. ! * ! * When rebuilding an open relcache entry, we must preserve ref count ! * and rd_createSubid/rd_newRelfilenodeSubid state. Also attempt to ! * preserve the tupledesc and rewrite-rule substructures in place, ! * because various places assume that these structures won't move ! * while they are working with an open relcache entry. (Note: the ! * refcount mechanism for tupledescs may eventually allow us to remove ! * this hack for the tupledesc.) * * Note that this process does not touch CurrentResourceOwner; which * is good because whatever ref counts the entry may have do not * necessarily belong to that resource owner. */ + Relation newrel; Oid save_relid = RelationGetRelid(relation); ! bool keep_tupdesc; ! bool keep_rules; ! /* Build temporary entry, but don't link it into hashtable */ ! newrel = RelationBuildDesc(save_relid, false); ! if (newrel == NULL) { /* Should only get here if relation was deleted */ flush_rowtype_cache(old_reltype); ! RelationCacheDelete(relation); ! RelationDestroyRelation(relation); elog(ERROR, "relation %u deleted while still in use", save_relid); } ! keep_tupdesc = equalTupleDescs(relation->rd_att, newrel->rd_att); ! keep_rules = equalRuleLocks(relation->rd_rules, newrel->rd_rules); ! if (!keep_tupdesc) flush_rowtype_cache(old_reltype); ! ! /* ! * Perform swapping of the relcache entry contents. Within this ! * process the old entry is momentarily invalid, so there *must* ! * be no possibility of CHECK_FOR_INTERRUPTS within this sequence. ! * Do it in all-in-line code for safety. ! * ! * Since the vast majority of fields should be swapped, our method ! * is to swap the whole structures and then re-swap those few fields ! * we didn't want swapped. ! */ { ! RelationData tmpstruct; ! ! memcpy(&tmpstruct, newrel, sizeof(RelationData)); ! memcpy(newrel, relation, sizeof(RelationData)); ! memcpy(relation, &tmpstruct, sizeof(RelationData)); } ! ! #define SWAPFIELD(fldtype, fldname) \ ! do { \ ! fldtype _tmp = newrel->fldname; \ ! newrel->fldname = relation->fldname; \ ! relation->fldname = _tmp; \ ! } while (0) ! ! /* rd_smgr must not be swapped, due to back-links from smgr level */ ! SWAPFIELD(SMgrRelation, rd_smgr); ! /* rd_refcnt must be preserved */ ! SWAPFIELD(int, rd_refcnt); ! /* isnailed shouldn't change */ ! Assert(newrel->rd_isnailed == relation->rd_isnailed); ! /* creation sub-XIDs must be preserved */ ! SWAPFIELD(SubTransactionId, rd_createSubid); ! SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid); ! /* preserve old tupledesc and rules if no logical change */ ! if (keep_tupdesc) ! SWAPFIELD(TupleDesc, rd_att); ! if (keep_rules) { ! SWAPFIELD(RuleLock *, rd_rules); ! SWAPFIELD(MemoryContext, rd_rulescxt); } + /* pgstat_info must be preserved */ + SWAPFIELD(struct PgStat_TableStatus *, pgstat_info); + + #undef SWAPFIELD + + /* And now we can throw away the temporary entry */ + RelationDestroyRelation(newrel); } } *************** *** 2836,2842 **** Relation ird; LockRelationOid(indexoid, AccessShareLock); ! ird = RelationBuildDesc(indexoid, NULL); if (ird == NULL) elog(PANIC, "could not open critical system index %u", indexoid); ird->rd_isnailed = true; --- 2888,2894 ---- Relation ird; LockRelationOid(indexoid, AccessShareLock); ! ird = RelationBuildDesc(indexoid, true); if (ird == NULL) elog(PANIC, "could not open critical system index %u", indexoid); ird->rd_isnailed = true; *************** *** 3293,3299 **** fix_opfuncids((Node *) result); /* Now save a copy of the completed tree in the relcache entry. */ ! oldcxt = MemoryContextSwitchTo(CacheMemoryContext); relation->rd_indexprs = (List *) copyObject(result); MemoryContextSwitchTo(oldcxt); --- 3345,3351 ---- fix_opfuncids((Node *) result); /* Now save a copy of the completed tree in the relcache entry. */ ! oldcxt = MemoryContextSwitchTo(relation->rd_indexcxt); relation->rd_indexprs = (List *) copyObject(result); MemoryContextSwitchTo(oldcxt); *************** *** 3368,3374 **** fix_opfuncids((Node *) result); /* Now save a copy of the completed tree in the relcache entry. */ ! oldcxt = MemoryContextSwitchTo(CacheMemoryContext); relation->rd_indpred = (List *) copyObject(result); MemoryContextSwitchTo(oldcxt); --- 3420,3426 ---- fix_opfuncids((Node *) result); /* Now save a copy of the completed tree in the relcache entry. */ ! oldcxt = MemoryContextSwitchTo(relation->rd_indexcxt); relation->rd_indpred = (List *) copyObject(result); MemoryContextSwitchTo(oldcxt);
В списке pgsql-hackers по дате отправления: