Обсуждение: ERROR: cache lookup failed for relation 17442 (repost)

Поиск
Список
Период
Сортировка

ERROR: cache lookup failed for relation 17442 (repost)

От
Michael Guerin
Дата:
Hi All,

    I've been getting these errors ("ERROR:  cache lookup failed for
relation 17442")  in my logs for a while now.   It originally seemed
like a hardware problem, however now we getting them pretty consistently
on a couple servers.  I've scalled down the schema to the one table and
the function involved and included a code snipet to make a bunch of
connections and loop around calling the same function.   It usually
takes 100-2000 iterations before these messages start appearing in the
log.  I've also included the original function, this takes 10,000
iterations for the error to start showing.   I should note, we've been
getting these erros since version 7, this is the first time they were
reproducable..

With the original function, the log messages were slightly different and
usually caused the server to reset:
i.e.
ERROR:  type "t" already exists
ERROR:  duplicate key violates unique constraint
"pg_type_typname_nsp_index"
ERROR:  duplicate key violates unique constraint
"pg_type_typname_nsp_index"
ERROR:  duplicate key violates unique constraint
"pg_type_typname_nsp_index"
CONTEXT:  SQL statement "create temp table tmp_children ( uniqid bigint,
memberid bigint, membertype varchar(50), ownerid smallint, tag
varchar(50), level int4 )"
     PL/pgSQL function "fngetcompositeids2" line 14 at SQL statement
ERROR:  duplicate key violates unique constraint
"pg_type_typname_nsp_index"
ERROR:  cache lookup failed for type 2449707570
FATAL:  cache lookup failed for type 2449707570

Environment info:  Postgres v8, suse linix with latest kernal patches,
filesystem: reiserfs.

Please let me know if you need anymore information.  No data is need,
just the schema included.

Thanks
Michael









--
-- PostgreSQL database dump
--

SET client_encoding = 'SQL_ASCII';
SET check_function_bodies = false;
SET client_min_messages = warning;

--
-- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: fiasco
--

COMMENT ON SCHEMA public IS 'Standard public schema';


SET search_path = public, pg_catalog;

--
-- Name: plpgsql_call_handler(); Type: FUNCTION; Schema: public; Owner: fiasco
--

CREATE FUNCTION plpgsql_call_handler() RETURNS language_handler
    AS '$libdir/plpgsql', 'plpgsql_call_handler'
    LANGUAGE c;


ALTER FUNCTION public.plpgsql_call_handler() OWNER TO fiasco;

--
-- Name: plpgsql; Type: PROCEDURAL LANGUAGE; Schema: public; Owner:
--

CREATE TRUSTED PROCEDURAL LANGUAGE plpgsql HANDLER plpgsql_call_handler;


--
-- Name: date_value; Type: TYPE; Schema: public; Owner: guerin
--

CREATE TYPE date_value AS (
    tsvalue double precision,
    tsdate timestamp without time zone
);


ALTER TYPE public.date_value OWNER TO guerin;

--
-- Name: object_composite_row_3; Type: TYPE; Schema: public; Owner: guerin
--

CREATE TYPE object_composite_row_3 AS (
    uniqid bigint,
    memberid bigint,
    membertype character varying(50),
    ownerid smallint,
    tag character varying(50),
    "level" integer
);


ALTER TYPE public.object_composite_row_3 OWNER TO guerin;

--
-- Name: fngetcompositeids2(bigint); Type: FUNCTION; Schema: public; Owner: fiasco
--

CREATE FUNCTION fngetcompositeids2(bigint) RETURNS SETOF object_composite_row_3
    AS $_$

 declare
    r object_composite_row_3%rowtype;
    pid alias for $1;
    x   int;
    qry text;
    retval int;
    rec record;

 begin
    x := 1;

   if not table_exists('tchildren') then
     create temp table tchildren (
       uniqid bigint,
       memberid bigint,
       membertype varchar(50),
       ownerid smallint,
       tag varchar(50),
       level int4
     );
   else
     EXECUTE 'truncate table tchildren;';
   end if;

   EXECUTE 'insert into tchildren
             select uniqId,memberId,memberType,ownerid,tag, ' || x || ' as level
             from tblObjectComposite
             where uniqid = ' || pid || '::bigint;';

   qry:= 'select count(*) as numrows from (select * from tchildren where level = ' || x || ' limit 1) as x;';
   FOR rec IN EXECUTE qry LOOP
     retVal:= rec.numrows;
   END LOOP;

   while (retVal > 0) loop
       x := x + 1;
       EXECUTE 'insert into tchildren
                 select oc.uniqid,oc.memberid, oc.membertype,oc.ownerid,oc.tag, ' || x  ||
               ' from tblobjectcomposite oc, tchildren tmp
                  where  oc.uniqid = tmp.memberid and level = (' || (x-1) || ');';

       qry:= 'select count(*) as numrows from (select * from tchildren where level = ' || x || ' limit 1) as x;';
       FOR rec IN EXECUTE qry LOOP
         retVal:= rec.numrows;
       END LOOP;
   end loop;

   FOR r IN EXECUTE '
     select uniqId,memberId,memberType,ownerid,tag,level from tchildren order by level' LOOP
     RETURN NEXT r;
   END LOOP;

   return;
 END;

 $_$
    LANGUAGE plpgsql;


ALTER FUNCTION public.fngetcompositeids2(bigint) OWNER TO fiasco;

--
-- Name: fngetcompositeids2_orig(bigint); Type: FUNCTION; Schema: public; Owner: guerin
--

CREATE FUNCTION fngetcompositeids2_orig(bigint) RETURNS SETOF object_composite_row_3
    AS $_$

 declare
    r object_composite_row_3%rowtype;
    pid alias for $1;
    x   int;
    qry text;
    retval int;
    rec record;

 begin
    x := 1;

   create temp table tmp_children (
       uniqid bigint,
       memberid bigint,
       membertype varchar(50),
       ownerid smallint,
       tag varchar(50),
       level int4
   );

   EXECUTE 'insert into tmp_children
             select uniqId,memberId,memberType,ownerid,tag, ' || x || ' as level
             from tblObjectComposite
             where uniqid = ' || pid || '::bigint;';

   qry:= 'select count(*) as numrows from (select * from tmp_children where level = ' || x || ' limit 1) as x;';
   FOR rec IN EXECUTE qry LOOP
     retVal:= rec.numrows;
   END LOOP;

   while (retVal > 0) loop
       x := x + 1;
       EXECUTE 'insert into tmp_children
                 select oc.uniqid,oc.memberid, oc.membertype,oc.ownerid,oc.tag, ' || x  ||
               ' from tblobjectcomposite oc, tmp_children tmp
                  where  oc.uniqid = tmp.memberid and level = (' || (x-1) || ');';

       qry:= 'select count(*) as numrows from (select * from tmp_children where level = ' || x || ' limit 1) as x;';
       FOR rec IN EXECUTE qry LOOP
         retVal:= rec.numrows;
       END LOOP;
   end loop;

   FOR r IN EXECUTE '
     select uniqId,memberId,memberType,ownerid,tag,level from tmp_children order by level' LOOP
     RETURN NEXT r;
   END LOOP;

   drop table tmp_children;

   return;
 END;


 $_$
    LANGUAGE plpgsql;


ALTER FUNCTION public.fngetcompositeids2_orig(bigint) OWNER TO guerin;

--
-- Name: table_exists(character varying); Type: FUNCTION; Schema: public; Owner: fiasco
--

CREATE FUNCTION table_exists(character varying) RETURNS boolean
    AS $_$

DECLARE
    t_name ALIAS for $1;
    t_result VARCHAR;
BEGIN

    if EXISTS (SELECT relname FROM pg_class WHERE relname ~* ('^' || t_name || '$')
        AND relkind = 'r' and pg_table_is_visible(oid)) then
        RETURN TRUE;
    ELSE
        RETURN FALSE;
    END IF;
END;
$_$
    LANGUAGE plpgsql;


ALTER FUNCTION public.table_exists(character varying) OWNER TO fiasco;

--
-- Name: activity; Type: VIEW; Schema: public; Owner: guerin
--

CREATE VIEW activity AS
    SELECT c.relname, l.relation, l."database", l."transaction", l.pid, l."mode", l.granted FROM (pg_locks l JOIN
pg_classc ON ((l.relation = c.oid))); 


ALTER TABLE public.activity OWNER TO guerin;

SET default_tablespace = '';

SET default_with_oids = true;

--
-- Name: pga_forms; Type: TABLE; Schema: public; Owner: sa; Tablespace:
--

CREATE TABLE pga_forms (
    formname character varying(64),
    formsource text
);


ALTER TABLE public.pga_forms OWNER TO sa;

--
-- Name: pga_layout; Type: TABLE; Schema: public; Owner: sa; Tablespace:
--

CREATE TABLE pga_layout (
    tablename character varying(64),
    nrcols smallint,
    colnames text,
    colwidth text
);


ALTER TABLE public.pga_layout OWNER TO sa;

--
-- Name: pga_queries; Type: TABLE; Schema: public; Owner: sa; Tablespace:
--

CREATE TABLE pga_queries (
    queryname character varying(64),
    querytype character(1),
    querycommand text,
    querytables text,
    querylinks text,
    queryresults text,
    querycomments text
);


ALTER TABLE public.pga_queries OWNER TO sa;

--
-- Name: pga_reports; Type: TABLE; Schema: public; Owner: sa; Tablespace:
--

CREATE TABLE pga_reports (
    reportname character varying(64),
    reportsource text,
    reportbody text,
    reportprocs text,
    reportoptions text
);


ALTER TABLE public.pga_reports OWNER TO sa;

--
-- Name: pga_schema; Type: TABLE; Schema: public; Owner: sa; Tablespace:
--

CREATE TABLE pga_schema (
    schemaname character varying(64),
    schematables text,
    schemalinks text
);


ALTER TABLE public.pga_schema OWNER TO sa;

--
-- Name: pga_scripts; Type: TABLE; Schema: public; Owner: sa; Tablespace:
--

CREATE TABLE pga_scripts (
    scriptname character varying(64),
    scriptsource text
);


ALTER TABLE public.pga_scripts OWNER TO sa;

SET default_with_oids = false;

--
-- Name: tblobjectcomposite; Type: TABLE; Schema: public; Owner: fiasco; Tablespace:
--

CREATE TABLE tblobjectcomposite (
    uniqid bigint,
    "type" character varying(50),
    memberid bigint,
    tag character varying(50),
    membertype character varying(50),
    ownerid smallint
);


ALTER TABLE public.tblobjectcomposite OWNER TO fiasco;

--
-- Data for Name: pga_forms; Type: TABLE DATA; Schema: public; Owner: sa
--

COPY pga_forms (formname, formsource) FROM stdin;
\.


--
-- Data for Name: pga_layout; Type: TABLE DATA; Schema: public; Owner: sa
--

COPY pga_layout (tablename, nrcols, colnames, colwidth) FROM stdin;
\.


--
-- Data for Name: pga_queries; Type: TABLE DATA; Schema: public; Owner: sa
--

COPY pga_queries (queryname, querytype, querycommand, querytables, querylinks, queryresults, querycomments) FROM stdin;
\.


--
-- Data for Name: pga_reports; Type: TABLE DATA; Schema: public; Owner: sa
--

COPY pga_reports (reportname, reportsource, reportbody, reportprocs, reportoptions) FROM stdin;
\.


--
-- Data for Name: pga_schema; Type: TABLE DATA; Schema: public; Owner: sa
--

COPY pga_schema (schemaname, schematables, schemalinks) FROM stdin;
\.


--
-- Data for Name: pga_scripts; Type: TABLE DATA; Schema: public; Owner: sa
--

COPY pga_scripts (scriptname, scriptsource) FROM stdin;
\.


--
-- Data for Name: tblobjectcomposite; Type: TABLE DATA; Schema: public; Owner: fiasco
--

COPY tblobjectcomposite (uniqid, "type", memberid, tag, membertype, ownerid) FROM stdin;
\.


--
-- Name: composite_memberid; Type: INDEX; Schema: public; Owner: fiasco; Tablespace:
--

CREATE INDEX composite_memberid ON tblobjectcomposite USING btree (memberid);


ALTER INDEX public.composite_memberid OWNER TO fiasco;

--
-- Name: composite_uniqid; Type: INDEX; Schema: public; Owner: fiasco; Tablespace:
--

CREATE INDEX composite_uniqid ON tblobjectcomposite USING btree (uniqid);


ALTER INDEX public.composite_uniqid OWNER TO fiasco;

--
-- Name: idx_unique_composite; Type: INDEX; Schema: public; Owner: fiasco; Tablespace:
--

CREATE UNIQUE INDEX idx_unique_composite ON tblobjectcomposite USING btree (uniqid, tag);


ALTER INDEX public.idx_unique_composite OWNER TO fiasco;

--
-- Name: public; Type: ACL; Schema: -; Owner: fiasco
--

REVOKE ALL ON SCHEMA public FROM PUBLIC;
REVOKE ALL ON SCHEMA public FROM fiasco;
GRANT ALL ON SCHEMA public TO fiasco;
GRANT ALL ON SCHEMA public TO PUBLIC;


--
-- Name: pga_forms; Type: ACL; Schema: public; Owner: sa
--

REVOKE ALL ON TABLE pga_forms FROM PUBLIC;
REVOKE ALL ON TABLE pga_forms FROM sa;
GRANT ALL ON TABLE pga_forms TO sa;
GRANT ALL ON TABLE pga_forms TO PUBLIC;


--
-- Name: pga_layout; Type: ACL; Schema: public; Owner: sa
--

REVOKE ALL ON TABLE pga_layout FROM PUBLIC;
REVOKE ALL ON TABLE pga_layout FROM sa;
GRANT ALL ON TABLE pga_layout TO sa;
GRANT ALL ON TABLE pga_layout TO PUBLIC;


--
-- Name: pga_queries; Type: ACL; Schema: public; Owner: sa
--

REVOKE ALL ON TABLE pga_queries FROM PUBLIC;
REVOKE ALL ON TABLE pga_queries FROM sa;
GRANT ALL ON TABLE pga_queries TO sa;
GRANT ALL ON TABLE pga_queries TO PUBLIC;


--
-- Name: pga_reports; Type: ACL; Schema: public; Owner: sa
--

REVOKE ALL ON TABLE pga_reports FROM PUBLIC;
REVOKE ALL ON TABLE pga_reports FROM sa;
GRANT ALL ON TABLE pga_reports TO sa;
GRANT ALL ON TABLE pga_reports TO PUBLIC;


--
-- Name: pga_schema; Type: ACL; Schema: public; Owner: sa
--

REVOKE ALL ON TABLE pga_schema FROM PUBLIC;
REVOKE ALL ON TABLE pga_schema FROM sa;
GRANT ALL ON TABLE pga_schema TO sa;
GRANT ALL ON TABLE pga_schema TO PUBLIC;


--
-- Name: pga_scripts; Type: ACL; Schema: public; Owner: sa
--

REVOKE ALL ON TABLE pga_scripts FROM PUBLIC;
REVOKE ALL ON TABLE pga_scripts FROM sa;
GRANT ALL ON TABLE pga_scripts TO sa;
GRANT ALL ON TABLE pga_scripts TO PUBLIC;


--
-- Name: tblobjectcomposite; Type: ACL; Schema: public; Owner: fiasco
--

REVOKE ALL ON TABLE tblobjectcomposite FROM PUBLIC;
REVOKE ALL ON TABLE tblobjectcomposite FROM fiasco;
GRANT ALL ON TABLE tblobjectcomposite TO fiasco;
GRANT ALL ON TABLE tblobjectcomposite TO PUBLIC;


--
-- PostgreSQL database dump complete
--



Вложения

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Tom Lane
Дата:
Michael Guerin <guerin@rentec.com> writes:
>     I've been getting these errors ("ERROR:  cache lookup failed for
> relation 17442")  in my logs for a while now.

Turning on verbose error logging shows that the error invariably comes
from RelationIsVisible():

2005-02-07 16:44:35 EST 2994 ERROR:  XX000: cache lookup failed for relation 168051
2005-02-07 16:44:35 EST 2994 CONTEXT:  SQL statement "SELECT  EXISTS (SELECT relname FROM pg_class WHERE relname ~*
('^'||  $1  || '$') AND relkind = 'r' and pg_table_is_visible(oid))" 
    PL/pgSQL function "table_exists" line 7 at if
    PL/pgSQL function "fngetcompositeids2" line 13 at if
2005-02-07 16:44:35 EST 2994 LOCATION:  RelationIsVisible, namespace.c:320

What is happening, I believe, is that the SELECT finds a row in pg_class
that has already been deleted (because the temp table in question
belonged to another backend that already dropped it).  Under MVCC rules
the row is still visible to the SELECT.  However, pg_table_is_visible
depends on backend internal catalog lookup functions, and those lookups
always follow SnapshotNow rules ... so there is a window in which the
pg_table_is_visible call can fail because the other backend's DROP
already committed.

The only quick-fix way I can see for making this safe is to modify
pg_table_is_visible and friends to silently return FALSE instead of
erroring out when they are handed an OID that doesn't seem to match any
existing catalog row.  There was some talk of doing this once before:
http://archives.postgresql.org/pgsql-hackers/2002-11/msg00067.php
but we never got around to it, mainly because it seemed too likely to
mask mistakes.

A more general issue is that we now have a pile of "catalog inquiry"
functions that all make use of backend internal lookups and therefore
reflect SnapshotNow behavior to the user.  This has bothered me for some
time, mainly because I'm worried that it could result in pg_dump failing
to emit a consistent snapshot --- eg, it could emit a CREATE VIEW
command that reflects a view change that occurred later than the start
of the dump.  Defending against this looks horridly messy though :-(
... it almost seems to require duplicate sets of catalog lookup code.

In the meantime, you might want to think about identifying your own temp
tables by matching their relnamespace to current_schemas(), instead of
relying on pg_table_is_visible.

            regards, tom lane

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Tom Lane
Дата:
Also ... the visibility issue doesn't seem to explain the other symptom
you reported:

> With the original function, the log messages were slightly different and
> usually caused the server to reset:
> ERROR:  duplicate key violates unique constraint
> "pg_type_typname_nsp_index"

What was the "original function" exactly?

            regards, tom lane

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Hans-Jürgen Schönig
Дата:
Michael Guerin wrote:
> Hi All,
>
>    I've been getting these errors ("ERROR:  cache lookup failed for
> relation 17442")  in my logs for a while now.   It originally seemed
> like a hardware problem, however now we getting them pretty consistently
> on a couple servers.  I've scalled down the schema to the one table and
> the function involved and included a code snipet to make a bunch of
> connections and loop around calling the same function.   It usually
> takes 100-2000 iterations before these messages start appearing in the
> log.  I've also included the original function, this takes 10,000
> iterations for the error to start showing.   I should note, we've been
> getting these erros since version 7, this is the first time they were
> reproducable..
>
> With the original function, the log messages were slightly different and
> usually caused the server to reset:
> i.e.
> ERROR:  type "t" already exists
> ERROR:  duplicate key violates unique constraint
> "pg_type_typname_nsp_index"
> ERROR:  duplicate key violates unique constraint
> "pg_type_typname_nsp_index"
> ERROR:  duplicate key violates unique constraint
> "pg_type_typname_nsp_index"
> CONTEXT:  SQL statement "create temp table tmp_children ( uniqid bigint,
> memberid bigint, membertype varchar(50), ownerid smallint, tag
> varchar(50), level int4 )"
>     PL/pgSQL function "fngetcompositeids2" line 14 at SQL statement
> ERROR:  duplicate key violates unique constraint
> "pg_type_typname_nsp_index"
> ERROR:  cache lookup failed for type 2449707570
> FATAL:  cache lookup failed for type 2449707570
>
> Environment info:  Postgres v8, suse linix with latest kernal patches,
> filesystem: reiserfs.
>
> Please let me know if you need anymore information.  No data is need,
> just the schema included.
>
> Thanks
> Michael
>


Michael,

The interesting thing about this bug is: We had the same thing on a
customer's machine some time ago. It actually occurred after a certain
script (nothing big) was run the 100.001st time (maybe) on an empty
database. So this one does not seem to be related to the schema - it is
more or less random ...
The interesting thing is: We copied the data directory from the customer
and we were not able to reproduce the same behaviour on a different machine.
The strange thing is: After doing a checkpoint and restarting the
database the problem still occurred. Starting the same binary thing on a
different machine did not show that error ...
We stepped through it with gdb but we could not find anything strange ...
Can you reliably reproduce the problem after a arbitrary amount of
iterations on a different machine? We couldn't ...

Looking at the code: This is a null pointer caught by the system ...
Something seems to corrupt memory ...
Hans

--
Cybertec Geschwinde u Schoenig
Schoengrabern 134, A-2020 Hollabrunn, Austria
Tel: +43/660/816 40 77
www.cybertec.at, www.postgresql.at



Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Alvaro Herrera
Дата:
On Mon, Feb 07, 2005 at 05:07:47PM -0500, Tom Lane wrote:

> A more general issue is that we now have a pile of "catalog inquiry"
> functions that all make use of backend internal lookups and therefore
> reflect SnapshotNow behavior to the user.  This has bothered me for
> some
> time,

Can we pass a Snapshot as a parameter to the internal lookups?  Or set a
global variable to indicate this is not really a internal lookup, and
reset it afterwards?

--
Alvaro Herrera (<alvherre[@]dcc.uchile.cl>)
"The problem with the future is that it keeps turning into the present"
(Hobbes)

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Tom Lane
Дата:
Alvaro Herrera <alvherre@dcc.uchile.cl> writes:
> On Mon, Feb 07, 2005 at 05:07:47PM -0500, Tom Lane wrote:
>> A more general issue is that we now have a pile of "catalog inquiry"
>> functions that all make use of backend internal lookups and therefore
>> reflect SnapshotNow behavior to the user.

> Can we pass a Snapshot as a parameter to the internal lookups?  Or set a
> global variable to indicate this is not really a internal lookup, and
> reset it afterwards?

The first seems awfully messy, and the second awfully dangerous.

            regards, tom lane

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Michael Guerin
Дата:
Tom Lane wrote:

>Also ... the visibility issue doesn't seem to explain the other symptom
>you reported:
>
>
>
>>With the original function, the log messages were slightly different and
>>usually caused the server to reset:
>>ERROR:  duplicate key violates unique constraint
>>"pg_type_typname_nsp_index"
>>
>>
>
>What was the "original function" exactly?
>
>            regards, tom lane
>
>
It's included in the schema call uspgetcompositeids2_orig.   The only
differences is that this function creates and drops the temp table
within the function, the newer function keeps the temp table around for
the life of the connections.

-michael

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Tom Lane
Дата:
Michael Guerin <guerin@rentec.com> writes:
> Tom Lane wrote:
>> What was the "original function" exactly?
>>
> It's included in the schema call uspgetcompositeids2_orig.   The only
> differences is that this function creates and drops the temp table
> within the function, the newer function keeps the temp table around for
> the life of the connections.

So the appropriate test script looks like what?

            regards, tom lane

Re: ERROR: cache lookup failed for relation 17442 (repost)

От
Michael Guerin
Дата:
Tom Lane wrote:

>Michael Guerin <guerin@rentec.com> writes:
>
>
>>Tom Lane wrote:
>>
>>
>>>What was the "original function" exactly?
>>>
>>>
>>>
>>It's included in the schema call uspgetcompositeids2_orig.   The only
>>differences is that this function creates and drops the temp table
>>within the function, the newer function keeps the temp table around for
>>the life of the connections.
>>
>>
>
>So the appropriate test script looks like what?
>
>            regards, tom lane
>
>
I'll have to try and re-create this error, I thought the two were
releated.  I'll post it when I can replicate that error, so far I
haven't been able to duplicate it.

thanks
michael