Re: problem (bug?) with "in (subquery)"
От | Tom Lane |
---|---|
Тема | Re: problem (bug?) with "in (subquery)" |
Дата | |
Msg-id | 16528.1121447565@sss.pgh.pa.us обсуждение исходный текст |
Ответ на | Re: problem (bug?) with "in (subquery)" (Luca Pireddu <lucap@shaw.ca>) |
Список | pgsql-sql |
Luca Pireddu <lucap@shaw.ca> writes: > On July 15, 2005 08:58, Tom Lane wrote: >> Ah-hah: this one is the fault of create_unique_path, which quoth > In any case, it looks like Tom has already found the problem :-) Thanks guys! On closer analysis, the test in create_unique_path is almost but not quite completely wrong :-(. Here is the patch against 8.0 branch, if you need it right away. regards, tom lane Index: src/backend/optimizer/util/pathnode.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v retrieving revision 1.111 diff -c -r1.111 pathnode.c *** src/backend/optimizer/util/pathnode.c 31 Dec 2004 22:00:23 -0000 1.111 --- src/backend/optimizer/util/pathnode.c 15 Jul 2005 17:03:06 -0000 *************** *** 34,40 **** #include "utils/syscache.h" ! static bool is_distinct_query(Query *query); static bool hash_safe_tlist(List *tlist); --- 34,41 ---- #include "utils/syscache.h" ! static List *translate_sub_tlist(List *tlist, int relid); ! static bool query_is_distinct_for(Query *query, List *colnos); static bool hash_safe_tlist(List *tlist); *************** *** 642,655 **** pathnode->subpath = subpath; /* ! * If the input is a subquery whose output must be unique already, we ! * don't need to do anything. */ ! if (rel->rtekind == RTE_SUBQUERY) { RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable); ! if (is_distinct_query(rte->subquery)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->rows= rel->rows; --- 643,683 ---- pathnode->subpath = subpath; /* ! * Try to identify the targetlist that will actually be unique-ified. ! * In current usage, this routine is only used for sub-selects of IN ! * clauses, so we should be able to find the tlist in in_info_list. ! */ ! sub_targetlist = NIL; ! foreach(l, root->in_info_list) ! { ! InClauseInfo *ininfo = (InClauseInfo *) lfirst(l); ! ! if (bms_equal(ininfo->righthand, rel->relids)) ! { ! sub_targetlist = ininfo->sub_targetlist; ! break; ! } ! } ! ! /* ! * If the input is a subquery whose output must be unique already, ! * then we don't need to do anything. The test for uniqueness has ! * to consider exactly which columns we are extracting; for example ! * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct. ! * So we cannot check for this optimization unless we found our own ! * targetlist above, and it consists only of simple Vars referencing ! * subquery outputs. (Possibly we could do something with expressions ! * in the subquery outputs, too, but for now keep it simple.) */ ! if (sub_targetlist && rel->rtekind == RTE_SUBQUERY) { RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable); + List *sub_tlist_colnos; ! sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid); ! ! if (sub_tlist_colnos && ! query_is_distinct_for(rte->subquery, sub_tlist_colnos)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->rows = rel->rows; *************** *** 664,686 **** } /* - * Try to identify the targetlist that will actually be unique-ified. - * In current usage, this routine is only used for sub-selects of IN - * clauses, so we should be able to find the tlist in in_info_list. - */ - sub_targetlist = NIL; - foreach(l, root->in_info_list) - { - InClauseInfo *ininfo = (InClauseInfo *) lfirst(l); - - if (bms_equal(ininfo->righthand, rel->relids)) - { - sub_targetlist = ininfo->sub_targetlist; - break; - } - } - - /* * If we know the targetlist, try to estimate number of result rows; * otherwise punt. */ --- 692,697 ---- *************** *** 755,804 **** } /* ! * is_distinct_query - does query never return duplicate rows? */ ! static bool ! is_distinct_query(Query *query) { ! /* DISTINCT (but not DISTINCT ON) guarantees uniqueness */ ! if (has_distinct_clause(query)) ! return true; ! /* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */ ! if (query->setOperations) { ! SetOperationStmt *topop = (SetOperationStmt *) query->setOperations; ! Assert(IsA(topop, SetOperationStmt)); ! Assert(topop->op != SETOP_NONE); ! if (!topop->all) return true; } /* ! * GROUP BY guarantees uniqueness if all the grouped columns appear in ! * the output. In our implementation this means checking they are non ! * resjunk columns. */ if (query->groupClause) { ! ListCell *gl; ! ! foreach(gl, query->groupClause) { ! GroupClause *grpcl = (GroupClause *) lfirst(gl); TargetEntry *tle = get_sortgroupclause_tle(grpcl, query->targetList); ! if (tle->resdom->resjunk) ! break; } ! if (!gl) /* got to the end? */ return true; } /* * XXX Are there any othercases in which we can easily see the result * must be distinct? */ --- 766,888 ---- } /* ! * translate_sub_tlist - get subquery column numbers represented by tlist ! * ! * The given targetlist should contain only Vars referencing the given relid. ! * Extract their varattnos (ie, the column numbers of the subquery) and return ! * as an integer List. ! * ! * If any of the tlist items is not a simple Var, we cannot determine whether ! * the subquery's uniqueness condition (if any) matches ours, so punt and ! * return NIL. */ ! static List * ! translate_sub_tlist(List *tlist, int relid) { ! List *result = NIL; ! ListCell *l; ! foreach(l, tlist) { ! Var *var = (Var *) lfirst(l); ! if (!var || !IsA(var, Var) || ! var->varno != relid) ! return NIL; /* punt */ ! result = lappend_int(result, var->varattno); ! } ! return result; ! } ! ! /* ! * query_is_distinct_for - does query never return duplicates of the ! * specified columns? ! * ! * colnos is an integer list of output column numbers (resno's). We are ! * interested in whether rows consisting of just these columns are certain ! * to be distinct. ! */ ! static bool ! query_is_distinct_for(Query *query, List *colnos) ! { ! ListCell *l; ! ! /* ! * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the ! * columns in the DISTINCT clause appear in colnos. ! */ ! if (query->distinctClause) ! { ! foreach(l, query->distinctClause) ! { ! SortClause *scl = (SortClause *) lfirst(l); ! TargetEntry *tle = get_sortgroupclause_tle(scl, ! query->targetList); ! ! if (!list_member_int(colnos, tle->resdom->resno)) ! break; /* exit early if no match */ ! } ! if (l == NULL) /* had matches for all? */ return true; } /* ! * Similarly, GROUP BY guarantees uniqueness if all the grouped columns ! * appear in colnos. */ if (query->groupClause) { ! foreach(l, query->groupClause) { ! GroupClause *grpcl = (GroupClause *) lfirst(l); TargetEntry *tle = get_sortgroupclause_tle(grpcl, query->targetList); ! if (!list_member_int(colnos, tle->resdom->resno)) ! break; /* exit early if no match */ } ! if (l == NULL) /* had matches for all? */ ! return true; ! } ! else ! { ! /* ! * If we have no GROUP BY, but do have aggregates or HAVING, then ! * the result is at most one row so it's surely unique. ! */ ! if (query->hasAggs || query->havingQual) return true; } /* + * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row, + * except with ALL + */ + if (query->setOperations) + { + SetOperationStmt *topop = (SetOperationStmt *) query->setOperations; + + Assert(IsA(topop, SetOperationStmt)); + Assert(topop->op != SETOP_NONE); + + if (!topop->all) + { + /* We're good if all the nonjunk output columns are in colnos */ + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + if (!tle->resdom->resjunk && + !list_member_int(colnos, tle->resdom->resno)) + break; /* exit early if no match */ + } + if (l == NULL) /* had matches for all? */ + return true; + } + } + + /* * XXX Are there any other cases in which we can easily see the result * must be distinct? */ Index: src/test/regress/expected/subselect.out =================================================================== RCS file: /cvsroot/pgsql/src/test/regress/expected/subselect.out,v retrieving revision 1.10.4.2 diff -c -r1.10.4.2 subselect.out *** src/test/regress/expected/subselect.out 1 Feb 2005 23:09:00 -0000 1.10.4.2 --- src/test/regress/expected/subselect.out 15 Jul 2005 17:03:06 -0000 *************** *** 203,208 **** --- 203,265 ---- (1 row) -- + -- Test cases to check for overenthusiastic optimization of + -- "IN (SELECT DISTINCT ...)" and related cases. Per example from + -- Luca Pireddu and Michael Fuhr. + -- + CREATE TEMP TABLE foo (id integer); + CREATE TEMP TABLE bar (id1 integer, id2 integer); + INSERT INTO foo VALUES (1); + INSERT INTO bar VALUES (1, 1); + INSERT INTO bar VALUES (2, 2); + INSERT INTO bar VALUES (3, 1); + -- These cases require an extra level of distinct-ing above subquery s + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s); + id + ---- + 1 + (1 row) + + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s); + id + ---- + 1 + (1 row) + + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION + SELECT id1, id2 FROM bar) AS s); + id + ---- + 1 + (1 row) + + -- These cases do not + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s); + id + ---- + 1 + (1 row) + + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s); + id + ---- + 1 + (1 row) + + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar UNION + SELECT id2 FROM bar) AS s); + id + ---- + 1 + (1 row) + + -- -- Test case to catch problems with multiply nested sub-SELECTs not getting -- recalculated properly. Per bug reportfrom Didier Moens. -- Index: src/test/regress/sql/subselect.sql =================================================================== RCS file: /cvsroot/pgsql/src/test/regress/sql/subselect.sql,v retrieving revision 1.7 diff -c -r1.7 subselect.sql *** src/test/regress/sql/subselect.sql 4 Oct 2004 14:42:48 -0000 1.7 --- src/test/regress/sql/subselect.sql 15 Jul 2005 17:03:06 -0000 *************** *** 96,101 **** --- 96,134 ---- where unique1 IN (select distinct hundred from tenk1 b)) ss; -- + -- Test cases to check for overenthusiastic optimization of + -- "IN (SELECT DISTINCT ...)" and related cases. Per example from + -- Luca Pireddu and Michael Fuhr. + -- + + CREATE TEMP TABLE foo (id integer); + CREATE TEMP TABLE bar (id1 integer, id2 integer); + + INSERT INTO foo VALUES (1); + + INSERT INTO bar VALUES (1, 1); + INSERT INTO bar VALUES (2, 2); + INSERT INTO bar VALUES (3, 1); + + -- These cases require an extra level of distinct-ing above subquery s + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s); + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s); + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION + SELECT id1, id2 FROM bar) AS s); + + -- These cases do not + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s); + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s); + SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar UNION + SELECT id2 FROM bar) AS s); + + -- -- Test case to catch problems with multiply nested sub-SELECTs not getting -- recalculated properly. Per bug reportfrom Didier Moens. --
В списке pgsql-sql по дате отправления: