Re: BUG #19340: Wrong result from CORR() function

Поиск

Список

Период

Сортировка

От	Tom Lane
Тема	Re: BUG #19340: Wrong result from CORR() function
Дата	3 декабря 03:17:32
Msg-id	531516.1764721052@sss.pgh.pa.us обсуждение исходный текст
Ответ на	Re: BUG #19340: Wrong result from CORR() function (Tom Lane <tgl@sss.pgh.pa.us>)
Ответы	Re: BUG #19340: Wrong result from CORR() function
Список	pgsql-bugs

Дерево обсуждения

I tried the attached realization of your idea, and it does seem
very marginally faster than what I did; but it's like a 1.8%
slowdown instead of 2%.  Might be different on a different
machine of course.  But I think we should choose on the basis
of which semantics we like better, rather than such a tiny
performance difference.

I'm coming around to the conclusion that your way is better,
though.  It seems good that "any NaN in the input results in
NaN output", which your way does and mine doesn't.

            regards, tom lane

diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 7b97d2be6ca..c2173f378bf 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -3319,9 +3319,16 @@ float8_stddev_samp(PG_FUNCTION_ARGS)
  * As with the preceding aggregates, we use the Youngs-Cramer algorithm to
  * reduce rounding errors in the aggregate final functions.
  *
- * The transition datatype for all these aggregates is a 6-element array of
+ * The transition datatype for all these aggregates is an 8-element array of
  * float8, holding the values N, Sx=sum(X), Sxx=sum((X-Sx/N)^2), Sy=sum(Y),
- * Syy=sum((Y-Sy/N)^2), Sxy=sum((X-Sx/N)*(Y-Sy/N)) in that order.
+ * Syy=sum((Y-Sy/N)^2), Sxy=sum((X-Sx/N)*(Y-Sy/N)), commonX, and commonY
+ * in that order.
+ *
+ * commonX is defined as the common X value if all the X values were the
+ * same, else NaN; likewise for commonY.  This is useful for deciding
+ * whether corr() should return NULL.  Note that this representation cannot
+ * distinguish all-the-values-were-NaN from the-values-weren't-all-the-same,
+ * but that's okay because we want to return NaN for all-NaN input.
  *
  * Note that Y is the first argument to all these aggregates!
  *
@@ -3345,17 +3352,21 @@ float8_regr_accum(PG_FUNCTION_ARGS)
                 Sy,
                 Syy,
                 Sxy,
+                commonX,
+                commonY,
                 tmpX,
                 tmpY,
                 scale;

-    transvalues = check_float8_array(transarray, "float8_regr_accum", 6);
+    transvalues = check_float8_array(transarray, "float8_regr_accum", 8);
     N = transvalues[0];
     Sx = transvalues[1];
     Sxx = transvalues[2];
     Sy = transvalues[3];
     Syy = transvalues[4];
     Sxy = transvalues[5];
+    commonX = transvalues[6];
+    commonY = transvalues[7];

     /*
      * Use the Youngs-Cramer algorithm to incorporate the new values into the
@@ -3373,6 +3384,16 @@ float8_regr_accum(PG_FUNCTION_ARGS)
         Syy += tmpY * tmpY * scale;
         Sxy += tmpX * tmpY * scale;

+        /*
+         * Check to see if we have seen distinct inputs.  We can use a test
+         * that's a bit cheaper than float8_ne() because if commonX is already
+         * NaN, it does not matter whether the != test returns true or not.
+         */
+        if (newvalX != commonX || isnan(newvalX))
+            commonX = get_float8_nan();
+        if (newvalY != commonY || isnan(newvalY))
+            commonY = get_float8_nan();
+
         /*
          * Overflow check.  We only report an overflow error when finite
          * inputs lead to infinite results.  Note also that Sxx, Syy and Sxy
@@ -3410,6 +3431,8 @@ float8_regr_accum(PG_FUNCTION_ARGS)
             Sxx = Sxy = get_float8_nan();
         if (isnan(newvalY) || isinf(newvalY))
             Syy = Sxy = get_float8_nan();
+        commonX = newvalX;
+        commonY = newvalY;
     }

     /*
@@ -3425,12 +3448,14 @@ float8_regr_accum(PG_FUNCTION_ARGS)
         transvalues[3] = Sy;
         transvalues[4] = Syy;
         transvalues[5] = Sxy;
+        transvalues[6] = commonX;
+        transvalues[7] = commonY;

         PG_RETURN_ARRAYTYPE_P(transarray);
     }
     else
     {
-        Datum        transdatums[6];
+        Datum        transdatums[8];
         ArrayType  *result;

         transdatums[0] = Float8GetDatumFast(N);
@@ -3439,8 +3464,10 @@ float8_regr_accum(PG_FUNCTION_ARGS)
         transdatums[3] = Float8GetDatumFast(Sy);
         transdatums[4] = Float8GetDatumFast(Syy);
         transdatums[5] = Float8GetDatumFast(Sxy);
+        transdatums[6] = Float8GetDatumFast(commonX);
+        transdatums[7] = Float8GetDatumFast(commonY);

-        result = construct_array_builtin(transdatums, 6, FLOAT8OID);
+        result = construct_array_builtin(transdatums, 8, FLOAT8OID);

         PG_RETURN_ARRAYTYPE_P(result);
     }
@@ -3733,24 +3760,27 @@ float8_corr(PG_FUNCTION_ARGS)
     float8        N,
                 Sxx,
                 Syy,
-                Sxy;
+                Sxy,
+                commonX,
+                commonY;

-    transvalues = check_float8_array(transarray, "float8_corr", 6);
+    transvalues = check_float8_array(transarray, "float8_corr", 8);
     N = transvalues[0];
     Sxx = transvalues[2];
     Syy = transvalues[4];
     Sxy = transvalues[5];
+    commonX = transvalues[6];
+    commonY = transvalues[7];

     /* if N is 0 we should return NULL */
     if (N < 1.0)
         PG_RETURN_NULL();

-    /* Note that Sxx and Syy are guaranteed to be non-negative */
-
     /* per spec, return NULL for horizontal and vertical lines */
-    if (Sxx == 0 || Syy == 0)
+    if (!isnan(commonX) || !isnan(commonY))
         PG_RETURN_NULL();

+    /* at this point, Sxx and Syy cannot be zero or negative */
     PG_RETURN_FLOAT8(Sxy / sqrt(Sxx * Syy));
 }

diff --git a/src/include/catalog/pg_aggregate.dat b/src/include/catalog/pg_aggregate.dat
index 870769e8f14..d1342f7bb94 100644
--- a/src/include/catalog/pg_aggregate.dat
+++ b/src/include/catalog/pg_aggregate.dat
@@ -505,7 +505,7 @@
   aggtranstype => '_float8', agginitval => '{0,0,0,0,0,0}' },
 { aggfnoid => 'corr', aggtransfn => 'float8_regr_accum',
   aggfinalfn => 'float8_corr', aggcombinefn => 'float8_regr_combine',
-  aggtranstype => '_float8', agginitval => '{0,0,0,0,0,0}' },
+  aggtranstype => '_float8', agginitval => '{0,0,0,0,0,0,0,0}' },

 # boolean-and and boolean-or
 { aggfnoid => 'bool_and', aggtransfn => 'booland_statefunc',

В списке pgsql-bugs по дате отправления:

Вход в личный кабинет

Восстановление пароля

Подтверждение аккаунта

Изменение пароля

Re: BUG #19340: Wrong result from CORR() function