Обсуждение: BUG #13636: psql numericlocale adds comma where it ought not
The following bug has been logged on the website: Bug reference: 13636 Logged by: Jeff Janes Email address: jeff.janes@gmail.com PostgreSQL version: 9.4.4 Operating system: Linux Description: \pset numericlocale on select 1000000::real; float4 -------- 1e,+06 (1 row) There should not be a comma added between e and +. Same with other versions.
jeff.janes@gmail.com writes:
> \pset numericlocale on
> select 1000000::real;
> float4
> --------
> 1e,+06
> (1 row)
> There should not be a comma added between e and +.
Indeed. It looks like the author of format_numeric_locale() never
heard of e-format output. There's some other pretty crummy code in
there, but that's the core problem ...
regards, tom lane
On Fri, Sep 25, 2015 at 11:37 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote: > jeff.janes@gmail.com writes: >> \pset numericlocale on >> select 1000000::real; >> float4 >> -------- >> 1e,+06 >> (1 row) > >> There should not be a comma added between e and +. > > Indeed. It looks like the author of format_numeric_locale() never > heard of e-format output. There's some other pretty crummy code in > there, but that's the core problem ... Does this look reasonable? -- Thomas Munro http://www.enterprisedb.com
Вложения
Thomas Munro <thomas.munro@enterprisedb.com> writes:
> On Fri, Sep 25, 2015 at 11:37 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> Indeed. It looks like the author of format_numeric_locale() never
>> heard of e-format output. There's some other pretty crummy code in
>> there, but that's the core problem ...
> Does this look reasonable?
I thought it needed a rather more thoroughgoing revision: there is no need
for it to assume so much about what is in the column, and good reason for
it not to. (For instance, I note that psql will try to apply this code to
"money" columns, which may be a bad idea, but there can definitely be
stuff in there that doesn't look like a regular number.) It should muck
with digits immediately following the sign, and nothing else. There was
some other useless inefficiency too. I came up with the attached.
I would have borrowed your regression test additions, except I'm afraid
they will fail if the prevailing locale isn't C.
regards, tom lane
diff --git a/src/bin/psql/print.c b/src/bin/psql/print.c
index cab9e6e..f7343d1 100644
*** a/src/bin/psql/print.c
--- b/src/bin/psql/print.c
***************
*** 39,46 ****
*/
volatile bool cancel_pressed = false;
static char *decimal_point;
! static char *grouping;
static char *thousands_sep;
static char default_footer[100];
--- 39,47 ----
*/
volatile bool cancel_pressed = false;
+ /* info for locale-aware numeric formatting; set up by setDecimalLocale() */
static char *decimal_point;
! static int groupdigits;
static char *thousands_sep;
static char default_footer[100];
*************** static void IsPagerNeeded(const printTab
*** 196,239 ****
static void print_aligned_vertical(const printTableContent *cont, FILE *fout);
static int
integer_digits(const char *my_str)
{
! int frac_len;
!
! if (my_str[0] == '-')
my_str++;
!
! frac_len = strchr(my_str, '.') ? strlen(strchr(my_str, '.')) : 0;
!
! return strlen(my_str) - frac_len;
}
! /* Return additional length required for locale-aware numeric output */
static int
additional_numeric_locale_len(const char *my_str)
{
int int_len = integer_digits(my_str),
len = 0;
- int groupdigits = atoi(grouping);
! if (int_len > 0)
! /* Don't count a leading separator */
! len = (int_len / groupdigits - (int_len % groupdigits == 0)) *
! strlen(thousands_sep);
if (strchr(my_str, '.') != NULL)
! len += strlen(decimal_point) - strlen(".");
return len;
}
- static int
- strlen_with_numeric_locale(const char *my_str)
- {
- return strlen(my_str) + additional_numeric_locale_len(my_str);
- }
-
/*
* Returns the appropriately formatted string in a new allocated block,
* caller must free
--- 197,231 ----
static void print_aligned_vertical(const printTableContent *cont, FILE *fout);
+ /* Count number of digits in integral part of number */
static int
integer_digits(const char *my_str)
{
! /* ignoring any sign ... */
! if (my_str[0] == '-' || my_str[0] == '+')
my_str++;
! /* ... count initial integral digits */
! return strspn(my_str, "0123456789");
}
! /* Compute additional length required for locale-aware numeric output */
static int
additional_numeric_locale_len(const char *my_str)
{
int int_len = integer_digits(my_str),
len = 0;
! /* Account for added thousands_sep instances */
! if (int_len > groupdigits)
! len += ((int_len - 1) / groupdigits) * strlen(thousands_sep);
+ /* Account for possible additional length of decimal_point */
if (strchr(my_str, '.') != NULL)
! len += strlen(decimal_point) - 1;
return len;
}
/*
* Returns the appropriately formatted string in a new allocated block,
* caller must free
*************** strlen_with_numeric_locale(const char *m
*** 241,293 ****
static char *
format_numeric_locale(const char *my_str)
{
int i,
- j,
- int_len = integer_digits(my_str),
leading_digits;
! int groupdigits = atoi(grouping);
! int new_str_start = 0;
! char *new_str = pg_malloc(strlen_with_numeric_locale(my_str) + 1);
! leading_digits = (int_len % groupdigits != 0) ?
! int_len % groupdigits : groupdigits;
! if (my_str[0] == '-') /* skip over sign, affects grouping
! * calculations */
{
! new_str[0] = my_str[0];
my_str++;
- new_str_start = 1;
}
! for (i = 0, j = new_str_start;; i++, j++)
{
! /* Hit decimal point? */
! if (my_str[i] == '.')
{
! strcpy(&new_str[j], decimal_point);
! j += strlen(decimal_point);
! /* add fractional part */
! strcpy(&new_str[j], &my_str[i] + 1);
! break;
}
! /* End of string? */
! if (my_str[i] == '\0')
! {
! new_str[j] = '\0';
! break;
! }
! /* Add separator? */
! if (i != 0 && (i - leading_digits) % groupdigits == 0)
! {
! strcpy(&new_str[j], thousands_sep);
! j += strlen(thousands_sep);
! }
! new_str[j] = my_str[i];
! }
return new_str;
}
--- 233,283 ----
static char *
format_numeric_locale(const char *my_str)
{
+ int new_len = strlen(my_str) + additional_numeric_locale_len(my_str);
+ char *new_str = pg_malloc(new_len + 1);
+ int int_len = integer_digits(my_str);
int i,
leading_digits;
! int new_str_pos = 0;
! /* number of digits in first thousands group */
! leading_digits = int_len % groupdigits;
! if (leading_digits == 0)
! leading_digits = groupdigits;
! /* process sign */
! if (my_str[0] == '-' || my_str[0] == '+')
{
! new_str[new_str_pos++] = my_str[0];
my_str++;
}
! /* process integer part of number */
! for (i = 0; i < int_len; i++)
{
! /* Time to insert separator? */
! if (i > 0 && --leading_digits == 0)
{
! strcpy(&new_str[new_str_pos], thousands_sep);
! new_str_pos += strlen(thousands_sep);
! leading_digits = groupdigits;
}
+ new_str[new_str_pos++] = my_str[i];
+ }
! /* handle decimal point if any */
! if (my_str[i] == '.')
! {
! strcpy(&new_str[new_str_pos], decimal_point);
! new_str_pos += strlen(decimal_point);
! i++;
! }
! /* copy the rest (fractional digits and/or exponent, and \0 terminator) */
! strcpy(&new_str[new_str_pos], &my_str[i]);
! /* assert we didn't underestimate new_len (an overestimate is OK) */
! assert(strlen(new_str) <= new_len);
return new_str;
}
*************** setDecimalLocale(void)
*** 3241,3250 ****
decimal_point = pg_strdup(extlconv->decimal_point);
else
decimal_point = "."; /* SQL output standard */
if (*extlconv->grouping && atoi(extlconv->grouping) > 0)
! grouping = pg_strdup(extlconv->grouping);
else
! grouping = "3"; /* most common */
/* similar code exists in formatting.c */
if (*extlconv->thousands_sep)
--- 3231,3241 ----
decimal_point = pg_strdup(extlconv->decimal_point);
else
decimal_point = "."; /* SQL output standard */
+
if (*extlconv->grouping && atoi(extlconv->grouping) > 0)
! groupdigits = atoi(extlconv->grouping);
else
! groupdigits = 3; /* most common */
/* similar code exists in formatting.c */
if (*extlconv->thousands_sep)
On Fri, Sep 25, 2015 at 2:19 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote: > Thomas Munro <thomas.munro@enterprisedb.com> writes: >> On Fri, Sep 25, 2015 at 11:37 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote: >>> Indeed. It looks like the author of format_numeric_locale() never >>> heard of e-format output. There's some other pretty crummy code in >>> there, but that's the core problem ... > >> Does this look reasonable? > > I thought it needed a rather more thoroughgoing revision: there is no need > for it to assume so much about what is in the column, and good reason for > it not to. (For instance, I note that psql will try to apply this code to > "money" columns, which may be a bad idea, but there can definitely be > stuff in there that doesn't look like a regular number.) It should muck > with digits immediately following the sign, and nothing else. There was > some other useless inefficiency too. I came up with the attached. > > I would have borrowed your regression test additions, except I'm afraid > they will fail if the prevailing locale isn't C. Oops, right. (I suppose there could be a schedule of optional extra tests that somehow run with C locale for psql, but perhaps not worth setting up just for this.) -- Thomas Munro http://www.enterprisedb.com
Thomas Munro <thomas.munro@enterprisedb.com> writes:
> On Fri, Sep 25, 2015 at 2:19 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> I would have borrowed your regression test additions, except I'm afraid
>> they will fail if the prevailing locale isn't C.
> Oops, right. (I suppose there could be a schedule of optional extra
> tests that somehow run with C locale for psql, but perhaps not worth
> setting up just for this.)
Yeah, I thought about that too, and likewise decided it probably wasn't
worth the trouble, not yet anyway.
regards, tom lane
On Fri, Sep 25, 2015 at 4:06 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote: > Thomas Munro <thomas.munro@enterprisedb.com> writes: >> On Fri, Sep 25, 2015 at 2:19 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote: >>> I would have borrowed your regression test additions, except I'm afraid >>> they will fail if the prevailing locale isn't C. > >> Oops, right. (I suppose there could be a schedule of optional extra >> tests that somehow run with C locale for psql, but perhaps not worth >> setting up just for this.) > > Yeah, I thought about that too, and likewise decided it probably wasn't > worth the trouble, not yet anyway. About your follow-up commit 6325527d845b629243fb3f605af6747a7a4ac45f, I noticed that glibc localedata has some grouping values of 0 (no grouping at all), for example nl_NL, el_GR, hr_HR, it_IT, pl_PL, es_CU, pt_PT and we don't honour that, if it's 0 we use 3. All the rest begin with 3, except for unm_US which uses 2;2;2;3 (apparently a Delaware language), and I confirmed that now produces strings like "12 34 56", so I guess that obscure locale may be the only case that the commit actually changes on a glibc system. -- Thomas Munro http://www.enterprisedb.com
Thomas Munro <thomas.munro@enterprisedb.com> writes:
> About your follow-up commit 6325527d845b629243fb3f605af6747a7a4ac45f,
> I noticed that glibc localedata has some grouping values of 0 (no
> grouping at all), for example nl_NL, el_GR, hr_HR, it_IT, pl_PL,
> es_CU, pt_PT and we don't honour that, if it's 0 we use 3. All the
> rest begin with 3, except for unm_US which uses 2;2;2;3 (apparently a
> Delaware language), and I confirmed that now produces strings like "12
> 34 56", so I guess that obscure locale may be the only case that the
> commit actually changes on a glibc system.
Yeah, the locales where grouping isn't just 3 are so obscure that
I don't particularly care. If someone from one of those areas
wants to submit a feature patch to implement grouping more fully,
more power to 'em ...
However, I checked this morning and found that the MONEY case that
was niggling me yesterday is indeed a problem, for instance in de_DE:
$ LC_NUMERIC=de_DE psql regression
psql (9.6devel)
Type "help" for help.
regression=# set lc_monetary = 'de_DE';
SET
regression=# select '123456.78'::money;
money
-------------------
12.345.678,00 EUR
(1 row)
regression=# \pset numericlocale on
Locale-adjusted numeric output is on.
regression=# select '123456.78'::money;
money
-------------------
12,345.678,00 EUR
(1 row)
So we're gonna have to do something about that. I considered
a few fixes:
* Remove CASHOID from the set of datatypes that printQuery will choose
to right-justify. This seems likely to annoy people who are used to
having money amounts right-justified.
* Separate "use locale formatting" from "right justify", and apply only
the latter to CASHOID. This would be the cleanest fix but by far the
most invasive. I don't particularly want to do that much work and I
definitely wouldn't want to back-patch it.
* Put a hack into format_numeric_locale() so that it won't mess with
monetary output. This seems feasible because cash_out() always insists
on using a non-empty currency symbol. For example, we could check that
the string includes no characters outside "0123456789+-.eE" and feel
pretty safe that no money value would pass the check.
So I'm inclined to do the third one. Objections, better ideas?
regards, tom lane