Обсуждение: unexpected (to me) sorting order
Hi!
below are some commands to What am I missing?
begin;
create table T_SORT (
ID bigint default 1 not null , -- Primary Key
NAME varchar(100) default ' ' not null
);
alter table T_SORT add constraint T_SORTP1 primary key (
ID
);
insert into T_SORT values ( 1,'FINISH_110_150_1');
insert into T_SORT values ( 2,'FINISH_110_200_1');
insert into T_SORT values ( 3,'FINISH_1.10_20.0_3');
insert into T_SORT values ( 4,'FINISH_1.10_20.0_4');
insert into T_SORT values ( 5,'FINISH_1.10_30.0_3');
insert into T_SORT values ( 6,'FINISH_1.10_30.0_4');
insert into T_SORT values ( 7,'FINISH_120_150_1');
insert into T_SORT values ( 8,'FINISH_120_200_1');
select * from T_SORT order by NAME ;
rollback;
id | name
----+--------------------
1 | FINISH_110_150_1
2 | FINISH_110_200_1
3 | FINISH_1.10_20.0_3
4 | FINISH_1.10_20.0_4
5 | FINISH_1.10_30.0_3
6 | FINISH_1.10_30.0_4
7 | FINISH_120_150_1
8 | FINISH_120_200_1
(8 rows)
why is FINISH_1.10_20.0_3 between
FINISH_110_200_1 and
FINISH_120_150_1
?
That is why is '.' between 1 and 2 as in 110/120 ?
pg_admin III reports the database is created like
CREATE DATABASE bnl
WITH OWNER = bnl
ENCODING = 'UTF8'
TABLESPACE = pg_default
LC_COLLATE = 'en_US.UTF-8'
LC_CTYPE = 'en_US.UTF-8'
CONNECTION LIMIT = -1;
bnl=> select version();
version
--------------------------------------------------------------------------------------------------------------
PostgreSQL 9.3.3 on x86_64-unknown-linux-gnu, compiled by gcc (GCC) 4.6.3 20120306 (Red Hat 4.6.3-2), 64-bit
(1 row)
psql says
psql (9.3.5, server 9.3.3)
It is an Amazon RDS-service
client machine
bnl@prod:~$ uname -a
Linux prod 3.2.0-4-amd64 #1 SMP Debian 3.2.63-2+deb7u1 x86_64 GNU/Linux
bnl@prod:~$ locale
LANG=en_US.UTF-8
LANGUAGE=
LC_CTYPE="en_US.UTF-8"
LC_NUMERIC="en_US.UTF-8"
LC_TIME="en_US.UTF-8"
LC_COLLATE="en_US.UTF-8"
LC_MONETARY="en_US.UTF-8"
LC_MESSAGES="en_US.UTF-8"
LC_PAPER="en_US.UTF-8"
LC_NAME="en_US.UTF-8"
LC_ADDRESS="en_US.UTF-8"
LC_TELEPHONE="en_US.UTF-8"
LC_MEASUREMENT="en_US.UTF-8"
LC_IDENTIFICATION="en_US.UTF-8"
LC_ALL=
--
/Björn
> From: Björn Lundin <b.f.lundin@gmail.com> >To: pgsql-general@postgresql.org >Sent: Wednesday, 8 April 2015, 10:09 >Subject: [GENERAL] unexpected (to me) sorting order > > > >Hi! >below are some commands to >replicate a strange sorting order. > >I do not see why id:s 3-6 are in the middle of the result set. > >What am I missing? > > >begin; > >create table T_SORT ( > ID bigint default 1 not null , -- Primary Key > NAME varchar(100) default ' ' not null >); >alter table T_SORT add constraint T_SORTP1 primary key ( > ID >); > > >insert into T_SORT values ( 1,'FINISH_110_150_1'); >insert into T_SORT values ( 2,'FINISH_110_200_1'); >insert into T_SORT values ( 3,'FINISH_1.10_20.0_3'); >insert into T_SORT values ( 4,'FINISH_1.10_20.0_4'); >insert into T_SORT values ( 5,'FINISH_1.10_30.0_3'); >insert into T_SORT values ( 6,'FINISH_1.10_30.0_4'); >insert into T_SORT values ( 7,'FINISH_120_150_1'); >insert into T_SORT values ( 8,'FINISH_120_200_1'); > >select * from T_SORT order by NAME ; > >rollback; > id | name >----+-------------------- > 1 | FINISH_110_150_1 > 2 | FINISH_110_200_1 > 3 | FINISH_1.10_20.0_3 > 4 | FINISH_1.10_20.0_4 > 5 | FINISH_1.10_30.0_3 > 6 | FINISH_1.10_30.0_4 > 7 | FINISH_120_150_1 > 8 | FINISH_120_200_1 >(8 rows) > >why is FINISH_1.10_20.0_3 between > FINISH_110_200_1 and > FINISH_120_150_1 >? > >That is why is '.' between 1 and 2 as in 110/120 ? > > >pg_admin III reports the database is created like >CREATE DATABASE bnl > WITH OWNER = bnl > ENCODING = 'UTF8' > TABLESPACE = pg_default > LC_COLLATE = 'en_US.UTF-8' > LC_CTYPE = 'en_US.UTF-8' > CONNECTION LIMIT = -1; > > The collation of your "bnl" database is utf8, so the "." punctuation character is seen as a "variable element" and givena lower weighting in the sort to the rest of the characters. That's just how the collate algorithm works in UTF8. Try with LC_COLLATE = 'C' and it should sort how you expect.
> select * from T_SORT order by NAME ; > > rollback; > id | name > ----+-------------------- > 1 | FINISH_110_150_1 > 2 | FINISH_110_200_1 > 3 | FINISH_1.10_20.0_3 > 4 | FINISH_1.10_20.0_4 > 5 | FINISH_1.10_30.0_3 > 6 | FINISH_1.10_30.0_4 > 7 | FINISH_120_150_1 > 8 | FINISH_120_200_1 > (8 rows) Hi, PostreSQL relies on the OS's C lib. So this kind of ordering problems depend on the OS' idea about collations. I get the exact same order on 9.4.1 running on Centos 7.1: chris=# select * from T_SORT order by NAME ; id | name ----+-------------------- 1 | FINISH_110_150_1 2 | FINISH_110_200_1 3 | FINISH_1.10_20.0_3 4 | FINISH_1.10_20.0_4 5 | FINISH_1.10_30.0_3 6 | FINISH_1.10_30.0_4 7 | FINISH_120_150_1 8 | FINISH_120_200_1 (8 rows) But I get this on 9.3.5 running on OS X 10.8 chris=# select * from T_SORT order by NAME ; id | name ----+-------------------- 3 | FINISH_1.10_20.0_3 4 | FINISH_1.10_20.0_4 5 | FINISH_1.10_30.0_3 6 | FINISH_1.10_30.0_4 1 | FINISH_110_150_1 2 | FINISH_110_200_1 7 | FINISH_120_150_1 8 | FINISH_120_200_1 with both databases having Collate = en_US.UTF-8. If I put your data in a file and use the command sort from the shell I get the same effect (this is on the Centos 7.1 box): [chris@mercury ~]$ cat x FINISH_1.10_20.0_3 FINISH_1.10_20.0_4 FINISH_1.10_30.0_3 FINISH_1.10_30.0_4 FINISH_110_150_1 FINISH_110_200_1 FINISH_120_150_1 FINISH_120_200_1 [chris@mercury ~]$ sort x FINISH_110_150_1 FINISH_110_200_1 FINISH_1.10_20.0_3 FINISH_1.10_20.0_4 FINISH_1.10_30.0_3 FINISH_1.10_30.0_4 FINISH_120_150_1 FINISH_120_200_1 [chris@mercury ~]$ I don't know what's the rationale behin this, but it looks like Linux ignores the . when doing the sort. Bye, Chris.
On 2015-04-08 11:33, Glyn Astill wrote: > The collation of your "bnl" database is utf8, so the "." punctuation > character is seen as a "variable element" and given a lower weighting in > the sort to the rest of the characters. That's just how the collate algorithm works in UTF8. > Try with LC_COLLATE = 'C' and it should sort how you expect. > Ok. And as Chris Mair says in his answer, it looks like the '.' is ignored, and then I see the pattern Thanks -- Björn
On 2015-04-08 11:36, Chris Mair wrote: > > I don't know what's the rationale behin this, > but it looks like Linux ignores the . when doing the sort. > Yes, I see that now, and it makes sense Thanks. -- Björn
On Wed, 08 Apr 2015 11:36:01 +0200 Chris Mair <chris@1006.org> wrote: > PostreSQL relies on the OS's C lib. So this kind > of ordering problems depend on the OS' idea about > collations. > > I don't know what's the rationale behin this, > but it looks like Linux ignores the . when doing the sort. Not only '.'. Sorting by VARCHAR is not ASCII order, but "alphabetical" (whatever it means), which causes some behaviour to be more real-life oriented than logically coherent, like this one: select * from ejemplo order by texto; id | texto ----+---------- 2 | Lalin 1 | La Palma 3 | Lasarte It is pretty obvious that ' ' is not between 'l' and 's', but this makes more sense than 1 | La Palma 2 | Lalin 3 | Lasarte -- Alberto Cabello Sánchez <alberto@unex.es>
> From: Chris Mair <chris@1006.org> > To: Björn Lundin <b.f.lundin@gmail.com>; pgsql-general@postgresql.org > Cc: > Sent: Wednesday, 8 April 2015, 10:36 > Subject: Re: [GENERAL] unexpected (to me) sorting order > > >> select * from T_SORT order by NAME ; >> >> rollback; >> id | name >> ----+-------------------- >> 1 | FINISH_110_150_1 >> 2 | FINISH_110_200_1 >> 3 | FINISH_1.10_20.0_3 >> 4 | FINISH_1.10_20.0_4 >> 5 | FINISH_1.10_30.0_3 >> 6 | FINISH_1.10_30.0_4 >> 7 | FINISH_120_150_1 >> 8 | FINISH_120_200_1 >> (8 rows) > > Hi, > > PostreSQL relies on the OS's C lib. So this kind > of ordering problems depend on the OS' idea about > collations. > > I get the exact same order on 9.4.1 running on Centos 7.1: > > chris=# select * from T_SORT order by NAME ; > id | name > ----+-------------------- > 1 | FINISH_110_150_1 > 2 | FINISH_110_200_1 > 3 | FINISH_1.10_20.0_3 > 4 | FINISH_1.10_20.0_4 > 5 | FINISH_1.10_30.0_3 > 6 | FINISH_1.10_30.0_4 > 7 | FINISH_120_150_1 > 8 | FINISH_120_200_1 > (8 rows) > > But I get this on 9.3.5 running on OS X 10.8 > > chris=# select * from T_SORT order by NAME ; > id | name > ----+-------------------- > 3 | FINISH_1.10_20.0_3 > 4 | FINISH_1.10_20.0_4 > 5 | FINISH_1.10_30.0_3 > 6 | FINISH_1.10_30.0_4 > 1 | FINISH_110_150_1 > 2 | FINISH_110_200_1 > 7 | FINISH_120_150_1 > 8 | FINISH_120_200_1 > > with both databases having Collate = en_US.UTF-8. > > If I put your data in a file and use the command sort > from the shell I get the same effect (this is on > the Centos 7.1 box): > > [chris@mercury ~]$ cat x > FINISH_1.10_20.0_3 > FINISH_1.10_20.0_4 > FINISH_1.10_30.0_3 > FINISH_1.10_30.0_4 > FINISH_110_150_1 > FINISH_110_200_1 > FINISH_120_150_1 > FINISH_120_200_1 > > [chris@mercury ~]$ sort x > > FINISH_110_150_1 > FINISH_110_200_1 > FINISH_1.10_20.0_3 > FINISH_1.10_20.0_4 > FINISH_1.10_30.0_3 > FINISH_1.10_30.0_4 > FINISH_120_150_1 > FINISH_120_200_1 > [chris@mercury ~]$ > > I don't know what's the rationale behin this, > but it looks like Linux ignores the . when doing the sort. > > I think this is down to behaviour changes in glibc, there was a thread a while ago where somebody replicating via streamingrep between with different versions of glibc ended up in a bit of a pickle. http://www.postgresql.org/message-id/BA6132ED-1F6B-4A0B-AC22-81278F5AB81E@tripadvisor.com
On 2015-04-08 13:10, Glyn Astill wrote: >> From: Chris Mair <chris@1006.org> > > > > I think this is down to behaviour changes in glibc, there was a thread a while ago where somebody replicating via streamingrep between with different versions of glibc ended up in a bit of a pickle. > > http://www.postgresql.org/message-id/BA6132ED-1F6B-4A0B-AC22-81278F5AB81E@tripadvisor.com > interesting - thanks -- Björn
On Wed, Apr 8, 2015 at 3:33 AM, Glyn Astill <glynastill@yahoo.co.uk> wrote: > >> From: Björn Lundin <b.f.lundin@gmail.com> >>To: pgsql-general@postgresql.org >>Sent: Wednesday, 8 April 2015, 10:09 >>Subject: [GENERAL] unexpected (to me) sorting order >> >>select * from T_SORT order by NAME ; >> >>rollback; >> id | name >>----+-------------------- >> 1 | FINISH_110_150_1 >> 2 | FINISH_110_200_1 >> 3 | FINISH_1.10_20.0_3 >> 4 | FINISH_1.10_20.0_4 >> 5 | FINISH_1.10_30.0_3 >> 6 | FINISH_1.10_30.0_4 >> 7 | FINISH_120_150_1 >> 8 | FINISH_120_200_1 >>(8 rows) >> >>why is FINISH_1.10_20.0_3 between >> FINISH_110_200_1 and >> FINISH_120_150_1 >>? >> >>That is why is '.' between 1 and 2 as in 110/120 ? >> >> >>pg_admin III reports the database is created like >>CREATE DATABASE bnl >> WITH OWNER = bnl >> ENCODING = 'UTF8' >> TABLESPACE = pg_default >> LC_COLLATE = 'en_US.UTF-8' >> LC_CTYPE = 'en_US.UTF-8' >> CONNECTION LIMIT = -1; >> >> > > > > The collation of your "bnl" database is utf8, so the "." punctuation character is seen as a "variable element" and givena lower weighting in the sort to the rest of the characters. That's just how the collate algorithm works in UTF8. utf8 is an encoding method, not a collation. The collation is en_US, encoded in utf8. You can use C collation with utf8 encoding just fine. So just replace UTF8 with en_US in your sentence and you've got it right.
> From: Scott Marlowe <scott.marlowe@gmail.com>
> To: Glyn Astill <glynastill@yahoo.co.uk>
> Cc: Björn Lundin <b.f.lundin@gmail.com>; "pgsql-general@postgresql.org" <pgsql-general@postgresql.org>
> Sent: Thursday, 9 April 2015, 13:23
> Subject: Re: [GENERAL] unexpected (to me) sorting order
>
> On Wed, Apr 8, 2015 at 3:33 AM, Glyn Astill <glynastill@yahoo.co.uk>
> wrote:
>
>>
>>> From: Björn Lundin <b.f.lundin@gmail.com>
>>> To: pgsql-general@postgresql.org
>>> Sent: Wednesday, 8 April 2015, 10:09
>>> Subject: [GENERAL] unexpected (to me) sorting order
>>>
>>> select * from T_SORT order by NAME ;
>>>
>>> rollback;
>>> id | name
>>> ----+--------------------
>>> 1 | FINISH_110_150_1
>>> 2 | FINISH_110_200_1
>>> 3 | FINISH_1.10_20.0_3
>>> 4 | FINISH_1.10_20.0_4
>>> 5 | FINISH_1.10_30.0_3
>>> 6 | FINISH_1.10_30.0_4
>>> 7 | FINISH_120_150_1
>>> 8 | FINISH_120_200_1
>>> (8 rows)
>>>
>>> why is FINISH_1.10_20.0_3 between
>>> FINISH_110_200_1 and
>>> FINISH_120_150_1
>>> ?
>>>
>>> That is why is '.' between 1 and 2 as in 110/120 ?
>>>
>>>
>>> pg_admin III reports the database is created like
>>> CREATE DATABASE bnl
>>> WITH OWNER = bnl
>>> ENCODING = 'UTF8'
>>> TABLESPACE = pg_default
>>> LC_COLLATE = 'en_US.UTF-8'
>>> LC_CTYPE = 'en_US.UTF-8'
>>> CONNECTION LIMIT = -1;
>>>
>>>
>>
>>
>>
>> The collation of your "bnl" database is utf8, so the
> "." punctuation character is seen as a "variable element"
> and given a lower weighting in the sort to the rest of the characters.
> That's just how the collate algorithm works in UTF8.
>
> utf8 is an encoding method, not a collation. The collation is en_US,
> encoded in utf8. You can use C collation with utf8 encoding just fine.
> So just replace UTF8 with en_US in your sentence and you've got it
> right.
> Cc: Björn Lundin <b.f.lundin@gmail.com>; "pgsql-general@postgresql.org" <pgsql-general@postgresql.org>
> Sent: Thursday, 9 April 2015, 13:23
> Subject: Re: [GENERAL] unexpected (to me) sorting order
>
> On Wed, Apr 8, 2015 at 3:33 AM, Glyn Astill <glynastill@yahoo.co.uk>
> wrote:
>
>>
>>> From: Björn Lundin <b.f.lundin@gmail.com>
>>> To: pgsql-general@postgresql.org
>>> Sent: Wednesday, 8 April 2015, 10:09
>>> Subject: [GENERAL] unexpected (to me) sorting order
>>>
>>> select * from T_SORT order by NAME ;
>>>
>>> rollback;
>>> id | name
>>> ----+--------------------
>>> 1 | FINISH_110_150_1
>>> 2 | FINISH_110_200_1
>>> 3 | FINISH_1.10_20.0_3
>>> 4 | FINISH_1.10_20.0_4
>>> 5 | FINISH_1.10_30.0_3
>>> 6 | FINISH_1.10_30.0_4
>>> 7 | FINISH_120_150_1
>>> 8 | FINISH_120_200_1
>>> (8 rows)
>>>
>>> why is FINISH_1.10_20.0_3 between
>>> FINISH_110_200_1 and
>>> FINISH_120_150_1
>>> ?
>>>
>>> That is why is '.' between 1 and 2 as in 110/120 ?
>>>
>>>
>>> pg_admin III reports the database is created like
>>> CREATE DATABASE bnl
>>> WITH OWNER = bnl
>>> ENCODING = 'UTF8'
>>> TABLESPACE = pg_default
>>> LC_COLLATE = 'en_US.UTF-8'
>>> LC_CTYPE = 'en_US.UTF-8'
>>> CONNECTION LIMIT = -1;
>>>
>>>
>>
>>
>>
>> The collation of your "bnl" database is utf8, so the
> "." punctuation character is seen as a "variable element"
> and given a lower weighting in the sort to the rest of the characters.
> That's just how the collate algorithm works in UTF8.
>
> utf8 is an encoding method, not a collation. The collation is en_US,
> encoded in utf8. You can use C collation with utf8 encoding just fine.
> So just replace UTF8 with en_US in your sentence and you've got it
> right.
>
Yes, thanks for the correction there, and we're talking about the wider unicode collate algorithm.
8.4.2015, 12:09, Björn Lundin kirjoitti: > Hi! > below are some commands to > replicate a strange sorting order. > > I do not see why id:s 3-6 are in the middle of the result set. > What am I missing? > > begin; > > create table T_SORT ( > ID bigint default 1 not null , -- Primary Key > NAME varchar(100) default ' ' not null > ); > alter table T_SORT add constraint T_SORTP1 primary key ( > ID > ); > > > insert into T_SORT values ( 1,'FINISH_110_150_1'); > insert into T_SORT values ( 2,'FINISH_110_200_1'); > insert into T_SORT values ( 3,'FINISH_1.10_20.0_3'); > insert into T_SORT values ( 4,'FINISH_1.10_20.0_4'); > insert into T_SORT values ( 5,'FINISH_1.10_30.0_3'); > insert into T_SORT values ( 6,'FINISH_1.10_30.0_4'); > insert into T_SORT values ( 7,'FINISH_120_150_1'); > insert into T_SORT values ( 8,'FINISH_120_200_1'); > > select * from T_SORT order by NAME ; > > rollback; > id | name > ----+-------------------- > 1 | FINISH_110_150_1 > 2 | FINISH_110_200_1 > 3 | FINISH_1.10_20.0_3 > 4 | FINISH_1.10_20.0_4 > 5 | FINISH_1.10_30.0_3 > 6 | FINISH_1.10_30.0_4 > 7 | FINISH_120_150_1 > 8 | FINISH_120_200_1 > (8 rows) > > why is FINISH_1.10_20.0_3 between > FINISH_110_200_1 and > FINISH_120_150_1 > ? > > That is why is '.' between 1 and 2 as in 110/120 ? > > > pg_admin III reports the database is created like > CREATE DATABASE bnl > WITH OWNER = bnl > ENCODING = 'UTF8' > TABLESPACE = pg_default > LC_COLLATE = 'en_US.UTF-8' > LC_CTYPE = 'en_US.UTF-8' > CONNECTION LIMIT = -1; > > > bnl=> select version(); > version > -------------------------------------------------------------------------------------------------------------- > PostgreSQL 9.3.3 on x86_64-unknown-linux-gnu, compiled by gcc (GCC) > 4.6.3 20120306 (Red Hat 4.6.3-2), 64-bit > (1 row) > > psql says > psql (9.3.5, server 9.3.3) > It is an Amazon RDS-service > > > client machine > > bnl@prod:~$ uname -a > Linux prod 3.2.0-4-amd64 #1 SMP Debian 3.2.63-2+deb7u1 x86_64 GNU/Linux > > bnl@prod:~$ locale > LANG=en_US.UTF-8 > LANGUAGE= > LC_CTYPE="en_US.UTF-8" > LC_NUMERIC="en_US.UTF-8" > LC_TIME="en_US.UTF-8" > LC_COLLATE="en_US.UTF-8" > LC_MONETARY="en_US.UTF-8" > LC_MESSAGES="en_US.UTF-8" > LC_PAPER="en_US.UTF-8" > LC_NAME="en_US.UTF-8" > LC_ADDRESS="en_US.UTF-8" > LC_TELEPHONE="en_US.UTF-8" > LC_MEASUREMENT="en_US.UTF-8" > LC_IDENTIFICATION="en_US.UTF-8" > LC_ALL= > > > -- > /Björn Add some more letters lower/upper and so on. Then compare sorting ex. ö/z. Or look my some ex. 0/!/letter order with or without other chars. insert into T_SORT values ( 10,'FINISH_Z'); insert into T_SORT values ( 11,'FINISH_a'); insert into T_SORT values ( 12,'FINISH_b'); insert into T_SORT values ( 13,'FINISH_A'); insert into T_SORT values ( 14,'FINISH_B'); insert into T_SORT values ( 15,'FINISH_ä'); insert into T_SORT values ( 16,'FINISH_Ä'); insert into T_SORT values ( 17,'FINISH_+'); insert into T_SORT values ( 18,'FINISH_@'); insert into T_SORT values ( 19,'FINISH_='); insert into T_SORT values ( 20,'FINISH_]'); insert into T_SORT values ( 21,'FINISH_a0a'); insert into T_SORT values ( 22,'FINISH_a!a'); insert into T_SORT values ( 23,'FINISH_!'); insert into T_SORT values ( 24,'FINISH_012'); insert into T_SORT values ( 25,'FINISH_0aa'); insert into T_SORT values ( 26,'FINISH_!aa'); insert into T_SORT values ( 27,'FINISH_0'); insert into T_SORT values ( 28,'FINISH_!b!b'); insert into T_SORT values ( 29,'FINISH_a!b'); insert into T_SORT values ( 30,'FINISH_b!a'); insert into T_SORT values ( 31,'FINISH_!ab'); insert into T_SORT values ( 32,'FINISH_!b!a'); select * from T_SORT order by NAME ; -- use your db LC_COLLATE -- using COLLATE = you have installed those locales in -- your system, PG use those. select * from T_SORT ORDER BY name COLLATE "en_US" ; select * from T_SORT ORDER BY name COLLATE "fi_FI" ; select * from T_SORT ORDER BY name COLLATE "C" ; -- sorting weight = ascii value - simple select * from T_SORT ORDER BY name COLLATE "POSIX" ; select * from T_SORT ORDER BY name COLLATE "de_DE" ; Sorting - it's not so easy ... but with COLLATE option you can "fix" your order if you need / as you want - almost. http://en.wikipedia.org/wiki/ISO_14651 http://en.wikipedia.org/wiki/European_ordering_rules http://standards.iso.org/ittf/PubliclyAvailableStandards/index.html - ISO/IEC 14651:2011/Amd 1:2012 https://www.debian.org/doc/manuals/intro-i18n/ - how the library works http://en.wikipedia.org/wiki/Internationalization_and_localization ... https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/localeCompare ... -jukka-
9.4.2015, 15:43, Glyn Astill kirjoitti: > > From: Scott Marlowe <scott.marlowe@gmail.com> > > To: Glyn Astill <glynastill@yahoo.co.uk> > > Cc: Björn Lundin <b.f.lundin@gmail.com>; > "pgsql-general@postgresql.org" <pgsql-general@postgresql.org> > > Sent: Thursday, 9 April 2015, 13:23 > > Subject: Re: [GENERAL] unexpected (to me) sorting order > > > > On Wed, Apr 8, 2015 at 3:33 AM, Glyn Astill <glynastill@yahoo.co.uk> > > wrote: > > > >> > >>> From: Björn Lundin <b.f.lundin@gmail.com> > >>> To: pgsql-general@postgresql.org > >>> Sent: Wednesday, 8 April 2015, 10:09 > >>> Subject: [GENERAL] unexpected (to me) sorting order > >>> > >>> select * from T_SORT order by NAME ; > >>> > >>> rollback; > >>> id | name > >>> ----+-------------------- > >>> 1 | FINISH_110_150_1 > >>> 2 | FINISH_110_200_1 > >>> 3 | FINISH_1.10_20.0_3 > >>> 4 | FINISH_1.10_20.0_4 > >>> 5 | FINISH_1.10_30.0_3 > >>> 6 | FINISH_1.10_30.0_4 > >>> 7 | FINISH_120_150_1 > >>> 8 | FINISH_120_200_1 > >>> (8 rows) > >>> > >>> why is FINISH_1.10_20.0_3 between > >>> FINISH_110_200_1 and > >>> FINISH_120_150_1 > >>> ? > >>> > >>> That is why is '.' between 1 and 2 as in 110/120 ? > >>> > >>> > >>> pg_admin III reports the database is created like > >>> CREATE DATABASE bnl > >>> WITH OWNER = bnl > >>> ENCODING = 'UTF8' > >>> TABLESPACE = pg_default > >>> LC_COLLATE = 'en_US.UTF-8' > >>> LC_CTYPE = 'en_US.UTF-8' > >>> CONNECTION LIMIT = -1; > >>> > >>> > >> > >> > >> > >> The collation of your "bnl" database is utf8, so the > > "." punctuation character is seen as a "variable element" > > and given a lower weighting in the sort to the rest of the characters. > > That's just how the collate algorithm works in UTF8. > > > > utf8 is an encoding method, not a collation. The collation is en_US, > > encoded in utf8. You can use C collation with utf8 encoding just fine. > > So just replace UTF8 with en_US in your sentence and you've got it > > right. > > > > Yes, thanks for the correction there, and we're talking about the wider > unicode collate algorithm. Add some more letters lower/upper and so on. Then compare sorting ex. ö/z. Or look 0/! order with or without other chars. We have so many "sorting rules" and standards. insert into T_SORT values ( 10,'FINISH_Z'); insert into T_SORT values ( 11,'FINISH_a'); insert into T_SORT values ( 12,'FINISH_b'); insert into T_SORT values ( 13,'FINISH_A'); insert into T_SORT values ( 14,'FINISH_B'); insert into T_SORT values ( 15,'FINISH_ä'); insert into T_SORT values ( 16,'FINISH_Ä'); insert into T_SORT values ( 17,'FINISH_+'); insert into T_SORT values ( 18,'FINISH_@'); insert into T_SORT values ( 19,'FINISH_='); insert into T_SORT values ( 20,'FINISH_]'); insert into T_SORT values ( 21,'FINISH_a0a'); insert into T_SORT values ( 22,'FINISH_a!a'); insert into T_SORT values ( 23,'FINISH_!'); insert into T_SORT values ( 24,'FINISH_012'); insert into T_SORT values ( 25,'FINISH_0aa'); insert into T_SORT values ( 26,'FINISH_!aa'); insert into T_SORT values ( 27,'FINISH_0'); select * from T_SORT order by NAME ; -- use your db LC_COLLATE -- using COLLATE need that you have installed those locales in -- your system, PG use those. select * from T_SORT ORDER BY name COLLATE "en_US" ; select * from T_SORT ORDER BY name COLLATE "fi_FI" ; select * from T_SORT ORDER BY name COLLATE "C" ; select * from T_SORT ORDER BY name COLLATE "POSIX" ; select * from T_SORT ORDER BY name COLLATE "de_DE" ; Sorting - it's not so easy ... but with COLLATE option you can "fix" your order if you need / as you want http://en.wikipedia.org/wiki/ISO_14651 http://en.wikipedia.org/wiki/European_ordering_rules http://standards.iso.org/ittf/PubliclyAvailableStandards/index.html - ISO/IEC 14651:2011/Amd 1:2012 https://www.debian.org/doc/manuals/intro-i18n/ - how the library works http://en.wikipedia.org/wiki/Internationalization_and_localization ... https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/localeCompare ... -jukka-