Re: Errors in our encoding conversion tables

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: Errors in our encoding conversion tables
Дата
Msg-id 25721.1448598653@sss.pgh.pa.us
обсуждение исходный текст
Ответ на Re: Errors in our encoding conversion tables  (Tatsuo Ishii <ishii@postgresql.org>)
Список pgsql-hackers
Tatsuo Ishii <ishii@postgresql.org> writes:
> I have started to looking into it. I wonder how do you create the part
> of your patch:

The code I used is below.

> In the above you seem to disable the conversion from 0x96 of win1250
> to ISO-8859-2 by using the Unicode mapping files in
> src/backend/utils/mb/Unicode. But the corresponding mapping file
> (iso8859_2_to_utf8.amp) does include following entry:

>   {0x0096, 0xc296},

> How do you know 0x96 should be removed from the conversion?

Right, but there is no mapping in the win1250-utf8 files that matches
U+C296.  The complaint over in the other thread is precisely that we
have no business translating 0x96 in WIN1250 to this character.  What
WIN1250 0x96 could translate to is U+E28093 (at least, according to
win1250_to_utf8.map) but that Unicode character has no equivalent in
LATIN2.

AFAICS, whoever made these tables just arbitrarily decided that 0x96
in WIN1250 could be mapped to 0x96 in LATIN2, and likewise for a number
of other codes; but those are false equivalences, as you find out if
you try to perform the same conversion via other encoding conversion
paths, ie convert to UTF8 and then to the other encoding.

            regards, tom lane

#include "c.h"
#include "mb/pg_wchar.h"

#include "src/backend/utils/mb/Unicode/iso8859_2_to_utf8.map"
#include "src/backend/utils/mb/Unicode/iso8859_5_to_utf8.map"
#include "src/backend/utils/mb/Unicode/win1250_to_utf8.map"
#include "src/backend/utils/mb/Unicode/win1251_to_utf8.map"
#include "src/backend/utils/mb/Unicode/win866_to_utf8.map"
#include "src/backend/utils/mb/Unicode/koi8r_to_utf8.map"
#include "src/backend/utils/mb/Unicode/koi8u_to_utf8.map"


typedef struct
{
    const pg_local_to_utf *map1;    /* to UTF8 map name */
    int            size1;            /* size of map1 */
    const pg_local_to_utf *map2;    /* to UTF8 map name */
    int            size2;            /* size of map2 */
    const char *tabname;
    int            upper;
} pg_conv_map;

static const pg_conv_map maps[] = {
    {
        LUmapWIN1250, lengthof(LUmapWIN1250),
        LUmapISO8859_2, lengthof(LUmapISO8859_2),
        "win1250_2_iso88592", 1
    },
    {
        LUmapISO8859_2, lengthof(LUmapISO8859_2),
        LUmapWIN1250, lengthof(LUmapWIN1250),
        "iso88592_2_win1250", 1
    },
    {
        LUmapISO8859_5, lengthof(LUmapISO8859_5),
        LUmapKOI8R, lengthof(LUmapKOI8R),
        "iso2koi", 0
    },
    {
        LUmapKOI8R, lengthof(LUmapKOI8R),
        LUmapISO8859_5, lengthof(LUmapISO8859_5),
        "koi2iso", 0
    },
    {
        LUmapWIN1251, lengthof(LUmapWIN1251),
        LUmapKOI8R, lengthof(LUmapKOI8R),
        "win2koi", 0
    },
    {
        LUmapKOI8R, lengthof(LUmapKOI8R),
        LUmapWIN1251, lengthof(LUmapWIN1251),
        "koi2win", 0
    },
    {
        LUmapWIN866, lengthof(LUmapWIN866),
        LUmapKOI8R, lengthof(LUmapKOI8R),
        "win8662koi", 0
    },
    {
        LUmapKOI8R, lengthof(LUmapKOI8R),
        LUmapWIN866, lengthof(LUmapWIN866),
        "koi2win866", 0
    },

};

static void
domap(const pg_conv_map *info)
{
    uint32 c;

    printf("    static const unsigned char %s[] = {\n", info->tabname);

    for (c = 0x80; c <= 0xff; c++)
    {
        uint32 u = 0;
        uint32 c2 = 0;
        int i;

        for (i = 0; i < info->size1; i++)
        {
            if (info->map1[i].code == c)
            {
                u = info->map1[i].utf;
                break;
            }
        }
        if (u != 0)
        {
            for (i = 0; i < info->size2; i++)
            {
                if (info->map2[i].utf == u)
                {
                    c2 = info->map2[i].code;
                    break;
                }
            }
        }
#if 0
        if (c2)
            printf("0x%02x maps to 0x%02x via U+%04X\n", c, c2, u);
        else
            printf("0x%02x has no equivalent\n", c);
#endif
        if (c % 8 == 0)
            printf("\t\t");
        if (info->upper)
            printf("0x%02X", c2);
        else
            printf("0x%02x", c2);
        if (c == 0xff)
            printf("\n");
        else if (c % 8 == 7)
            printf(",\n");
        else
            printf(", ");
    }
    printf("\t};\n\n");
}

int
main()
{
    int i;

    for (i = 0; i < lengthof(maps); i++)
        domap(maps + i);

    return 0;
}

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Tatsuo Ishii
Дата:
Сообщение: Re: Errors in our encoding conversion tables
Следующее
От: Tom Lane
Дата:
Сообщение: Re: WIP: About CMake v2