pgsql: Teach UtfToLocal/LocalToUtf to support algorithmic encoding conv

Поиск
Список
Период
Сортировка
От Tom Lane
Тема pgsql: Teach UtfToLocal/LocalToUtf to support algorithmic encoding conv
Дата
Msg-id E1Yt5LO-0002Tl-4a@gemulon.postgresql.org
обсуждение исходный текст
Список pgsql-committers
Teach UtfToLocal/LocalToUtf to support algorithmic encoding conversions.

Until now, these functions have only supported encoding conversions using
lookup tables, which is fine as long as there's not too many code points
to convert.  However, GB18030 expects all 1.1 million Unicode code points
to be convertible, which would require a ridiculously-sized lookup table.
Fortunately, a large fraction of those conversions can be expressed through
arithmetic, ie the conversions are one-to-one in certain defined ranges.
To support that, provide a callback function that is used after consulting
the lookup tables.  (This patch doesn't actually change anything about the
GB18030 conversion behavior, just provide infrastructure for fixing it.)

Since this requires changing the APIs of UtfToLocal/LocalToUtf anyway,
take the opportunity to rearrange their argument lists into what seems
to me a saner order.  And beautify the call sites by using lengthof()
instead of error-prone sizeof() arithmetic.

In passing, also mark all the lookup tables used by these calls "const".
This moves an impressive amount of stuff into the text segment, at least
on my machine, and is safer anyhow.

Branch
------
master

Details
-------
http://git.postgresql.org/pg/commitdiff/7730f48ede0d222e7f750541d3d5f0f74d75d99b

Modified Files
--------------
src/backend/utils/mb/Unicode/UCS_to_BIG5.pl        |    4 +-
src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl      |    4 +-
.../utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl        |    8 +-
src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl      |    4 +-
src/backend/utils/mb/Unicode/UCS_to_EUC_KR.pl      |    4 +-
src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl      |    4 +-
src/backend/utils/mb/Unicode/UCS_to_GB18030.pl     |    4 +-
.../utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl      |    8 +-
src/backend/utils/mb/Unicode/UCS_to_SJIS.pl        |    4 +-
src/backend/utils/mb/Unicode/UCS_to_most.pl        |    4 +-
src/backend/utils/mb/Unicode/big5_to_utf8.map      |    2 +-
src/backend/utils/mb/Unicode/euc_cn_to_utf8.map    |    2 +-
.../utils/mb/Unicode/euc_jis_2004_to_utf8.map      |    2 +-
.../mb/Unicode/euc_jis_2004_to_utf8_combined.map   |    2 +-
src/backend/utils/mb/Unicode/euc_jp_to_utf8.map    |    2 +-
src/backend/utils/mb/Unicode/euc_kr_to_utf8.map    |    2 +-
src/backend/utils/mb/Unicode/euc_tw_to_utf8.map    |    2 +-
src/backend/utils/mb/Unicode/gb18030_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/gbk_to_utf8.map       |    2 +-
.../utils/mb/Unicode/iso8859_10_to_utf8.map        |    2 +-
.../utils/mb/Unicode/iso8859_13_to_utf8.map        |    2 +-
.../utils/mb/Unicode/iso8859_14_to_utf8.map        |    2 +-
.../utils/mb/Unicode/iso8859_15_to_utf8.map        |    2 +-
.../utils/mb/Unicode/iso8859_16_to_utf8.map        |    2 +-
src/backend/utils/mb/Unicode/iso8859_2_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_3_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_4_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_5_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_6_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_7_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_8_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/iso8859_9_to_utf8.map |    2 +-
src/backend/utils/mb/Unicode/johab_to_utf8.map     |    2 +-
src/backend/utils/mb/Unicode/koi8r_to_utf8.map     |    2 +-
src/backend/utils/mb/Unicode/koi8u_to_utf8.map     |    2 +-
.../utils/mb/Unicode/shift_jis_2004_to_utf8.map    |    2 +-
.../mb/Unicode/shift_jis_2004_to_utf8_combined.map |    2 +-
src/backend/utils/mb/Unicode/sjis_to_utf8.map      |    2 +-
src/backend/utils/mb/Unicode/uhc_to_utf8.map       |    2 +-
src/backend/utils/mb/Unicode/utf8_to_big5.map      |    2 +-
src/backend/utils/mb/Unicode/utf8_to_euc_cn.map    |    2 +-
.../utils/mb/Unicode/utf8_to_euc_jis_2004.map      |    2 +-
.../mb/Unicode/utf8_to_euc_jis_2004_combined.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_euc_jp.map    |    2 +-
src/backend/utils/mb/Unicode/utf8_to_euc_kr.map    |    2 +-
src/backend/utils/mb/Unicode/utf8_to_euc_tw.map    |    2 +-
src/backend/utils/mb/Unicode/utf8_to_gb18030.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_gbk.map       |    2 +-
.../utils/mb/Unicode/utf8_to_iso8859_10.map        |    2 +-
.../utils/mb/Unicode/utf8_to_iso8859_13.map        |    2 +-
.../utils/mb/Unicode/utf8_to_iso8859_14.map        |    2 +-
.../utils/mb/Unicode/utf8_to_iso8859_15.map        |    2 +-
.../utils/mb/Unicode/utf8_to_iso8859_16.map        |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_2.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_3.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_4.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_5.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_6.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_7.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_8.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_iso8859_9.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_johab.map     |    2 +-
src/backend/utils/mb/Unicode/utf8_to_koi8r.map     |    2 +-
src/backend/utils/mb/Unicode/utf8_to_koi8u.map     |    2 +-
.../utils/mb/Unicode/utf8_to_shift_jis_2004.map    |    2 +-
.../mb/Unicode/utf8_to_shift_jis_2004_combined.map |    2 +-
src/backend/utils/mb/Unicode/utf8_to_sjis.map      |    2 +-
src/backend/utils/mb/Unicode/utf8_to_uhc.map       |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1250.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1251.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1252.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1253.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1254.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1255.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1256.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1257.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win1258.map   |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win866.map    |    2 +-
src/backend/utils/mb/Unicode/utf8_to_win874.map    |    2 +-
src/backend/utils/mb/Unicode/win1250_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1251_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1252_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1253_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1254_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1255_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1256_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1257_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win1258_to_utf8.map   |    2 +-
src/backend/utils/mb/Unicode/win866_to_utf8.map    |    2 +-
src/backend/utils/mb/Unicode/win874_to_utf8.map    |    2 +-
src/backend/utils/mb/conv.c                        |  327 ++++++++++----------
.../mb/conversion_procs/euc_tw_and_big5/big5.c     |   14 +-
.../conversion_procs/utf8_and_big5/utf8_and_big5.c |   14 +-
.../utf8_and_cyrillic/utf8_and_cyrillic.c          |   28 +-
.../utf8_and_euc2004/utf8_and_euc2004.c            |   18 +-
.../utf8_and_euc_cn/utf8_and_euc_cn.c              |   14 +-
.../utf8_and_euc_jp/utf8_and_euc_jp.c              |   14 +-
.../utf8_and_euc_kr/utf8_and_euc_kr.c              |   14 +-
.../utf8_and_euc_tw/utf8_and_euc_tw.c              |   14 +-
.../utf8_and_gb18030/utf8_and_gb18030.c            |   14 +-
.../conversion_procs/utf8_and_gbk/utf8_and_gbk.c   |   14 +-
.../utf8_and_iso8859/utf8_and_iso8859.c            |   80 ++---
.../utf8_and_johab/utf8_and_johab.c                |   14 +-
.../conversion_procs/utf8_and_sjis/utf8_and_sjis.c |   14 +-
.../utf8_and_sjis2004/utf8_and_sjis2004.c          |   18 +-
.../conversion_procs/utf8_and_uhc/utf8_and_uhc.c   |   14 +-
.../conversion_procs/utf8_and_win/utf8_and_win.c   |   72 +++--
src/include/mb/pg_wchar.h                          |   41 ++-
108 files changed, 538 insertions(+), 408 deletions(-)


В списке pgsql-committers по дате отправления:

Предыдущее
От: Simon Riggs
Дата:
Сообщение: pgsql: Separate block sampling functions
Следующее
От: Tom Lane
Дата:
Сообщение: pgsql: Honor traditional SGML NAMELEN limit.