Re: The "char" type versus non-ASCII characters

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: The "char" type versus non-ASCII characters
Дата
Msg-id 2537044.1657747461@sss.pgh.pa.us
обсуждение исходный текст
Ответ на Re: The "char" type versus non-ASCII characters  (Tom Lane <tgl@sss.pgh.pa.us>)
Ответы Re: The "char" type versus non-ASCII characters  (Tom Lane <tgl@sss.pgh.pa.us>)
Список pgsql-hackers
I wrote:
> Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:
>> I think we could consider char to be a single-byte bytea and use the
>> escape format of bytea for char.  That way there is some precedent and
>> we don't add yet another encoding or escape format.

> Do you want to take that as far as changing backslash to print
> as '\\' ?

This came up again today [1], so here's a concrete proposal.
Let's use \ooo for high-bit-set chars, but keep backslash as just
backslash (so it's only semi-compatible with bytea).

            regards, tom lane

[1] https://www.postgresql.org/message-id/CAFM5RapGbBQm%2BdH%3D7K80HcvBvEWiV5Tm7N%3DNRaYURfm98YWc8A%40mail.gmail.com

diff --git a/src/backend/utils/adt/char.c b/src/backend/utils/adt/char.c
index 0df41c2253..e50293bf14 100644
--- a/src/backend/utils/adt/char.c
+++ b/src/backend/utils/adt/char.c
@@ -20,6 +20,11 @@
 #include "libpq/pqformat.h"
 #include "utils/builtins.h"

+#define ISOCTAL(c)   (((c) >= '0') && ((c) <= '7'))
+#define TOOCTAL(c)   ((c) + '0')
+#define FROMOCTAL(c) ((unsigned char) (c) - '0')
+
+
 /*****************************************************************************
  *     USER I/O ROUTINES                                                         *
  *****************************************************************************/
@@ -27,31 +32,53 @@
 /*
  *        charin            - converts "x" to 'x'
  *
- * Note that an empty input string will implicitly be converted to \0.
+ * This accepts the formats charout produces.  If we have multibyte input
+ * that is not in the form '\ooo', then we take its first byte as the value
+ * and silently discard the rest; this is a backwards-compatibility provision.
  */
 Datum
 charin(PG_FUNCTION_ARGS)
 {
     char       *ch = PG_GETARG_CSTRING(0);

+    if (strlen(ch) == 4 && ch[0] == '\\' &&
+        ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+        PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) +
+                       (FROMOCTAL(ch[2]) << 3) +
+                       FROMOCTAL(ch[3]));
+    /* This will do the right thing for a zero-length input string */
     PG_RETURN_CHAR(ch[0]);
 }

 /*
  *        charout            - converts 'x' to "x"
  *
- * Note that if the char value is \0, the resulting string will appear
- * to be empty (null-terminated after zero characters).  So this is the
- * inverse of the charin() function for such data.
+ * The possible output formats are:
+ * 1. 0x00 is represented as an empty string.
+ * 2. 0x01..0x7F are represented as a single ASCII byte.
+ * 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits).
+ * Case 3 is meant to match the traditional "escape" format of bytea.
  */
 Datum
 charout(PG_FUNCTION_ARGS)
 {
     char        ch = PG_GETARG_CHAR(0);
-    char       *result = (char *) palloc(2);
+    char       *result = (char *) palloc(5);

-    result[0] = ch;
-    result[1] = '\0';
+    if (IS_HIGHBIT_SET(ch))
+    {
+        result[0] = '\\';
+        result[1] = TOOCTAL(((unsigned char) ch) >> 6);
+        result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07);
+        result[3] = TOOCTAL(((unsigned char) ch) & 07);
+        result[4] = '\0';
+    }
+    else
+    {
+        /* This produces acceptable results for 0x00 as well */
+        result[0] = ch;
+        result[1] = '\0';
+    }
     PG_RETURN_CSTRING(result);
 }

@@ -176,15 +203,20 @@ Datum
 text_char(PG_FUNCTION_ARGS)
 {
     text       *arg1 = PG_GETARG_TEXT_PP(0);
+    char       *ch = VARDATA_ANY(arg1);
     char        result;

     /*
-     * An empty input string is converted to \0 (for consistency with charin).
-     * If the input is longer than one character, the excess data is silently
-     * discarded.
+     * Conversion rules are the same as in charin(), but here we need to
+     * handle the empty-string case honestly.
      */
-    if (VARSIZE_ANY_EXHDR(arg1) > 0)
-        result = *(VARDATA_ANY(arg1));
+    if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' &&
+        ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
+        result = (FROMOCTAL(ch[1]) << 6) +
+            (FROMOCTAL(ch[2]) << 3) +
+            FROMOCTAL(ch[3]);
+    else if (VARSIZE_ANY_EXHDR(arg1) > 0)
+        result = ch[0];
     else
         result = '\0';

@@ -195,13 +227,21 @@ Datum
 char_text(PG_FUNCTION_ARGS)
 {
     char        arg1 = PG_GETARG_CHAR(0);
-    text       *result = palloc(VARHDRSZ + 1);
+    text       *result = palloc(VARHDRSZ + 4);

     /*
-     * Convert \0 to an empty string, for consistency with charout (and
-     * because the text stuff doesn't like embedded nulls all that well).
+     * Conversion rules are the same as in charout(), but here we need to be
+     * honest about converting 0x00 to an empty string.
      */
-    if (arg1 != '\0')
+    if (IS_HIGHBIT_SET(arg1))
+    {
+        SET_VARSIZE(result, VARHDRSZ + 4);
+        (VARDATA(result))[0] = '\\';
+        (VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6);
+        (VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07);
+        (VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07);
+    }
+    else if (arg1 != '\0')
     {
         SET_VARSIZE(result, VARHDRSZ + 1);
         *(VARDATA(result)) = arg1;

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Andrew Dunstan
Дата:
Сообщение: Re: proposal: possibility to read dumped table's name from file
Следующее
От: Tom Lane
Дата:
Сообщение: Re: Bug: Reading from single byte character column type may cause out of bounds memory reads.