Re: COPY for CSV documentation
| От | Andrew Dunstan |
|---|---|
| Тема | Re: COPY for CSV documentation |
| Дата | |
| Msg-id | 407AC373.2010106@dunslane.net обсуждение исходный текст |
| Ответ на | Re: COPY for CSV documentation (Bruce Momjian <pgman@candle.pha.pa.us>) |
| Ответы |
Re: COPY for CSV documentation
Re: COPY for CSV documentation Updated COPY CSV patch |
| Список | pgsql-patches |
Bruce Momjian wrote:
>Andrew Dunstan wrote:
>
>
>>In fact, in the patch I sent in, no quoted string is marked as null when
>>being read (so even if you use \N as the null marker, "\N" will be that
>>literal and not a null marker). And the null marker, whatever it is,
>>should be made quote safe by us throwing an error if it contains the
>>quote marker, just as we now make sure that the null marker is
>>delimiter-safe.
>>
>>
>
>What value does an int column get if the input file has ',,'. Don't
>tell me zero? :-) Error?
>
>
If the null marker is not an empty string, it gets an error, of course -
if it is it gets a null:
[andrew@marmaduke pginst]$ echo ',,' | bin/psql -c "create temp table
foo (a int, b text, c text); copy foo from stdin delimiter ',\"' null
'\\\\N';"
ERROR: invalid input syntax for integer: ""
CONTEXT: COPY foo, line 1, column a: ""
[andrew@marmaduke pginst]$ echo ',,' | bin/psql -c "create temp table
foo (a int, b text, c text); copy foo from stdin delimiter ',\"' ;"
[andrew@marmaduke pginst]$
I hope that is expected behaviour - it's what *I* expect, at least.
>
>
>>I will check on the write behaviour - it might need ammending too.
>>
>>I'll submit a revised patch based on the original syntax scheme, and
>>then you (Bruce) can make the syntax/psql changes that seem to be agreed
>>on now - is that ok?
>>
>>
>
>OK, go as far as you want and post it. I will turn around a new patch
>in a few hours after you post.
>
>
>
>>The default NULL value issue can be determined at the end of any
>>exhaustive debate we have - in the end it's a one line code change ;-)
>>
>>
>
>Agreed.
>
>
>
Attached patch has these additions to previously posted patch:
. quote character may not appear in NULL marker
. any non-null value that matches the NULL marker is forced to be quoted
when written.
cheers
andrew
Index: src/backend/commands/copy.c
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v
retrieving revision 1.219
diff -c -r1.219 copy.c
*** src/backend/commands/copy.c 6 Apr 2004 13:21:33 -0000 1.219
--- src/backend/commands/copy.c 12 Apr 2004 16:21:33 -0000
***************
*** 70,76 ****
typedef enum CopyReadResult
{
NORMAL_ATTR,
! END_OF_LINE
} CopyReadResult;
/*
--- 70,77 ----
typedef enum CopyReadResult
{
NORMAL_ATTR,
! END_OF_LINE,
! UNTERMINATED_FIELD
} CopyReadResult;
/*
***************
*** 136,144 ****
--- 137,148 ----
static bool CopyReadLine(void);
static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull);
+ static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
+ CopyReadResult *result, bool *isnull);
static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo,
Oid typelem, bool *isnull);
static void CopyAttributeOut(char *string, char *delim);
+ static void CopyAttributeOutCSV(char *string, char *delim, bool force_quote);
static List *CopyGetAttnums(Relation rel, List *attnamelist);
static void limit_printout_length(StringInfo buf);
***************
*** 682,687 ****
--- 686,692 ----
List *attnumlist;
bool binary = false;
bool oids = false;
+ bool csv_mode = false;
char *delim = NULL;
char *null_print = NULL;
Relation rel;
***************
*** 744,751 ****
if (!delim)
delim = "\t";
if (!null_print)
! null_print = "\\N";
/*
* Open and lock the relation, using the appropriate lock type.
--- 749,759 ----
if (!delim)
delim = "\t";
+ if (strlen(delim) > 1)
+ csv_mode = true;
+
if (!null_print)
! null_print = csv_mode ? "" : "\\N";
/*
* Open and lock the relation, using the appropriate lock type.
***************
*** 772,783 ****
"psql's \\copy command also works for anyone.")));
/*
! * Presently, only single-character delimiter strings are supported.
*/
! if (strlen(delim) != 1)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("COPY delimiter must be a single character")));
/*
* Don't allow the delimiter to appear in the null string.
--- 780,806 ----
"psql's \\copy command also works for anyone.")));
/*
! * Only single-character delimiter strings are supported,
! * except in CSV mode, where the string must be
! * delimiter-char quote-char [escape-char]
*/
! if (!csv_mode && strlen(delim) != 1)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("COPY delimiter must be a single character")));
+ else if (csv_mode)
+ {
+ if(strlen(delim) > 3)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("COPY delimiters for CSV must be a 2 or 3 characters")));
+ if (delim[0] == delim[1] ||
+ (strlen(delim) == 3 && delim[0] == delim[2]))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("CSV delimiter character must not be same as quote character or escape character")));
+
+ }
/*
* Don't allow the delimiter to appear in the null string.
***************
*** 788,793 ****
--- 811,833 ----
errmsg("COPY delimiter must not appear in the NULL specification")));
/*
+ * Don't allow the csv quote char to appear in the null string.
+ */
+ if (csv_mode && strchr(null_print, delim[1]) != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("CSV quote character must not appear in the NULL specification")));
+
+ /*
+ * Don't allow OIDs in CSV mode
+ */
+
+ if (csv_mode && oids)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot specify OIDS in CSV mode ")));
+
+ /*
* Don't allow COPY w/ OIDs to or from a table without them
*/
if (oids && !rel->rd_rel->relhasoids)
***************
*** 969,974 ****
--- 1009,1015 ----
FmgrInfo *out_functions;
Oid *elements;
bool *isvarlena;
+ bool csv_mode;
char *string;
Snapshot mySnapshot;
List *cur;
***************
*** 979,984 ****
--- 1020,1026 ----
attr = tupDesc->attrs;
num_phys_attrs = tupDesc->natts;
attr_count = length(attnumlist);
+ csv_mode = (strlen(delim) > 1);
/*
* Get info about the columns we need to process.
***************
*** 1051,1057 ****
while ((tuple = heap_getnext(scandesc, ForwardScanDirection)) != NULL)
{
bool need_delim = false;
-
CHECK_FOR_INTERRUPTS();
MemoryContextReset(mycontext);
--- 1093,1098 ----
***************
*** 1113,1119 ****
value,
ObjectIdGetDatum(elements[attnum - 1]),
Int32GetDatum(attr[attnum - 1]->atttypmod)));
! CopyAttributeOut(string, delim);
}
else
{
--- 1154,1167 ----
value,
ObjectIdGetDatum(elements[attnum - 1]),
Int32GetDatum(attr[attnum - 1]->atttypmod)));
! if (csv_mode)
! {
! bool force_quote = (strcmp(string,null_print) == 0);
! CopyAttributeOutCSV(string, delim, force_quote);
! }
! else
! CopyAttributeOut(string, delim);
!
}
else
{
***************
*** 1263,1268 ****
--- 1311,1317 ----
Datum *values;
char *nulls;
bool done = false;
+ bool csv_mode;
bool isnull;
ResultRelInfo *resultRelInfo;
EState *estate = CreateExecutorState(); /* for ExecConstraints() */
***************
*** 1280,1285 ****
--- 1329,1335 ----
num_phys_attrs = tupDesc->natts;
attr_count = length(attnumlist);
num_defaults = 0;
+ csv_mode = (strlen(delim) > 1);
/*
* We need a ResultRelInfo so we can use the regular executor's
***************
*** 1499,1504 ****
--- 1549,1555 ----
if (file_has_oids)
{
+ /* can't be in CSV mode here */
string = CopyReadAttribute(delim, null_print,
&result, &isnull);
***************
*** 1537,1544 ****
errmsg("missing data for column \"%s\"",
NameStr(attr[m]->attname))));
! string = CopyReadAttribute(delim, null_print,
! &result, &isnull);
if (isnull)
{
--- 1588,1608 ----
errmsg("missing data for column \"%s\"",
NameStr(attr[m]->attname))));
! if (csv_mode)
! {
! string = CopyReadAttributeCSV(delim, null_print,
! &result, &isnull);
! if (result == UNTERMINATED_FIELD)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("unterminated CSV quoted field")));
! }
! else
! {
! string = CopyReadAttribute(delim, null_print,
! &result, &isnull);
! }
!
if (isnull)
{
***************
*** 2069,2074 ****
--- 2133,2288 ----
return attribute_buf.data;
}
+
+ /*
+ * Read the value of a single attribute in CSV mode,
+ * performing de-escaping as needed. Escaping does not follow the normal
+ * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage.
+ *
+ * Quoted fields can span lines, in which case the line end is embedded
+ * in the returned string.
+ *
+ * delim is a 2- or 3-character string. The first character is the
+ * field delimiter, the second the quote character, the third is the
+ * escape character indise quotes, and defaults to the quote character.
+ *
+ * null_print is the null marker string. Note that this is compared to
+ * the pre-de-escaped input string (thus if it is quoted it is not a NULL).
+ *
+ * *result is set to indicate what terminated the read:
+ * NORMAL_ATTR: column delimiter
+ * END_OF_LINE: end of line
+ * UNTERMINATED_FIELD no quote detected at end of a quoted field
+ *
+ * In any case, the string read up to the terminator (or end of file)
+ * is returned.
+ *
+ * *isnull is set true or false depending on whether the input matched
+ * the null marker. Note that the caller cannot check this since the
+ * returned string will be the post-de-escaping equivalent, which may
+ * look the same as some valid data string.
+ *----------
+ */
+
+ static char *
+ CopyReadAttributeCSV(const char *delim, const char *null_print,
+ CopyReadResult *result, bool *isnull)
+ {
+ char delimc = delim[0];
+ char quotec = delim[1];
+ char escapec = delim[2] ? delim[2] : delim[1];
+ char c;
+ int start_cursor = line_buf.cursor;
+ int end_cursor = start_cursor;;
+ int input_len;
+ bool in_quote = false;
+ bool saw_quote = false;
+
+ /* reset attribute_buf to empty */
+ attribute_buf.len = 0;
+ attribute_buf.data[0] = '\0';
+
+ /* set default status */
+ *result = END_OF_LINE;
+
+ for (;;)
+ {
+ /* handle multiline quoted fields */
+ if (in_quote && line_buf.cursor >= line_buf.len)
+ {
+ bool done;
+
+ switch(eol_type)
+ {
+ case EOL_NL:
+ appendStringInfoString(&attribute_buf,"\n");
+ break;
+ case EOL_CR:
+ appendStringInfoString(&attribute_buf,"\r");
+ break;
+ case EOL_CRNL:
+ appendStringInfoString(&attribute_buf,"\r\n");
+ break;
+ case EOL_UNKNOWN:
+ /* shouldn't happen - just keep going */
+ break;
+ }
+
+ copy_lineno++;
+ done = CopyReadLine();
+ if (done && line_buf.len == 0)
+ break;
+ start_cursor = line_buf.cursor;
+ }
+
+ end_cursor = line_buf.cursor;
+ if (line_buf.cursor >= line_buf.len)
+ break;
+ c = line_buf.data[line_buf.cursor++];
+ /*
+ * unquoted field delimiter
+ */
+ if (!in_quote && c == delimc)
+ {
+ *result = NORMAL_ATTR;
+ break;
+ }
+ /*
+ * start of quoted field (or part of field)
+ */
+ if (!in_quote && c == quotec)
+ {
+ saw_quote = true;
+ in_quote = true;
+ continue;
+ }
+ /*
+ * escape within a quoted field
+ */
+ if (in_quote && c == escapec)
+ {
+ /*
+ * peek at the next char if available, and escape it if it
+ * is an escape char or a quote char
+ */
+ if (line_buf.cursor <= line_buf.len)
+ {
+ char nextc = line_buf.data[line_buf.cursor];
+ if (nextc == escapec || nextc == quotec)
+ {
+ appendStringInfoCharMacro(&attribute_buf, nextc);
+ line_buf.cursor++;
+ continue;
+ }
+ }
+ }
+ /*
+ * end of quoted field.
+ * Must do this test after testing for escape in case quote char
+ * and escape char are the same (which is the common case).
+ */
+ if(in_quote && c == quotec)
+ {
+ in_quote = false;
+ continue;
+ }
+ appendStringInfoCharMacro(&attribute_buf, c);
+ }
+
+ if (in_quote)
+ *result = UNTERMINATED_FIELD;
+
+ /* check whether raw input matched null marker */
+ input_len = end_cursor - start_cursor;
+ if (!saw_quote && input_len == strlen(null_print) &&
+ strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0)
+ *isnull = true;
+ else
+ *isnull = false;
+
+ return attribute_buf.data;
+ }
+
/*
* Read a binary attribute
*/
***************
*** 2192,2197 ****
--- 2406,2479 ----
break;
}
}
+ }
+
+ /*
+ * Send CSV representation of one attribute, with conversion and
+ * CSV type escaping
+ */
+ static void
+ CopyAttributeOutCSV(char *server_string, char *delim, bool force_quote)
+ {
+ char *string;
+ char c;
+ char delimc = delim[0];
+ char quotec = delim[1];
+ char escapec = delim[2] ? delim[2] : delim[1];
+ bool need_quote = force_quote;
+ char *test_string;
+ bool same_encoding;
+ int mblen;
+ int i;
+
+ same_encoding = (server_encoding == client_encoding);
+ if (!same_encoding)
+ string = (char *) pg_server_to_client((unsigned char *) server_string,
+ strlen(server_string));
+ else
+ string = server_string;
+
+ /* have to run through the string twice,
+ * first time to see if it needs quoting, second to actually send it
+ */
+
+ for(test_string = string;
+ !need_quote && (c = *test_string) != '\0';
+ test_string += mblen)
+ {
+ if (c == delimc || c == quotec || c == '\n' || c == '\r')
+ {
+ need_quote = true;
+ }
+ if (!same_encoding)
+ mblen = pg_encoding_mblen(client_encoding, test_string);
+ else
+ mblen = 1;
+ }
+
+ if (need_quote)
+ CopySendChar(quotec);
+
+ for (; (c = *string) != '\0'; string += mblen)
+ {
+ if (c == quotec || c == escapec)
+ CopySendChar(escapec);
+
+ CopySendChar(c);
+
+ if (!same_encoding)
+ {
+ /* send additional bytes of the char, if any */
+ mblen = pg_encoding_mblen(client_encoding, string);
+ for (i = 1; i < mblen; i++)
+ CopySendChar(string[i]);
+ }
+ else
+ mblen = 1;
+ }
+
+ if (need_quote)
+ CopySendChar(quotec);
}
/*
В списке pgsql-patches по дате отправления: