Re: pg_read_file() with virtual files returns empty string

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: pg_read_file() with virtual files returns empty string
Дата
Msg-id 749696.1593634340@sss.pgh.pa.us
обсуждение исходный текст
Ответ на Re: pg_read_file() with virtual files returns empty string  (Joe Conway <mail@joeconway.com>)
Ответы Re: pg_read_file() with virtual files returns empty string  (Joe Conway <mail@joeconway.com>)
Список pgsql-hackers
Joe Conway <mail@joeconway.com> writes:
> I did some performance testing of the worst case/largest possible file and found
> that skipping the stat and bulk read does cause a significant regression.

Yeah, I was wondering a little bit if that'd be an issue.

> In the attached patch I was able to get most of the performance degradation back
> -- ~600ms. Hopefully you don't think what I did was "too cute by half" :-). Do
> you think this is good enough or should we go back to using the stat file size
> when it is > 0?

I don't think it's unreasonable to "get in bed" with the innards of the
StringInfo; plenty of other places do already, such as pqformat.h or
pgp_armor_decode, just to name the first couple that I came across in a
quick grep.

However, if we're going to get in bed with it, let's get all the way in
and just read directly into the StringInfo's buffer, as per attached.
This saves all the extra memcpy'ing and reduces the number of fread calls
to at most log(N).

(This also fixes a bug in your version, which is that it captured
the buf.data pointer before any repalloc that might happen.)

            regards, tom lane

diff --git a/contrib/adminpack/expected/adminpack.out b/contrib/adminpack/expected/adminpack.out
index 5738b0f6c4..edf3ebfcba 100644
--- a/contrib/adminpack/expected/adminpack.out
+++ b/contrib/adminpack/expected/adminpack.out
@@ -79,7 +79,7 @@ SELECT pg_file_rename('test_file1', 'test_file2');
 (1 row)

 SELECT pg_read_file('test_file1');  -- not there
-ERROR:  could not stat file "test_file1": No such file or directory
+ERROR:  could not open file "test_file1" for reading: No such file or directory
 SELECT pg_read_file('test_file2');
  pg_read_file
 --------------
@@ -108,7 +108,7 @@ SELECT pg_file_rename('test_file2', 'test_file3', 'test_file3_archive');
 (1 row)

 SELECT pg_read_file('test_file2');  -- not there
-ERROR:  could not stat file "test_file2": No such file or directory
+ERROR:  could not open file "test_file2" for reading: No such file or directory
 SELECT pg_read_file('test_file3');
  pg_read_file
 --------------
diff --git a/src/backend/utils/adt/genfile.c b/src/backend/utils/adt/genfile.c
index ceaa6180da..aa49df4d0c 100644
--- a/src/backend/utils/adt/genfile.c
+++ b/src/backend/utils/adt/genfile.c
@@ -106,33 +106,11 @@ read_binary_file(const char *filename, int64 seek_offset, int64 bytes_to_read,
                  bool missing_ok)
 {
     bytea       *buf;
-    size_t        nbytes;
+    size_t        nbytes = 0;
     FILE       *file;

-    if (bytes_to_read < 0)
-    {
-        if (seek_offset < 0)
-            bytes_to_read = -seek_offset;
-        else
-        {
-            struct stat fst;
-
-            if (stat(filename, &fst) < 0)
-            {
-                if (missing_ok && errno == ENOENT)
-                    return NULL;
-                else
-                    ereport(ERROR,
-                            (errcode_for_file_access(),
-                             errmsg("could not stat file \"%s\": %m", filename)));
-            }
-
-            bytes_to_read = fst.st_size - seek_offset;
-        }
-    }
-
-    /* not sure why anyone thought that int64 length was a good idea */
-    if (bytes_to_read > (MaxAllocSize - VARHDRSZ))
+    /* clamp request size to what we can actually deliver */
+    if (bytes_to_read > (int64) (MaxAllocSize - VARHDRSZ))
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                  errmsg("requested length too large")));
@@ -154,9 +132,56 @@ read_binary_file(const char *filename, int64 seek_offset, int64 bytes_to_read,
                 (errcode_for_file_access(),
                  errmsg("could not seek in file \"%s\": %m", filename)));

-    buf = (bytea *) palloc((Size) bytes_to_read + VARHDRSZ);
+    if (bytes_to_read >= 0)
+    {
+        /* If passed explicit read size just do it */
+        buf = (bytea *) palloc((Size) bytes_to_read + VARHDRSZ);
+
+        nbytes = fread(VARDATA(buf), 1, (size_t) bytes_to_read, file);
+    }
+    else
+    {
+        /* Negative read size, read rest of file */
+        StringInfoData sbuf;
+
+        initStringInfo(&sbuf);
+        /* Leave room in the buffer for the varlena length word */
+        sbuf.len += VARHDRSZ;
+        Assert(sbuf.len < sbuf.maxlen);

-    nbytes = fread(VARDATA(buf), 1, (size_t) bytes_to_read, file);
+        while (!(feof(file) || ferror(file)))
+        {
+            size_t        rbytes;
+
+            /* Minimum amount to read at a time */
+#define MIN_READ_SIZE 4096
+
+            /*
+             * Verify that enlargeStringInfo won't fail; we'd rather give the
+             * error message for that ourselves.
+             */
+            if (((Size) MIN_READ_SIZE) >= (MaxAllocSize - (Size) sbuf.len))
+                ereport(ERROR,
+                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                         errmsg("requested length too large")));
+
+            /* OK, ensure that we can read at least MIN_READ_SIZE */
+            enlargeStringInfo(&sbuf, MIN_READ_SIZE);
+
+            /*
+             * stringinfo.c likes to allocate in powers of 2, so it's likely
+             * that much more space is available than we asked for.  Use all
+             * of it, rather than making more fread calls than necessary.
+             */
+            rbytes = fread(sbuf.data + sbuf.len, 1,
+                           (size_t) (sbuf.maxlen - sbuf.len - 1), file);
+            sbuf.len += rbytes;
+            nbytes += rbytes;
+        }
+
+        /* Now we can commandeer the stringinfo's buffer as the result */
+        buf = (bytea *) sbuf.data;
+    }

     if (ferror(file))
         ereport(ERROR,

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Andrew Dunstan
Дата:
Сообщение: Re: POC: rational number type (fractions)
Следующее
От: Stephen Frost
Дата:
Сообщение: Re: v12 and TimeLine switches and backups/restores