faster version of AllocSetFreeIndex for x86 architecture

Поиск
Список
Период
Сортировка
От Atsushi Ogawa
Тема faster version of AllocSetFreeIndex for x86 architecture
Дата
Msg-id 4A253CF0.1000702@hi-ho.ne.jp
обсуждение исходный текст
Ответы Re: faster version of AllocSetFreeIndex for x86 architecture  (Jeremy Kerr <jk@ozlabs.org>)
Re: faster version of AllocSetFreeIndex for x86 architecture  (Simon Riggs <simon@2ndQuadrant.com>)
Список pgsql-hackers
Hi,
I made a faster version of AllocSetFreeIndex for x86 architecture.

Attached files are benchmark programs and patch file.

 alloc_test.pl: benchmark script
 alloc_test.c: benchmark program
 aset_free_index.patch: patch for util/mmgr/aset.c

This benchmark compares the original function with a faster version.
To try the benchmark, only execute alloc_test.pl. This script compiles
alloc_test.c and execute the benchmark.

Results of benchmark script:
Xeon(Core architecture), RedHat EL4, gcc 3.4.6
 bytes   :     4     8    16    32    64   128   256   512  1024   mix
 original: 0.780 0.780 0.820 0.870 0.930 0.970 1.030 1.080 1.130 0.950
 patched : 0.380 0.170 0.170 0.170 0.170 0.180 0.170 0.180 0.180 0.280

Core2, Windows XP, gcc 3.4.4 (cygwin)
 bytes   :     4     8    16    32    64   128   256   512  1024   mix
 original: 0.249 0.249 0.515 0.452 0.577 0.671 0.796 0.890 0.999 1.577
 patched : 0.358 0.218 0.202 0.218 0.218 0.218 0.202 0.218 0.218 0.218

Xeon(Pentium4 architecture), RedHal EL4, gcc 3.4.6
 bytes   :     4     8    16    32    64   128   256   512  1024   mix
 original: 0.510 0.520 0.620 0.860 0.970 1.260 1.150 1.220 1.290 0.860
 patched : 0.620 0.530 0.530 0.540 0.540 0.530 0.540 0.530 0.530 0.490

The effect of the patch that I measured by oprofile is:
- test program: pgbench -c 1 -t 50000 (fsync=off)

original:
CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events
with a unit mask of 0x01 (mandatory) count 100000
samples  %        symbol name
66854     6.6725  AllocSetAlloc
47679     4.7587  base_yyparse
29058     2.9002  hash_search_with_hash_value
22053     2.2011  SearchCatCache
19264     1.9227  MemoryContextAllocZeroAligned
16223     1.6192  base_yylex
13819     1.3792  ScanKeywordLookup
13305     1.3279  expression_tree_walker
12144     1.2121  LWLockAcquire
11850     1.1827  XLogInsert
11817     1.1794  AllocSetFree

patched:
CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events
with a unit mask of 0x01 (mandatory) count 100000
samples  %        symbol name
47610     4.9333  AllocSetAlloc
47441     4.9158  base_yyparse
28243     2.9265  hash_search_with_hash_value
22197     2.3000  SearchCatCache
18984     1.9671  MemoryContextAllocZeroAligned
15747     1.6317  base_yylex
13368     1.3852  ScanKeywordLookup
12889     1.3356  expression_tree_walker
12092     1.2530  LWLockAcquire
12078     1.2515  XLogInsert
(skip)
6248      0.6474  AllocSetFree

I think this patch improves AllocSetAlloc/AllocSetFree performance.

Best regards,

---
Atsushi Ogawa
a_ogawa@hi-ho.ne.jp




#!/usr/bin/perl

system "gcc -O2 -o alloc_test alloc_test.c";

my @test_bytes = (4,8,16,32,64,128,256,512,1024,
    '8 16 28 36 12 4 8 64 1024 8 24 12 8 64 16');
my $cnt = 10000000;

my @old_result;
my @new_result;
my $t0, $t1, $e;

foreach $e (@test_bytes) {
    $t0 = (times)[2];
    system "./alloc_test old $cnt $e";
    push @old_result, (times)[2] - $t0;

    $t0 = (times)[2];
    system "./alloc_test new $cnt $e";
    push @new_result, (times)[2] - $t0;
}

print " bytes   : ";
foreach $e (@test_bytes) {
    $e = 'mix' if($e =~ /\d+ \d+/);
    printf("%5s ", $e);
}
print "\n";

print " original: ";
foreach $e (@old_result) { printf("%.3f ", $e); }
print "\n";

print " patched : ";
foreach $e (@new_result) { printf("%.3f ", $e); }
print "\n";

#include <stdio.h>

#define Assert(condition)

#define ALLOC_MINBITS 3
#define ALLOCSET_NUM_FREELISTS 11
typedef size_t Size;

/*
 * faster version of AllocSetFreeIndex for x86 architecure.
 * this function runs in O(1).
 */
static inline int
AllocSetFreeIndex_new(Size size)
{
    int idx;

    if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0))
        size = (1 << ALLOC_MINBITS);

    /* bsr(Bit Scan Reverse): Search the most significant set bit */
    __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1));

    return idx - (ALLOC_MINBITS - 1);
}

static inline int
AllocSetFreeIndex(Size size)
{
    int         idx = 0;

    if (size > 0)
    {
        size = (size - 1) >> ALLOC_MINBITS;
        while (size != 0)
        {
            idx++;
            size >>= 1;
        }
        Assert(idx < ALLOCSET_NUM_FREELISTS);
    }

    return idx;
}

int main(int argc, char *argv[])
{
    int        loop_cnt;
    int        size[16];
    int        i, j;
    int        result = 0;

    if(argc < 4) {
        fprintf(stderr, "usage: asettest (new|old) loop_cnt size...\n");
        return 1;
    }

    loop_cnt = atoi(argv[2]);

    for(i = 0; i < 16; i++) {
        if(argc <= i + 3) {
            size[i] = size[0];
        } else {
            size[i] = atoi(argv[i + 3]);
        }
    }

    if(strcmp(argv[1], "new") == 0) {
        for(i = 0; i < loop_cnt; i++) {
            for(j = 0; j < 16; j++) {
                result += AllocSetFreeIndex_new(size[j]);
            }
        }
    } else if(strcmp(argv[1], "old") == 0) {
        for(i = 0; i < loop_cnt; i++) {
            for(j = 0; j < 16; j++) {
                result += AllocSetFreeIndex(size[j]);
            }
        }
    } else {
        fprintf(stderr, "usage: asettest (new|old) size loop_cnt\n");
        return 1;
    }

    fprintf(stderr, "%s, size:%d, loop:%d, checksum:%d\n",
        argv[1], size[0], loop_cnt, result);

    return 0;
}

*** ./src/backend/utils/mmgr/aset.c.orig    2009-06-01 23:12:10.000000000 +0900
--- ./src/backend/utils/mmgr/aset.c    2009-06-02 08:47:30.000000000 +0900
***************
*** 263,268 ****
--- 263,287 ----
   *        that size <= ALLOC_CHUNK_LIMIT.
   * ----------
   */
+ #if defined(__i386__) && defined(__GNUC__)
+ /*
+  * faster version of AllocSetFreeIndex for x86 architecure.
+  * this function runs in O(1).
+  */
+ static inline int
+ AllocSetFreeIndex(Size size)
+ {
+     int idx;
+
+     if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0))
+         size = (1 << ALLOC_MINBITS);
+
+     /* bsr(Bit Scan Reverse): Search the most significant set bit */
+     __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1));
+
+     return idx - (ALLOC_MINBITS - 1);
+ }
+ #else
  static inline int
  AllocSetFreeIndex(Size size)
  {
***************
*** 281,286 ****
--- 300,306 ----

      return idx;
  }
+ #endif /* defined(__i386__) && defined(__GNUC__) */

  #ifdef RANDOMIZE_ALLOCATED_MEMORY


В списке pgsql-hackers по дате отправления:

Предыдущее
От: Greg Stark
Дата:
Сообщение: Re: PostgreSQL Developer meeting minutes up
Следующее
От: Marko Kreen
Дата:
Сообщение: Re: PostgreSQL Developer meeting minutes up