c - Why is faster to do a branch than a lookup? - Stack Overflow

I have just been trying to benchmark some PRNG code for generating hex characters - I have an input of

I have just been trying to benchmark some PRNG code for generating hex characters - I have an input of random values buf, which I want to turn into a series of hex characters, in-place.

Code

#define _POSIX_C_SOURCE 199309L

#include <stdio.h>
#include <stdalign.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>

static const char HEX_CHARS[16] = "0123456789ABCDEF";

static void a(size_t n, char buf[static n]) {
    // single loop, branch
    for (size_t i = 0; i < n; i++) {
        buf[i] &= 15;
        buf[i] = buf[i] < 10
                ? buf[i] + '0'
                : buf[i] + 'A' - 10;
    }
}

static void b(size_t n, char buf[static n]) {
    // double loop, branch
    for (size_t i = 0; i < n; i++)
        buf[i] &= 15;
    for (size_t i = 0; i < n; i++)
        buf[i] = buf[i] < 10
                ? buf[i] + '0'
                : buf[i] + 'A' - 10;
}

static void c(size_t n, char buf[static n]) {
    // single loop, lookup
    for (size_t i = 0; i < n; i++)
        buf[i] = HEX_CHARS[buf[i] & 15];
}

static void d(size_t n, char buf[static n]) {
    // double loop, lookup
    for (size_t i = 0; i < n; i++)
        buf[i] &= 0xF;
    for (size_t i = 0; i < n; i++)
        buf[i] = HEX_CHARS[(size_t)buf[i]];
}

static void test(
    const size_t n,
    const char input[const static n],
    const char *const name,
    void (*const fn)(size_t n, char buf[static n])
) {
    const size_t M = 1 << 16;

    char *buf = calloc(n, 1);
    memmove(buf, input, n);

    double ns_total = 0;
    for (size_t i = 0; i < M; i++) {
        struct timespec t_start, t_end;
        clock_gettime(CLOCK_MONOTONIC, &t_start);
        fn(n, buf);
        clock_gettime(CLOCK_MONOTONIC, &t_end);
        double ns_start = (double)t_start.tv_sec * 1e9 + (double)t_start.tv_nsec;
        double ns_end = (double)t_end.tv_sec * 1e9 + (double)t_end.tv_nsec;
        double ns_dur = ns_end - ns_start;
        ns_total += ns_dur;
    }

    printf("%s() took %f sec (avg %f ms)\n", name, ns_total * 1e-9, ns_total / (double) M * 1e-6);

    free(buf);
}

int main(void) {
    const size_t N = 1UL << 24;

    // generate random data
    char *input = calloc(N, 1);
    FILE *file = fopen("/dev/urandom", "r");
    if (fread(input, 1, N, file) != N) {
        perror("fread");
        abort();
    }
    fclose(file);

    test(N, input, "a", a);
    test(N, input, "b", b);
    test(N, input, "c", c);
    test(N, input, "d", d);

    free(input);
}

Disassembly

Results

tested on Ryzen 9 7940HS

-O3

a() took 41.452250 sec (avg 0.632511 ms)
b() took 68.869114 sec (avg 1.050859 ms)
c() took 323.392966 sec (avg 4.934585 ms)
d() took 365.612924 sec (avg 5.578810 ms)

-O3 -march=native

a() took 30.370241 sec (avg 0.463413 ms)
b() took 59.591973 sec (avg 0.909301 ms)
c() took 315.819261 sec (avg 4.819019 ms)
d() took 332.411909 sec (avg 5.072203 ms)

Why is it so much faster to do a branch (a(), b()), compared to a lookup (c(), d())? Since the input data is fully random, the branch predictor shouldn't be able to predict which branch will be taken, so I would expect a lookup to win.

I have just been trying to benchmark some PRNG code for generating hex characters - I have an input of random values buf, which I want to turn into a series of hex characters, in-place.

Code

#define _POSIX_C_SOURCE 199309L

#include <stdio.h>
#include <stdalign.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>

static const char HEX_CHARS[16] = "0123456789ABCDEF";

static void a(size_t n, char buf[static n]) {
    // single loop, branch
    for (size_t i = 0; i < n; i++) {
        buf[i] &= 15;
        buf[i] = buf[i] < 10
                ? buf[i] + '0'
                : buf[i] + 'A' - 10;
    }
}

static void b(size_t n, char buf[static n]) {
    // double loop, branch
    for (size_t i = 0; i < n; i++)
        buf[i] &= 15;
    for (size_t i = 0; i < n; i++)
        buf[i] = buf[i] < 10
                ? buf[i] + '0'
                : buf[i] + 'A' - 10;
}

static void c(size_t n, char buf[static n]) {
    // single loop, lookup
    for (size_t i = 0; i < n; i++)
        buf[i] = HEX_CHARS[buf[i] & 15];
}

static void d(size_t n, char buf[static n]) {
    // double loop, lookup
    for (size_t i = 0; i < n; i++)
        buf[i] &= 0xF;
    for (size_t i = 0; i < n; i++)
        buf[i] = HEX_CHARS[(size_t)buf[i]];
}

static void test(
    const size_t n,
    const char input[const static n],
    const char *const name,
    void (*const fn)(size_t n, char buf[static n])
) {
    const size_t M = 1 << 16;

    char *buf = calloc(n, 1);
    memmove(buf, input, n);

    double ns_total = 0;
    for (size_t i = 0; i < M; i++) {
        struct timespec t_start, t_end;
        clock_gettime(CLOCK_MONOTONIC, &t_start);
        fn(n, buf);
        clock_gettime(CLOCK_MONOTONIC, &t_end);
        double ns_start = (double)t_start.tv_sec * 1e9 + (double)t_start.tv_nsec;
        double ns_end = (double)t_end.tv_sec * 1e9 + (double)t_end.tv_nsec;
        double ns_dur = ns_end - ns_start;
        ns_total += ns_dur;
    }

    printf("%s() took %f sec (avg %f ms)\n", name, ns_total * 1e-9, ns_total / (double) M * 1e-6);

    free(buf);
}

int main(void) {
    const size_t N = 1UL << 24;

    // generate random data
    char *input = calloc(N, 1);
    FILE *file = fopen("/dev/urandom", "r");
    if (fread(input, 1, N, file) != N) {
        perror("fread");
        abort();
    }
    fclose(file);

    test(N, input, "a", a);
    test(N, input, "b", b);
    test(N, input, "c", c);
    test(N, input, "d", d);

    free(input);
}

Disassembly

https://c.godbolt./z/WK95x4x86

Results

tested on Ryzen 9 7940HS

-O3

a() took 41.452250 sec (avg 0.632511 ms)
b() took 68.869114 sec (avg 1.050859 ms)
c() took 323.392966 sec (avg 4.934585 ms)
d() took 365.612924 sec (avg 5.578810 ms)

-O3 -march=native

a() took 30.370241 sec (avg 0.463413 ms)
b() took 59.591973 sec (avg 0.909301 ms)
c() took 315.819261 sec (avg 4.819019 ms)
d() took 332.411909 sec (avg 5.072203 ms)

Why is it so much faster to do a branch (a(), b()), compared to a lookup (c(), d())? Since the input data is fully random, the branch predictor shouldn't be able to predict which branch will be taken, so I would expect a lookup to win.

Share Improve this question edited Feb 26 at 15:24 TylerH 21.1k79 gold badges79 silver badges114 bronze badges asked Feb 3 at 5:12 AnonAnon 3711 silver badge10 bronze badges 5
  • 1 You need to run the tests multiple times, hundreds or even thousands of times, and get the average. – Some programmer dude Commented Feb 3 at 5:37
  • @Someprogrammerdude That's why I ran it with 2^33 elements. Do you not believe that is sufficient? – Anon Commented Feb 3 at 5:49
  • 1 That's not really what I said. The test function needs to run the function in a loop. – Some programmer dude Commented Feb 3 at 5:53
  • Have you tried running the test program several times to see if results are consistent? – nielsen Commented Feb 3 at 7:13
  • @Someprogrammerdude Done, although it hasn't made much of difference (D was most likely an outlier) – Anon Commented Feb 3 at 7:39
Add a comment  | 

1 Answer 1

Reset to default 6

The function a is getting vectorized. Notice that in the Assembly code you linked to the operations in the loop are performed using xmm registers:

        movdqu  xmm0, XMMWORD PTR [rax]
        add     rax, 16
        pand    xmm0, xmm6
        movdqa  xmm2, xmm0
        movdqa  xmm1, xmm0
        pcmpgtb xmm0, xmm5
        paddb   xmm2, xmm4
        paddb   xmm1, xmm3
        pand    xmm1, xmm0
        pandn   xmm0, xmm2
        por     xmm0, xmm1
        movups  XMMWORD PTR [rax-16], xmm0

Meanwhile c is not vectorized at all:

c:
        test    rdi, rdi
        je      .L1
        add     rdi, rsi
.L3:
        movzx   eax, BYTE PTR [rsi]
        add     rsi, 1
        and     eax, 15
        movzx   eax, BYTE PTR HEX_CHARS[rax]
        mov     BYTE PTR [rsi-1], al
        cmp     rdi, rsi
        jne     .L3
.L1:
        ret
HEX_CHARS:
        .ascii  "0123456789ABCDEF"

With xmm registers in SSE the CPU can perform the operations in a on four integers at once, which is consistent with your results (notice how the function a finishes in about a quarter of the time as the function c). The function c can’t be vectorized because its loop is a random look-up. So this answers your first question.

I tested your code on my computer (Ryzen 5 5500U) and got the following results:

-O3 without -march=native:

a() took 1.089963 sec
b() took 2.084695 sec
c() took 3.504775 sec
d() took 3.938886 sec

-O3 with -march=native:

a() took 0.988197 sec
b() took 1.905577 sec
c() took 3.507487 sec
d() took 3.866236 sec

As you can see c performed the same in both cases, so I’m not sure why in your tests it got so much slower with -march=native. d can get a bit of a speed-up because of AVX (you can see the generated code on Godbolt).

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745248656a4618541.html

相关推荐

  • c - Why is faster to do a branch than a lookup? - Stack Overflow

    I have just been trying to benchmark some PRNG code for generating hex characters - I have an input of

    17小时前
    20

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信