c - Why is `switch` so slow? -
in bytecode interpreting loop, after several tests, i'm surprised see using switch
worst choice make. making calls function pointer array, or using gcc's computed goto
extension 10~20% faster, computed goto
version being fastest. i've tested real toy vm 97 instructions , mini fake vm pasted below.
why using switch
slowest?
#include <stdio.h> #include <stdlib.h> #include <stdbool.h> #include <inttypes.h> #include <time.h> enum { add1 = 1, add2, sub3, sub4, mul5, mul6, }; static unsigned int number; static void add1(void) { number += 1; } static void add2(void) { number += 2; } static void sub3(void) { number -= 3; } static void sub4(void) { number -= 4; } static void mul5(void) { number *= 5; } static void mul6(void) { number *= 6; } static void interpret_bytecodes_switch(uint8_t *bcs) { while (true) { switch (*bcs++) { case 0: return; case add1: add1(); break; case add2: add2(); break; case sub3: sub3(); break; case sub4: sub4(); break; case mul5: mul5(); break; case mul6: mul6(); break; } } } static void interpret_bytecodes_function_pointer(uint8_t *bcs) { void (*fs[])(void) = { null, add1, add2, sub3, sub4, mul5, mul6, }; while (*bcs) { fs[*bcs++](); } } static void interpret_bytecodes_goto(uint8_t *bcs) { void *labels[] = { &&l_exit, &&l_add1, &&l_add2, &&l_sub3, &&l_sub4, &&l_mul5, &&l_mul6, }; #define jump goto *labels[*bcs++] jump; l_exit: return; l_add1: add1(); jump; l_add2: add2(); jump; l_sub3: sub3(); jump; l_sub4: sub4(); jump; l_mul5: mul5(); jump; l_mul6: mul6(); jump; #undef jump } struct timer { clock_t start, end; }; static void timer_start(struct timer *self) { self->start = clock(); } static void timer_end(struct timer *self) { self->end = clock(); } static double timer_measure(struct timer *self) { return (double)(self->end - self->start) / clocks_per_sec; } static void test(void (*f)(uint8_t *), uint8_t *bcs) { number = 0; struct timer timer; timer_start(&timer); f(bcs); timer_end(&timer); printf("%u %.3fs\n", number, timer_measure(&timer)); } int main(void) { const int n = 300000000; srand((unsigned)time(null)); uint8_t *bcs = malloc(n + 1); (int = 0; < n; ++i) { bcs[i] = rand() % 5 + 1; } bcs[n] = 0; (int = 0; < 10; ++i) { printf("%d ", bcs[i]); } printf("\nswitch\n"); test(interpret_bytecodes_switch, bcs); printf("function pointer\n"); test(interpret_bytecodes_function_pointer, bcs); printf("goto\n"); test(interpret_bytecodes_goto, bcs); return 0; }
result
~$ gcc vm.c -ovm -std=gnu11 -o3 ~$ ./vm 3 4 5 3 3 5 3 3 1 2 switch 3050839589 2.866s function pointer 3050839589 2.573s goto 3050839589 2.433s ~$ ./vm 3 1 1 3 5 5 2 4 5 1 switch 3898179583 2.871s function pointer 3898179583 2.573s goto 3898179583 2.431s ~$ ./vm 5 5 1 2 3 3 1 2 2 4 switch 954521520 2.869s function pointer 954521520 2.574s goto 954521520 2.432s
below relevant disassembly of code posted here after -o3
optimization.
interpret_bytecodes_switch: .l8: addq $1, %rdi cmpb $6, -1(%rdi) ja .l8 movzbl -1(%rdi), %edx jmp *.l11(,%rdx,8) .l11: .quad .l10 .quad .l12 .quad .l13 .quad .l14 .quad .l15 .quad .l16 .quad .l17 .l16: leal (%rax,%rax,4), %eax jmp .l8 .l15: subl $4, %eax jmp .l8 .l14: subl $3, %eax jmp .l8 .l13: addl $2, %eax jmp .l8 .l12: addl $1, %eax jmp .l8 .l10: movl %eax, number(%rip) ret .l17: leal (%rax,%rax,2), %eax addl %eax, %eax jmp .l8 interpret_bytecodes_function_pointer: pushq %rbx movq %rdi, %rbx subq $64, %rsp movzbl (%rdi), %eax movq $0, (%rsp) movq $add1, 8(%rsp) movq $add2, 16(%rsp) movq $sub3, 24(%rsp) movq $sub4, 32(%rsp) movq $mul5, 40(%rsp) testb %al, %al movq $mul6, 48(%rsp) je .l19 .l23: addq $1, %rbx call *(%rsp,%rax,8) movzbl (%rbx), %eax testb %al, %al jne .l23 .l19: addq $64, %rsp popq %rbx ret interpret_bytecodes_goto: movzbl (%rdi), %eax movq $.l27, -72(%rsp) addq $2, %rdi movq $.l28, -64(%rsp) movq $.l29, -56(%rsp) movq $.l30, -48(%rsp) movq $.l31, -40(%rsp) movq $.l32, -32(%rsp) movq $.l33, -24(%rsp) movq -72(%rsp,%rax,8), %rax jmp *%rax .l33: movl number(%rip), %eax leal (%rax,%rax,2), %eax addl %eax, %eax movl %eax, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax .l35: addq $1, %rdi jmp *%rax .l28: addl $1, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .l35 .l30: subl $3, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .l35 .l31: subl $4, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .l35 .l32: movl number(%rip), %eax leal (%rax,%rax,4), %eax movl %eax, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .l35 .l29: addl $2, number(%rip) movzbl -1(%rdi), %eax movq -72(%rsp,%rax,8), %rax jmp .l35 .l27: rep ret
switch
slowest because has manage default
cases , may add bounds test didn't implemented.
switch
handles more general case case values not arranged in simple sequence, computation may needed that.
Comments
Post a Comment