After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.
#include <cmath> #include <cstdio> const int N = 4000; float cosine[N][N]; float sine[N][N]; int main() { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float ang = i*j*2*M_PI/N; cosine[i][j] = cos(ang); sine[i][j] = sin(ang); } } } With the above code I get:
With -O0: 2.402s
With -O1: 9.004s
With -O2: 9.013s
With -O3: 9.001s
Now if I change
float ang = i*j*2*M_PI/N; To
double ang = i*j*2*M_PI/N; I get:
With -O0: 2.362s
With -O1: 1.188s
With -O2: 1.197s
With -O3: 1.197s
How can the first test be that faster without optimizations?
I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.
EDIT: Changed the title to better describe the problem.
EDIT: Added assembly code
Assembly for first test with O0:
.file "main.cpp" .globl cosine .bss .align 32 .type cosine, @object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, @object .size sine, 64000000 sine: .zero 64000000 .text .globl main .type main, @function main: .LFB87: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 movq %rsp, %rbp .cfi_offset 6, -16 .cfi_def_cfa_register 6 subq $16, %rsp movl $0, -4(%rbp) jmp .L2 .L5: movl $0, -8(%rbp) jmp .L3 .L4: movl -4(%rbp), %eax imull -8(%rbp), %eax addl %eax, %eax cvtsi2sd %eax, %xmm0 movsd .LC0(%rip), %xmm1 mulsd %xmm1, %xmm0 movsd .LC1(%rip), %xmm1 divsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movss %xmm0, -12(%rbp) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call cos unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, cosine(,%rax,4) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call sin unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, sine(,%rax,4) addl $1, -8(%rbp) .L3: cmpl $3999, -8(%rbp) setle %al testb %al, %al jne .L4 addl $1, -4(%rbp) .L2: cmpl $3999, -4(%rbp) setle %al testb %al, %al jne .L5 movl $0, %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE87: .size main, .-main .section .rodata .align 4 .type _ZL1N, @object .size _ZL1N, 4 _ZL1N: .long 4000 .align 8 .LC0: .long 1413754136 .long 1074340347 .align 8 .LC1: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",@progbits Assembly for first test with O3:
.file "main.cpp" .text .p2align 4,,15 .globl main .type main, @function main: .LFB121: .cfi_startproc pushq %r15 .cfi_def_cfa_offset 16 xorl %r15d, %r15d .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 movl $cosine+16000, %r14d .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 xorl %r13d, %r13d .cfi_offset 13, -32 pushq %r12 .cfi_def_cfa_offset 40 pushq %rbp .cfi_def_cfa_offset 48 pushq %rbx .cfi_def_cfa_offset 56 subq $24, %rsp .cfi_def_cfa_offset 80 .p2align 4,,10 .p2align 3 .L2: movslq %r15d, %rbp .cfi_offset 3, -56 .cfi_offset 6, -48 .cfi_offset 12, -40 movl %r13d, %r12d movl $0x3f800000, %edx imulq $16000, %rbp, %rbp xorl %eax, %eax leaq cosine(%rbp), %rbx addq $sine, %rbp jmp .L5 .p2align 4,,10 .p2align 3 .L3: movl %r12d, %eax leaq 8(%rsp), %rsi leaq 12(%rsp), %rdi subl %r13d, %eax cvtsi2sd %eax, %xmm0 mulsd .LC2(%rip), %xmm0 divsd .LC3(%rip), %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 call sincosf movl 8(%rsp), %edx movl 12(%rsp), %eax .L5: movl %edx, (%rbx) addq $4, %rbx movl %eax, 0(%rbp) addl %r13d, %r12d addq $4, %rbp cmpq %r14, %rbx jne .L3 addl $1, %r15d addl $2, %r13d leaq 16000(%rbx), %r14 cmpl $4000, %r15d jne .L2 addq $24, %rsp .cfi_def_cfa_offset 56 xorl %eax, %eax popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE121: .size main, .-main .globl cosine .bss .align 32 .type cosine, @object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, @object .size sine, 64000000 sine: .zero 64000000 .section .rodata.cst8,"aM",@progbits,8 .align 8 .LC2: .long 1413754136 .long 1074340347 .align 8 .LC3: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",@progbits
Here's a possibility:
In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.
You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.
Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.
This shouldn't account for a 9x performance difference, though.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With