Why sin/cos are slower when optimizations are enabled?

Question

After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.

#include <cmath> #include <cstdio>  const int N = 4000;  float cosine[N][N]; float sine[N][N];  int main() {     for (int i = 0; i < N; i++) {         for (int j = 0; j < N; j++) {             float ang = i*j*2*M_PI/N;             cosine[i][j] = cos(ang);             sine[i][j] = sin(ang);         }     } }

With the above code I get:

With -O0: 2.402s

With -O1: 9.004s

With -O2: 9.013s

With -O3: 9.001s

Now if I change

float ang = i*j*2*M_PI/N;

To

double ang = i*j*2*M_PI/N;

I get:

With -O0: 2.362s

With -O1: 1.188s

With -O2: 1.197s

With -O3: 1.197s

How can the first test be that faster without optimizations?

I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.

EDIT: Changed the title to better describe the problem.

EDIT: Added assembly code

Assembly for first test with O0:

    .file   "main.cpp" .globl cosine     .bss     .align 32     .type   cosine, @object     .size   cosine, 64000000 cosine:     .zero   64000000 .globl sine     .align 32     .type   sine, @object     .size   sine, 64000000 sine:     .zero   64000000     .text .globl main     .type   main, @function main: .LFB87:     .cfi_startproc     pushq   %rbp     .cfi_def_cfa_offset 16     movq    %rsp, %rbp     .cfi_offset 6, -16     .cfi_def_cfa_register 6     subq    $16, %rsp     movl    $0, -4(%rbp)     jmp .L2 .L5:     movl    $0, -8(%rbp)     jmp .L3 .L4:     movl    -4(%rbp), %eax     imull   -8(%rbp), %eax     addl    %eax, %eax     cvtsi2sd    %eax, %xmm0     movsd   .LC0(%rip), %xmm1     mulsd   %xmm1, %xmm0     movsd   .LC1(%rip), %xmm1     divsd   %xmm1, %xmm0     unpcklpd    %xmm0, %xmm0     cvtpd2ps    %xmm0, %xmm0     movss   %xmm0, -12(%rbp)     movss   -12(%rbp), %xmm0     cvtps2pd    %xmm0, %xmm0     call    cos     unpcklpd    %xmm0, %xmm0     cvtpd2ps    %xmm0, %xmm0     movl    -8(%rbp), %eax     cltq     movl    -4(%rbp), %edx     movslq  %edx, %rdx     imulq   $4000, %rdx, %rdx     leaq    (%rdx,%rax), %rax     movss   %xmm0, cosine(,%rax,4)     movss   -12(%rbp), %xmm0     cvtps2pd    %xmm0, %xmm0     call    sin     unpcklpd    %xmm0, %xmm0     cvtpd2ps    %xmm0, %xmm0     movl    -8(%rbp), %eax     cltq     movl    -4(%rbp), %edx     movslq  %edx, %rdx     imulq   $4000, %rdx, %rdx     leaq    (%rdx,%rax), %rax     movss   %xmm0, sine(,%rax,4)     addl    $1, -8(%rbp) .L3:     cmpl    $3999, -8(%rbp)     setle   %al     testb   %al, %al     jne .L4     addl    $1, -4(%rbp) .L2:     cmpl    $3999, -4(%rbp)     setle   %al     testb   %al, %al     jne .L5     movl    $0, %eax     leave     .cfi_def_cfa 7, 8     ret     .cfi_endproc .LFE87:     .size   main, .-main     .section    .rodata     .align 4     .type   _ZL1N, @object     .size   _ZL1N, 4 _ZL1N:     .long   4000     .align 8 .LC0:     .long   1413754136     .long   1074340347     .align 8 .LC1:     .long   0     .long   1085227008     .ident  "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"     .section    .note.GNU-stack,"",@progbits

Assembly for first test with O3:

    .file   "main.cpp"     .text     .p2align 4,,15 .globl main     .type   main, @function main: .LFB121:     .cfi_startproc     pushq   %r15     .cfi_def_cfa_offset 16     xorl    %r15d, %r15d     .cfi_offset 15, -16     pushq   %r14     .cfi_def_cfa_offset 24     movl    $cosine+16000, %r14d     .cfi_offset 14, -24     pushq   %r13     .cfi_def_cfa_offset 32     xorl    %r13d, %r13d     .cfi_offset 13, -32     pushq   %r12     .cfi_def_cfa_offset 40     pushq   %rbp     .cfi_def_cfa_offset 48     pushq   %rbx     .cfi_def_cfa_offset 56     subq    $24, %rsp     .cfi_def_cfa_offset 80     .p2align 4,,10     .p2align 3 .L2:     movslq  %r15d, %rbp     .cfi_offset 3, -56     .cfi_offset 6, -48     .cfi_offset 12, -40     movl    %r13d, %r12d     movl    $0x3f800000, %edx     imulq   $16000, %rbp, %rbp     xorl    %eax, %eax     leaq    cosine(%rbp), %rbx     addq    $sine, %rbp     jmp .L5     .p2align 4,,10     .p2align 3 .L3:     movl    %r12d, %eax     leaq    8(%rsp), %rsi     leaq    12(%rsp), %rdi     subl    %r13d, %eax     cvtsi2sd    %eax, %xmm0     mulsd   .LC2(%rip), %xmm0     divsd   .LC3(%rip), %xmm0     unpcklpd    %xmm0, %xmm0     cvtpd2ps    %xmm0, %xmm0     call    sincosf     movl    8(%rsp), %edx     movl    12(%rsp), %eax .L5:     movl    %edx, (%rbx)     addq    $4, %rbx     movl    %eax, 0(%rbp)     addl    %r13d, %r12d     addq    $4, %rbp     cmpq    %r14, %rbx     jne .L3     addl    $1, %r15d     addl    $2, %r13d     leaq    16000(%rbx), %r14     cmpl    $4000, %r15d     jne .L2     addq    $24, %rsp     .cfi_def_cfa_offset 56     xorl    %eax, %eax     popq    %rbx     .cfi_def_cfa_offset 48     popq    %rbp     .cfi_def_cfa_offset 40     popq    %r12     .cfi_def_cfa_offset 32     popq    %r13     .cfi_def_cfa_offset 24     popq    %r14     .cfi_def_cfa_offset 16     popq    %r15     .cfi_def_cfa_offset 8     ret     .cfi_endproc .LFE121:     .size   main, .-main .globl cosine     .bss     .align 32     .type   cosine, @object     .size   cosine, 64000000 cosine:     .zero   64000000 .globl sine     .align 32     .type   sine, @object     .size   sine, 64000000 sine:     .zero   64000000     .section    .rodata.cst8,"aM",@progbits,8     .align 8 .LC2:     .long   1413754136     .long   1074340347     .align 8 .LC3:     .long   0     .long   1085227008     .ident  "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"     .section    .note.GNU-stack,"",@progbits

Cory Nelson · Accepted Answer

Here's a possibility:

In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.

You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.

Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.

This shouldn't account for a 9x performance difference, though.

Why sin/cos are slower when optimizations are enabled?

Tags:

fbafelipe

1 Answers

Cory Nelson

Recent Activity

Donate For Us

Why sin/cos are slower when optimizations are enabled?

Tags:

fbafelipe

1 Answers

Cory Nelson

Related questions

Recent Activity

Donate For Us