Cosine and sine are computed with Horner's method and a Chebyshev polynomial, e.g. a0 + x(a1 + x(a2 + ...))). The fused-multiply add instructions this generates form a dependency chain, meaning we can only execute one every clock cycle.
This is a waste, so in a code where many trig functions are called, such as the following, we would ideally evaluate two trig functions at the same time, and interleave the instructions of Horner's method.
However, gcc 15.2.0 generates calls to glibc's vectorized trig implementations, rather than inlining it ( https://godbolt.org/z/sWfMfW93b ), even with -flto, which inhibits the interleaving of instructions. How can I get gcc to produce the assembly I want?
#include <math.h>
#define LANES 4
void twiddle(double * restrict x_r, double * restrict x_i, int m, int n)
{
double angles[LANES];
for (int k = 0; k < m; k++) {
for (int j = 0; j < n; j += LANES) {
double base = -2.0 * M_PI / (m * n) * k * j;
#pragma omp simd
for (int j2 = 0; j2 < LANES; j2++) {
angles[j2] = base * j2;
}
#pragma omp simd
for (int j2 = 0; j2 < LANES; j2++) {
double tmp_r = cos(angles[j2]) * x_r[k * n + (j + j2)] -
sin(angles[j2]) * x_i[k * n + (j + j2)];
double tmp_i = cos(angles[j2]) * x_i[k * n + (j + j2)] +
sin(angles[j2]) * x_r[k * n + (j + j2)];
x_r[k * n + (j + j2)] = tmp_r;
x_i[k * n + (j + j2)] = tmp_i;
}
}
}
}
Generated hot-loop:
.L4:
vxorpd %xmm5, %xmm5, %xmm5
vcvtsi2sdl %ebx, %xmm5, %xmm0
vmulsd -120(%rbp), %xmm0, %xmm0
vmovddup %xmm0, %xmm0
vandpd .LC1(%rip), %xmm0, %xmm1
vmulpd .LC2(%rip), %xmm0, %xmm0
vmovdqa %xmm1, -80(%rbp)
vmovapd %xmm0, -64(%rbp)
vmovapd -80(%rbp), %ymm0
call _ZGVdN4v_sin
vmovapd %ymm0, -112(%rbp)
vmovapd -80(%rbp), %ymm0
call _ZGVdN4v_cos
vmovupd (%r15,%rbx,8), %ymm1
vmovapd -112(%rbp), %ymm4
vmovapd %ymm0, %ymm3
vmovupd (%r14,%rbx,8), %ymm0
vmulpd %ymm3, %ymm1, %ymm2
vmulpd %ymm4, %ymm1, %ymm1
vfmadd231pd %ymm0, %ymm4, %ymm2
vfmsub132pd %ymm3, %ymm1, %ymm0
vmovupd %ymm2, (%r15,%rbx,8)
vmovupd %ymm0, (%r14,%rbx,8)
addq $4, %rbx
cmpq %rbx, %r12
jne .L4
Example of self written code: https://godbolt.org/z/Ysc8KxWeY
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With