I'm compiling c++ code with MSVC. My sample code below is SSE2 because I'm using a dispatcher which means Visual Studio won't dump the assembly for AVX or AVX512. My use case is AVX2 and AVX512.
I've tried about 3 times now to use arrays of simd vectors to replace bunches of simd loads and stores. It's tempting but seems like a sucker bet because it never speeds things up and if anything tends to make the code slower. This is the first time I tried looking at the assembly, for a project where the array code is definitely slower than the load/store code, and I was surprised to see 13 instructions for the array code versus only 8 for the load/store code, to get the same result.
My questions are:
Is the 13-line asm block below clearly significantly slower than the 8-line block?
Are arrays of simd vectors naturally inefficient, compared to equivalent code using loads/stores with arrays of fundamental data types? Or, maybe it's just the MSVC compiler not doing a good job? Or maybe the answer is different for different size arrays with regard to cache issues? My simd arrays are usually pretty small, typically about 200 doubles.
//__m128d* TR_vec = new __m128d[vec_arraysize];
//double* TR = (double*)TR_vec;
//const __m128d vec_dcp = _mm_set1_pd(dc_p);
//const __m128d vec_dcq = _mm_set1_pd(dc_q);
for (jj = bsteps - 1; jj > -1; jj--)
{
jjN = ((jj + 1) + vectorsize - 1) & (-vectorsize);
for (kk = 0; kk < jjN; kk += vectorsize)
{
#ifdef ARRAY_OF_SIMDS
vndx = kk >> 1;
TR_vec[vndx] = _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, TR_vec[vndx]));
#else
_mm_store_pd(TR + kk, _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, _mm_load_pd(TR + kk))));
#endif
}
}
//; 29161:
TR_vec[vndx] = _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, TR_vec[vndx]));
movaps xmm1, xmm7
mov rax, rcx
movaps xmm0, xmm6
mulpd xmm0, XMMWORD PTR [r8]
add rax, rax
add rdx, 2
add r8, 16
mulpd xmm1, XMMWORD PTR [rdi+rax*8]
mov rax, rcx
add rax, rax
addpd xmm1, xmm0
movups XMMWORD PTR [rdi+rax*8], xmm1
cmp rdx, r9
//; 29168:
_mm_store_pd(TR + kk, _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, _mm_load_pd(TR + kk))));
movaps xmm1, xmm6
movaps xmm0, xmm7
mulpd xmm1, XMMWORD PTR [rdi+rax*8+8]
mulpd xmm0, XMMWORD PTR [rdi+rax*8]
addpd xmm1, xmm0
movups XMMWORD PTR [rdi+rax*8], xmm1
add rax, 2
cmp rax, rcx
//AVX2 dumped by Visual Studio:
TR_vec[vndx] = _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_load_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, TR_vec[vndx]));
mov rax, rcx
shl rax, 5
vmulpd ymm1, ymm5, YMMWORD PTR [rax+rdi]
movsxd rax, r8d
add r8d, 4
vmulpd ymm0, ymm4, YMMWORD PTR [rdi+rax*8]
mov rax, rcx
shl rax, 5
vaddpd ymm1, ymm0, ymm1
vmovupd YMMWORD PTR [rax+rdi], ymm1
cmp rdx, r9
_mm256_store_pd(TR + kk, _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_load_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, _mm256_load_pd(TR + kk))));
vmulpd ymm1, ymm5, YMMWORD PTR [rcx]
vmulpd ymm0, ymm4, YMMWORD PTR [rcx+8]
lea rcx, QWORD PTR [rcx+32]
vaddpd ymm1, ymm0, ymm1
vmovupd YMMWORD PTR [rcx-32], ymm1
sub rdx, 1
I think @PeterCordes provided the answers to this question in his excellent comments:
The MSVC asm for the array-of-simds __m256d* code is "clearly" significantly slower than the MSVC asm for the double* code.
But this is a matter of poor MSVC compiler performance in this case, not an inherent problem with the __m256d* approach. When the same source code is compiled with Visual Studio LLVM (clang-cl) I get the following comparative asm. The __m256d* version is only slightly slower than the double* version, as would be expected with the single extra instruction.
//__m256d* TR_vec = new __m256d[vec_arraysize];
//double* TR = (double*)TR_vec;
//const __m256d vec_dcp = _mm256_set1_pd(dc_p);
//const __m256d vec_dcq = _mm256_set1_pd(dc_q);
for (jj = bsteps - 1; jj > -1; jj--)
{
jjN = ((jj + 1) + vectorsize - 1) & (-vectorsize);
for (kk = 0; kk < jjN; kk += vectorsize)
{
#ifdef ARRAY_OF_SIMDS
vndx = kk >> 2;
TR_vec[vndx] = _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_loadu_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, TR_vec[vndx]));
//#DEBUG_VALUE: kk <- $r10
//#DEBUG_VALUE: vndx <- [DW_OP_constu 2, DW_OP_shr, DW_OP_stack_value] $r10
//vmulpd ymm0, ymm3, ymmword ptr [rdi + 8*r10]
//vmulpd ymm1, ymm2, ymmword ptr [rdi + 8*r10 + 8]
//vaddpd ymm0, ymm0, ymm1
//vmovapd ymmword ptr [rdi + 8*r10], ymm0
#else
_mm256_storeu_pd(TR + kk, _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_loadu_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, _mm256_loadu_pd(TR + kk))));
//#DEBUG_VALUE: :kk <- $r10
//vmulpd ymm0, ymm3, ymmword ptr [rdi + 8*r10]
//vmulpd ymm1, ymm2, ymmword ptr [rdi + 8*r10 + 8]
//vaddpd ymm0, ymm0, ymm1
//vmovapd ymmword ptr [rdi + 8*r10], ymm0
#endif
}
}
To enable FMA across mul and add intrinsics, use -ffp-contract=fast. (Clang manual). #pragma STDC FP_CONTRACT on in the source may also work, but that might only be within expressions, not across statements.
After a lot of trial and error I discovered that for Visual Studio clang-cl you need to prepend the clang items with /clang:, so for example -ffp-contract=fast should go on the command line as /clang:-ffp-contract=fast.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With