Are arrays of simd vectors naturally inefficient?

Question

I'm compiling c++ code with MSVC. My sample code below is SSE2 because I'm using a dispatcher which means Visual Studio won't dump the assembly for AVX or AVX512. My use case is AVX2 and AVX512.

I've tried about 3 times now to use arrays of simd vectors to replace bunches of simd loads and stores. It's tempting but seems like a sucker bet because it never speeds things up and if anything tends to make the code slower. This is the first time I tried looking at the assembly, for a project where the array code is definitely slower than the load/store code, and I was surprised to see 13 instructions for the array code versus only 8 for the load/store code, to get the same result.

My questions are:

Is the 13-line asm block below clearly significantly slower than the 8-line block?
Are arrays of simd vectors naturally inefficient, compared to equivalent code using loads/stores with arrays of fundamental data types? Or, maybe it's just the MSVC compiler not doing a good job? Or maybe the answer is different for different size arrays with regard to cache issues? My simd arrays are usually pretty small, typically about 200 doubles.

//__m128d* TR_vec = new __m128d[vec_arraysize];
//double* TR = (double*)TR_vec;
//const __m128d vec_dcp = _mm_set1_pd(dc_p);
//const __m128d vec_dcq = _mm_set1_pd(dc_q);

for (jj = bsteps - 1; jj > -1; jj--)
{
    jjN = ((jj + 1) + vectorsize - 1) & (-vectorsize);
    for (kk = 0; kk < jjN; kk += vectorsize)
    {
#ifdef ARRAY_OF_SIMDS
        vndx = kk >> 1;
        TR_vec[vndx] = _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, TR_vec[vndx]));
#else
        _mm_store_pd(TR + kk, _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, _mm_load_pd(TR + kk))));
#endif
    }
}

//; 29161: 
    TR_vec[vndx] = _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, TR_vec[vndx]));

    movaps  xmm1, xmm7
    mov rax, rcx
    movaps  xmm0, xmm6
    mulpd   xmm0, XMMWORD PTR [r8]
    add rax, rax
    add rdx, 2
    add r8, 16
    mulpd   xmm1, XMMWORD PTR [rdi+rax*8]
    mov rax, rcx
    add rax, rax
    addpd   xmm1, xmm0
    movups  XMMWORD PTR [rdi+rax*8], xmm1
    cmp rdx, r9

//; 29168: 
    _mm_store_pd(TR + kk, _mm_add_pd(_mm_mul_pd(vec_dcp, _mm_load_pd(TR + (kk + 1))), _mm_mul_pd(vec_dcq, _mm_load_pd(TR + kk))));

    movaps  xmm1, xmm6
    movaps  xmm0, xmm7
    mulpd   xmm1, XMMWORD PTR [rdi+rax*8+8]
    mulpd   xmm0, XMMWORD PTR [rdi+rax*8]
    addpd   xmm1, xmm0
    movups  XMMWORD PTR [rdi+rax*8], xmm1
    add rax, 2
    cmp rax, rcx

//AVX2 dumped by Visual Studio:

TR_vec[vndx] = _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_load_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, TR_vec[vndx]));

    mov rax, rcx
    shl rax, 5
    vmulpd  ymm1, ymm5, YMMWORD PTR [rax+rdi]
    movsxd  rax, r8d
    add r8d, 4
    vmulpd  ymm0, ymm4, YMMWORD PTR [rdi+rax*8]
    mov rax, rcx
    shl rax, 5
    vaddpd  ymm1, ymm0, ymm1
    vmovupd YMMWORD PTR [rax+rdi], ymm1
    cmp rdx, r9


_mm256_store_pd(TR + kk, _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_load_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, _mm256_load_pd(TR + kk))));


    vmulpd  ymm1, ymm5, YMMWORD PTR [rcx]
    vmulpd  ymm0, ymm4, YMMWORD PTR [rcx+8]
    lea rcx, QWORD PTR [rcx+32]
    vaddpd  ymm1, ymm0, ymm1
    vmovupd YMMWORD PTR [rcx-32], ymm1
    sub rdx, 1

dts · Accepted Answer

I think @PeterCordes provided the answers to this question in his excellent comments:

The MSVC asm for the array-of-simds __m256d* code is "clearly" significantly slower than the MSVC asm for the double* code.
But this is a matter of poor MSVC compiler performance in this case, not an inherent problem with the __m256d* approach. When the same source code is compiled with Visual Studio LLVM (clang-cl) I get the following comparative asm. The __m256d* version is only slightly slower than the double* version, as would be expected with the single extra instruction.

//__m256d* TR_vec = new __m256d[vec_arraysize];
//double* TR = (double*)TR_vec;
//const __m256d vec_dcp = _mm256_set1_pd(dc_p);
//const __m256d vec_dcq = _mm256_set1_pd(dc_q);

for (jj = bsteps - 1; jj > -1; jj--)
{
    jjN = ((jj + 1) + vectorsize - 1) & (-vectorsize);
    for (kk = 0; kk < jjN; kk += vectorsize)
    {
#ifdef ARRAY_OF_SIMDS
        vndx = kk >> 2;
        TR_vec[vndx] = _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_loadu_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, TR_vec[vndx]));

    //#DEBUG_VALUE: kk <- $r10
    //#DEBUG_VALUE: vndx <- [DW_OP_constu 2, DW_OP_shr, DW_OP_stack_value] $r10
    //vmulpd    ymm0, ymm3, ymmword ptr [rdi + 8*r10]
    //vmulpd    ymm1, ymm2, ymmword ptr [rdi + 8*r10 + 8]
    //vaddpd    ymm0, ymm0, ymm1
    //vmovapd   ymmword ptr [rdi + 8*r10], ymm0

#else
    _mm256_storeu_pd(TR + kk, _mm256_add_pd(_mm256_mul_pd(vec_dcp, _mm256_loadu_pd(TR + (kk + 1))), _mm256_mul_pd(vec_dcq, _mm256_loadu_pd(TR + kk))));

    //#DEBUG_VALUE: :kk <- $r10
    //vmulpd    ymm0, ymm3, ymmword ptr [rdi + 8*r10]
    //vmulpd    ymm1, ymm2, ymmword ptr [rdi + 8*r10 + 8]
    //vaddpd    ymm0, ymm0, ymm1
    //vmovapd   ymmword ptr [rdi + 8*r10], ymm0

#endif
    }
}

To enable FMA across mul and add intrinsics, use -ffp-contract=fast. (Clang manual). #pragma STDC FP_CONTRACT on in the source may also work, but that might only be within expressions, not across statements.

After a lot of trial and error I discovered that for Visual Studio clang-cl you need to prepend the clang items with /clang:, so for example -ffp-contract=fast should go on the command line as /clang:-ffp-contract=fast.

Are arrays of simd vectors naturally inefficient?

Tags:

c++

x86

assembly

simd

sse

dts

1 Answers

dts

Recent Activity

Donate For Us

Are arrays of simd vectors naturally inefficient?

Tags:

c++

x86

assembly

simd

sse

dts

1 Answers

dts

Related questions

Recent Activity

Donate For Us