Optimization using prefetch

Question

I want to understand how to use PREFETCH* instructions. For this I wrote some code:

.model flat

.code

?fast_mem_copy_sse@@YAXPAH0H@Z PROC
    MOV     edi, [esp + 4]  ; destination
    MOV     esi, [esp + 8]  ; source
    MOV     ecx, [esp + 12] ; n bytes for copy  

copy_loop_1:
    MOVAPS  xmm0, [edi + 0 * 4 * 4]
    MOVAPS  xmm1, [edi + 1 * 4 * 4]
    MOVAPS  xmm2, [edi + 2 * 4 * 4]
    MOVAPS  xmm3, [edi + 3 * 4 * 4]
    MOVAPS  xmm4, [edi + 4 * 4 * 4]
    MOVAPS  xmm5, [edi + 5 * 4 * 4]
    MOVAPS  xmm6, [edi + 6 * 4 * 4]
    MOVAPS  xmm7, [edi + 7 * 4 * 4]

    MOVAPS  [esi + 0 * 4 * 4], xmm0
    MOVAPS  [esi + 1 * 4 * 4], xmm1
    MOVAPS  [esi + 2 * 4 * 4], xmm2
    MOVAPS  [esi + 3 * 4 * 4], xmm3
    MOVAPS  [esi + 4 * 4 * 4], xmm4
    MOVAPS  [esi + 5 * 4 * 4], xmm5
    MOVAPS  [esi + 6 * 4 * 4], xmm6
    MOVAPS  [esi + 7 * 4 * 4], xmm7

    ADD     esi, 4*4*8
    ADD     edi, 4*4*8

    SUB     ecx, 4*8
    JNZ     copy_loop_1

    RET
?fast_mem_copy_sse@@YAXPAH0H@Z ENDP

?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z PROC
    MOV     edi, [esp + 4]  ; destination
    MOV     esi, [esp + 8]  ; source
    MOV     ecx, [esp + 12] ; n bytes for copy  

copy_loop_2:

    MOVAPS  xmm0, [edi + 0 * 4 * 4]
    MOVAPS  xmm1, [edi + 1 * 4 * 4]
    MOVAPS  xmm2, [edi + 2 * 4 * 4]
    MOVAPS  xmm3, [edi + 3 * 4 * 4]
    MOVAPS  xmm4, [edi + 4 * 4 * 4]
    MOVAPS  xmm5, [edi + 5 * 4 * 4]
    MOVAPS  xmm6, [edi + 6 * 4 * 4]
    MOVAPS  xmm7, [edi + 7 * 4 * 4]

    MOVNTDQ [esi + 0 * 4 * 4], xmm0
    MOVNTDQ [esi + 1 * 4 * 4], xmm1
    MOVNTDQ [esi + 2 * 4 * 4], xmm2
    MOVNTDQ [esi + 3 * 4 * 4], xmm3
    MOVNTDQ [esi + 4 * 4 * 4], xmm4
    MOVNTDQ [esi + 5 * 4 * 4], xmm5
    MOVNTDQ [esi + 6 * 4 * 4], xmm6
    MOVNTDQ [esi + 7 * 4 * 4], xmm7

    ADD     esi, 4*4*8
    ADD     edi, 4*4*8

    SUB     ecx, 4*8
    JNZ     copy_loop_2

    RET
?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z ENDP

?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z PROC
    MOV     edi, [esp + 4]  ; destination
    MOV     esi, [esp + 8]  ; source
    MOV     ecx, [esp + 12] ; n bytes for copy  

copy_loop_3:
    ;PREFETCHT0 [edi + 0 * 4 * 4]
    ;PREFETCHT0 [edi + 1 * 4 * 4]
    ;PREFETCHT0 [edi + 2 * 4 * 4]
    ;PREFETCHT0 [edi + 3 * 4 * 4]
    ;PREFETCHT0 [edi + 4 * 4 * 4]
    ;PREFETCHT0 [edi + 5 * 4 * 4]
    ;PREFETCHT0 [edi + 6 * 4 * 4]
    ;PREFETCHT0 [edi + 7 * 4 * 4]
    PREFETCHT0 [edi]


    MOVAPS  xmm0, [edi + 0 * 4 * 4]
    MOVAPS  xmm1, [edi + 1 * 4 * 4]
    MOVAPS  xmm2, [edi + 2 * 4 * 4]
    MOVAPS  xmm3, [edi + 3 * 4 * 4]
    MOVAPS  xmm4, [edi + 4 * 4 * 4]
    MOVAPS  xmm5, [edi + 5 * 4 * 4]
    MOVAPS  xmm6, [edi + 6 * 4 * 4]
    MOVAPS  xmm7, [edi + 7 * 4 * 4]

    MOVAPS  [esi + 0 * 4 * 4], xmm0
    MOVAPS  [esi + 1 * 4 * 4], xmm1
    MOVAPS  [esi + 2 * 4 * 4], xmm2
    MOVAPS  [esi + 3 * 4 * 4], xmm3
    MOVAPS  [esi + 4 * 4 * 4], xmm4
    MOVAPS  [esi + 5 * 4 * 4], xmm5
    MOVAPS  [esi + 6 * 4 * 4], xmm6
    MOVAPS  [esi + 7 * 4 * 4], xmm7

    ADD     esi, 4*4*8
    ADD     edi, 4*4*8

    SUB     ecx, 4*8
    JNZ     copy_loop_3

    RET
?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z ENDP

END

#include <string.h>
#include <iostream>
#include <time.h>

//#define CHECK

#define BLOCK_SIZE          8*8
#define AMOUNT_OF_BLOCKS    200*4
#define AMOUNT_OF_RUNS      100000

void fast_mem_copy_sse(int *dst, int *src, int n);
void fast_mem_copy_sse_movntdq(int *dst, int *src, int n);
void fast_mem_copy_sse_prefetch(int *dst, int *src, int n);

void fast_mem_copy(int *dst, int *src, int n)
{
    for (int i = 0; i < n; i++) {
        *(dst + i) = *(src + i);
    }
}

int main() 
{
    clock_t t;

    _declspec(align(16)) int a[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
    _declspec(align(16)) int b[AMOUNT_OF_BLOCKS*BLOCK_SIZE];

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy work wrong; j = " << j << "
";
            }
        }
#endif
    }

    t = clock() - t;
    std::cout << "fast_mem_copy took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).
";

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy_sse(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS); 

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy_sse work wrong; j = " << j << "
";
            }
        }
#endif
    }
    t = clock() - t;
    std::cout << "fast_mem_copy_sse took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).
";

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy_sse_movntdq(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy_sse_movntdq work wrong; j = " << j << "
";
            }
        }
#endif
    }
    t = clock() - t;
    std::cout << "fast_mem_copy_sse_movntdq took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).
";

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy_sse_prefetch(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy_sse_prefetch work wrong; j = " << j << "
";
            }
        }
#endif
    }
    t = clock() - t;
    std::cout << "fast_mem_copy_sse_prefetch took me " << t << " clicks (" << ((float)t / CLOCKS_PER_SEC) << " seconds).
";

    system("PAUSE");
    return 0;
}

I got the following result:

fast_mem_copy took me 11262 clicks (11.262 seconds).
fast_mem_copy_sse took me 1940 clicks (1.94 seconds).
fast_mem_copy_sse_movntdq took me 3570 clicks (3.57 seconds).
fast_mem_copy_sse_prefetch took me 1970 clicks (1.97 seconds).

So what is wrong? Or in fast_mem_copy_sse are using hardware prefetch and there is no any sense to use instruction for prefetch? Also I used VTune and it told me that there is no cache misses.

Zan Lynx · Accepted Answer

Prefetching will only help if you do it far enough ahead to matter. I believe CPU speeds are up to the point that it now takes about 200 CPU cycles to fetch from RAM. With a loop like yours you'd need to be prefetching probably 10 iterations ahead.

Also, if you are doing simple copy loops that proceed in sequential access, the CPU hardware is already doing prefetch for you.

Optimization using prefetch

Tags:

optimization

assembly

sse

user1358552

1 Answers

Zan Lynx

Recent Activity

Donate For Us

Optimization using prefetch

Tags:

optimization

assembly

sse

user1358552

1 Answers

Zan Lynx

Related questions

Recent Activity

Donate For Us