I want to understand how to use PREFETCH* instructions. For this I wrote some code:
.model flat
.code
?fast_mem_copy_sse@@YAXPAH0H@Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_1:
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVAPS [esi + 0 * 4 * 4], xmm0
MOVAPS [esi + 1 * 4 * 4], xmm1
MOVAPS [esi + 2 * 4 * 4], xmm2
MOVAPS [esi + 3 * 4 * 4], xmm3
MOVAPS [esi + 4 * 4 * 4], xmm4
MOVAPS [esi + 5 * 4 * 4], xmm5
MOVAPS [esi + 6 * 4 * 4], xmm6
MOVAPS [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_1
RET
?fast_mem_copy_sse@@YAXPAH0H@Z ENDP
?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_2:
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVNTDQ [esi + 0 * 4 * 4], xmm0
MOVNTDQ [esi + 1 * 4 * 4], xmm1
MOVNTDQ [esi + 2 * 4 * 4], xmm2
MOVNTDQ [esi + 3 * 4 * 4], xmm3
MOVNTDQ [esi + 4 * 4 * 4], xmm4
MOVNTDQ [esi + 5 * 4 * 4], xmm5
MOVNTDQ [esi + 6 * 4 * 4], xmm6
MOVNTDQ [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_2
RET
?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z ENDP
?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_3:
;PREFETCHT0 [edi + 0 * 4 * 4]
;PREFETCHT0 [edi + 1 * 4 * 4]
;PREFETCHT0 [edi + 2 * 4 * 4]
;PREFETCHT0 [edi + 3 * 4 * 4]
;PREFETCHT0 [edi + 4 * 4 * 4]
;PREFETCHT0 [edi + 5 * 4 * 4]
;PREFETCHT0 [edi + 6 * 4 * 4]
;PREFETCHT0 [edi + 7 * 4 * 4]
PREFETCHT0 [edi]
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVAPS [esi + 0 * 4 * 4], xmm0
MOVAPS [esi + 1 * 4 * 4], xmm1
MOVAPS [esi + 2 * 4 * 4], xmm2
MOVAPS [esi + 3 * 4 * 4], xmm3
MOVAPS [esi + 4 * 4 * 4], xmm4
MOVAPS [esi + 5 * 4 * 4], xmm5
MOVAPS [esi + 6 * 4 * 4], xmm6
MOVAPS [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_3
RET
?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z ENDP
END
#include <string.h>
#include <iostream>
#include <time.h>
//#define CHECK
#define BLOCK_SIZE 8*8
#define AMOUNT_OF_BLOCKS 200*4
#define AMOUNT_OF_RUNS 100000
void fast_mem_copy_sse(int *dst, int *src, int n);
void fast_mem_copy_sse_movntdq(int *dst, int *src, int n);
void fast_mem_copy_sse_prefetch(int *dst, int *src, int n);
void fast_mem_copy(int *dst, int *src, int n)
{
for (int i = 0; i < n; i++) {
*(dst + i) = *(src + i);
}
}
int main()
{
clock_t t;
_declspec(align(16)) int a[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
_declspec(align(16)) int b[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse_movntdq(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse_movntdq work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse_movntdq took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse_prefetch(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse_prefetch work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse_prefetch took me " << t << " clicks (" << ((float)t / CLOCKS_PER_SEC) << " seconds).\n";
system("PAUSE");
return 0;
}
I got the following result:
fast_mem_copy took me 11262 clicks (11.262 seconds).
fast_mem_copy_sse took me 1940 clicks (1.94 seconds).
fast_mem_copy_sse_movntdq took me 3570 clicks (3.57 seconds).
fast_mem_copy_sse_prefetch took me 1970 clicks (1.97 seconds).
So what is wrong? Or in fast_mem_copy_sse are using hardware prefetch and there is no any sense to use instruction for prefetch? Also I used VTune and it told me that there is no cache misses.
Prefetching will only help if you do it far enough ahead to matter. I believe CPU speeds are up to the point that it now takes about 200 CPU cycles to fetch from RAM. With a loop like yours you'd need to be prefetching probably 10 iterations ahead.
Also, if you are doing simple copy loops that proceed in sequential access, the CPU hardware is already doing prefetch for you.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With