After watching Day 25 I want to comment on memory bandwidth thing. I seriously doubt any code will get those 32GB/s Casey was looking up online. That number is max CPU supported memory bandwidth. Real bandwidth will be lower. To test this, I wrote small test application that does memcpy from one memory buffer to other. And I am not seeing max number from spec sheet (on my laptop it is 25.6 GB/s for i7-4750HQ CPU). To see if maybe we can write better memcpy I implemented also memcpy with SSE and AVX instructions - still not getting that number.
So the numbers for my laptop with i7-4750HQ CPU (16GB of memory with two 8GB modules) using "clang -O2" compiler are follwing.
Simple memcpy gives me
6.51 GiB/s.
Using SSE2 with following code gives me
3.97 GiB/s. That means memcpy is better optimized that naive SSE2.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 | // dst and src must be 16-byte aligned
// size must be multiple of 16*8 = 128 bytes
static void CopyWithSSE(uint8_t* dst, uint8_t* src, size_t size)
{
size_t stride = 8 * sizeof(__m128i);
while (size)
{
__m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
__m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
__m128 c = _mm_load_ps((float*)(src + 2*sizeof(__m128)));
__m128 d = _mm_load_ps((float*)(src + 3*sizeof(__m128)));
__m128 e = _mm_load_ps((float*)(src + 4*sizeof(__m128)));
__m128 f = _mm_load_ps((float*)(src + 5*sizeof(__m128)));
__m128 g = _mm_load_ps((float*)(src + 6*sizeof(__m128)));
__m128 h = _mm_load_ps((float*)(src + 7*sizeof(__m128)));
_mm_store_ps((float*)(dst + 0*sizeof(__m128)), a);
_mm_store_ps((float*)(dst + 1*sizeof(__m128)), b);
_mm_store_ps((float*)(dst + 2*sizeof(__m128)), c);
_mm_store_ps((float*)(dst + 3*sizeof(__m128)), d);
_mm_store_ps((float*)(dst + 4*sizeof(__m128)), e);
_mm_store_ps((float*)(dst + 5*sizeof(__m128)), f);
_mm_store_ps((float*)(dst + 6*sizeof(__m128)), g);
_mm_store_ps((float*)(dst + 7*sizeof(__m128)), h);
size -= stride;
src += stride;
dst += stride;
}
}
|
I tried with less registers to see if code can be smaller. That gives still
3.97 GiB/s.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | // dst and src must be 16-byte aligned
// size must be multiple of 16*2 = 32 bytes
static void CopyWithSSESmall(uint8_t* dst, uint8_t* src, size_t size)
{
size_t stride = 2 * sizeof(__m128);
while (size)
{
__m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
__m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
_mm_store_ps((float*)(dst + 0*sizeof(__m128)), a);
_mm_store_ps((float*)(dst + 1*sizeof(__m128)), b);
size -= stride;
src += stride;
dst += stride;
}
}
|
Then I tried store instruction that doesn't pollute cache. That gives
6.67 GiB/s. That's very close to memcpy, and I'm guessing C runtime on Linux uses this instruction.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | // dst and src must be 16-byte aligned
// size must be multiple of 16*2 = 32 bytes
static void CopyWithSSENoCache(uint8_t* dst, uint8_t* src, size_t size)
{
size_t stride = 2 * sizeof(__m128);
while (size)
{
__m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
__m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
_mm_stream_ps((float*)(dst + 0*sizeof(__m128)), a);
_mm_stream_ps((float*)(dst + 1*sizeof(__m128)), b);
size -= stride;
src += stride;
dst += stride;
}
}
|
I tried to use prefetch instructions, but that did not give any reasonable speedup. I'm guessing modern CPUs can predict linear memory access pretty efficiently and does prefetch automatically.
Then I tried using AVX instructions. This gives
4.00 GiB/s. This is not better than SSE.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | // dst and src must be 32-byte aligned
// size must be multiple of 32*16 = 512 bytes
static void CopyWithAVX(uint8_t* dst, uint8_t* src, size_t size)
{
size_t stride = 16 * sizeof(__m256i);
while (size)
{
__m256i a = _mm256_load_si256((__m256i*)src + 0);
__m256i b = _mm256_load_si256((__m256i*)src + 1);
__m256i c = _mm256_load_si256((__m256i*)src + 2);
__m256i d = _mm256_load_si256((__m256i*)src + 3);
__m256i e = _mm256_load_si256((__m256i*)src + 4);
__m256i f = _mm256_load_si256((__m256i*)src + 5);
__m256i g = _mm256_load_si256((__m256i*)src + 6);
__m256i h = _mm256_load_si256((__m256i*)src + 7);
__m256i i = _mm256_load_si256((__m256i*)src + 8);
__m256i j = _mm256_load_si256((__m256i*)src + 9);
__m256i k = _mm256_load_si256((__m256i*)src + 10);
__m256i l = _mm256_load_si256((__m256i*)src + 11);
__m256i m = _mm256_load_si256((__m256i*)src + 12);
__m256i n = _mm256_load_si256((__m256i*)src + 13);
__m256i o = _mm256_load_si256((__m256i*)src + 14);
__m256i p = _mm256_load_si256((__m256i*)src + 15);
_mm256_store_si256((__m256i*)dst + 0, a);
_mm256_store_si256((__m256i*)dst + 1, b);
_mm256_store_si256((__m256i*)dst + 2, c);
_mm256_store_si256((__m256i*)dst + 3, d);
_mm256_store_si256((__m256i*)dst + 4, e);
_mm256_store_si256((__m256i*)dst + 5, f);
_mm256_store_si256((__m256i*)dst + 6, g);
_mm256_store_si256((__m256i*)dst + 7, h);
_mm256_store_si256((__m256i*)dst + 8, i);
_mm256_store_si256((__m256i*)dst + 9, j);
_mm256_store_si256((__m256i*)dst + 10, k);
_mm256_store_si256((__m256i*)dst + 11, l);
_mm256_store_si256((__m256i*)dst + 12, m);
_mm256_store_si256((__m256i*)dst + 13, n);
_mm256_store_si256((__m256i*)dst + 14, o);
_mm256_store_si256((__m256i*)dst + 15, p);
size -= stride;
src += stride;
dst += stride;
}
}
|
Let's see if reducing register cound help or at least doesn't make everything worse. It doesn't, I'm getting
3.99 GiB/s for this.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | // dst and src must be 32-byte aligned
// size must be multiple of 32*2 = 64 bytes
static void CopyWithAVXSmall(uint8_t* dst, uint8_t* src, size_t size)
{
size_t stride = 2 * sizeof(__m256i);
while (size)
{
__m256i a = _mm256_load_si256((__m256i*)src + 0);
__m256i b = _mm256_load_si256((__m256i*)src + 1);
_mm256_store_si256((__m256i*)dst + 0, a);
_mm256_store_si256((__m256i*)dst + 1, b);
size -= stride;
src += stride;
dst += stride;
}
}
|
Using store instruction that doesn't pollute cache helps. Now getting
6.64 GiB/s - same speed as for SSE.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | // dst and src must be 32-byte aligned
// size must be multiple of 32*2 = 64 bytes
static void CopyWithAVXNoCache(uint8_t* dst, uint8_t* src, size_t size)
{
size_t stride = 2 * sizeof(__m256i);
while (size)
{
__m256i a = _mm256_load_si256((__m256i*)src + 0);
__m256i b = _mm256_load_si256((__m256i*)src + 1);
_mm256_stream_si256((__m256i*)dst + 0, a);
_mm256_stream_si256((__m256i*)dst + 1, b);
size -= stride;
src += stride;
dst += stride;
}
}
|
Then I tried couple of crazy things.
First I tried using "rep movsb", "rep movsl" and "rep movsq" instructions. These typically are not recommended to use on modern CPUs. But I was surprised that this gives better speed that just using SSE instructions -
~5.5 GiB/s for all three variants. So for smaller moves using "rep movsb" is OK to use in my opinion. I tried unaligned addresses (not multiple of 16), still ok - around
5.5 GiB/s. I'm guessing modern CPU's recognize "rep movsX" instruction as special case and does "the right thing" automatically.
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | static void __movsb(void* dst, const void* src, size_t size)
{
__asm__ __volatile__("rep movsb" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
}
static void __movsd(void* dst, const void* src, size_t size)
{
__asm__ __volatile__("rep movsl" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
}
static void __movsq(void* dst, const void* src, size_t size)
{
__asm__ __volatile__("rep movsq" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
}
|
Then I went with completely crazy stuff - copying in parallel with two threads:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61 | const size_t kThreadCount = 2;
struct ThreadWorkData
{
uint8_t* src;
uint8_t* dst;
size_t size;
volatile bool RunThread;
};
static ThreadWorkData ThreadData[kThreadCount];
static volatile long ThreadsReady;
static void* ThreadProc(void* Arg)
{
size_t ThreadIndex = (size_t)Arg;
ThreadWorkData* MyData = &ThreadData[ThreadIndex];
for (;;)
{
while (!MyData->RunThread)
{
}
CopyWithSSENoCache(MyData->dst, MyData->src, MyData->size);
__sync_add_and_fetch(&ThreadsReady, 1);
MyData->RunThread = false;
}
return 0;
}
static void SetupThreads()
{
for (size_t i=0; i<kThreadCount; i++)
{
pthread_t thread;
pthread_create(&thread, 0, ThreadProc, (void*)i);
}
}
// dst and src must be 32-byte aligned
// size must be multiple of 32*2*kThreadCount = 64*kThreadCoutn bytes
static void CopyWithThreads(uint8_t* dst, uint8_t* src, size_t size)
{
size_t size1 = size / kThreadCount;
ThreadsReady = 0;
for (size_t i=0; i<kThreadCount; i++)
{
ThreadData[i].dst = dst;
ThreadData[i].src = src;
ThreadData[i].size = size1;
ThreadData[i].RunThread = true;
dst += size1;
src += size1;
}
while (ThreadsReady != kThreadCount)
{
}
}
|
This gives
7.7 GiB/s. So a bit better than other solutions. Increasing thread count to 4 (my CPU is quad core) doesn't help, speed stays the same.
So in summary, here are speeds for my laptop with i7-4750HQ, compiled with clang, running under Linux:
1
2
3
4
5
6
7
8
9
10
11
12 | memcpy = 6.51 GiB/s
CopyWithSSE = 3.97 GiB/s
CopyWithSSESmall = 3.97 GiB/s
CopyWithSSENoCache = 6.67 GiB/s
CopyWithAVX = 4.00 GiB/s
CopyWithAVXSmall = 3.99 GiB/s
CopyWithAVXNoCache = 6.64 GiB/s
CopyWithRepMovsb = 5.69 GiB/s
CopyWithRepMovsd = 5.22 GiB/s
CopyWithRepMovsq = 5.19 GiB/s
CopyWithRepMovsbUnaligned = 5.11 GiB/s
CopyWithThreads = 7.70 GiB/s
|
Numbers on my desktop with i5-750 (no AVX instruction set), compiled with Visual Studio 2013 (x64):
| memcpy = 3.46 GiB/s
CopyWithSSE = 3.48 GiB/s
CopyWithSSESmall = 3.43 GiB/s
CopyWithSSENoCache = 4.79 GiB/s
CopyWithRepMovsb = 4.08 GiB/s
CopyWithRepMovsd = 4.11 GiB/s
CopyWithRepMovsq = 4.01 GiB/s
CopyWithRepMovsbUnaligned = 3.93 GiB/s
CopyWithThreads = 4.44 GiB/s
|
Only memcpy is less efficient (apparently it doesn't use SSE store instructions that doesn't pollute cache).
My conclusion on all this - if you want to implement fast memcpy, don't bother with SSE on modern CPU's. Just use "classic" rep movsb instruction. MSVC has intrisinc for that (__movsb) on GCC/clang it is pretty trivial to implement (see code above). And don't expect numbers to be close to max bandwidth numbers you see in spec sheets.
Also about CopyMemory - it is actually #define to use memcpy. For VS2013 CopyMemory (and its friends) in minwinbase.h are defined like this:
| #define MoveMemory RtlMoveMemory
#define CopyMemory RtlCopyMemory
#define FillMemory RtlFillMemory
#define ZeroMemory RtlZeroMemory
|
RtlXYZ functions are in winnt.h file:
| #define RtlEqualMemory(Destination,Source,Length) (!memcmp((Destination),(Source),(Length)))
#define RtlMoveMemory(Destination,Source,Length) memmove((Destination),(Source),(Length))
#define RtlCopyMemory(Destination,Source,Length) memcpy((Destination),(Source),(Length))
#define RtlFillMemory(Destination,Length,Fill) memset((Destination),(Fill),(Length))
#define RtlZeroMemory(Destination,Length) memset((Destination),0,(Length))
|
So there is not "Windows" function for copying memory. And because we are implementing everything ourselves, and Casey said he won't use standard library for anything, it is not "fair" to use CopyMemory :) We need to implement it ourselves. My suggestion:
| void CopyMemory(void* dst, const void* src, size_t size)
{
#ifdef _MSC_VER
__movsb(dst, src, size);
#elif defined(__i386__) || defined(__x86_64___)
__asm__ __volatile__("rep movsb" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
#else
#error TODO: implement for other architectures
#endif
}
|
All my source code is available here:
MemSpeed.cpp. It can be compiled with MSVC2013, clang and gcc.