Memory bandwidth + implementing memcpy

After watching Day 25 I want to comment on memory bandwidth thing. I seriously doubt any code will get those 32GB/s Casey was looking up online. That number is max CPU supported memory bandwidth. Real bandwidth will be lower. To test this, I wrote small test application that does memcpy from one memory buffer to other. And I am not seeing max number from spec sheet (on my laptop it is 25.6 GB/s for i7-4750HQ CPU). To see if maybe we can write better memcpy I implemented also memcpy with SSE and AVX instructions - still not getting that number.

So the numbers for my laptop with i7-4750HQ CPU (16GB of memory with two 8GB modules) using "clang -O2" compiler are follwing.

Simple memcpy gives me 6.51 GiB/s.

Using SSE2 with following code gives me 3.97 GiB/s. That means memcpy is better optimized that naive SSE2.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
// dst and src must be 16-byte aligned
// size must be multiple of 16*8 = 128 bytes
static void CopyWithSSE(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 8 * sizeof(__m128i);
    while (size)
    {
        __m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
        __m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
        __m128 c = _mm_load_ps((float*)(src + 2*sizeof(__m128)));
        __m128 d = _mm_load_ps((float*)(src + 3*sizeof(__m128)));
        __m128 e = _mm_load_ps((float*)(src + 4*sizeof(__m128)));
        __m128 f = _mm_load_ps((float*)(src + 5*sizeof(__m128)));
        __m128 g = _mm_load_ps((float*)(src + 6*sizeof(__m128)));
        __m128 h = _mm_load_ps((float*)(src + 7*sizeof(__m128)));
        _mm_store_ps((float*)(dst + 0*sizeof(__m128)), a);
        _mm_store_ps((float*)(dst + 1*sizeof(__m128)), b);
        _mm_store_ps((float*)(dst + 2*sizeof(__m128)), c);
        _mm_store_ps((float*)(dst + 3*sizeof(__m128)), d);
        _mm_store_ps((float*)(dst + 4*sizeof(__m128)), e);
        _mm_store_ps((float*)(dst + 5*sizeof(__m128)), f);
        _mm_store_ps((float*)(dst + 6*sizeof(__m128)), g);
        _mm_store_ps((float*)(dst + 7*sizeof(__m128)), h);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


I tried with less registers to see if code can be smaller. That gives still 3.97 GiB/s.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
// dst and src must be 16-byte aligned
// size must be multiple of 16*2 = 32 bytes
static void CopyWithSSESmall(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m128);
    while (size)
    {
        __m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
        __m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
        _mm_store_ps((float*)(dst + 0*sizeof(__m128)), a);
        _mm_store_ps((float*)(dst + 1*sizeof(__m128)), b);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


Then I tried store instruction that doesn't pollute cache. That gives 6.67 GiB/s. That's very close to memcpy, and I'm guessing C runtime on Linux uses this instruction.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
// dst and src must be 16-byte aligned
// size must be multiple of 16*2 = 32 bytes
static void CopyWithSSENoCache(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m128);
    while (size)
    {
        __m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
        __m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
        _mm_stream_ps((float*)(dst + 0*sizeof(__m128)), a);
        _mm_stream_ps((float*)(dst + 1*sizeof(__m128)), b);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


I tried to use prefetch instructions, but that did not give any reasonable speedup. I'm guessing modern CPUs can predict linear memory access pretty efficiently and does prefetch automatically.

Then I tried using AVX instructions. This gives 4.00 GiB/s. This is not better than SSE.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// dst and src must be 32-byte aligned
// size must be multiple of 32*16 = 512 bytes
static void CopyWithAVX(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 16 * sizeof(__m256i);
    while (size)
    {
        __m256i a = _mm256_load_si256((__m256i*)src + 0);
        __m256i b = _mm256_load_si256((__m256i*)src + 1);
        __m256i c = _mm256_load_si256((__m256i*)src + 2);
        __m256i d = _mm256_load_si256((__m256i*)src + 3);
        __m256i e = _mm256_load_si256((__m256i*)src + 4);
        __m256i f = _mm256_load_si256((__m256i*)src + 5);
        __m256i g = _mm256_load_si256((__m256i*)src + 6);
        __m256i h = _mm256_load_si256((__m256i*)src + 7);
        __m256i i = _mm256_load_si256((__m256i*)src + 8);
        __m256i j = _mm256_load_si256((__m256i*)src + 9);
        __m256i k = _mm256_load_si256((__m256i*)src + 10);
        __m256i l = _mm256_load_si256((__m256i*)src + 11);
        __m256i m = _mm256_load_si256((__m256i*)src + 12);
        __m256i n = _mm256_load_si256((__m256i*)src + 13);
        __m256i o = _mm256_load_si256((__m256i*)src + 14);
        __m256i p = _mm256_load_si256((__m256i*)src + 15);
        _mm256_store_si256((__m256i*)dst + 0, a);
        _mm256_store_si256((__m256i*)dst + 1, b);
        _mm256_store_si256((__m256i*)dst + 2, c);
        _mm256_store_si256((__m256i*)dst + 3, d);
        _mm256_store_si256((__m256i*)dst + 4, e);
        _mm256_store_si256((__m256i*)dst + 5, f);
        _mm256_store_si256((__m256i*)dst + 6, g);
        _mm256_store_si256((__m256i*)dst + 7, h);
        _mm256_store_si256((__m256i*)dst + 8, i);
        _mm256_store_si256((__m256i*)dst + 9, j);
        _mm256_store_si256((__m256i*)dst + 10, k);
        _mm256_store_si256((__m256i*)dst + 11, l);
        _mm256_store_si256((__m256i*)dst + 12, m);
        _mm256_store_si256((__m256i*)dst + 13, n);
        _mm256_store_si256((__m256i*)dst + 14, o);
        _mm256_store_si256((__m256i*)dst + 15, p);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


Let's see if reducing register cound help or at least doesn't make everything worse. It doesn't, I'm getting 3.99 GiB/s for this.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
// dst and src must be 32-byte aligned
// size must be multiple of 32*2 = 64 bytes
static void CopyWithAVXSmall(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m256i);
    while (size)
    {
        __m256i a = _mm256_load_si256((__m256i*)src + 0);
        __m256i b = _mm256_load_si256((__m256i*)src + 1);
        _mm256_store_si256((__m256i*)dst + 0, a);
        _mm256_store_si256((__m256i*)dst + 1, b);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


Using store instruction that doesn't pollute cache helps. Now getting 6.64 GiB/s - same speed as for SSE.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
// dst and src must be 32-byte aligned
// size must be multiple of 32*2 = 64 bytes
static void CopyWithAVXNoCache(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m256i);
    while (size)
    {
        __m256i a = _mm256_load_si256((__m256i*)src + 0);
        __m256i b = _mm256_load_si256((__m256i*)src + 1);
        _mm256_stream_si256((__m256i*)dst + 0, a);
        _mm256_stream_si256((__m256i*)dst + 1, b);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


Then I tried couple of crazy things.
First I tried using "rep movsb", "rep movsl" and "rep movsq" instructions. These typically are not recommended to use on modern CPUs. But I was surprised that this gives better speed that just using SSE instructions - ~5.5 GiB/s for all three variants. So for smaller moves using "rep movsb" is OK to use in my opinion. I tried unaligned addresses (not multiple of 16), still ok - around 5.5 GiB/s. I'm guessing modern CPU's recognize "rep movsX" instruction as special case and does "the right thing" automatically.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
static void __movsb(void* dst, const void* src, size_t size)
{
    __asm__ __volatile__("rep movsb" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
}

static void __movsd(void* dst, const void* src, size_t size)
{
    __asm__ __volatile__("rep movsl" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
}

static void __movsq(void* dst, const void* src, size_t size)
{
    __asm__ __volatile__("rep movsq" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
}


Then I went with completely crazy stuff - copying in parallel with two threads:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
const size_t kThreadCount = 2;

struct ThreadWorkData
{
    uint8_t* src;
    uint8_t* dst;
    size_t size;

    volatile bool RunThread;
};

static ThreadWorkData ThreadData[kThreadCount];
static volatile long ThreadsReady;

static void* ThreadProc(void* Arg)
{
    size_t ThreadIndex = (size_t)Arg;
    ThreadWorkData* MyData = &ThreadData[ThreadIndex];
    for (;;)
    {
        while (!MyData->RunThread)
        {
        }
        CopyWithSSENoCache(MyData->dst, MyData->src, MyData->size);
        __sync_add_and_fetch(&ThreadsReady, 1);
        MyData->RunThread = false;
    } 
    return 0;
}

static void SetupThreads()
{
    for (size_t i=0; i<kThreadCount; i++)
    {
        pthread_t thread;
        pthread_create(&thread, 0, ThreadProc, (void*)i);
    }
}

// dst and src must be 32-byte aligned
// size must be multiple of 32*2*kThreadCount = 64*kThreadCoutn bytes
static void CopyWithThreads(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t size1 = size / kThreadCount;

    ThreadsReady = 0;
    for (size_t i=0; i<kThreadCount; i++)
    {
        ThreadData[i].dst = dst;
        ThreadData[i].src = src;
        ThreadData[i].size = size1;
        ThreadData[i].RunThread = true;

        dst += size1;
        src += size1;
    }

    while (ThreadsReady != kThreadCount)
    {
    }
}

This gives 7.7 GiB/s. So a bit better than other solutions. Increasing thread count to 4 (my CPU is quad core) doesn't help, speed stays the same.

So in summary, here are speeds for my laptop with i7-4750HQ, compiled with clang, running under Linux:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
memcpy = 6.51 GiB/s
CopyWithSSE = 3.97 GiB/s
CopyWithSSESmall = 3.97 GiB/s
CopyWithSSENoCache = 6.67 GiB/s
CopyWithAVX = 4.00 GiB/s
CopyWithAVXSmall = 3.99 GiB/s
CopyWithAVXNoCache = 6.64 GiB/s
CopyWithRepMovsb = 5.69 GiB/s
CopyWithRepMovsd = 5.22 GiB/s
CopyWithRepMovsq = 5.19 GiB/s
CopyWithRepMovsbUnaligned = 5.11 GiB/s
CopyWithThreads = 7.70 GiB/s


Numbers on my desktop with i5-750 (no AVX instruction set), compiled with Visual Studio 2013 (x64):
1
2
3
4
5
6
7
8
9
memcpy = 3.46 GiB/s
CopyWithSSE = 3.48 GiB/s
CopyWithSSESmall = 3.43 GiB/s
CopyWithSSENoCache = 4.79 GiB/s
CopyWithRepMovsb = 4.08 GiB/s
CopyWithRepMovsd = 4.11 GiB/s
CopyWithRepMovsq = 4.01 GiB/s
CopyWithRepMovsbUnaligned = 3.93 GiB/s
CopyWithThreads = 4.44 GiB/s

Only memcpy is less efficient (apparently it doesn't use SSE store instructions that doesn't pollute cache).

My conclusion on all this - if you want to implement fast memcpy, don't bother with SSE on modern CPU's. Just use "classic" rep movsb instruction. MSVC has intrisinc for that (__movsb) on GCC/clang it is pretty trivial to implement (see code above). And don't expect numbers to be close to max bandwidth numbers you see in spec sheets.

Also about CopyMemory - it is actually #define to use memcpy. For VS2013 CopyMemory (and its friends) in minwinbase.h are defined like this:
1
2
3
4
#define MoveMemory RtlMoveMemory
#define CopyMemory RtlCopyMemory
#define FillMemory RtlFillMemory
#define ZeroMemory RtlZeroMemory


RtlXYZ functions are in winnt.h file:
1
2
3
4
5
#define RtlEqualMemory(Destination,Source,Length) (!memcmp((Destination),(Source),(Length)))
#define RtlMoveMemory(Destination,Source,Length) memmove((Destination),(Source),(Length))
#define RtlCopyMemory(Destination,Source,Length) memcpy((Destination),(Source),(Length))
#define RtlFillMemory(Destination,Length,Fill) memset((Destination),(Fill),(Length))
#define RtlZeroMemory(Destination,Length) memset((Destination),0,(Length))


So there is not "Windows" function for copying memory. And because we are implementing everything ourselves, and Casey said he won't use standard library for anything, it is not "fair" to use CopyMemory :) We need to implement it ourselves. My suggestion:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
void CopyMemory(void* dst, const void* src, size_t size)
{
#ifdef _MSC_VER
    __movsb(dst, src, size);
#elif defined(__i386__) || defined(__x86_64___)
    __asm__ __volatile__("rep movsb" : "+D"(dst), "+S"(src), "+c"(size) : : "memory");
#else
    #error TODO: implement for other architectures
#endif
}


All my source code is available here: MemSpeed.cpp. It can be compiled with MSVC2013, clang and gcc.

Edited by Mārtiņš Možeiko on
Nice and interesting post.

Can you test the Linux build against the Win build on the same hardware, please?

I am interested, because I am on Linux now, since about the windows 10 preview came along. I frankly find the new Windows metrostyle obscene, and extremely ugly. IS like:wtf? Every time I see it, I perceive a worse experience then the last time. Its like having a cutting tool to your eyeballs! It has "bad taste" written all over it. Who the fuck would consider writing a new API for a freaking TILING engine? Hahha. Thats what it is. TILING 2.0. It's ugly as hell, and it's anoying ;)

But on the other hand I find Linux to feel very slow and unresponsive. Also frequent strange crashes and hangups, not of the os, but the apps I am using. Firefox, System monitor, copying and so on. It's also ideling in like 5-10% CPU on every core, which I find terrible. It uses now 10% on core 3, for doing absolutely nope. And I would never consider using this platform for serious coding. Espesially of games or other performant code. And I am frankly in a bit of shock when I hear other people doing it. What have I missed?

Windows is blazing fast if done right, and the MSDN papers have proven me wrong so many times, that I now trust them completely. Everytime I was suspecting there was something wrong with the API it always turned out that there was not. And that I was just misunderstanding something. I don't know how many times I realized that the real asshole was me. I would gauge MSDN has a failure-rate of about 0,1% or something. You can hardly get better information than that.

While some of the higher level features doesn't seem all that right to me, the core OS is just about as fast and well done as it could be, on the average. And after 30+ years, what would you expect? But Linux seems as sluggish and slow as everytime I tried it in the last 15 years. And now that ubuntu is bundled with spyware, and a disabled firewall, with google and Amazons hands all over it, the net result is the feeling that it's the worst of two worlds. It's slow, and it will become a launchpad for marketing powers to flourish. I don't like that. And it doesn't help moving distros, as they will be where the users are. I do not like that at all.

However, up on seing that my win32 applications runs very well on the Wine plattform, way better than in the Linux itself, and close to win32, I guess I to some extent IS considering it, still, as an option for the future.

Sorry for ranting in your thread.

I be very interested to read about things you have to say about memory speeds.
Kladdehelvete
Can you test the Linux build against the Win build on the same hardware, please?

Sorry, I don't have Windows on same hardware as Linux. And I think its pointless to do that, because these functions are pure memory operations there won't be difference in speed, because nothing depends OS (as long as memory are really in physical memory, not in pagefile).

Only thing such test will show is difference between compilers (MSVC vs clang/gcc). I can test that if you want. So for clang built executable running on my i7-4750HQ laptop you see my numbers above. Running Visual C++ compiled executable using wine gives following numbers:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
[mmozeiko@dev ~]$ WINEARCH=win64 WINEPREFIX=~/.wine64 wine ./MemSpeed.exe
memcpy = 4.48 GiB/s
CopyWithSSE = 4.48 GiB/s
CopyWithSSESmall = 4.48 GiB/s
CopyWithSSENoCache = 7.67 GiB/s
CopyWithAVX = 4.51 GiB/s
CopyWithAVXSmall = 4.49 GiB/s
CopyWithAVXNoCache = 7.64 GiB/s
CopyWithRepMovsb = 5.85 GiB/s
CopyWithRepMovsd = 5.60 GiB/s
CopyWithRepMovsq = 5.57 GiB/s
CopyWithRepMovsbUnaligned = 5.50 GiB/s
CopyWithThreads = 7.55 GiB/s


As you can see numbers are pretty close to Linux native ones. Except memcpy, which makes sense - I made MSVC to link statically with C runtime, so wine executes memcpy from Microsoft Visual C/C++ instead of calling glibc memcpy.

But on the other hand I find Linux to feel very slow and unresponsive.
In my experience it is other way around. Linux is much more responsive for me. In I/O, process creation, threading stuff.. etc. In my work I often need to compile whole llvm/clang - it is very large C++ project. On same hardware doing that under Windows using Visual C++ makes it 2x or 3x longer than using clang/gcc on Linux. Using clang/gcc on Windows is still longer than on Linux. My guess would that this is because Windows simply doesn't optimize such low-level stuff anymore, they are changing only high-level stuff (UI & Metro) nowdays. But Linux does (example). But let's leave this for different thread.

Edited by Mārtiņš Možeiko on
FYI, I did a naïve port of your code to mac os x and did a run on my 2012 retina macbook pro. I get some interesting results:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
memcpy = 7.44 GiB/s
CopyWithSSE = 3.92 GiB/s
CopyWithSSESmall = 3.75 GiB/s
CopyWithSSENoCache = 0.34 GiB/s
CopyWithAVX = 4.09 GiB/s
CopyWithAVXSmall = 4.08 GiB/s
CopyWithAVXNoCache = 5.87 GiB/s
CopyWithRepMovsb = 7.59 GiB/s
CopyWithRepMovsd = 7.17 GiB/s
CopyWithRepMovsq = 7.25 GiB/s
CopyWithRepMovsbUnaligned = 6.90 GiB/s
CopyWithThreads = 0.67 GiB/s


And btw, the "port" was only:
1. link with -mavx2
2. add clock_gettime impl found on stack overflow:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
#ifdef __MACH__
#include <sys/time.h>
#define CLOCK_REALTIME 0 
#define CLOCK_MONOTONIC 0 
//clock_gettime is not implemented on OSX
int clock_gettime(int /*clk_id*/, struct timespec* t) {
    struct timeval now;
    int rv = gettimeofday(&now, NULL);
    if (rv) return rv;
    t->tv_sec  = now.tv_sec;
    t->tv_nsec = now.tv_usec * 1000;
    return 0;
}
#endif
Can you check assembly code for CopyWithSSENoCache function? Something isn't right there.
And you need to use only "-mavx" compiler flag. I'm not using AVX2 instructions, just AVX.

For timing on OSX I would use functions from <mach/mach_time.h> header. mach_absolute_time() returns ticks in uint64_t, and with mach_timebase_info(...) you can get how many tikcs are in second. These functions are very similar to QueryPerformanceCounter and QueryPerformanceFrequency on Windows.

Edited by Mārtiņš Možeiko on
hrm. I'm sorry I just forgot the optimization flag.
Now the results seems more plausible:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
memcpy = 7.54 GiB/s
CopyWithSSE = 5.57 GiB/s
CopyWithSSESmall = 5.55 GiB/s
CopyWithSSENoCache = 7.88 GiB/s
CopyWithAVX = 5.51 GiB/s
CopyWithAVXSmall = 5.53 GiB/s
CopyWithAVXNoCache = 7.75 GiB/s
CopyWithRepMovsb = 7.30 GiB/s
CopyWithRepMovsd = 7.10 GiB/s
CopyWithRepMovsq = 7.29 GiB/s
CopyWithRepMovsbUnaligned = 6.77 GiB/s
CopyWithThreads = 8.62 GiB/s
If your stdlib is any good, then it probably selects an algorithm based on the size, the architecture, the degree of overlap (if it's memmove), and whether the source and destination are both aligned or not. That last point is important on SSE; it's the difference between movaps and movups.

As several people discovered, rep movsb isn't as bad as it's often alleged to be. So much so, that even modern memcpy/memmove just go ahead and use it if the block isn't very big, or to do the initial and final unaligned parts.

For larger blocks, modern stdlibs will often detect the CPU at startup time, and if it's a large block of memory, will use a version tuned for that.
The latest Intel recommendations (seems to agree with the numbers everyone has posted):

"Beginning with processors based on Intel microarchitecture code name Ivy Bridge, REP string operation using MOVSB and STOSB can provide both flexible and high-performance REP string operations for software in common situations like memory copy and set operations."

and

"For processors supporting enhanced REP MOVSB/STOSB, implementing memcpy with REP MOVSB will provide even more compact benefits in code size and better throughput than using the combination of REP MOVSD+B. For processors based on Intel microarchitecture code name Ivy Bridge, implementing memcpy using ERMSB might not reach the same level of throughput as using 256-bit or 128-bit AVX alternatives, depending on length and alignment factors."
Yeah these things are very microarchitecture dependent. I believe the optimal way of doing memcpy varies depending whether you are on sandy bridge, ivy bridge or haswell.
Hey, mmozeiko, I have some questions about the CopyWithSSENoCache function.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
// dst and src must be 16-byte aligned
// size must be multiple of 16*2 = 32 bytes
static void CopyWithSSENoCache(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m128);
    while (size)
    {
        __m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
        __m128 b = _mm_load_ps((float*)(src + 1*sizeof(__m128)));
        _mm_stream_ps((float*)(dst + 0*sizeof(__m128)), a);
        _mm_stream_ps((float*)(dst + 1*sizeof(__m128)), b);

        size -= stride;
        src += stride;
        dst += stride;
    }
}


I don't know much about SSE so I'm asking in case there are details I'm not aware of.

Why do you use two load and two stream ? Couldn't we use only one and not have the requirement of the size being a multiple of 32 ? Is it because the CPU can issue two of these on different ports at the same time ? I tested it with only 1 load and 1 stream and it seem to work with no much performance difference (on my CPU which is quite old now, 2009 lynnfield i7 860).

Is there a reason for not using load_si128 and stream_si128 instead (load_si128 as "better" thoughput) ? I tried it and it seems that there are no differences ? I also tried stream_load_si128 SEE 4.2 instruction, but it didn't seem to matter at all.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
static void CopyWithSSENoCache_single(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = sizeof(__m128);
    while (size)
    {
        __m128 a = _mm_load_ps((float*)(src + 0*sizeof(__m128)));
        _mm_stream_ps((float*)(dst + 0*sizeof(__m128)), a);
        
        size -= stride;
        src += stride;
        dst += stride;
    }
}

static void CopyWithSSENoCache_load_si128_stream_ps(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m128);
    while (size)
    {
        __m128i a = _mm_load_si128((__m128i*)(src + 0*sizeof(__m128)));
        __m128i b = _mm_load_si128((__m128i*)(src + 1*sizeof(__m128)));
        _mm_stream_ps((float*)(dst + 0*sizeof(__m128)), *(__m128*)&a);
        _mm_stream_ps((float*)(dst + 1*sizeof(__m128)), *(__m128*)&b);
        
        size -= stride;
        src += stride;
        dst += stride;
    }
}

static void CopyWithSSENoCache_load_si128_stream_si128(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m128);
    while (size)
    {
        __m128i a = _mm_load_si128((__m128i*)(src + 0*sizeof(__m128)));
        __m128i b = _mm_load_si128((__m128i*)(src + 1*sizeof(__m128)));
        _mm_stream_si128((__m128i*)(dst + 0*sizeof(__m128i)), a);
        _mm_stream_si128((__m128i*)(dst + 1*sizeof(__m128i)), b);
        
        size -= stride;
        src += stride;
        dst += stride;
    }
}

static void CopyWithSSENoCache_stream_load_si128_stream_si128(uint8_t* dst, uint8_t* src, size_t size)
{
    size_t stride = 2 * sizeof(__m128);
    while (size)
    {
        /* SSE 4.2 */
        __m128i a = _mm_stream_load_si128((__m128i*)(src + 0*sizeof(__m128)));
        __m128i b = _mm_stream_load_si128((__m128i*)(src + 1*sizeof(__m128)));
        _mm_stream_si128((__m128i*)(dst + 0*sizeof(__m128i)), a);
        _mm_stream_si128((__m128i*)(dst + 1*sizeof(__m128i)), b);
        
        size -= stride;
        src += stride;
        dst += stride;
    }
}


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
/* Different runs give different result, but all SSENoCache functions seem equivalent. */
memcpy = 3.81 GiB/s
CopyWithSSE = 3.86 GiB/s
CopyWithSSESmall = 3.81 GiB/s
CopyWithSSENoCache = 5.14 GiB/s
CopyWithSSENoCache_single = 5.18 GiB/s
CopyWithSSENoCache_load_si128_stream_ps = 5.13 GiB/s
CopyWithSSENoCache_load_si128_stream_si128 = 5.17 GiB/s
CopyWithSSENoCache_stream_load_si128_stream_si128 = 5.13 GiB/s
CopyWithRepMovsb = 4.33 GiB/s
CopyWithRepMovsd = 4.36 GiB/s
CopyWithRepMovsq = 4.46 GiB/s
CopyWithRepMovsbUnaligned = 4.36 GiB/s
CopyWithThreads = 5.06 GiB/s
Intuition usually tells me put two or more same kind of SSE instructions next to each other, because usually there is some other code around this that pipelines with these instructions well enough. If you have only one instruction then often there is some kind of bottleneck on pipeline and it does not run as fast as expected. I am pretty sure one pair of load+store instructions in a loop was slower than two that I wrote here. But I don't remember much what I measured 5y ago here :)

No idea about differences between integer vs float loads. I actually did not know that there are any differences in throughput for these. My assumption was that loading is loading memory - no matter how it will be used - integer or float. Never bothered looking up exact numbers...

Edited by Mārtiņš Možeiko on
Thanks.