Threads not running in parallel

I have a problem where a certain number of threads in my thread queue don't wake up when they are supposed to. It seems like windows schedules some threads to be on the same processor and those don't get woken up until the previous thread finishes its work and goes back to sleep. The effect of that is that the actual work process time doubles and sometimes tripples.

It looks like this in my app:


I'm using 8 threads and I have an i7-3770K with 4 cores and 8 hyper-threads.
The job pushing code is just incrementing a semaphore and the threads wait on that with waitforSingleObject().

Below is a small program that demonstrates the issue.
If I manually set the affinity to be even across all threads the problem goes away mostly, at least in this test program. In the main app that doesn't work, because there is often one thread that drastically lags behind, slowing everything down, even more than using the operating systems scheduling.

Is this normal behaviour or am I just missing something totally obvious?

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#define USE_AFFINITY false

#include <windows.h>
#include <cstdio>

struct Timer {
	double frequency;
	LARGE_INTEGER timeStamp;
	double dt;

	void init() {
		LARGE_INTEGER frequency;
		QueryPerformanceFrequency(&frequency);
		this->frequency = (double)frequency.QuadPart;
	}

	void start() { QueryPerformanceCounter(&timeStamp); }

	double stop() {
		LARGE_INTEGER newTimeStamp;
		QueryPerformanceCounter(&newTimeStamp);

		dt = newTimeStamp.QuadPart - timeStamp.QuadPart;
		dt /= frequency;

		return dt;
	}
};

struct ThreadSettings {
	bool active;
	HANDLE semaphore;

	void* data;
	void (*function)(void* data);
};

DWORD WINAPI threadProcessX(LPVOID data) {
	ThreadSettings* settings = (ThreadSettings*)data;
	while(true) {
		WaitForSingleObjectEx(settings->semaphore, INFINITE, FALSE);
		settings->function(settings->data);
		settings->active = false;
	}

	return 0;
}

int main(int argc, char** argv) {
	if(USE_AFFINITY) {
		__int64 threadMask = SetThreadAffinityMask(GetCurrentThread(), 1);
	}

	ThreadSettings settings[8] = {};
	int i = 0;
	for(auto& it : settings) {
		it.active = false;
		it.semaphore = CreateSemaphoreA(0, 0, 1, 0);

		HANDLE thread = CreateThread(0, 0, threadProcessX, &it, 0, 0);
		if(USE_AFFINITY) {
			SetThreadAffinityMask(thread, 2 << i);
		}

		CloseHandle(thread);
		i++;
	}

	Timer timer;
	timer.init();

	struct ThreadData {
		Timer timer;
		int count;
		float temp;
	};

	auto threadFunc = [](void* data) {
		ThreadData* d = (ThreadData*)data;
		d->timer.start();

		d->temp = 0;
		int count = 5000000 / d->count;
		for(int i = 0 ; i < count; i++) {
			d->temp += 123 * 456; // Do some work.
		}

		d->timer.stop();
	};
	
	for(int i = 0; i < 8; i++) {
		int threadCount = i+1;

		timer.start();
		ThreadData threadData[8];
		for(int i = 0; i < threadCount; i++) {
			Timer tim = timer;
			threadData[i] = {tim, threadCount};
		}

		for(int i = 0; i < threadCount; i++) {
			if(i < threadCount-1) {
				settings[i].active = true;
				settings[i].function = threadFunc;
				settings[i].data = threadData + i;
				ReleaseSemaphore(settings[i].semaphore, 1, 0);
			} else {
				threadFunc(threadData + i);
			}
		}

		while(true) {
			bool done = true;
			for(int i = 0; i < threadCount; i++) {
				ThreadSettings* s = settings + i;
				if(s->active) {
					done = false;
					break;
				}
			}
			if(done) break;
		}

		timer.stop();

		printf("\n");
		for(int i = 0; i < threadCount; i++) {
			ThreadData* it = threadData + i;
			float startTime = (it->timer.timeStamp.QuadPart - timer.timeStamp.QuadPart) / timer.frequency;
			printf("Time : %f, Start time: %f\n", it->timer.dt, startTime);
		}
		printf("Total: %f, \n", timer.dt);
	}

	return 0;
}


Automatic scheduling:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Time : 0.003572, Start time: 0.000000
Total: 0.003572, 

Time : 0.001689, Start time: 0.000002
Time : 0.001667, Start time: 0.000001
Total: 0.001691, 

Time : 0.001112, Start time: 0.000001
Time : 0.001229, Start time: 0.000002
Time : 0.001111, Start time: 0.000001
Total: 0.001232, 

Time : 0.000837, Start time: 0.000162
Time : 0.000834, Start time: 0.000016
Time : 0.000872, Start time: 0.000013
Time : 0.000834, Start time: 0.000001
Total: 0.000999, 

Time : 0.000668, Start time: 0.000001
Time : 0.000668, Start time: 0.000002
Time : 0.000688, Start time: 0.000002
Time : 0.000667, Start time: 0.000671
Time : 0.000667, Start time: 0.000002
Total: 0.001339, 

Time : 0.000556, Start time: 0.000001
Time : 0.000556, Start time: 0.000002
Time : 0.000556, Start time: 0.000002
Time : 0.000556, Start time: 0.000558
Time : 0.000556, Start time: 0.000559
Time : 0.000555, Start time: 0.000002
Total: 0.001114, 

Time : 0.000476, Start time: 0.000001
Time : 0.000476, Start time: 0.000001
Time : 0.000476, Start time: 0.000002
Time : 0.000476, Start time: 0.000479
Time : 0.000477, Start time: 0.000479
Time : 0.000477, Start time: 0.000478
Time : 0.000476, Start time: 0.000002
Total: 0.000956, 

Time : 0.000417, Start time: 0.000001
Time : 0.000425, Start time: 0.000002
Time : 0.000419, Start time: 0.000419
Time : 0.000436, Start time: 0.000030
Time : 0.000443, Start time: 0.000466
Time : 0.000437, Start time: 0.000864
Time : 0.000420, Start time: 0.000427
Time : 0.000553, Start time: 0.000003
Total: 0.001301, 


Fixed scheduling:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Time : 0.003348, Start time: 0.000000
Total: 0.003348, 

Time : 0.001667, Start time: 0.000002
Time : 0.001667, Start time: 0.000001
Total: 0.001669, 

Time : 0.001112, Start time: 0.000001
Time : 0.001117, Start time: 0.000003
Time : 0.001112, Start time: 0.000001
Total: 0.001120, 

Time : 0.000834, Start time: 0.000001
Time : 0.000833, Start time: 0.000002
Time : 0.000834, Start time: 0.000002
Time : 0.000833, Start time: 0.000001
Total: 0.000836, 

Time : 0.000667, Start time: 0.000001
Time : 0.000667, Start time: 0.000002
Time : 0.000666, Start time: 0.000002
Time : 0.000667, Start time: 0.000003
Time : 0.000731, Start time: 0.000002
Total: 0.000733, 

Time : 0.000556, Start time: 0.000001
Time : 0.000556, Start time: 0.000002
Time : 0.000556, Start time: 0.000002
Time : 0.000556, Start time: 0.000003
Time : 0.000556, Start time: 0.000004
Time : 0.000556, Start time: 0.000002
Total: 0.000559, 

Time : 0.000476, Start time: 0.000001
Time : 0.000476, Start time: 0.000002
Time : 0.000476, Start time: 0.000002
Time : 0.000476, Start time: 0.000003
Time : 0.000476, Start time: 0.000003
Time : 0.000476, Start time: 0.000004
Time : 0.000476, Start time: 0.000003
Total: 0.000480, 

Time : 0.000417, Start time: 0.000001
Time : 0.000417, Start time: 0.000002
Time : 0.000417, Start time: 0.000002
Time : 0.000417, Start time: 0.000003
Time : 0.000417, Start time: 0.000003
Time : 0.000417, Start time: 0.000004
Time : 0.000417, Start time: 0.000004
Time : 0.000417, Start time: 0.000003
Total: 0.000421, 



Edited by Guitarfreak on
I don't see this on my machine (also 4 core, 8 thread).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Time : 0.001343, Start time: 0.000000
Total: 0.001343, 

Time : 0.000627, Start time: 0.000037
Time : 0.000641, Start time: 0.000001
Total: 0.000664, 

Time : 0.000419, Start time: 0.000005
Time : 0.000419, Start time: 0.000005
Time : 0.000419, Start time: 0.000002
Total: 0.000424, 

Time : 0.000313, Start time: 0.000008
Time : 0.000313, Start time: 0.000005
Time : 0.000313, Start time: 0.000006
Time : 0.000313, Start time: 0.000002
Total: 0.000322, 

Time : 0.000251, Start time: 0.000008
Time : 0.000251, Start time: 0.000005
Time : 0.000251, Start time: 0.000006
Time : 0.000251, Start time: 0.000008
Time : 0.000252, Start time: 0.000004
Total: 0.000259, 

Time : 0.000210, Start time: 0.000047
Time : 0.000226, Start time: 0.000018
Time : 0.000234, Start time: 0.000019
Time : 0.000232, Start time: 0.000020
Time : 0.000214, Start time: 0.000047
Time : 0.000231, Start time: 0.000018
Total: 0.000261, 

Time : 0.000179, Start time: 0.000006
Time : 0.000180, Start time: 0.000009
Time : 0.000179, Start time: 0.000008
Time : 0.000180, Start time: 0.000012
Time : 0.000190, Start time: 0.000008
Time : 0.000180, Start time: 0.000014
Time : 0.000179, Start time: 0.000008
Total: 0.000199, 

Time : 0.000157, Start time: 0.000065
Time : 0.000176, Start time: 0.000019
Time : 0.000177, Start time: 0.000023
Time : 0.000173, Start time: 0.000041
Time : 0.000157, Start time: 0.000065
Time : 0.000177, Start time: 0.000025
Time : 0.000158, Start time: 0.000067
Time : 0.000175, Start time: 0.000021
Total: 0.000225, 


Maybe you are running some software in background that does some weird stuff? Like antivirus or similar security software. They tend to affect performance in unexpected ways.

Edited by Mārtiņš Možeiko on
Thanks for looking at it. No antivirus or anything like that is running. I tried to look through the active processes and closed anything that wasn't needed, but that didn't change anything. I glanced at running services but not sure what I should be looking for that could cause the issue. I also tried disabling hyper threading.
Could it be a driver issue? Maybe there is a better tool to debug threads that would tell me exactly what they are doing.
Result my machine, i7 860 (4 cores 8 threads), Windows 7 64bit.

Set affinity is false:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Time : 0.012561, Start time: 0.000000
Total: 0.012561,

Time : 0.005042, Start time: 0.000008
Time : 0.005104, Start time: 0.000003
Total: 0.005106,

Time : 0.002347, Start time: 0.000006
Time : 0.002347, Start time: 0.000015
Time : 0.002435, Start time: 0.000002
Total: 0.002437,

Time : 0.001783, Start time: 0.000058
Time : 0.001768, Start time: 0.000055
Time : 0.003089, Start time: 0.000105
Time : 0.001783, Start time: 0.000054
Total: 0.003194,

Time : 0.001411, Start time: 0.000055
Time : 0.001411, Start time: 0.000052
Time : 0.001424, Start time: 0.000053
Time : 0.001408, Start time: 0.001469
Time : 0.001409, Start time: 0.000052
Total: 0.002877,

Time : 0.001174, Start time: 0.000052
Time : 0.001174, Start time: 0.000049
Time : 0.001174, Start time: 0.000050
Time : 0.001221, Start time: 0.001225
Time : 0.001219, Start time: 0.001225
Time : 0.001173, Start time: 0.000049
Total: 0.002446,

Time : 0.003114, Start time: 0.000007
Time : 0.001012, Start time: 0.000011
Time : 0.001041, Start time: 0.000010
Time : 0.001035, Start time: 0.001052
Time : 0.001023, Start time: 0.002524
Time : 0.001025, Start time: 0.001025
Time : 0.001046, Start time: 0.000012
Total: 0.003547,

Time : 0.000905, Start time: 0.000004
Time : 0.000881, Start time: 0.000006
Time : 0.000893, Start time: 0.001203
Time : 0.000929, Start time: 0.001826
Time : 0.000880, Start time: 0.000911
Time : 0.000882, Start time: 0.000889
Time : 0.000881, Start time: 0.000011
Time : 0.000881, Start time: 0.000008
Total: 0.002755,


Set affinity is true:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Time : 0.010693, Start time: 0.000000
Total: 0.010693,                     
                                     
Time : 0.005328, Start time: 0.000008
Time : 0.005390, Start time: 0.000003
Total: 0.005393,                     
                                     
Time : 0.003520, Start time: 0.000006
Time : 0.003458, Start time: 0.000018
Time : 0.003567, Start time: 0.000004
Total: 0.003571,                     
                                     
Time : 0.001734, Start time: 0.000007
Time : 0.001753, Start time: 0.000009
Time : 0.001721, Start time: 0.000012
Time : 0.001747, Start time: 0.000007
Total: 0.001763,                     
                                     
Time : 0.001040, Start time: 0.000003
Time : 0.001058, Start time: 0.000008
Time : 0.001040, Start time: 0.000005
Time : 0.001042, Start time: 0.000014
Time : 0.001040, Start time: 0.000004
Total: 0.001066,                     
                                     
Time : 0.000874, Start time: 0.000003
Time : 0.000874, Start time: 0.000005
Time : 0.000873, Start time: 0.000005
Time : 0.000875, Start time: 0.000006
Time : 0.000874, Start time: 0.000008
Time : 0.000887, Start time: 0.000005
Total: 0.000893,                     
                                     
Time : 0.000747, Start time: 0.000003
Time : 0.000747, Start time: 0.000004
Time : 0.000748, Start time: 0.000004
Time : 0.000748, Start time: 0.000005
Time : 0.000748, Start time: 0.000006
Time : 0.000732, Start time: 0.003233
Time : 0.000747, Start time: 0.000005
Total: 0.003965,                     
                                     
Time : 0.000641, Start time: 0.000005
Time : 0.000641, Start time: 0.000038
Time : 0.000641, Start time: 0.000038
Time : 0.000641, Start time: 0.000038
Time : 0.000641, Start time: 0.000040
Time : 0.000640, Start time: 0.000039
Time : 0.000641, Start time: 0.000041
Time : 0.000641, Start time: 0.000039
Total: 0.000683,