pointer aliasing problem

Hello,
i'm writing an fft function, which takes 3 float pointers to arrays of float.
after every iteration of the main loop, the compiler reads the pointers off the stack again, my gamble is it because of pointer aliasing.
does anybody know how i can solve it?
code below,
thanks
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static void FFT_only_dest_SSE(r32*__restrict src_cmplex,r32*__restrict dest_cmplex,u32 logpower2,s32*__restrict index,r32*__restrict SinLUT,r32*__restrict CosLUT)

unsigned int t1,t2;
u32 NumberOfCmplxSamples=(1<<(logpower2));
	u32 power2=NumberOfCmplxSamples;
	u32 halfpower2=power2>>1;
	r32*__restrict now_sin_lut=SinLUT;//trying to overcome the aliasing.
	r32*__restrict now_cos_lut=CosLUT;
for(u32 j=0;j<logpower2-1;j++,power2>>=1,halfpower2>>=1)
	{
	
		for(u32 k=0;k<NumberOfCmplxSamples;)
		{
			u32 limit=k+power2;
			u32 LutIndex=0;
			for(u32 index=k*2;k<limit;k+=4,index+=4,LutIndex+=4)
			{

				// IACA_START
				__m128 sample1_3realandcomplex=_mm_load_ps(src_cmplex+index);
				__m128 sample2_4realandcomplex=_mm_load_ps(src_cmplex+index+power2);
				__m128 sum=_mm_add_ps(sample1_3realandcomplex,sample2_4realandcomplex);
				
				__m128 diff=_mm_sub_ps(sample1_3realandcomplex,sample2_4realandcomplex);
				__m128 diff_shuffled=_mm_shuffle_ps(diff,diff,0b10110001);

				__m128 trig1,trig2;
		
				 trig2=_mm_load_ps((now_sin_lut+(power2)-4+LutIndex));;
				 trig1=_mm_load_ps((now_cos_lut+(power2)-4+LutIndex));
				__m128 store2=_mm_add_ps(_mm_mul_ps(diff,trig1),_mm_mul_ps(diff_shuffled,trig2));
				_mm_store_ps(src_cmplex+index,sum);
				_mm_store_ps(src_cmplex+index+power2,store2);
		
			}
			// IACA_END
		
		}

	}


and assembly of the main loop:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
movaps xmm0, xmmword ptr [ecx]
movaps xmm2, xmmword ptr [edi]
movaps xmm1, xmmword ptr [edx+esi*1]
subps xmm2, xmm0
addps xmm0, xmmword ptr [edi]
movaps xmmword ptr [edi], xmm0
movaps xmm0, xmm2
shufps xmm0, xmm2, 0xb1
add edi, 0x10
mulps xmm1, xmm0
movaps xmm0, xmmword ptr [esi]
add esi, 0x10
mulps xmm0, xmm2
addps xmm1, xmm0
movaps xmmword ptr [ecx], xmm1
add ecx, 0x10
sub eax, 0x1
jnz 0xffffffc1
[b]mov edx, dword ptr [ebp-0x78]
mov esi, dword ptr [ebp-0x7c]
mov edi, dword ptr [ebp-0x8c][/b]

Edited by The_8th_mage on Reason: mmozeiko found a bit of redundant code i had to take away. thx
What does CosSSE function do?
oops, a bit of redundancy i didn't catch becuase it got optimized away. it basiclly just return the loaded __m128. i will fix my first post, but it didn't fix the problem.
Imho compiler simply doesn't have enough free registers in 32-bit code to keep all needed values in registers.

If you compile it targeting 64-bit code there won't be loads from stack after inner loop:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
$LL3@FFT_only_d:
	movaps	xmm2, XMMWORD PTR [r14+r11*4]
	lea	rcx, QWORD PTR [r14+r11*4]
	mov	eax, r11d
	add	rax, rdi
	add	r11d, 4
	lea	rdx, QWORD PTR [r14+rax*4]
	mov	eax, r10d
	add	r10d, 4
	add	rax, rdi
	movaps	xmm0, XMMWORD PTR [rdx]
	subps	xmm2, xmm0
	addps	xmm0, XMMWORD PTR [rcx]
	movaps	xmm1, XMMWORD PTR [r15+rax*4-16]
	movaps	XMMWORD PTR [rcx], xmm0
	movaps	xmm0, xmm2
	shufps	xmm0, xmm2, 177				; 000000b1H
	mulps	xmm1, xmm0
	movaps	xmm0, XMMWORD PTR [rbp+rax*4-16]
	mulps	xmm0, xmm2
	addps	xmm1, xmm0
	movaps	XMMWORD PTR [rdx], xmm1
	dec	r9
	jne	SHORT $LL3@FFT_only_d
$LN1@FFT_only_d:
	cmp	r8d, esi
	jb	$LL5@FFT_only_d
$LN7@FFT_only_d:
	shr	ebx, 1
	dec	r12
	jne	$LL8@FFT_only_dp