Hello,
i'm writing an fft function, which takes 3 float pointers to arrays of float.
after every iteration of the main loop, the compiler reads the pointers off the stack again, my gamble is it because of pointer aliasing.
does anybody know how i can solve it?
code below,
thanks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 | static void FFT_only_dest_SSE(r32*__restrict src_cmplex,r32*__restrict dest_cmplex,u32 logpower2,s32*__restrict index,r32*__restrict SinLUT,r32*__restrict CosLUT)
unsigned int t1,t2;
u32 NumberOfCmplxSamples=(1<<(logpower2));
u32 power2=NumberOfCmplxSamples;
u32 halfpower2=power2>>1;
r32*__restrict now_sin_lut=SinLUT;//trying to overcome the aliasing.
r32*__restrict now_cos_lut=CosLUT;
for(u32 j=0;j<logpower2-1;j++,power2>>=1,halfpower2>>=1)
{
for(u32 k=0;k<NumberOfCmplxSamples;)
{
u32 limit=k+power2;
u32 LutIndex=0;
for(u32 index=k*2;k<limit;k+=4,index+=4,LutIndex+=4)
{
// IACA_START
__m128 sample1_3realandcomplex=_mm_load_ps(src_cmplex+index);
__m128 sample2_4realandcomplex=_mm_load_ps(src_cmplex+index+power2);
__m128 sum=_mm_add_ps(sample1_3realandcomplex,sample2_4realandcomplex);
__m128 diff=_mm_sub_ps(sample1_3realandcomplex,sample2_4realandcomplex);
__m128 diff_shuffled=_mm_shuffle_ps(diff,diff,0b10110001);
__m128 trig1,trig2;
trig2=_mm_load_ps((now_sin_lut+(power2)-4+LutIndex));;
trig1=_mm_load_ps((now_cos_lut+(power2)-4+LutIndex));
__m128 store2=_mm_add_ps(_mm_mul_ps(diff,trig1),_mm_mul_ps(diff_shuffled,trig2));
_mm_store_ps(src_cmplex+index,sum);
_mm_store_ps(src_cmplex+index+power2,store2);
}
// IACA_END
}
}
|
and assembly of the main loop:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 | movaps xmm0, xmmword ptr [ecx]
movaps xmm2, xmmword ptr [edi]
movaps xmm1, xmmword ptr [edx+esi*1]
subps xmm2, xmm0
addps xmm0, xmmword ptr [edi]
movaps xmmword ptr [edi], xmm0
movaps xmm0, xmm2
shufps xmm0, xmm2, 0xb1
add edi, 0x10
mulps xmm1, xmm0
movaps xmm0, xmmword ptr [esi]
add esi, 0x10
mulps xmm0, xmm2
addps xmm1, xmm0
movaps xmmword ptr [ecx], xmm1
add ecx, 0x10
sub eax, 0x1
jnz 0xffffffc1
[b]mov edx, dword ptr [ebp-0x78]
mov esi, dword ptr [ebp-0x7c]
mov edi, dword ptr [ebp-0x8c][/b]
|