i'm writing an fft function, which takes 3 float pointers to arrays of float.
after every iteration of the main loop, the compiler reads the pointers off the stack again, my gamble is it because of pointer aliasing.
does anybody know how i can solve it?
code below,
thanks
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | static void FFT_only_dest_SSE(r32*__restrict src_cmplex,r32*__restrict dest_cmplex,u32 logpower2,s32*__restrict index,r32*__restrict SinLUT,r32*__restrict CosLUT) unsigned int t1,t2; u32 NumberOfCmplxSamples=(1<<(logpower2)); u32 power2=NumberOfCmplxSamples; u32 halfpower2=power2>>1; r32*__restrict now_sin_lut=SinLUT;//trying to overcome the aliasing. r32*__restrict now_cos_lut=CosLUT; for(u32 j=0;j<logpower2-1;j++,power2>>=1,halfpower2>>=1) { for(u32 k=0;k<NumberOfCmplxSamples;) { u32 limit=k+power2; u32 LutIndex=0; for(u32 index=k*2;k<limit;k+=4,index+=4,LutIndex+=4) { // IACA_START __m128 sample1_3realandcomplex=_mm_load_ps(src_cmplex+index); __m128 sample2_4realandcomplex=_mm_load_ps(src_cmplex+index+power2); __m128 sum=_mm_add_ps(sample1_3realandcomplex,sample2_4realandcomplex); __m128 diff=_mm_sub_ps(sample1_3realandcomplex,sample2_4realandcomplex); __m128 diff_shuffled=_mm_shuffle_ps(diff,diff,0b10110001); __m128 trig1,trig2; trig2=_mm_load_ps((now_sin_lut+(power2)-4+LutIndex));; trig1=_mm_load_ps((now_cos_lut+(power2)-4+LutIndex)); __m128 store2=_mm_add_ps(_mm_mul_ps(diff,trig1),_mm_mul_ps(diff_shuffled,trig2)); _mm_store_ps(src_cmplex+index,sum); _mm_store_ps(src_cmplex+index+power2,store2); } // IACA_END } } |
and assembly of the main loop:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | movaps xmm0, xmmword ptr [ecx] movaps xmm2, xmmword ptr [edi] movaps xmm1, xmmword ptr [edx+esi*1] subps xmm2, xmm0 addps xmm0, xmmword ptr [edi] movaps xmmword ptr [edi], xmm0 movaps xmm0, xmm2 shufps xmm0, xmm2, 0xb1 add edi, 0x10 mulps xmm1, xmm0 movaps xmm0, xmmword ptr [esi] add esi, 0x10 mulps xmm0, xmm2 addps xmm1, xmm0 movaps xmmword ptr [ecx], xmm1 add ecx, 0x10 sub eax, 0x1 jnz 0xffffffc1 [b]mov edx, dword ptr [ebp-0x78] mov esi, dword ptr [ebp-0x7c] mov edi, dword ptr [ebp-0x8c][/b] |