9 posts
Edited by twelvefifteen on Reason: Initial post
Hey guys. After finishing the first SIMD section of HmH (~days 115-120) I started SIMDizing some functions in an image processing codebase of mine. While converting a function to get the average color of an image, I ran into a scenario which lead me to use what I now know is called a horizontal add. I'd like to make sure I have the right idea with this technique, and if it's even necessary at all in this case. Any help is appreciated.

Here's the code:

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 static f32 HorizontalAdd(__m128 PackedSingle) { f32* PackedSinglePtr = (f32*)&PackedSingle; f32 Result = (PackedSinglePtr[0] + PackedSinglePtr[1] + PackedSinglePtr[2] + PackedSinglePtr[3]); return(Result); } static v4 GetMeanColor(loaded_raster* Raster) { __m128i MaskFF_4x = _mm_set1_epi32(0xFF); __m128 Inv255_4x = _mm_set1_ps(1.0f / 255.0f); __m128 Accumulator = _mm_set1_ps(0.0f); u32* SourceDest = (u32*)Raster->Address; for(s32 Y = 0; Y < Raster->Height; Y++) { for(s32 X = 0; X < Raster->Width; X += 4) { __m128i C = _mm_loadu_si128((__m128i*)SourceDest); __m128 Texelb = _mm_cvtepi32_ps(_mm_and_si128(C, MaskFF_4x)); __m128 Texelg = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(C, 8), MaskFF_4x)); __m128 Texelr = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(C, 16), MaskFF_4x)); __m128 Texela = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(C, 24), MaskFF_4x)); Texelb = _mm_mul_ps(Texelb, Inv255_4x); Texelg = _mm_mul_ps(Texelg, Inv255_4x); Texelr = _mm_mul_ps(Texelr, Inv255_4x); Texela = _mm_mul_ps(Texela, Inv255_4x); Accumulator = _mm_add_ps(Accumulator, _mm_set_ps(HorizontalAdd(Texela), HorizontalAdd(Texelb), HorizontalAdd(Texelg), HorizontalAdd(Texelr))); SourceDest += 4; } } __m128 InvPixelCount = _mm_set1_ps(1.0f / (Raster->Width*Raster->Height)); Accumulator = _mm_mul_ps(Accumulator, InvPixelCount); v4 Result; _mm_storeu_ps((f32*)&Result, Accumulator); return(Result); } 
498 posts
Edited by ratchetfreak on
You are halfway there,

You can have 4 accumulators, then you don't need to do a horizontal add until after the loop.

 1 2 3 4 Accumulatora = _mm_add_ps(Accumulatora, Texela); Accumulatorb = _mm_add_ps(Accumulatorb, Texelb); Accumulatorr = _mm_add_ps(Accumulatorr, Texelr); Accumulatorg = _mm_add_ps(Accumulatorg, Texelg); 
9 posts