1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195 | #include "fs_lib.h"
#include "fs_rect.h"
#include "fs_vector.h"
#include <immintrin.h>
#include <Windows.h>
#define ALIGN __declspec(align(16))
typedef fsVec<fsr32, 2> fsv2f;
typedef fsRect<fsi32, 2> fsr2i;
typedef fsVec<fsr32, 4> fsColorf;
struct fsOffscreenBuffer {
fsu32 *buffer;
fsu32 width;
fsu32 height;
fsu32 pitch;
};
struct fsRenderContext {
fsOffscreenBuffer *buffer;
fsr2i clipRect;
fsu32 scanlineTotal;
};
static void
_fix_minY(fsi32 *minY, fsu32 scanline, fsu32 scanlineTotal) {
fsi32 val = *minY;
if (val % scanlineTotal == 0) {
*minY += scanline;
} else {
fsi32 scanlineY = (val / scanlineTotal) * scanlineTotal + scanline;
*minY = (scanlineY < val ? scanlineY + scanlineTotal : scanlineY);
}
}
static void
_mm256_get_clip_masks_aligned(fsi32 *minX, fsi32 *maxX, __m256i *startMask, __m256i *endMask) {
fsi32 x0 = *minX;
fsi32 x1 = *maxX;
fsi32 width = x1 - x0;
if (!(x1 & 7)) {
*endMask = _mm256_set1_epi32(-1);
} else {
fsu32 mask = x1 & 7;
ALIGN fsu32 clipMask[] = {
(mask > 0 ? 0xFFFFFFFF : 0x0),
(mask > 1 ? 0xFFFFFFFF : 0x0),
(mask > 2 ? 0xFFFFFFFF : 0x0),
(mask > 3 ? 0xFFFFFFFF : 0x0),
(mask > 4 ? 0xFFFFFFFF : 0x0),
(mask > 5 ? 0xFFFFFFFF : 0x0),
(mask > 6 ? 0xFFFFFFFF : 0x0),
(mask > 7 ? 0xFFFFFFFF : 0x0)
};
*maxX = (*maxX & ~7) + 8;
*endMask = _mm256_load_si256((__m256i *)clipMask);
}
if (!(x0 & 7)) {
if (width < 8) {
*startMask = *endMask;
} else {
*startMask = _mm256_set1_epi32(-1);
}
} else {
fsu32 clip = x0 & 7;
ALIGN fsu32 clipMask[] = {
(clip > 0 ? 0x0 : 0xFFFFFFFF),
(clip > 1 ? 0x0 : 0xFFFFFFFF),
(clip > 2 ? 0x0 : 0xFFFFFFFF),
(clip > 3 ? 0x0 : 0xFFFFFFFF),
(clip > 4 ? 0x0 : 0xFFFFFFFF),
(clip > 5 ? 0x0 : 0xFFFFFFFF),
(clip > 6 ? 0x0 : 0xFFFFFFFF),
(clip > 7 ? 0x0 : 0xFFFFFFFF)
};
*minX &= ~7;
*startMask = _mm256_load_si256((__m256i *)clipMask);
}
}
static void
_fill_rect_avx2(fsr2i rect, fsColorf color, fsRenderContext *context, fsu32 scanline) {
fsr2i fillRect = fs_rect_intersect(rect, context->clipRect);
if (!fs_rect_has_area(fillRect)) {
return;
}
fsi32 minX = fs_rect_left_edge(fillRect);
fsi32 minY = fs_rect_top_edge(fillRect);
fsi32 maxX = fs_rect_right_edge(fillRect);
fsi32 maxY = fs_rect_bottom_edge(fillRect);
fsu32 scanlineTotal = context->scanlineTotal;
_fix_minY(&minY, scanline, scanlineTotal);
fsr32 invSrcAlpha = 1.f - color.a;
color *= 255.f;
__m256i kMaskFF = _mm256_set1_epi32(0xFF);
__m256 colorR_8 = _mm256_set1_ps(color.r);
__m256 colorG_8 = _mm256_set1_ps(color.g);
__m256 colorB_8 = _mm256_set1_ps(color.b);
__m256 colorA_8 = _mm256_set1_ps(color.a);
__m256 invSrcAlpha_8 = _mm256_set1_ps(invSrcAlpha);
__m256i rowStartClipMask_8, rowEndClipMask_8;
_mm256_get_clip_masks_aligned(&minX, &maxX, &rowStartClipMask_8, &rowEndClipMask_8);
fsOffscreenBuffer *buffer = context->buffer;
for (fsi32 y = minY; y < maxY; y += scanlineTotal) {
fsu32 *pixel = buffer->buffer + (intptr_t)(y * buffer->width + minX);
__m256i writeMask_8 = rowStartClipMask_8;
for (fsi32 x = minX; x < maxX; x += 8) {
__m256i pixel_8 = _mm256_load_si256((__m256i *)pixel);
__m256i pixelR_8 = _mm256_and_si256(pixel_8, kMaskFF);
__m256i pixelG_8 = _mm256_and_si256(_mm256_srli_epi32(pixel_8, 8), kMaskFF);
__m256i pixelB_8 = _mm256_and_si256(_mm256_srli_epi32(pixel_8, 16), kMaskFF);
__m256 blendedR_8 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(pixelR_8), invSrcAlpha_8), colorR_8);
__m256 blendedG_8 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(pixelG_8), invSrcAlpha_8), colorG_8);
__m256 blendedB_8 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(pixelB_8), invSrcAlpha_8), colorB_8);
__m256 blendedA_8 = colorA_8;
__m256i blendedRG_8 = _mm256_or_si256(_mm256_cvtps_epi32(blendedR_8), _mm256_slli_epi32(_mm256_cvtps_epi32(blendedG_8), 8));
__m256i blendedBA_8 = _mm256_or_si256(_mm256_slli_epi32(_mm256_cvtps_epi32(blendedB_8), 16), _mm256_slli_epi32(_mm256_cvtps_epi32(blendedA_8), 24));
__m256i out_8 = _mm256_or_si256(blendedRG_8, blendedBA_8);
__m256i maskedOut_8 = _mm256_or_si256(_mm256_and_si256(writeMask_8, out_8), _mm256_andnot_si256(writeMask_8, pixel_8));
_mm256_store_si256((__m256i *)pixel, maskedOut_8);
pixel += 8;
if (x + 16 < maxX) {
writeMask_8 = _mm256_set1_epi32(-1);
} else {
writeMask_8 = rowEndClipMask_8;
}
}
}
}
LARGE_INTEGER g_timebaseFreq;
uint64_t
get_absolute_time() {
LARGE_INTEGER time;
QueryPerformanceCounter(&time);
return time.QuadPart;
}
float
get_seconds_elapsed(uint64_t absStart, uint64_t absEnd) {
uint64_t elapsed = absEnd - absStart;
return ((float)elapsed / (float)g_timebaseFreq.QuadPart);
}
int main(int argc, const char * argv[])
{
QueryPerformanceFrequency(&g_timebaseFreq);
fsr2i rect = fs_rect_create(0, 0, 1240, 780);
fsColorf color = fs_vcreate<fsr32, 4>(0.f, 0.4f, 0.8f, 1.f);
fsOffscreenBuffer buffer;
buffer.buffer = (fsu32 *)malloc(sizeof(fsu32) * rect.size.w * rect.size.h);
buffer.width = rect.size.w;
buffer.height = rect.size.h;
fsRenderContext renderContext;
renderContext.buffer = &buffer;
renderContext.clipRect = rect;
renderContext.scanlineTotal = 1;
int it = 0;
while (it < 100) {
uint64_t startTime = get_absolute_time();
_fill_rect_avx2(rect, color, &renderContext, 0);
uint64_t endTime = get_absolute_time();
float elapsed = get_seconds_elapsed(startTime, endTime);
printf("%d, %f\n", it, elapsed * 1000.f);
it++;
}
free(buffer.buffer);
return 0;
}
|