1 module CPUblit.composing.blitter;
2 
3 import CPUblit.composing.common;
4 
5 /*
6  * CPUblit
7  * Blitter composing functions.
8  * Author: Laszlo Szeremi
9  *
10  * The functions can be used on 8, 16, and 32 bit datatypes. These cannot deal with alignments related to datatypes less 
11  * than 8 bit, or with 24 bit.
12  * 8 and 16 bit blitters copy a an image over another with either treating 0 as transparency, or getting transparency
13  * information from the mask operator, which must be either U.min (for overwriting) or U.max (for transparency). Mask can
14  * be 8 and 16 bit
15  * 32 bit blitter copies an image over another by either using the alpha channel from the src operator or from a supplied
16  * mask. Mask can be either 32 bit or 8 bit, based on pointer type.
17  */
18 
19 @nogc pure nothrow {
20 	///2 operator blitter
21 	void blitter(T)(T* src, T* dest, size_t length) {
22 		static enum MAINLOOP_LENGTH = 16 / T.sizeof;
23 		static enum HALFLOAD_LENGTH = 8 / T.sizeof;
24 		static enum QUTRLOAD_LENGTH = 4 / T.sizeof;
25 		while (length >= MAINLOOP_LENGTH) {
26 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
27 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
28 			static if(is(T == ubyte))
29 				__m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT);
30 			else static if(is(T == ushort))
31 				__m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT);
32 			else static if(is(T == uint))
33 				__m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT);
34 			destV = srcV | (destV & maskV);
35 			_mm_storeu_si128(cast(__m128i*)dest, destV);
36 			src += MAINLOOP_LENGTH;
37 			dest += MAINLOOP_LENGTH;
38 			length -= MAINLOOP_LENGTH;
39 		}
40 		if(length >= HALFLOAD_LENGTH){
41 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
42 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
43 			static if(is(T == ubyte))
44 				__m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT);
45 			else static if(is(T == ushort))
46 				__m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT);
47 			else static if(is(T == uint))
48 				__m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT);
49 			destV = srcV | (destV & maskV);
50 			_mm_storel_epi64(cast(__m128i*)dest, destV);
51 			src += HALFLOAD_LENGTH;
52 			dest += HALFLOAD_LENGTH;
53 			length -= HALFLOAD_LENGTH;
54 		}
55 		if(length >= QUTRLOAD_LENGTH){
56 			__m128i srcV = _mm_loadu_si32(src);
57 			__m128i destV = _mm_loadu_si32(dest);
58 			static if(is(T == ubyte))
59 				__m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT);
60 			else static if(is(T == ushort))
61 				__m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT);
62 			else static if(is(T == uint))
63 				__m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT);
64 			destV = srcV | (destV & maskV);
65 			_mm_storeu_si32(dest, destV);
66 			static if(!is(T == uint)){
67 				src += QUTRLOAD_LENGTH;
68 				dest += QUTRLOAD_LENGTH;
69 				length -= QUTRLOAD_LENGTH;
70 			}
71 		}
72 		static if(is(T == ubyte)){
73 			while(length){
74 				const ubyte mask = *src ? ubyte.min : ubyte.max;
75 				*dest = *src | (*dest & mask);
76 				src++;
77 				dest++;
78 				length--;
79 			}
80 		}else static if(is(T == ushort)){
81 			if(length){
82 				const ushort mask = *src ? ushort.min : ushort.max;
83 				*dest = *src | (*dest & mask);
84 			}
85 		}
86 	}
87 	///3 operator blitter
88 	void blitter(T)(T* src, T* dest, T* dest0, size_t length) {
89 		static enum MAINLOOP_LENGTH = 16 / T.sizeof;
90 		static enum HALFLOAD_LENGTH = 8 / T.sizeof;
91 		static enum QUTRLOAD_LENGTH = 4 / T.sizeof;
92 		while (length >= MAINLOOP_LENGTH) {
93 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
94 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
95 			static if(is(T == ubyte))
96 				__m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT);
97 			else static if(is(T == ushort))
98 				__m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT);
99 			else static if(is(T == uint))
100 				__m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT);
101 			destV = srcV | (destV & maskV);
102 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
103 			src += MAINLOOP_LENGTH;
104 			dest += MAINLOOP_LENGTH;
105 			dest0 += MAINLOOP_LENGTH;
106 			length -= MAINLOOP_LENGTH;
107 		}
108 		if (length >= HALFLOAD_LENGTH) {
109 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
110 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
111 			static if(is(T == ubyte))
112 				__m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT);
113 			else static if(is(T == ushort))
114 				__m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT);
115 			else static if(is(T == uint))
116 				__m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT);
117 			destV = srcV | (destV & maskV);
118 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
119 			src += HALFLOAD_LENGTH;
120 			dest += HALFLOAD_LENGTH;
121 			dest0 += HALFLOAD_LENGTH;
122 			length -= HALFLOAD_LENGTH;
123 		}
124 		if (length >= QUTRLOAD_LENGTH) {
125 			__m128i srcV = _mm_loadu_si32(src);
126 			__m128i destV = _mm_loadu_si32(dest);
127 			static if(is(T == ubyte))
128 				__m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT);
129 			else static if(is(T == ushort))
130 				__m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT);
131 			else static if(is(T == uint))
132 				__m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT);
133 			destV = srcV | (destV & maskV);
134 			_mm_storeu_si32(dest0, destV);
135 			static if(!is(T == uint)){
136 				src += QUTRLOAD_LENGTH;
137 				dest += QUTRLOAD_LENGTH;
138 				dest0 += QUTRLOAD_LENGTH;
139 				length -= QUTRLOAD_LENGTH;
140 			}
141 		}
142 		static if(is(T == ubyte)) {
143 			while (length) {
144 				const ubyte mask = *src ? ubyte.min : ubyte.max;
145 				*dest0 = *src | (*dest & mask);
146 				src++;
147 				dest++;
148 				dest0++;
149 				length--;
150 			}
151 		} else static if(is(T == ushort)) {
152 			if (length) {
153 				const ushort mask = *src ? ushort.min : ushort.max;
154 				*dest0 = *src | (*dest & mask);
155 			}
156 		}
157 	}
158 	///3 operator blitter
159 	void blitter(T,M)(T* src, T* dest, size_t length, M* mask) {
160 		static enum MAINLOOP_LENGTH = 16 / T.sizeof;
161 		static enum HALFLOAD_LENGTH = 8 / T.sizeof;
162 		static enum QUTRLOAD_LENGTH = 4 / T.sizeof;
163 		while(length >= MAINLOOP_LENGTH){
164 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
165 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
166 			static if (is(T == ubyte)) {
167 				static assert(is(T == M), "8 bit mask and image types must match!");
168 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
169 			} else static if (is(T == ushort)) {
170 				static if (is(M == ushort)) {
171 					__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
172 				} else static if (is(M == ubyte)) {
173 					__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
174 					maskV = _mm_unpacklo_epi8(maskV, maskV);
175 				} else static assert (0, "16 bit blitter only works with 8 or 16 bit masks!");
176 			} else static if(is(T == uint)) {
177 				static if (is(M == uint)) {
178 					__m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 
179 							SSE2_NULLVECT);
180 				} else static if (is(M == ubyte)) {
181 					__m128i maskV;
182 					maskV[0] = mask[0];
183 					maskV[1] = mask[1];
184 					maskV[2] = mask[2];
185 					maskV[3] = mask[3];
186 					maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT);
187 				} else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!");
188 			}
189 			destV = srcV | (destV & maskV);
190 			_mm_storeu_si128(cast(__m128i*)dest, destV);
191 			src += MAINLOOP_LENGTH;
192 			dest += MAINLOOP_LENGTH;
193 			mask += MAINLOOP_LENGTH;
194 			length -= MAINLOOP_LENGTH;
195 		}
196 		if(length >= HALFLOAD_LENGTH){
197 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
198 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
199 			static if (is(T == ubyte)) {
200 				static assert(is(T == M), "8 bit mask and image types must match!");
201 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
202 			} else static if (is(T == ushort)) {
203 				static if (is(M == ushort)) {
204 					__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
205 				} else static if (is(M == ubyte)) {
206 					__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
207 					maskV = _mm_unpacklo_epi8(maskV, maskV);
208 				} else static assert (0, "16 bit blitter only works with ");
209 			} else static if(is(T == uint)) {
210 				static if (is(M == uint)) {
211 					__m128i maskV = _mm_cmpeq_epi32(_mm_loadl_epi64(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 
212 							SSE2_NULLVECT);
213 				} else static if (is(M == ubyte)) {
214 					__m128i maskV;
215 					maskV[0] = mask[0];
216 					maskV[1] = mask[1];
217 					maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT);
218 				} else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!");
219 			}
220 			destV = srcV | (destV & maskV);
221 			_mm_storel_epi64(cast(__m128i*)dest, destV);
222 			src += HALFLOAD_LENGTH;
223 			dest += HALFLOAD_LENGTH;
224 			mask += HALFLOAD_LENGTH;
225 			length -= HALFLOAD_LENGTH;
226 		}
227 		if(length >= QUTRLOAD_LENGTH){
228 			__m128i srcV = _mm_loadu_si32(src);
229 			__m128i destV = _mm_loadu_si32(dest);
230 			static if (is(T == ubyte)) {
231 				static assert(is(T == M), "8 bit mask and image types must match!");
232 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
233 			} else static if (is(T == ushort)) {
234 				static if (is(M == ushort)) {
235 					__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
236 				} else static if (is(M == ubyte)) {
237 					__m128i maskV;// = _mm_loadl_epi64(cast(__m128i*)mask);
238 					maskV[0] = (mask[0]<<24) | (mask[0]<<16) | (mask[1]<<8) | mask[1];
239 				} else static assert (0, "16 bit blitter only works with 8 or 16 bit masks!");
240 			} else static if(is(T == uint)) {
241 				static if (is(M == uint)) {
242 					__m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si32(mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 
243 							SSE2_NULLVECT);
244 				} else static if (is(M == ubyte)) {
245 					__m128i maskV;
246 					maskV[0] = mask[0];
247 					maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT);
248 				} else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!");
249 			}
250 			destV = srcV | (destV & maskV);
251 			_mm_storeu_si32(dest, destV);
252 			static if(!is(T == uint)){
253 				src += QUTRLOAD_LENGTH;
254 				dest += QUTRLOAD_LENGTH;
255 				mask += QUTRLOAD_LENGTH;
256 				length -= QUTRLOAD_LENGTH;
257 			}
258 		}
259 		static if(is(T == ubyte)){
260 			while(length){
261 				*dest = *src | (*dest & *mask);
262 				src++;
263 				dest++;
264 				mask++;
265 				length--;
266 			}
267 		}else static if(is(T == ushort)){
268 			if(length){
269 				*dest = *src | (*dest & *mask);
270 			}
271 		}
272 	}
273 	///4 operator blitter
274 	void blitter(T,M)(T* src, T* dest, T* dest0, size_t length, M* mask) {
275 		static enum MAINLOOP_LENGTH = 16 / T.sizeof;
276 		static enum HALFLOAD_LENGTH = 8 / T.sizeof;
277 		static enum QUTRLOAD_LENGTH = 4 / T.sizeof;
278 		while (length >= MAINLOOP_LENGTH) {
279 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
280 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
281 			static if (is(T == ubyte)) {
282 				static assert(is(T == M), "8 bit mask and image types must match!");
283 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
284 			} else static if (is(T == ushort)) {
285 				static if (is(M == ushort)) {
286 					__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
287 				} else static if (is(M == ubyte)) {
288 					__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
289 					maskV = _mm_unpacklo_epi8(maskV, maskV);
290 				} else static assert (0, "16 bit blitter only works with 8 or 16 bit masks!");
291 			} else static if(is(T == uint)) {
292 				static if (is(M == uint)) {
293 					__m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 
294 							SSE2_NULLVECT);
295 				} else static if (is(M == ubyte)) {
296 					__m128i maskV;
297 					maskV[0] = mask[0];
298 					maskV[1] = mask[1];
299 					maskV[2] = mask[2];
300 					maskV[3] = mask[3];
301 					maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT);
302 				} else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!");
303 			}
304 			destV = srcV | (destV & maskV);
305 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
306 			src += MAINLOOP_LENGTH;
307 			dest += MAINLOOP_LENGTH;
308 			dest0 += MAINLOOP_LENGTH;
309 			mask += MAINLOOP_LENGTH;
310 			length -= MAINLOOP_LENGTH;
311 		}
312 		if (length >= HALFLOAD_LENGTH) {
313 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
314 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
315 			static if (is(T == ubyte)) {
316 				static assert(is(T == M), "8 bit mask and image types must match!");
317 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
318 			} else static if (is(T == ushort)) {
319 				static if (is(M == ushort)) {
320 					__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
321 				} else static if (is(M == ubyte)) {
322 					__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
323 					maskV = _mm_unpacklo_epi8(maskV, maskV);
324 				} else static assert (0, "16 bit blitter only works with ");
325 			} else static if(is(T == uint)) {
326 				static if (is(M == uint)) {
327 					__m128i maskV = _mm_cmpeq_epi32(_mm_loadl_epi64(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 
328 							SSE2_NULLVECT);
329 				} else static if (is(M == ubyte)) {
330 					__m128i maskV;
331 					maskV[0] = mask[0];
332 					maskV[1] = mask[1];
333 					maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT);
334 				} else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!");
335 			}
336 			destV = srcV | (destV & maskV);
337 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
338 			src += HALFLOAD_LENGTH;
339 			dest += HALFLOAD_LENGTH;
340 			dest0 += HALFLOAD_LENGTH;
341 			mask += HALFLOAD_LENGTH;
342 			length -= HALFLOAD_LENGTH;
343 		}
344 		if (length >= QUTRLOAD_LENGTH) {
345 			__m128i srcV = _mm_loadu_si32(src);
346 			__m128i destV = _mm_loadu_si32(dest);
347 			static if (is(T == ubyte)) {
348 				static assert(is(T == M), "8 bit mask and image types must match!");
349 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
350 			} else static if (is(T == ushort)) {
351 				static if (is(M == ushort)) {
352 					__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
353 				} else static if (is(M == ubyte)) {
354 					__m128i maskV;// = _mm_loadl_epi64(cast(__m128i*)mask);
355 					maskV[0] = (mask[0]<<24) | (mask[0]<<16) | (mask[1]<<8) | mask[1];
356 				} else static assert (0, "16 bit blitter only works with ");
357 			} else static if(is(T == uint)) {
358 				static if (is(M == uint)) {
359 					__m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si32(mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 
360 							SSE2_NULLVECT);
361 				} else static if (is(M == ubyte)) {
362 					__m128i maskV;
363 					maskV[0] = mask[0];
364 					maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT);
365 				} else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!");
366 			}
367 			destV = srcV | (destV & maskV);
368 			_mm_storeu_si32(dest0, destV);
369 			static if(!is(T == uint)){
370 				src += QUTRLOAD_LENGTH;
371 				dest += QUTRLOAD_LENGTH;
372 				dest0 += QUTRLOAD_LENGTH;
373 				mask += QUTRLOAD_LENGTH;
374 				length -= QUTRLOAD_LENGTH;
375 			}
376 		}
377 		static if(is(T == ubyte)) {
378 			while (length) {
379 				*dest0 = *src | (*dest & *mask);
380 				src++;
381 				dest++;
382 				dest0++;
383 				mask++;
384 				length--;
385 			}
386 		} else static if(is(T == ushort)) {
387 			if (length) {
388 				*dest0 = *src | (*dest & *mask);
389 			}
390 		}
391 	}
392 	///Blitter with dummy master value
393 	void blitter(T)(T* src, T* dest, size_t length, ubyte value) {
394 		blitter(src, dest, length);
395 	}
396 	///Blitter with dummy master value
397 	void blitter(T)(T* src, T* dest, T* dest0, size_t length, ubyte value) {
398 		blitter(src, dest, dest0, length);
399 	}
400 	///Blitter with dummy master value
401 	void blitter(T,M)(T* src, T* dest, size_t length, M* mask, ubyte value) {
402 		blitter(src, dest, length, mask);
403 	}
404 	///Blitter with dummy master value
405 	void blitter(T,M)(T* src, T* dest, T* dest0, size_t length, M* mask, ubyte value) {
406 		blitter(src, dest, dest0, length, mask);
407 	}
408 
409 }
410 
411 unittest {
412 	{
413 		ubyte[255] a, b, c, d;
414 		blitter(a.ptr, b.ptr, 255);
415 		testArrayForValue(b);
416 		blitter(a.ptr, b.ptr, c.ptr, 255);
417 		testArrayForValue(c);
418 		blitter(a.ptr, b.ptr, 255, d.ptr);
419 		testArrayForValue(b);
420 		blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr);
421 		testArrayForValue(c);
422 	}
423 	{
424 		ushort[255] a, b, c, d;
425 		blitter(a.ptr, b.ptr, 255);
426 		testArrayForValue(b);
427 		blitter(a.ptr, b.ptr, c.ptr, 255);
428 		testArrayForValue(c);
429 		blitter(a.ptr, b.ptr, 255, d.ptr);
430 		testArrayForValue(b);
431 		blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr);
432 		testArrayForValue(c);
433 	}
434 	{
435 		uint[255] a, b, c, d;
436 		blitter(a.ptr, b.ptr, 255);
437 		testArrayForValue(b);
438 		blitter(a.ptr, b.ptr, c.ptr, 255);
439 		testArrayForValue(c);
440 		blitter(a.ptr, b.ptr, 255, d.ptr);
441 		testArrayForValue(b);
442 		blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr);
443 		testArrayForValue(c);
444 	}
445 	{
446 		ushort[255] a, b, c;
447 		ubyte[255] d;
448 		blitter(a.ptr, b.ptr, 255);
449 		testArrayForValue(b);
450 		blitter(a.ptr, b.ptr, c.ptr, 255);
451 		testArrayForValue(c);
452 		blitter(a.ptr, b.ptr, 255, d.ptr);
453 		testArrayForValue(b);
454 		blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr);
455 		testArrayForValue(c);
456 	}
457 	{
458 		uint[255] a, b, c;
459 		ubyte[255] d;
460 		blitter(a.ptr, b.ptr, 255);
461 		testArrayForValue(b);
462 		blitter(a.ptr, b.ptr, c.ptr, 255);
463 		testArrayForValue(c);
464 		blitter(a.ptr, b.ptr, 255, d.ptr);
465 		testArrayForValue(b);
466 		blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr);
467 		testArrayForValue(c);
468 	}
469 }