1 module CPUblit.composing.blitter; 2 3 import CPUblit.composing.common; 4 5 /* 6 * CPUblit 7 * Blitter composing functions. 8 * Author: Laszlo Szeremi 9 * 10 * The functions can be used on 8, 16, and 32 bit datatypes. These cannot deal with alignments related to datatypes less 11 * than 8 bit, or with 24 bit. 12 * 8 and 16 bit blitters copy a an image over another with either treating 0 as transparency, or getting transparency 13 * information from the mask operator, which must be either U.min (for overwriting) or U.max (for transparency). Mask can 14 * be 8 and 16 bit 15 * 32 bit blitter copies an image over another by either using the alpha channel from the src operator or from a supplied 16 * mask. Mask can be either 32 bit or 8 bit, based on pointer type. 17 */ 18 19 @nogc pure nothrow { 20 ///2 operator blitter 21 void blitter(T)(T* src, T* dest, size_t length) { 22 static enum MAINLOOP_LENGTH = 16 / T.sizeof; 23 static enum HALFLOAD_LENGTH = 8 / T.sizeof; 24 static enum QUTRLOAD_LENGTH = 4 / T.sizeof; 25 while (length >= MAINLOOP_LENGTH) { 26 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 27 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 28 static if(is(T == ubyte)) 29 __m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT); 30 else static if(is(T == ushort)) 31 __m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT); 32 else static if(is(T == uint)) 33 __m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT); 34 destV = srcV | (destV & maskV); 35 _mm_storeu_si128(cast(__m128i*)dest, destV); 36 src += MAINLOOP_LENGTH; 37 dest += MAINLOOP_LENGTH; 38 length -= MAINLOOP_LENGTH; 39 } 40 if(length >= HALFLOAD_LENGTH){ 41 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 42 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 43 static if(is(T == ubyte)) 44 __m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT); 45 else static if(is(T == ushort)) 46 __m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT); 47 else static if(is(T == uint)) 48 __m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT); 49 destV = srcV | (destV & maskV); 50 _mm_storel_epi64(cast(__m128i*)dest, destV); 51 src += HALFLOAD_LENGTH; 52 dest += HALFLOAD_LENGTH; 53 length -= HALFLOAD_LENGTH; 54 } 55 if(length >= QUTRLOAD_LENGTH){ 56 __m128i srcV = _mm_loadu_si32(src); 57 __m128i destV = _mm_loadu_si32(dest); 58 static if(is(T == ubyte)) 59 __m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT); 60 else static if(is(T == ushort)) 61 __m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT); 62 else static if(is(T == uint)) 63 __m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT); 64 destV = srcV | (destV & maskV); 65 _mm_storeu_si32(dest, destV); 66 static if(!is(T == uint)){ 67 src += QUTRLOAD_LENGTH; 68 dest += QUTRLOAD_LENGTH; 69 length -= QUTRLOAD_LENGTH; 70 } 71 } 72 static if(is(T == ubyte)){ 73 while(length){ 74 const ubyte mask = *src ? ubyte.min : ubyte.max; 75 *dest = *src | (*dest & mask); 76 src++; 77 dest++; 78 length--; 79 } 80 }else static if(is(T == ushort)){ 81 if(length){ 82 const ushort mask = *src ? ushort.min : ushort.max; 83 *dest = *src | (*dest & mask); 84 } 85 } 86 } 87 ///3 operator blitter 88 void blitter(T)(T* src, T* dest, T* dest0, size_t length) { 89 static enum MAINLOOP_LENGTH = 16 / T.sizeof; 90 static enum HALFLOAD_LENGTH = 8 / T.sizeof; 91 static enum QUTRLOAD_LENGTH = 4 / T.sizeof; 92 while (length >= MAINLOOP_LENGTH) { 93 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 94 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 95 static if(is(T == ubyte)) 96 __m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT); 97 else static if(is(T == ushort)) 98 __m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT); 99 else static if(is(T == uint)) 100 __m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT); 101 destV = srcV | (destV & maskV); 102 _mm_storeu_si128(cast(__m128i*)dest0, destV); 103 src += MAINLOOP_LENGTH; 104 dest += MAINLOOP_LENGTH; 105 dest0 += MAINLOOP_LENGTH; 106 length -= MAINLOOP_LENGTH; 107 } 108 if (length >= HALFLOAD_LENGTH) { 109 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 110 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 111 static if(is(T == ubyte)) 112 __m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT); 113 else static if(is(T == ushort)) 114 __m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT); 115 else static if(is(T == uint)) 116 __m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT); 117 destV = srcV | (destV & maskV); 118 _mm_storel_epi64(cast(__m128i*)dest0, destV); 119 src += HALFLOAD_LENGTH; 120 dest += HALFLOAD_LENGTH; 121 dest0 += HALFLOAD_LENGTH; 122 length -= HALFLOAD_LENGTH; 123 } 124 if (length >= QUTRLOAD_LENGTH) { 125 __m128i srcV = _mm_loadu_si32(src); 126 __m128i destV = _mm_loadu_si32(dest); 127 static if(is(T == ubyte)) 128 __m128i maskV = _mm_cmpeq_epi8(srcV, SSE2_NULLVECT); 129 else static if(is(T == ushort)) 130 __m128i maskV = _mm_cmpeq_epi16(srcV, SSE2_NULLVECT); 131 else static if(is(T == uint)) 132 __m128i maskV = _mm_cmpeq_epi32(srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK, SSE2_NULLVECT); 133 destV = srcV | (destV & maskV); 134 _mm_storeu_si32(dest0, destV); 135 static if(!is(T == uint)){ 136 src += QUTRLOAD_LENGTH; 137 dest += QUTRLOAD_LENGTH; 138 dest0 += QUTRLOAD_LENGTH; 139 length -= QUTRLOAD_LENGTH; 140 } 141 } 142 static if(is(T == ubyte)) { 143 while (length) { 144 const ubyte mask = *src ? ubyte.min : ubyte.max; 145 *dest0 = *src | (*dest & mask); 146 src++; 147 dest++; 148 dest0++; 149 length--; 150 } 151 } else static if(is(T == ushort)) { 152 if (length) { 153 const ushort mask = *src ? ushort.min : ushort.max; 154 *dest0 = *src | (*dest & mask); 155 } 156 } 157 } 158 ///3 operator blitter 159 void blitter(T,M)(T* src, T* dest, size_t length, M* mask) { 160 static enum MAINLOOP_LENGTH = 16 / T.sizeof; 161 static enum HALFLOAD_LENGTH = 8 / T.sizeof; 162 static enum QUTRLOAD_LENGTH = 4 / T.sizeof; 163 while(length >= MAINLOOP_LENGTH){ 164 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 165 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 166 static if (is(T == ubyte)) { 167 static assert(is(T == M), "8 bit mask and image types must match!"); 168 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 169 } else static if (is(T == ushort)) { 170 static if (is(M == ushort)) { 171 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 172 } else static if (is(M == ubyte)) { 173 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 174 maskV = _mm_unpacklo_epi8(maskV, maskV); 175 } else static assert (0, "16 bit blitter only works with 8 or 16 bit masks!"); 176 } else static if(is(T == uint)) { 177 static if (is(M == uint)) { 178 __m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 179 SSE2_NULLVECT); 180 } else static if (is(M == ubyte)) { 181 __m128i maskV; 182 maskV[0] = mask[0]; 183 maskV[1] = mask[1]; 184 maskV[2] = mask[2]; 185 maskV[3] = mask[3]; 186 maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT); 187 } else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!"); 188 } 189 destV = srcV | (destV & maskV); 190 _mm_storeu_si128(cast(__m128i*)dest, destV); 191 src += MAINLOOP_LENGTH; 192 dest += MAINLOOP_LENGTH; 193 mask += MAINLOOP_LENGTH; 194 length -= MAINLOOP_LENGTH; 195 } 196 if(length >= HALFLOAD_LENGTH){ 197 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 198 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 199 static if (is(T == ubyte)) { 200 static assert(is(T == M), "8 bit mask and image types must match!"); 201 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 202 } else static if (is(T == ushort)) { 203 static if (is(M == ushort)) { 204 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 205 } else static if (is(M == ubyte)) { 206 __m128i maskV = _mm_loadu_si32(cast(__m128i*)mask); 207 maskV = _mm_unpacklo_epi8(maskV, maskV); 208 } else static assert (0, "16 bit blitter only works with "); 209 } else static if(is(T == uint)) { 210 static if (is(M == uint)) { 211 __m128i maskV = _mm_cmpeq_epi32(_mm_loadl_epi64(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 212 SSE2_NULLVECT); 213 } else static if (is(M == ubyte)) { 214 __m128i maskV; 215 maskV[0] = mask[0]; 216 maskV[1] = mask[1]; 217 maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT); 218 } else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!"); 219 } 220 destV = srcV | (destV & maskV); 221 _mm_storel_epi64(cast(__m128i*)dest, destV); 222 src += HALFLOAD_LENGTH; 223 dest += HALFLOAD_LENGTH; 224 mask += HALFLOAD_LENGTH; 225 length -= HALFLOAD_LENGTH; 226 } 227 if(length >= QUTRLOAD_LENGTH){ 228 __m128i srcV = _mm_loadu_si32(src); 229 __m128i destV = _mm_loadu_si32(dest); 230 static if (is(T == ubyte)) { 231 static assert(is(T == M), "8 bit mask and image types must match!"); 232 __m128i maskV = _mm_loadu_si32(cast(__m128i*)mask); 233 } else static if (is(T == ushort)) { 234 static if (is(M == ushort)) { 235 __m128i maskV = _mm_loadu_si32(cast(__m128i*)mask); 236 } else static if (is(M == ubyte)) { 237 __m128i maskV;// = _mm_loadl_epi64(cast(__m128i*)mask); 238 maskV[0] = (mask[0]<<24) | (mask[0]<<16) | (mask[1]<<8) | mask[1]; 239 } else static assert (0, "16 bit blitter only works with 8 or 16 bit masks!"); 240 } else static if(is(T == uint)) { 241 static if (is(M == uint)) { 242 __m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si32(mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 243 SSE2_NULLVECT); 244 } else static if (is(M == ubyte)) { 245 __m128i maskV; 246 maskV[0] = mask[0]; 247 maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT); 248 } else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!"); 249 } 250 destV = srcV | (destV & maskV); 251 _mm_storeu_si32(dest, destV); 252 static if(!is(T == uint)){ 253 src += QUTRLOAD_LENGTH; 254 dest += QUTRLOAD_LENGTH; 255 mask += QUTRLOAD_LENGTH; 256 length -= QUTRLOAD_LENGTH; 257 } 258 } 259 static if(is(T == ubyte)){ 260 while(length){ 261 *dest = *src | (*dest & *mask); 262 src++; 263 dest++; 264 mask++; 265 length--; 266 } 267 }else static if(is(T == ushort)){ 268 if(length){ 269 *dest = *src | (*dest & *mask); 270 } 271 } 272 } 273 ///4 operator blitter 274 void blitter(T,M)(T* src, T* dest, T* dest0, size_t length, M* mask) { 275 static enum MAINLOOP_LENGTH = 16 / T.sizeof; 276 static enum HALFLOAD_LENGTH = 8 / T.sizeof; 277 static enum QUTRLOAD_LENGTH = 4 / T.sizeof; 278 while (length >= MAINLOOP_LENGTH) { 279 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 280 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 281 static if (is(T == ubyte)) { 282 static assert(is(T == M), "8 bit mask and image types must match!"); 283 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 284 } else static if (is(T == ushort)) { 285 static if (is(M == ushort)) { 286 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 287 } else static if (is(M == ubyte)) { 288 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 289 maskV = _mm_unpacklo_epi8(maskV, maskV); 290 } else static assert (0, "16 bit blitter only works with 8 or 16 bit masks!"); 291 } else static if(is(T == uint)) { 292 static if (is(M == uint)) { 293 __m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 294 SSE2_NULLVECT); 295 } else static if (is(M == ubyte)) { 296 __m128i maskV; 297 maskV[0] = mask[0]; 298 maskV[1] = mask[1]; 299 maskV[2] = mask[2]; 300 maskV[3] = mask[3]; 301 maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT); 302 } else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!"); 303 } 304 destV = srcV | (destV & maskV); 305 _mm_storeu_si128(cast(__m128i*)dest0, destV); 306 src += MAINLOOP_LENGTH; 307 dest += MAINLOOP_LENGTH; 308 dest0 += MAINLOOP_LENGTH; 309 mask += MAINLOOP_LENGTH; 310 length -= MAINLOOP_LENGTH; 311 } 312 if (length >= HALFLOAD_LENGTH) { 313 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 314 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 315 static if (is(T == ubyte)) { 316 static assert(is(T == M), "8 bit mask and image types must match!"); 317 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 318 } else static if (is(T == ushort)) { 319 static if (is(M == ushort)) { 320 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 321 } else static if (is(M == ubyte)) { 322 __m128i maskV = _mm_loadu_si32(cast(__m128i*)mask); 323 maskV = _mm_unpacklo_epi8(maskV, maskV); 324 } else static assert (0, "16 bit blitter only works with "); 325 } else static if(is(T == uint)) { 326 static if (is(M == uint)) { 327 __m128i maskV = _mm_cmpeq_epi32(_mm_loadl_epi64(cast(__m128i*)mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 328 SSE2_NULLVECT); 329 } else static if (is(M == ubyte)) { 330 __m128i maskV; 331 maskV[0] = mask[0]; 332 maskV[1] = mask[1]; 333 maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT); 334 } else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!"); 335 } 336 destV = srcV | (destV & maskV); 337 _mm_storel_epi64(cast(__m128i*)dest0, destV); 338 src += HALFLOAD_LENGTH; 339 dest += HALFLOAD_LENGTH; 340 dest0 += HALFLOAD_LENGTH; 341 mask += HALFLOAD_LENGTH; 342 length -= HALFLOAD_LENGTH; 343 } 344 if (length >= QUTRLOAD_LENGTH) { 345 __m128i srcV = _mm_loadu_si32(src); 346 __m128i destV = _mm_loadu_si32(dest); 347 static if (is(T == ubyte)) { 348 static assert(is(T == M), "8 bit mask and image types must match!"); 349 __m128i maskV = _mm_loadu_si32(cast(__m128i*)mask); 350 } else static if (is(T == ushort)) { 351 static if (is(M == ushort)) { 352 __m128i maskV = _mm_loadu_si32(cast(__m128i*)mask); 353 } else static if (is(M == ubyte)) { 354 __m128i maskV;// = _mm_loadl_epi64(cast(__m128i*)mask); 355 maskV[0] = (mask[0]<<24) | (mask[0]<<16) | (mask[1]<<8) | mask[1]; 356 } else static assert (0, "16 bit blitter only works with "); 357 } else static if(is(T == uint)) { 358 static if (is(M == uint)) { 359 __m128i maskV = _mm_cmpeq_epi32(_mm_loadu_si32(mask) & cast(__m128i)ALPHABLEND_SSE2_AMASK, 360 SSE2_NULLVECT); 361 } else static if (is(M == ubyte)) { 362 __m128i maskV; 363 maskV[0] = mask[0]; 364 maskV = _mm_cmpeq_epi32(maskV, SSE2_NULLVECT); 365 } else static assert (0, "32 bit blitter only works with 8 or 32 bit masks!"); 366 } 367 destV = srcV | (destV & maskV); 368 _mm_storeu_si32(dest0, destV); 369 static if(!is(T == uint)){ 370 src += QUTRLOAD_LENGTH; 371 dest += QUTRLOAD_LENGTH; 372 dest0 += QUTRLOAD_LENGTH; 373 mask += QUTRLOAD_LENGTH; 374 length -= QUTRLOAD_LENGTH; 375 } 376 } 377 static if(is(T == ubyte)) { 378 while (length) { 379 *dest0 = *src | (*dest & *mask); 380 src++; 381 dest++; 382 dest0++; 383 mask++; 384 length--; 385 } 386 } else static if(is(T == ushort)) { 387 if (length) { 388 *dest0 = *src | (*dest & *mask); 389 } 390 } 391 } 392 ///Blitter with dummy master value 393 void blitter(T)(T* src, T* dest, size_t length, ubyte value) { 394 blitter(src, dest, length); 395 } 396 ///Blitter with dummy master value 397 void blitter(T)(T* src, T* dest, T* dest0, size_t length, ubyte value) { 398 blitter(src, dest, dest0, length); 399 } 400 ///Blitter with dummy master value 401 void blitter(T,M)(T* src, T* dest, size_t length, M* mask, ubyte value) { 402 blitter(src, dest, length, mask); 403 } 404 ///Blitter with dummy master value 405 void blitter(T,M)(T* src, T* dest, T* dest0, size_t length, M* mask, ubyte value) { 406 blitter(src, dest, dest0, length, mask); 407 } 408 409 } 410 411 unittest { 412 { 413 ubyte[255] a, b, c, d; 414 blitter(a.ptr, b.ptr, 255); 415 testArrayForValue(b); 416 blitter(a.ptr, b.ptr, c.ptr, 255); 417 testArrayForValue(c); 418 blitter(a.ptr, b.ptr, 255, d.ptr); 419 testArrayForValue(b); 420 blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr); 421 testArrayForValue(c); 422 } 423 { 424 ushort[255] a, b, c, d; 425 blitter(a.ptr, b.ptr, 255); 426 testArrayForValue(b); 427 blitter(a.ptr, b.ptr, c.ptr, 255); 428 testArrayForValue(c); 429 blitter(a.ptr, b.ptr, 255, d.ptr); 430 testArrayForValue(b); 431 blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr); 432 testArrayForValue(c); 433 } 434 { 435 uint[255] a, b, c, d; 436 blitter(a.ptr, b.ptr, 255); 437 testArrayForValue(b); 438 blitter(a.ptr, b.ptr, c.ptr, 255); 439 testArrayForValue(c); 440 blitter(a.ptr, b.ptr, 255, d.ptr); 441 testArrayForValue(b); 442 blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr); 443 testArrayForValue(c); 444 } 445 { 446 ushort[255] a, b, c; 447 ubyte[255] d; 448 blitter(a.ptr, b.ptr, 255); 449 testArrayForValue(b); 450 blitter(a.ptr, b.ptr, c.ptr, 255); 451 testArrayForValue(c); 452 blitter(a.ptr, b.ptr, 255, d.ptr); 453 testArrayForValue(b); 454 blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr); 455 testArrayForValue(c); 456 } 457 { 458 uint[255] a, b, c; 459 ubyte[255] d; 460 blitter(a.ptr, b.ptr, 255); 461 testArrayForValue(b); 462 blitter(a.ptr, b.ptr, c.ptr, 255); 463 testArrayForValue(c); 464 blitter(a.ptr, b.ptr, 255, d.ptr); 465 testArrayForValue(b); 466 blitter(a.ptr, b.ptr, c.ptr, 255, d.ptr); 467 testArrayForValue(c); 468 } 469 }