1 module CPUblit.composing.alphablend; 2 3 import CPUblit.composing.common; 4 5 /* 6 * CPUblit 7 * Alpha-blending functions. 8 * Author: Laszlo Szeremi 9 * 10 * Alpha-blending composes two images together using the following formula: 11 * dest0[rgba] = ((1.0 - mask[aaaa]) * dest[rgba]) + (mask[aaaa] * src[rgba]) 12 * where `mask` is either a separate mask (either a master value or a per-pixel one), or the extracted alpha- 13 * channel of src. 14 * For speed's sake, these functions use integer arithmetics, so this formula will be used instead: 15 * dest0[rgba] = (((256 - mask[aaaa]) * dest[rgba]) + ((1 + mask[aaaa]) * src[rgba]))>>>8 16 * There should be no downside for this approach, especially as some workarounds have been done to avoid such 17 * issues. 18 * When a master alpha is used, it's multiplied with either mask. 19 * 20 * These functions only work with 8 bit channels, and many require 32 bit values. 21 * Masks can be either 8 bit per pixel, or 32 bit per pixel with the ability of processing up to 4 channels 22 * independently (only when using vectors). 23 * 24 * Note on differences between vector and non-vector implementation: Vector implementations process all four 25 * channels to save on complexity. Non-vector implementations only process the three color channels to save 26 * on processing speed. 27 */ 28 @nogc pure nothrow { 29 /** 30 * 2 operator alpha-blending function. 31 */ 32 public void alphaBlend(uint* src, uint* dest, size_t length) { 33 static if(USE_INTEL_INTRINSICS){ 34 while (length >= 4) { 35 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 36 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 37 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 38 version (cpublit_revalpha) { 39 maskV |= _mm_srli_epi32(maskV, 8); 40 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 41 } else { 42 maskV |= _mm_slli_epi32(maskV, 8); 43 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 44 } 45 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 46 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 47 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 48 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 49 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 50 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 51 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 52 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 53 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 54 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 55 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 56 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 57 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 58 src += 4; 59 dest += 4; 60 length -= 4; 61 } 62 if (length >= 2) { 63 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 64 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 65 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 66 version (cpublit_revalpha) { 67 maskV |= _mm_srli_epi32(maskV, 8); 68 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 69 } else { 70 maskV |= _mm_slli_epi32(maskV, 8); 71 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 72 } 73 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 74 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 75 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 76 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 77 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 78 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 79 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 80 src += 2; 81 dest += 2; 82 length -= 2; 83 } 84 if (length) { 85 __m128i srcV = _mm_loadu_si32(src); 86 __m128i destV = _mm_loadu_si32(dest); 87 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 88 version (cpublit_revalpha) { 89 maskV |= _mm_srli_epi32(maskV, 8); 90 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 91 } else { 92 maskV |= _mm_slli_epi32(maskV, 8); 93 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 94 } 95 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 96 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 97 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 98 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 99 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 100 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 101 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 102 } 103 } else { 104 while (length) { 105 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 106 const int src1 = 1 + lsrc.a; 107 const int src256 = 256 - lsrc.a; 108 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 109 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 110 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 111 src++; 112 *cast(Color32Bit*)dest = ldest; 113 dest++; 114 //dest0++; 115 length--; 116 } 117 } 118 } 119 /** 120 * 3 operator alpha-blending function. 121 */ 122 public void alphaBlend(uint* src, uint* dest, uint* dest0, size_t length) { 123 static if (USE_INTEL_INTRINSICS) { 124 while (length >= 4) { 125 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 126 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 127 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 128 version (cpublit_revalpha) { 129 maskV |= _mm_srli_epi32(maskV, 8); 130 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 131 } else { 132 maskV |= _mm_slli_epi32(maskV, 8); 133 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 134 } 135 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 136 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 137 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 138 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 139 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 140 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 141 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 142 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 143 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 144 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 145 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 146 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 147 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 148 src += 4; 149 dest += 4; 150 dest0 += 4; 151 length -= 4; 152 } 153 if (length >= 2) { 154 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 155 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 156 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 157 version (cpublit_revalpha) { 158 maskV |= _mm_srli_epi32(maskV, 8); 159 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 160 } else { 161 maskV |= _mm_slli_epi32(maskV, 8); 162 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 163 } 164 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 165 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 166 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 167 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 168 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 169 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 170 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 171 src += 2; 172 dest += 2; 173 dest0 += 2; 174 length -= 2; 175 } 176 if (length) { 177 __m128i srcV = _mm_loadu_si32(src); 178 __m128i destV = _mm_loadu_si32(dest); 179 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 180 version (cpublit_revalpha) { 181 maskV |= _mm_srli_epi32(maskV, 8); 182 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 183 } else { 184 maskV |= _mm_slli_epi32(maskV, 8); 185 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 186 } 187 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 188 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 189 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 190 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 191 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 192 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 193 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 194 } 195 } else { 196 while (length) { 197 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 198 const int src1 = 1 + lsrc.a; 199 const int src256 = 256 - lsrc.a; 200 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 201 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 202 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 203 src++; 204 *cast(Color32Bit*)dest0 = ldest; 205 dest++; 206 dest0++; 207 length--; 208 } 209 } 210 } 211 /** 212 * 3 operator alpha-blending function. 213 * Mask is either 8 or 32 bit per pixel. 214 */ 215 public void alphaBlend(M)(uint* src, uint* dest, size_t length, M* mask) { 216 static if(USE_INTEL_INTRINSICS){ 217 while(length >= 4){ 218 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 219 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 220 static if (is(M == uint)) { 221 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 222 } else static if (is(M == ubyte)) { 223 __m128i maskV; 224 maskV[0] = mask[0]; 225 maskV[1] = mask[1]; 226 maskV[2] = mask[2]; 227 maskV[3] = mask[3]; 228 maskV |= _mm_slli_epi32(maskV, 8); 229 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 230 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 231 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 232 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 233 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 234 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 235 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 236 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 237 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 238 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 239 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 240 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 241 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 242 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 243 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 244 src += 4; 245 dest += 4; 246 mask += 4; 247 length -= 4; 248 } 249 if(length >= 2){ 250 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 251 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 252 static if (is(M == uint)) { 253 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 254 } else static if (is(M == ubyte)) { 255 __m128i maskV; 256 maskV[0] = mask[0]; 257 maskV[1] = mask[1]; 258 maskV |= _mm_slli_epi32(maskV, 8); 259 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 260 } 261 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 262 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 263 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 264 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 265 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 266 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 267 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 268 src += 2; 269 dest += 2; 270 mask += 2; 271 length -= 2; 272 } 273 if(length){ 274 __m128i srcV = _mm_loadu_si32(src); 275 __m128i destV = _mm_loadu_si32(dest); 276 static if (is(M == uint)) { 277 __m128i maskV = _mm_loadu_si32(mask); 278 } else static if (is(M == ubyte)) { 279 __m128i maskV; 280 maskV[0] = mask[0]; 281 maskV |= _mm_slli_epi32(maskV, 8); 282 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 283 } 284 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 285 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 286 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 287 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 288 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 289 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 290 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 291 } 292 } else { 293 while (length) { 294 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 295 static if (is(M == uint)) { 296 const int src1 = 1 + (*mask & 0xFF); 297 const int src256 = 256 - (*mask & 0xFF); 298 } else static if (is(M == ubyte)) { 299 const int src1 = 1 + *mask; 300 const int src256 = 256 - *mask; 301 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 302 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 303 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 304 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 305 src++; 306 *cast(Color32Bit*)dest = ldest; 307 dest++; 308 mask++; 309 //dest0++; 310 length--; 311 } 312 } 313 } 314 /** 315 * 4 operator alpha-blending function. 316 * Mask is either 8 or 32 bit per pixel. 317 */ 318 public void alphaBlend(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) { 319 static if(USE_INTEL_INTRINSICS){ 320 while(length >= 4){ 321 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 322 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 323 static if (is(M == uint)) { 324 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 325 } else static if (is(M == ubyte)) { 326 __m128i maskV; 327 maskV[0] = mask[0]; 328 maskV[1] = mask[1]; 329 maskV[2] = mask[2]; 330 maskV[3] = mask[3]; 331 maskV |= _mm_slli_epi32(maskV, 8); 332 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 333 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 334 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 335 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 336 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 337 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 338 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 339 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 340 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 341 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 342 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 343 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 344 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 345 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 346 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 347 src += 4; 348 dest += 4; 349 dest0 += 4; 350 mask += 4; 351 length -= 4; 352 } 353 if(length >= 2){ 354 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 355 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 356 static if (is(M == uint)) { 357 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 358 } else static if (is(M == ubyte)) { 359 __m128i maskV; 360 maskV[0] = mask[0]; 361 maskV[1] = mask[1]; 362 maskV |= _mm_slli_epi32(maskV, 8); 363 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 364 } 365 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 366 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 367 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 368 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 369 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 370 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 371 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 372 src += 2; 373 dest += 2; 374 dest0 += 2; 375 mask += 2; 376 length -= 2; 377 } 378 if(length){ 379 __m128i srcV = _mm_loadu_si32(src); 380 __m128i destV = _mm_loadu_si32(dest); 381 static if (is(M == uint)) { 382 __m128i maskV = _mm_loadu_si32(mask); 383 } else static if (is(M == ubyte)) { 384 __m128i maskV; 385 maskV[0] = mask[0]; 386 maskV |= _mm_slli_epi32(maskV, 8); 387 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 388 } 389 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 390 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 391 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 392 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 393 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 394 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 395 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 396 } 397 } else { 398 while (length) { 399 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 400 static if (is(M == uint)) { 401 const int src1 = 1 + (*mask & 0xFF); 402 const int src256 = 256 - (*mask & 0xFF); 403 } else static if (is(M == ubyte)) { 404 const int src1 = 1 + *mask; 405 const int src256 = 256 - *mask; 406 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 407 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 408 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 409 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 410 src++; 411 *cast(Color32Bit*)dest0 = ldest; 412 dest++; 413 mask++; 414 dest0++; 415 length--; 416 } 417 } 418 } 419 /** 420 * Fix value alpha-blending, 3 operator. 421 */ 422 public void alphaBlendFV(V)(uint* src, uint* dest, size_t length, V value) { 423 static if(USE_INTEL_INTRINSICS){ 424 __m128i maskV; 425 static if (is(V == uint)) { 426 maskV[0] = value; 427 maskV[1] = value; 428 //maskV[2] = value; 429 //maskV[3] = value; 430 } else static if (is(V == ubyte)) { 431 maskV[0] = value; 432 maskV[1] = value; 433 //maskV[2] = value; 434 //maskV[3] = value; 435 maskV |= _mm_slli_epi32(maskV, 8); 436 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 437 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 438 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 439 //__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 440 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 441 //__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 442 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 443 //mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 444 while(length >= 4){ 445 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 446 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 447 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 448 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_lo); 449 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 450 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_lo); 451 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 452 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 453 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 454 src += 4; 455 dest += 4; 456 //mask += 4; 457 length -= 4; 458 } 459 if(length >= 2){ 460 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 461 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 462 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 463 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 464 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 465 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 466 src += 2; 467 dest += 2; 468 //mask += 2; 469 length -= 2; 470 } 471 if(length){ 472 __m128i srcV = _mm_loadu_si32(src); 473 __m128i destV = _mm_loadu_si32(dest); 474 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 475 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 476 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 477 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 478 } 479 } else { 480 static if (is(V == uint)) { 481 const int src1 = 1 + value; 482 const int src256 = 256 - value; 483 } else static if (is(V == ubyte)) { 484 const int src1 = 1 + value; 485 const int src256 = 256 - value; 486 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 487 while (length) { 488 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 489 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 490 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 491 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 492 src++; 493 *cast(Color32Bit*)dest = ldest; 494 dest++; 495 mask++; 496 //dest0++; 497 length--; 498 } 499 } 500 } 501 /** 502 * Fix value alpha-blending, 4 operator. 503 */ 504 public void alphaBlendFV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) { 505 static if(USE_INTEL_INTRINSICS){ 506 __m128i maskV; 507 static if (is(V == uint)) { 508 maskV[0] = value; 509 maskV[1] = value; 510 //maskV[2] = value; 511 //maskV[3] = value; 512 } else static if (is(V == ubyte)) { 513 maskV[0] = value; 514 maskV[1] = value; 515 maskV[2] = value; 516 maskV[3] = value; 517 //maskV |= _mm_slli_epi32(maskV, 8); 518 //maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 519 } else static assert (0, "Value must be either 8 or 32 bits!"); 520 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 521 //__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 522 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 523 //__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 524 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 525 //mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 526 while(length >= 4){ 527 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 528 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 529 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 530 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_lo); 531 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 532 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_lo); 533 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 534 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 535 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 536 src += 4; 537 dest += 4; 538 dest0 += 4; 539 length -= 4; 540 } 541 if(length >= 2){ 542 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 543 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 544 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 545 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 546 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 547 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 548 src += 2; 549 dest += 2; 550 dest0 += 2; 551 length -= 2; 552 } 553 if(length){ 554 __m128i srcV = _mm_loadu_si32(src); 555 __m128i destV = _mm_loadu_si32(dest); 556 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 557 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 558 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 559 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 560 } 561 } else { 562 static if (is(V == uint)) { 563 const int src1 = 1 + value; 564 const int src256 = 256 - value; 565 } else static if (is(V == ubyte)) { 566 const int src1 = 1 + value; 567 const int src256 = 256 - value; 568 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 569 while (length) { 570 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 571 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 572 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 573 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 574 src++; 575 *cast(Color32Bit*)dest = ldest; 576 dest++; 577 mask++; 578 //dest0++; 579 length--; 580 } 581 } 582 } 583 /** 584 * Alpha-blending with per pixel + fix master value alpha. 585 * `value` controls the overall alpha through extra multiplications on the alpha extracted from the pixels. 586 * 2 operator. 587 */ 588 public void alphaBlendMV(V)(uint* src, uint* dest, size_t length, V value) { 589 static if(USE_INTEL_INTRINSICS) { 590 __m128i masterV; 591 static if (is(V == uint)) { 592 masterV[0] = value; 593 masterV[1] = value; 594 masterV[2] = value; 595 masterV[3] = value; 596 } else static if (is(V == ubyte)) { 597 masterV[0] = value; 598 masterV[1] = value; 599 masterV[2] = value; 600 masterV[3] = value; 601 masterV |= _mm_slli_epi32(masterV, 8); 602 masterV |= _mm_slli_epi32(masterV, 16); 603 } else static assert (0, "Value must be either 8 or 32 bits!"); 604 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 605 //__m128i masterV = _mm_adds_epu16(_mm_unpackhi_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 606 while(length >= 4){ 607 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 608 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 609 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 610 version (cpublit_revalpha) { 611 maskV |= _mm_srli_epi32(maskV, 8); 612 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 613 } else { 614 maskV |= _mm_slli_epi32(maskV, 8); 615 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 616 } 617 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 618 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 619 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 620 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 621 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 622 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 623 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 624 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 625 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 626 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 627 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 628 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 629 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 630 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 631 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 632 src += 4; 633 dest += 4; 634 length -= 4; 635 } 636 if(length >= 2){ 637 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 638 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 639 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 640 version (cpublit_revalpha) { 641 maskV |= _mm_srli_epi32(maskV, 8); 642 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 643 } else { 644 maskV |= _mm_slli_epi32(maskV, 8); 645 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 646 } 647 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 648 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 649 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 650 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 651 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 652 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 653 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 654 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 655 src += 2; 656 dest += 2; 657 length -= 2; 658 } 659 if(length){ 660 __m128i srcV = _mm_loadu_si32(src); 661 __m128i destV = _mm_loadu_si32(dest); 662 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 663 version (cpublit_revalpha) { 664 maskV |= _mm_srli_epi32(maskV, 8); 665 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 666 } else { 667 maskV |= _mm_slli_epi32(maskV, 8); 668 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 669 } 670 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 671 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 672 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 673 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 674 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 675 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 676 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 677 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 678 } 679 } else { 680 while (length) { 681 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 682 const int a = (lsrc.a * (value + 1)) >>> 8; 683 const int src1 = 1 + a; 684 const int src256 = 256 - a; 685 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 686 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 687 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 688 src++; 689 *cast(Color32Bit*)dest = ldest; 690 dest++; 691 //dest0++; 692 length--; 693 } 694 } 695 } 696 /** 697 * Alpha-blending with per pixel + fix master value alpha. 698 * `value` controls the overall alpha through extra multiplications on the alpha extracted from the pixels. 699 * 3 operator. 700 */ 701 public void alphaBlendMV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) { 702 static if(USE_INTEL_INTRINSICS) { 703 __m128i masterV; 704 static if (is(V == uint)) { 705 masterV[0] = value; 706 masterV[1] = value; 707 //masterV[2] = value; 708 //masterV[3] = value; 709 } else static if (is(V == ubyte)) { 710 masterV[0] = value; 711 masterV[1] = value; 712 //masterV[2] = value; 713 //masterV[3] = value; 714 masterV |= _mm_slli_epi32(masterV, 8); 715 masterV |= _mm_slli_epi32(masterV, 16); 716 } else static assert (0, "Value must be either 8 or 32 bits!"); 717 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 718 while(length >= 4){ 719 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 720 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 721 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 722 version (cpublit_revalpha) { 723 maskV |= _mm_srli_epi32(maskV, 8); 724 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 725 } else { 726 maskV |= _mm_slli_epi32(maskV, 8); 727 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 728 } 729 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 730 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 731 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 732 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 733 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 734 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 735 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 736 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 737 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 738 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 739 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 740 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 741 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 742 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 743 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 744 src += 4; 745 dest += 4; 746 dest0 += 4; 747 length -= 4; 748 } 749 if(length >= 2){ 750 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 751 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 752 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 753 version (cpublit_revalpha) { 754 maskV |= _mm_srli_epi32(maskV, 8); 755 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 756 } else { 757 maskV |= _mm_slli_epi32(maskV, 8); 758 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 759 } 760 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 761 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 762 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 763 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 764 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 765 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 766 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 767 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 768 src += 2; 769 dest += 2; 770 dest0 += 2; 771 length -= 2; 772 } 773 if(length){ 774 __m128i srcV = _mm_loadu_si32(src); 775 __m128i destV = _mm_loadu_si32(dest); 776 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 777 version (cpublit_revalpha) { 778 maskV |= _mm_srli_epi32(maskV, 8); 779 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 780 } else { 781 maskV |= _mm_slli_epi32(maskV, 8); 782 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 783 } 784 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 785 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 786 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 787 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 788 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 789 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 790 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 791 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 792 } 793 } else { 794 while (length) { 795 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 796 const int a = (lsrc.a * (value + 1)) >>> 8; 797 const int src1 = 1 + a; 798 const int src256 = 256 - a; 799 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 800 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 801 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 802 src++; 803 *cast(Color32Bit*)dest0 = ldest; 804 dest++; 805 dest0++; 806 length--; 807 } 808 } 809 } 810 /** 811 * Alpha-blending with per pixel + fix master value alpha. 812 * `value` controls the overall alpha through extra multiplications on the alpha extracted from the pixels. 813 * 3 operator. 814 */ 815 public void alphaBlendMV(V,M)(uint* src, uint* dest, size_t length, M* mask, V value) { 816 static if(USE_INTEL_INTRINSICS) { 817 __m128i masterV; 818 static if (is(V == uint)) { 819 masterV[0] = value; 820 masterV[1] = value; 821 //masterV[2] = value; 822 //masterV[3] = value; 823 } else static if (is(V == ubyte)) { 824 masterV[0] = value; 825 masterV[1] = value; 826 //masterV[2] = value; 827 //masterV[3] = value; 828 masterV |= _mm_slli_epi32(masterV, 8); 829 masterV |= _mm_slli_epi32(masterV, 16); 830 } else static assert (0, "Value must be either 8 or 32 bits!"); 831 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 832 while(length >= 4){ 833 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 834 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 835 static if (is(M == uint)) { 836 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 837 } else static if (is(M == ubyte)) { 838 __m128i maskV; 839 maskV[0] = mask[0]; 840 maskV[1] = mask[1]; 841 maskV[2] = mask[2]; 842 maskV[3] = mask[3]; 843 maskV |= _mm_slli_epi32(maskV, 8); 844 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 845 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 846 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 847 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 848 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 849 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 850 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 851 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 852 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 853 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 854 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 855 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 856 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 857 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 858 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 859 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 860 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 861 src += 4; 862 dest += 4; 863 length -= 4; 864 } 865 if(length >= 2){ 866 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 867 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 868 static if (is(M == uint)) { 869 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 870 } else static if (is(M == ubyte)) { 871 __m128i maskV; 872 maskV[0] = mask[0]; 873 maskV[1] = mask[1]; 874 maskV |= _mm_slli_epi32(maskV, 8); 875 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 876 } 877 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 878 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 879 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 880 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 881 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 882 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 883 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 884 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 885 src += 2; 886 dest += 2; 887 length -= 2; 888 } 889 if(length){ 890 __m128i srcV = _mm_loadu_si32(src); 891 __m128i destV = _mm_loadu_si32(dest); 892 static if (is(M == uint)) { 893 __m128i maskV = _mm_loadu_si32(mask); 894 } else static if (is(M == ubyte)) { 895 __m128i maskV; 896 maskV[0] = mask[0]; 897 maskV |= _mm_slli_epi32(maskV, 8); 898 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 899 } 900 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 901 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 902 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 903 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 904 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 905 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 906 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 907 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 908 } 909 } else { 910 while (length) { 911 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 912 const int a = (lsrc.a * (value + 1)) >>> 8; 913 const int src1 = 1 + a; 914 const int src256 = 256 - a; 915 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 916 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 917 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 918 src++; 919 *cast(Color32Bit*)dest = ldest; 920 dest++; 921 //dest0++; 922 length--; 923 } 924 } 925 } 926 /** 927 * Alpha-blending with per pixel + fix master value alpha. 928 * `value` controls the overall alpha through extra multiplications on the alpha extracted from the pixels. 929 * 3 operator. 930 */ 931 public void alphaBlendMV(V,M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) { 932 static if(USE_INTEL_INTRINSICS) { 933 __m128i masterV; 934 static if (is(V == uint)) { 935 masterV[0] = value; 936 masterV[1] = value; 937 //masterV[2] = value; 938 //masterV[3] = value; 939 } else static if (is(V == ubyte)) { 940 masterV[0] = value; 941 masterV[1] = value; 942 //masterV[2] = value; 943 //masterV[3] = value; 944 masterV |= _mm_slli_epi32(masterV, 8); 945 masterV |= _mm_slli_epi32(masterV, 16); 946 } else static assert (0, "Value must be either 8 or 32 bits!"); 947 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 948 while(length >= 4){ 949 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 950 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 951 static if (is(M == uint)) { 952 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 953 } else static if (is(M == ubyte)) { 954 __m128i maskV; 955 maskV[0] = mask[0]; 956 maskV[1] = mask[1]; 957 maskV[2] = mask[2]; 958 maskV[3] = mask[3]; 959 maskV |= _mm_slli_epi32(maskV, 8); 960 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 961 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 962 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 963 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 964 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 965 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 966 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 967 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 968 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 969 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 970 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 971 __m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi); 972 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 973 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 974 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 975 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 976 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 977 src += 4; 978 dest += 4; 979 dest0 += 4; 980 length -= 4; 981 } 982 if(length >= 2){ 983 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 984 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 985 static if (is(M == uint)) { 986 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 987 } else static if (is(M == ubyte)) { 988 __m128i maskV; 989 maskV[0] = mask[0]; 990 maskV[1] = mask[1]; 991 maskV |= _mm_slli_epi32(maskV, 8); 992 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 993 } 994 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 995 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 996 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 997 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 998 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 999 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 1000 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1001 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 1002 src += 2; 1003 dest += 2; 1004 dest0 += 2; 1005 length -= 2; 1006 } 1007 if(length){ 1008 __m128i srcV = _mm_loadu_si32(src); 1009 __m128i destV = _mm_loadu_si32(dest); 1010 static if (is(M == uint)) { 1011 __m128i maskV = _mm_loadu_si32(mask); 1012 } else static if (is(M == ubyte)) { 1013 __m128i maskV; 1014 maskV[0] = mask[0]; 1015 maskV |= _mm_slli_epi32(maskV, 8); 1016 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 1017 } 1018 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 1019 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 1020 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1021 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 1022 __m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo); 1023 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 1024 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1025 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 1026 } 1027 } else { 1028 while (length) { 1029 Color32Bit lsrc = *cast(Color32Bit*)src, ldest = *cast(Color32Bit*)dest; 1030 const int a = (lsrc.a * (value + 1)) >>> 8; 1031 const int src1 = 1 + a; 1032 const int src256 = 256 - a; 1033 ldest.r = cast(ubyte)((lsrc.r * src1 + ldest.r * src256)>>>8); 1034 ldest.g = cast(ubyte)((lsrc.g * src1 + ldest.g * src256)>>>8); 1035 ldest.b = cast(ubyte)((lsrc.b * src1 + ldest.b * src256)>>>8); 1036 src++; 1037 *cast(Color32Bit*)dest0 = ldest; 1038 dest++; 1039 dest0++; 1040 length--; 1041 } 1042 } 1043 } 1044 } 1045 unittest { 1046 1047 1048 uint[255] a, b, c, d; 1049 ubyte[255] e; 1050 //0 velues should stay 0 1051 alphaBlend(a.ptr, b.ptr, 255); 1052 testArrayForValue(b); 1053 alphaBlend(a.ptr, b.ptr, 255, d.ptr); 1054 testArrayForValue(b); 1055 alphaBlend(a.ptr, b.ptr, c.ptr, 255); 1056 testArrayForValue(c); 1057 alphaBlend(a.ptr, b.ptr, c.ptr, 255, d.ptr); 1058 testArrayForValue(c); 1059 alphaBlend(a.ptr, b.ptr, 255, e.ptr); 1060 testArrayForValue(b); 1061 alphaBlend(a.ptr, b.ptr, c.ptr, 255, e.ptr); 1062 testArrayForValue(c); 1063 alphaBlendFV!ubyte(a.ptr, b.ptr, 255, 0x0F); 1064 testArrayForValue(b); 1065 alphaBlendFV!ubyte(a.ptr, b.ptr, c.ptr, 255, 0x0F); 1066 testArrayForValue(c); 1067 alphaBlendFV!uint(a.ptr, b.ptr, 255, 0x0F0F0F0F); 1068 testArrayForValue(b); 1069 alphaBlendFV!uint(a.ptr, b.ptr, c.ptr, 255, 0x0F0F0F0F); 1070 testArrayForValue(c); 1071 alphaBlendMV!ubyte(a.ptr, b.ptr, 255, ubyte.max); 1072 testArrayForValue(b); 1073 alphaBlendMV!ubyte(a.ptr, b.ptr, 255, d.ptr, ubyte.max); 1074 testArrayForValue(b); 1075 alphaBlendMV!ubyte(a.ptr, b.ptr, c.ptr, 255, ubyte.max); 1076 testArrayForValue(c); 1077 alphaBlendMV!ubyte(a.ptr, b.ptr, c.ptr, 255, d.ptr, ubyte.max); 1078 testArrayForValue(c); 1079 alphaBlendMV!uint(a.ptr, b.ptr, 255, uint.max); 1080 testArrayForValue(b); 1081 alphaBlendMV!uint(a.ptr, b.ptr, 255, d.ptr, uint.max); 1082 testArrayForValue(b); 1083 alphaBlendMV!uint(a.ptr, b.ptr, c.ptr, 255, uint.max); 1084 testArrayForValue(c); 1085 alphaBlendMV!uint(a.ptr, b.ptr, c.ptr, 255, d.ptr, uint.max); 1086 testArrayForValue(c); 1087 }