1 module CPUblit.composing.screen; 2 3 import CPUblit.composing.common; 4 5 /* 6 * CPUblit 7 * Screen-blend compose functions. 8 * Author: Laszlo Szeremi 9 * 10 * Screen-blend functions compose two images together using the following formula: 11 * dest0[rgba] = 1 - (1 - src[rgba]) * (1 - dest[rgba]) 12 * This is translated to the following formula: 13 * dest0[rgba] = 255 - ((256 - src[rgba]) * (255 - dest[rgba]))>>>8 14 * If alpha channel is enabled, it'control the blend between the multiplied value and the original one. 15 * dest0[rgba] = ((1.0 - mask[aaaa]) * dest) + (mask[aaaa] * (1 - (1 - src[rgba]) * (1 - dest[rgba]))) 16 * In integer, this is: 17 * dest0[rgba] = (((256 - mask[aaaa]) * dest) + (255 - ((256 - src[rgba]) * (255 - dest[rgba]))>>>8)>>>8 18 */ 19 @nogc pure nothrow { 20 /** 21 * 2 Operator screen function. 22 */ 23 void screen(uint* src, uint* dest, size_t length) { 24 while (length >= 4) { 25 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 26 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 27 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 28 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 29 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 30 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 31 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 32 _mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8); 33 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 34 _mm_unpackhi_epi8(destV, SSE2_NULLVECT))), 8); 35 _mm_storeu_si128(cast(__m128i*) dest, _mm_subs_epu8(SSE2_FULLVECT, 36 _mm_packus_epi16(src_lo, src_hi))); 37 src += 4; 38 dest += 4; 39 length -= 4; 40 } 41 if (length >= 2) { 42 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 43 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 44 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 45 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 46 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 47 _mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8); 48 _mm_storel_epi64(cast(__m128i*) dest, _mm_subs_epu8(SSE2_FULLVECT, 49 _mm_packus_epi16(src_lo, SSE2_NULLVECT))); 50 src += 2; 51 dest += 2; 52 length -= 2; 53 } 54 if (length) { 55 __m128i srcV = _mm_loadu_si32(src); 56 __m128i destV = _mm_loadu_si32(dest); 57 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 58 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 59 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 60 _mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8); 61 _mm_storeu_si32(dest, _mm_subs_epu8(SSE2_FULLVECT, 62 _mm_packus_epi16(src_lo, SSE2_NULLVECT))); 63 } 64 } 65 /** 66 * 3 Operator screen function. 67 */ 68 void screen(uint* src, uint* dest, uint* dest0, size_t length) { 69 while (length >= 4) { 70 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 71 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 72 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 73 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 74 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 75 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 76 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 77 _mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8); 78 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 79 _mm_unpackhi_epi8(destV, SSE2_NULLVECT))), 8); 80 _mm_storeu_si128(cast(__m128i*) dest0, _mm_subs_epu8(SSE2_FULLVECT, 81 _mm_packus_epi16(src_lo, src_hi))); 82 src += 4; 83 dest += 4; 84 dest0 += 4; 85 length -= 4; 86 } 87 if (length >= 2) { 88 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 89 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 90 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 91 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 92 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 93 _mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8); 94 _mm_storel_epi64(cast(__m128i*) dest0, _mm_subs_epu8(SSE2_FULLVECT, 95 _mm_packus_epi16(src_lo, SSE2_NULLVECT))); 96 src += 2; 97 dest += 2; 98 dest0 += 2; 99 length -= 2; 100 } 101 if (length) { 102 __m128i srcV = _mm_loadu_si32(src); 103 __m128i destV = _mm_loadu_si32(dest); 104 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 105 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 106 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, 107 _mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8); 108 _mm_storeu_si32(dest0, _mm_subs_epu8(SSE2_FULLVECT, 109 _mm_packus_epi16(src_lo, SSE2_NULLVECT))); 110 } 111 } 112 /** 113 * 2 Operator multiply function with blending. 114 */ 115 void screenBl(uint* src, uint* dest, size_t length) { 116 while (length >= 4) { 117 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 118 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 119 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 120 version (cpublit_revalpha) { 121 maskV |= _mm_srli_epi32(maskV, 8); 122 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 123 } else { 124 maskV |= _mm_slli_epi32(maskV, 8); 125 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 126 } 127 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 128 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 129 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 130 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 131 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 132 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 133 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 134 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 135 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 136 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 137 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 138 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 139 140 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 141 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 142 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 143 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 144 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 145 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 146 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 147 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 148 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 149 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 150 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 151 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 152 _mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi)); 153 src += 4; 154 dest += 4; 155 length -= 4; 156 } 157 if (length >= 2) { 158 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 159 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 160 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 161 version (cpublit_revalpha) { 162 maskV |= _mm_srli_epi32(maskV, 8); 163 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 164 } else { 165 maskV |= _mm_slli_epi32(maskV, 8); 166 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 167 } 168 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 169 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 170 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 171 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 172 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 173 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 174 175 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 176 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 177 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 178 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 179 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 180 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 181 _mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 182 src += 2; 183 dest += 2; 184 length -= 2; 185 } 186 if (length) { 187 __m128i srcV = _mm_loadu_si32(src); 188 __m128i destV = _mm_loadu_si32(dest); 189 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 190 version (cpublit_revalpha) { 191 maskV |= _mm_srli_epi32(maskV, 8); 192 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 193 } else { 194 maskV |= _mm_slli_epi32(maskV, 8); 195 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 196 } 197 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 198 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 199 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 200 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 201 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 202 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 203 204 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 205 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 206 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 207 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 208 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 209 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 210 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 211 } 212 } 213 /** 214 * 3 Operator multiply function with blending. 215 */ 216 void screenBl(uint* src, uint* dest, uint* dest0, size_t length) { 217 while (length >= 4) { 218 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 219 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 220 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 221 version (cpublit_revalpha) { 222 maskV |= _mm_srli_epi32(maskV, 8); 223 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 224 } else { 225 maskV |= _mm_slli_epi32(maskV, 8); 226 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 227 } 228 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 229 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 230 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 231 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 232 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 233 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 234 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 235 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 236 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 237 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 238 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 239 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 240 241 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 242 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 243 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 244 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 245 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 246 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 247 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 248 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 249 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 250 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 251 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 252 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 253 _mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi)); 254 src += 4; 255 dest += 4; 256 dest0 += 4; 257 length -= 4; 258 } 259 if (length >= 2) { 260 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 261 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 262 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 263 version (cpublit_revalpha) { 264 maskV |= _mm_srli_epi32(maskV, 8); 265 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 266 } else { 267 maskV |= _mm_slli_epi32(maskV, 8); 268 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 269 } 270 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 271 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 272 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 273 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 274 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 275 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 276 277 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 278 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 279 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 280 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 281 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 282 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 283 _mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 284 src += 2; 285 dest += 2; 286 dest0 += 2; 287 length -= 2; 288 } 289 if (length) { 290 __m128i srcV = _mm_loadu_si32(src); 291 __m128i destV = _mm_loadu_si32(dest); 292 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 293 version (cpublit_revalpha) { 294 maskV |= _mm_srli_epi32(maskV, 8); 295 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 296 } else { 297 maskV |= _mm_slli_epi32(maskV, 8); 298 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 299 } 300 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 301 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 302 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 303 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 304 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 305 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 306 307 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 308 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 309 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 310 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 311 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 312 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 313 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 314 } 315 } 316 317 /** 318 * 3 Operator multiply function with blending and masking. 319 */ 320 void screen(M)(uint* src, uint* dest, size_t length, M* mask) { 321 while (length >= 4) { 322 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 323 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 324 static if (is(M == uint)) { 325 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 326 } else static if (is(M == ubyte)) { 327 __m128i maskV; 328 maskV[0] = mask[0]; 329 maskV[1] = mask[1]; 330 maskV[2] = mask[2]; 331 maskV[3] = mask[3]; 332 maskV |= _mm_slli_epi32(maskV, 8); 333 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 334 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 335 336 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 337 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 338 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 339 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 340 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 341 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 342 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 343 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 344 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 345 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 346 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 347 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 348 349 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 350 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 351 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 352 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 353 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 354 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 355 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 356 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 357 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 358 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 359 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 360 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 361 _mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi)); 362 src += 4; 363 dest += 4; 364 mask += 4; 365 length -= 4; 366 } 367 if (length >= 2) { 368 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 369 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 370 static if (is(M == uint)) { 371 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 372 } else static if (is(M == ubyte)) { 373 __m128i maskV; 374 maskV[0] = mask[0]; 375 maskV[1] = mask[1]; 376 maskV |= _mm_slli_epi32(maskV, 8); 377 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 378 } 379 380 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 381 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 382 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 383 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 384 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 385 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 386 387 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 388 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 389 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 390 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 391 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 392 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 393 _mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 394 src += 2; 395 dest += 2; 396 mask += 2; 397 length -= 2; 398 } 399 if (length) { 400 __m128i srcV = _mm_loadu_si32(src); 401 __m128i destV = _mm_loadu_si32(dest); 402 static if (is(M == uint)) { 403 __m128i maskV = _mm_loadu_si32(mask); 404 } else static if (is(M == ubyte)) { 405 __m128i maskV; 406 maskV[0] = mask[0]; 407 maskV |= _mm_slli_epi32(maskV, 8); 408 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 409 } 410 411 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 412 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 413 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 414 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 415 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 416 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 417 418 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 419 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 420 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 421 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 422 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 423 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 424 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 425 } 426 } 427 /** 428 * 4 Operator multiply function with blending and masking. 429 */ 430 void screen(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) { 431 while (length >= 4) { 432 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 433 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 434 static if (is(M == uint)) { 435 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 436 } else static if (is(M == ubyte)) { 437 __m128i maskV; 438 maskV[0] = mask[0]; 439 maskV[1] = mask[1]; 440 maskV[2] = mask[2]; 441 maskV[3] = mask[3]; 442 maskV |= _mm_slli_epi32(maskV, 8); 443 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 444 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 445 446 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 447 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 448 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 449 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 450 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 451 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 452 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 453 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 454 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 455 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 456 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 457 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 458 459 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 460 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 461 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 462 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 463 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 464 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 465 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 466 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 467 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 468 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 469 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 470 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 471 _mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi)); 472 src += 4; 473 dest += 4; 474 dest0 += 4; 475 mask += 4; 476 length -= 4; 477 } 478 if (length >= 2) { 479 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 480 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 481 static if (is(M == uint)) { 482 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 483 } else static if (is(M == ubyte)) { 484 __m128i maskV; 485 maskV[0] = mask[0]; 486 maskV[1] = mask[1]; 487 maskV |= _mm_slli_epi32(maskV, 8); 488 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 489 } 490 491 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 492 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 493 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 494 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 495 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 496 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 497 498 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 499 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 500 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 501 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 502 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 503 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 504 _mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 505 src += 2; 506 dest += 2; 507 dest0 += 2; 508 mask += 2; 509 length -= 2; 510 } 511 if (length) { 512 __m128i srcV = _mm_loadu_si32(src); 513 __m128i destV = _mm_loadu_si32(dest); 514 static if (is(M == uint)) { 515 __m128i maskV = _mm_loadu_si32(mask); 516 } else static if (is(M == ubyte)) { 517 __m128i maskV; 518 maskV[0] = mask[0]; 519 maskV |= _mm_slli_epi32(maskV, 8); 520 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 521 } 522 523 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 524 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 525 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 526 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 527 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 528 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 529 530 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 531 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 532 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 533 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 534 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 535 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 536 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 537 } 538 } 539 540 /** 541 * 2 Operator multiply function with master value. 542 */ 543 void screenMV(V)(uint* src, uint* dest, size_t length, V value) { 544 __m128i masterV; 545 static if (is(V == uint)) { 546 masterV[0] = value; 547 masterV[1] = value; 548 } else static if (is(V == ubyte)) { 549 masterV[0] = value; 550 masterV[1] = value; 551 masterV |= _mm_slli_epi32(masterV, 8); 552 masterV |= _mm_slli_epi32(masterV, 16); 553 } else 554 static assert(0, "Value must be either 8 or 32 bits!"); 555 __m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 556 _mm_unpacklo_epi8(masterV, SSE2_NULLVECT)); 557 __m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, 558 SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 559 while (length >= 4) { 560 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 561 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 562 563 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 564 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 565 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 566 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 567 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 568 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 569 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 570 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 571 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 572 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 573 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 574 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 575 576 src_lo = _mm_mullo_epi16(src_lo, master_1); 577 src_hi = _mm_mullo_epi16(src_hi, master_1); 578 dest_lo = _mm_mullo_epi16(dest_lo, master_256); 579 dest_hi = _mm_mullo_epi16(dest_hi, master_256); 580 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 581 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 582 _mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi)); 583 src += 4; 584 dest += 4; 585 length -= 4; 586 } 587 if (length >= 2) { 588 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 589 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 590 591 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 592 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 593 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 594 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 595 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 596 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 597 598 src_lo = _mm_mullo_epi16(src_lo, master_1); 599 dest_lo = _mm_mullo_epi16(dest_lo, master_256); 600 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 601 _mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 602 src += 2; 603 dest += 2; 604 length -= 2; 605 } 606 if (length) { 607 __m128i srcV = _mm_loadu_si32(src); 608 __m128i destV = _mm_loadu_si32(dest); 609 610 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 611 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 612 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 613 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 614 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 615 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 616 617 src_lo = _mm_mullo_epi16(src_lo, master_1); 618 dest_lo = _mm_mullo_epi16(dest_lo, master_256); 619 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 620 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 621 } 622 } 623 /** 624 * 3 Operator multiply function with master value. 625 */ 626 void screenMV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) { 627 __m128i masterV; 628 static if (is(V == uint)) { 629 masterV[0] = value; 630 masterV[1] = value; 631 } else static if (is(V == ubyte)) { 632 masterV[0] = value; 633 masterV[1] = value; 634 masterV |= _mm_slli_epi32(masterV, 8); 635 masterV |= _mm_slli_epi32(masterV, 16); 636 } else 637 static assert(0, "Value must be either 8 or 32 bits!"); 638 __m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 639 _mm_unpacklo_epi8(masterV, SSE2_NULLVECT)); 640 __m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, 641 SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 642 while (length >= 4) { 643 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 644 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 645 646 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 647 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 648 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 649 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 650 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 651 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 652 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 653 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 654 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 655 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 656 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 657 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 658 659 src_lo = _mm_mullo_epi16(src_lo, master_1); 660 src_hi = _mm_mullo_epi16(src_hi, master_1); 661 dest_lo = _mm_mullo_epi16(dest_lo, master_256); 662 dest_hi = _mm_mullo_epi16(dest_hi, master_256); 663 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 664 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 665 _mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi)); 666 src += 4; 667 dest += 4; 668 dest0 += 4; 669 length -= 4; 670 } 671 if (length >= 2) { 672 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 673 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 674 675 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 676 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 677 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 678 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 679 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 680 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 681 682 src_lo = _mm_mullo_epi16(src_lo, master_1); 683 dest_lo = _mm_mullo_epi16(dest_lo, master_256); 684 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 685 _mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 686 src += 2; 687 dest += 2; 688 dest0 += 2; 689 length -= 2; 690 } 691 if (length) { 692 __m128i srcV = _mm_loadu_si32(src); 693 __m128i destV = _mm_loadu_si32(dest); 694 695 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 696 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 697 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 698 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 699 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 700 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 701 702 src_lo = _mm_mullo_epi16(src_lo, master_1); 703 dest_lo = _mm_mullo_epi16(dest_lo, master_256); 704 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 705 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 706 } 707 } 708 709 /** 710 * 2 Operator multiply function with blending. 711 */ 712 void screenMVBl(V)(uint* src, uint* dest, size_t length, V value) { 713 __m128i masterV; 714 static if (is(V == uint)) { 715 masterV[0] = value; 716 masterV[1] = value; 717 } else static if (is(V == ubyte)) { 718 masterV[0] = value; 719 masterV[1] = value; 720 masterV |= _mm_slli_epi32(masterV, 8); 721 masterV |= _mm_slli_epi32(masterV, 16); 722 } else 723 static assert(0, "Value must be either 8 or 32 bits!"); 724 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 725 while (length >= 4) { 726 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 727 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 728 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 729 version (cpublit_revalpha) { 730 maskV |= _mm_srli_epi32(maskV, 8); 731 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 732 } else { 733 maskV |= _mm_slli_epi32(maskV, 8); 734 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 735 } 736 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 737 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 738 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 739 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 740 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 741 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 742 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 743 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 744 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 745 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 746 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 747 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 748 749 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 750 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 751 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 752 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 753 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 754 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 755 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 756 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 757 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 758 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 759 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 760 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 761 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 762 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 763 _mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi)); 764 src += 4; 765 dest += 4; 766 length -= 4; 767 } 768 if (length >= 2) { 769 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 770 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 771 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 772 version (cpublit_revalpha) { 773 maskV |= _mm_srli_epi32(maskV, 8); 774 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 775 } else { 776 maskV |= _mm_slli_epi32(maskV, 8); 777 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 778 } 779 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 780 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 781 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 782 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 783 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 784 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 785 786 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 787 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 788 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 789 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 790 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 791 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 792 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 793 _mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 794 src += 2; 795 dest += 2; 796 length -= 2; 797 } 798 if (length) { 799 __m128i srcV = _mm_loadu_si32(src); 800 __m128i destV = _mm_loadu_si32(dest); 801 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 802 version (cpublit_revalpha) { 803 maskV |= _mm_srli_epi32(maskV, 8); 804 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 805 } else { 806 maskV |= _mm_slli_epi32(maskV, 8); 807 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 808 } 809 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 810 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 811 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 812 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 813 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 814 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 815 816 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 817 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 818 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 819 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 820 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 821 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 822 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 823 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 824 } 825 } 826 /** 827 * 3 Operator multiply function with blending. 828 */ 829 void screenMVBl(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) { 830 __m128i masterV; 831 static if (is(V == uint)) { 832 masterV[0] = value; 833 masterV[1] = value; 834 } else static if (is(V == ubyte)) { 835 masterV[0] = value; 836 masterV[1] = value; 837 masterV |= _mm_slli_epi32(masterV, 8); 838 masterV |= _mm_slli_epi32(masterV, 16); 839 } else 840 static assert(0, "Value must be either 8 or 32 bits!"); 841 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 842 while (length >= 4) { 843 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 844 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 845 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 846 version (cpublit_revalpha) { 847 maskV |= _mm_srli_epi32(maskV, 8); 848 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 849 } else { 850 maskV |= _mm_slli_epi32(maskV, 8); 851 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 852 } 853 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 854 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 855 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 856 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 857 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 858 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 859 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 860 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 861 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 862 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 863 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 864 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 865 866 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 867 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 868 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 869 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 870 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 871 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 872 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 873 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 874 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 875 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 876 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 877 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 878 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 879 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 880 _mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi)); 881 src += 4; 882 dest += 4; 883 dest0 += 4; 884 length -= 4; 885 } 886 if (length >= 2) { 887 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 888 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 889 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 890 version (cpublit_revalpha) { 891 maskV |= _mm_srli_epi32(maskV, 8); 892 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 893 } else { 894 maskV |= _mm_slli_epi32(maskV, 8); 895 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 896 } 897 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 898 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 899 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 900 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 901 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 902 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 903 904 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 905 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 906 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 907 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 908 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 909 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 910 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 911 _mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 912 src += 2; 913 dest += 2; 914 dest0 += 2; 915 length -= 2; 916 } 917 if (length) { 918 __m128i srcV = _mm_loadu_si32(src); 919 __m128i destV = _mm_loadu_si32(dest); 920 __m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK; 921 version (cpublit_revalpha) { 922 maskV |= _mm_srli_epi32(maskV, 8); 923 maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A] 924 } else { 925 maskV |= _mm_slli_epi32(maskV, 8); 926 maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A] 927 } 928 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 929 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 930 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 931 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 932 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 933 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 934 935 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 936 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 937 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 938 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 939 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 940 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 941 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 942 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 943 } 944 } 945 946 /** 947 * 3 Operator multiply function with blending and masking. 948 */ 949 void screenMV(M, V)(uint* src, uint* dest, size_t length, M* mask, V value) { 950 __m128i masterV; 951 static if (is(V == uint)) { 952 masterV[0] = value; 953 masterV[1] = value; 954 } else static if (is(V == ubyte)) { 955 masterV[0] = value; 956 masterV[1] = value; 957 masterV |= _mm_slli_epi32(masterV, 8); 958 masterV |= _mm_slli_epi32(masterV, 16); 959 } else 960 static assert(0, "Value must be either 8 or 32 bits!"); 961 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 962 while (length >= 4) { 963 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 964 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 965 static if (is(M == uint)) { 966 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 967 } else static if (is(M == ubyte)) { 968 __m128i maskV; 969 maskV[0] = mask[0]; 970 maskV[1] = mask[1]; 971 maskV[2] = mask[2]; 972 maskV[3] = mask[3]; 973 maskV |= _mm_slli_epi32(maskV, 8); 974 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 975 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 976 977 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 978 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 979 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 980 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 981 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 982 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 983 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 984 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 985 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 986 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 987 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 988 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 989 990 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 991 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 992 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 993 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 994 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 995 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 996 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 997 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 998 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 999 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 1000 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 1001 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 1002 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1003 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 1004 _mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi)); 1005 src += 4; 1006 dest += 4; 1007 mask += 4; 1008 length -= 4; 1009 } 1010 if (length >= 2) { 1011 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 1012 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 1013 static if (is(M == uint)) { 1014 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 1015 } else static if (is(M == ubyte)) { 1016 __m128i maskV; 1017 maskV[0] = mask[0]; 1018 maskV[1] = mask[1]; 1019 maskV |= _mm_slli_epi32(maskV, 8); 1020 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 1021 } 1022 1023 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 1024 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 1025 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 1026 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 1027 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 1028 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 1029 1030 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 1031 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 1032 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1033 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 1034 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 1035 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 1036 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1037 _mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 1038 src += 2; 1039 dest += 2; 1040 mask += 2; 1041 length -= 2; 1042 } 1043 if (length) { 1044 __m128i srcV = _mm_loadu_si32(src); 1045 __m128i destV = _mm_loadu_si32(dest); 1046 static if (is(M == uint)) { 1047 __m128i maskV = _mm_loadu_si32(mask); 1048 } else static if (is(M == ubyte)) { 1049 __m128i maskV; 1050 maskV[0] = mask[0]; 1051 maskV |= _mm_slli_epi32(maskV, 8); 1052 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 1053 } 1054 1055 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 1056 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 1057 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 1058 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 1059 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 1060 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 1061 1062 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 1063 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 1064 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1065 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 1066 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 1067 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 1068 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1069 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 1070 } 1071 } 1072 /** 1073 * 4 Operator multiply function with blending. 1074 */ 1075 void screenMV(M, V)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) { 1076 __m128i masterV; 1077 static if (is(V == uint)) { 1078 masterV[0] = value; 1079 masterV[1] = value; 1080 } else static if (is(V == ubyte)) { 1081 masterV[0] = value; 1082 masterV[1] = value; 1083 masterV |= _mm_slli_epi32(masterV, 8); 1084 masterV |= _mm_slli_epi32(masterV, 16); 1085 } else 1086 static assert(0, "Value must be either 8 or 32 bits!"); 1087 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 1088 while (length >= 4) { 1089 __m128i srcV = _mm_loadu_si128(cast(__m128i*) src); 1090 __m128i destV = _mm_loadu_si128(cast(__m128i*) dest); 1091 static if (is(M == uint)) { 1092 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 1093 } else static if (is(M == ubyte)) { 1094 __m128i maskV; 1095 maskV[0] = mask[0]; 1096 maskV[1] = mask[1]; 1097 maskV[2] = mask[2]; 1098 maskV[3] = mask[3]; 1099 maskV |= _mm_slli_epi32(maskV, 8); 1100 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 1101 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 1102 1103 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 1104 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 1105 __m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 1106 _mm_unpackhi_epi8(srcV, SSE2_NULLVECT)); 1107 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 1108 __m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT); 1109 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 1110 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 1111 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, 1112 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8); 1113 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 1114 src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi); 1115 1116 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 1117 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 1118 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 1119 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 1120 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1121 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1122 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 1123 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 1124 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 1125 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 1126 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 1127 dest_hi = _mm_mullo_epi16(dest_hi, mask_hi); 1128 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1129 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 1130 _mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi)); 1131 src += 4; 1132 dest += 4; 1133 dest0 += 4; 1134 mask += 4; 1135 length -= 4; 1136 } 1137 if (length >= 2) { 1138 __m128i srcV = _mm_loadl_epi64(cast(__m128i*) src); 1139 __m128i destV = _mm_loadl_epi64(cast(__m128i*) dest); 1140 static if (is(M == uint)) { 1141 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 1142 } else static if (is(M == ubyte)) { 1143 __m128i maskV; 1144 maskV[0] = mask[0]; 1145 maskV[1] = mask[1]; 1146 maskV |= _mm_slli_epi32(maskV, 8); 1147 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 1148 } 1149 1150 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 1151 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 1152 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 1153 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 1154 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 1155 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 1156 1157 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 1158 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 1159 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1160 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 1161 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 1162 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 1163 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1164 _mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 1165 src += 2; 1166 dest += 2; 1167 dest0 += 2; 1168 mask += 2; 1169 length -= 2; 1170 } 1171 if (length) { 1172 __m128i srcV = _mm_loadu_si32(src); 1173 __m128i destV = _mm_loadu_si32(dest); 1174 static if (is(M == uint)) { 1175 __m128i maskV = _mm_loadu_si32(mask); 1176 } else static if (is(M == ubyte)) { 1177 __m128i maskV; 1178 maskV[0] = mask[0]; 1179 maskV |= _mm_slli_epi32(maskV, 8); 1180 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 1181 } 1182 1183 __m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, 1184 _mm_unpacklo_epi8(srcV, SSE2_NULLVECT)); 1185 __m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT); 1186 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, 1187 _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8); 1188 src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo); 1189 1190 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 1191 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 1192 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 1193 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 1194 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 1195 dest_lo = _mm_mullo_epi16(dest_lo, mask_lo); 1196 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 1197 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 1198 } 1199 } 1200 } 1201 1202 unittest { 1203 uint[] src, src0, dest, dest0, maskA, maskB; 1204 ubyte[] mask0A, mask0B; 1205 src.length = 255; 1206 src0.length = 255; 1207 dest.length = 255; 1208 dest0.length = 255; 1209 maskA.length = 255; 1210 fillWithSingleValue(maskA, uint.max); 1211 maskB.length = 255; 1212 mask0A.length = 255; 1213 fillWithSingleValue(mask0A, ubyte.max); 1214 mask0B.length = 255; 1215 fillWithSingleValue(src, 0x306090FF); 1216 fillWithSingleValue(src0, 0x30609000); 1217 fillWithSingleValue(dest, 0xEE2ADDFF); //result should be `0xF27AF1FF` if A is FF 1218 1219 //Test basic functions 1220 screen(src.ptr, dest.ptr, 255); 1221 testArrayForValue(dest, 0xF27AF1FF); 1222 fillWithSingleValue(dest, 0xEE2ADDFF); 1223 screen(src.ptr, dest.ptr, dest0.ptr, 255); 1224 testArrayForValue(dest0, 0xF27AF1FF); 1225 fillWithSingleValue(dest0, 0); 1226 1227 //Test blend functions 1228 screenBl(src.ptr, dest.ptr, 255); 1229 testArrayForValue(dest, 0xF27AF1FF); 1230 fillWithSingleValue(dest, 0xEE2ADDFF); 1231 screenBl(src.ptr, dest.ptr, dest0.ptr, 255); 1232 testArrayForValue(dest0, 0xF27AF1FF); 1233 fillWithSingleValue(dest0, 0); 1234 1235 screenBl(src0.ptr, dest.ptr, 255); 1236 testArrayForValue(dest, 0xEE2ADDFF); 1237 screenBl(src0.ptr, dest.ptr, dest0.ptr, 255); 1238 testArrayForValue(dest0, 0xEE2ADDFF); 1239 fillWithSingleValue(dest0, 0); 1240 1241 //Test master value functions 1242 screenMV(src.ptr, dest.ptr, 255, uint.max); 1243 testArrayForValue(dest, 0xF27AF1FF); 1244 fillWithSingleValue(dest, 0xEE2ADDFF); 1245 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.max); 1246 testArrayForValue(dest0, 0xF27AF1FF); 1247 fillWithSingleValue(dest0, 0); 1248 1249 screenMV(src.ptr, dest.ptr, 255, ubyte.max); 1250 testArrayForValue(dest, 0xF27AF1FF); 1251 fillWithSingleValue(dest, 0xEE2ADDFF); 1252 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max); 1253 testArrayForValue(dest0, 0xF27AF1FF); 1254 fillWithSingleValue(dest0, 0); 1255 1256 screenMV(src.ptr, dest.ptr, 255, uint.min); 1257 testArrayForValue(dest, 0xEE2ADDFF); 1258 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.min); 1259 testArrayForValue(dest0, 0xEE2ADDFF); 1260 fillWithSingleValue(dest0, 0); 1261 1262 screenMV(src.ptr, dest.ptr, 255, ubyte.min); 1263 testArrayForValue(dest, 0xEE2ADDFF); 1264 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min); 1265 testArrayForValue(dest0, 0xEE2ADDFF); 1266 fillWithSingleValue(dest0, 0); 1267 1268 //Test mask functions 1269 screen(src.ptr, dest.ptr, 255, mask0A.ptr); 1270 testArrayForValue(dest, 0xF27AF1FF); 1271 fillWithSingleValue(dest, 0xEE2ADDFF); 1272 screen(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr); 1273 testArrayForValue(dest0, 0xF27AF1FF); 1274 fillWithSingleValue(dest0, 0); 1275 1276 screen(src.ptr, dest.ptr, 255, maskA.ptr); 1277 testArrayForValue(dest, 0xF27AF1FF); 1278 fillWithSingleValue(dest, 0xEE2ADDFF); 1279 screen(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr); 1280 testArrayForValue(dest0, 0xF27AF1FF); 1281 fillWithSingleValue(dest0, 0); 1282 1283 screen(src.ptr, dest.ptr, 255, mask0B.ptr); 1284 testArrayForValue(dest, 0xEE2ADDFF); 1285 screen(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr); 1286 testArrayForValue(dest0, 0xEE2ADDFF); 1287 fillWithSingleValue(dest0, 0); 1288 1289 screen(src.ptr, dest.ptr, 255, maskB.ptr); 1290 testArrayForValue(dest, 0xEE2ADDFF); 1291 screen(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr); 1292 testArrayForValue(dest0, 0xEE2ADDFF); 1293 fillWithSingleValue(dest0, 0); 1294 1295 //Test blend with master value functions 1296 screenMVBl(src.ptr, dest.ptr, 255, ubyte.max); 1297 testArrayForValue(dest, 0xF27AF1FF); 1298 fillWithSingleValue(dest, 0xEE2ADDFF); 1299 screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max); 1300 testArrayForValue(dest0, 0xF27AF1FF); 1301 fillWithSingleValue(dest0, 0); 1302 1303 screenMVBl(src0.ptr, dest.ptr, 255, ubyte.max); 1304 testArrayForValue(dest, 0xEE2ADDFF); 1305 screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.max); 1306 testArrayForValue(dest0, 0xEE2ADDFF); 1307 fillWithSingleValue(dest0, 0); 1308 1309 screenMVBl(src.ptr, dest.ptr, 255, uint.max); 1310 testArrayForValue(dest, 0xF27AF1FF); 1311 fillWithSingleValue(dest, 0xEE2ADDFF); 1312 screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.max); 1313 testArrayForValue(dest0, 0xF27AF1FF); 1314 fillWithSingleValue(dest0, 0); 1315 1316 screenMVBl(src0.ptr, dest.ptr, 255, uint.max); 1317 testArrayForValue(dest, 0xEE2ADDFF); 1318 screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.max); 1319 testArrayForValue(dest0, 0xEE2ADDFF); 1320 fillWithSingleValue(dest0, 0); 1321 1322 screenMVBl(src.ptr, dest.ptr, 255, ubyte.min); 1323 testArrayForValue(dest, 0xEE2ADDFF); 1324 screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min); 1325 testArrayForValue(dest0, 0xEE2ADDFF); 1326 fillWithSingleValue(dest0, 0); 1327 1328 screenMVBl(src0.ptr, dest.ptr, 255, ubyte.min); 1329 testArrayForValue(dest, 0xEE2ADDFF); 1330 screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.min); 1331 testArrayForValue(dest0, 0xEE2ADDFF); 1332 fillWithSingleValue(dest0, 0); 1333 1334 screenMVBl(src.ptr, dest.ptr, 255, uint.min); 1335 testArrayForValue(dest, 0xEE2ADDFF); 1336 screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.min); 1337 testArrayForValue(dest0, 0xEE2ADDFF); 1338 fillWithSingleValue(dest0, 0); 1339 1340 screenMVBl(src0.ptr, dest.ptr, 255, uint.min); 1341 testArrayForValue(dest, 0xEE2ADDFF); 1342 screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.min); 1343 testArrayForValue(dest0, 0xEE2ADDFF); 1344 fillWithSingleValue(dest0, 0); 1345 1346 //Test masking with master value functions 1347 screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.max); 1348 testArrayForValue(dest, 0xF27AF1FF); 1349 fillWithSingleValue(dest, 0xEE2ADDFF); 1350 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.max); 1351 testArrayForValue(dest0, 0xF27AF1FF); 1352 fillWithSingleValue(dest0, 0); 1353 1354 screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.max); 1355 testArrayForValue(dest, 0xEE2ADDFF); 1356 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.max); 1357 testArrayForValue(dest0, 0xEE2ADDFF); 1358 fillWithSingleValue(dest0, 0); 1359 1360 screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.max); 1361 testArrayForValue(dest, 0xF27AF1FF); 1362 fillWithSingleValue(dest, 0xEE2ADDFF); 1363 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.max); 1364 testArrayForValue(dest0, 0xF27AF1FF); 1365 fillWithSingleValue(dest0, 0); 1366 1367 screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.max); 1368 testArrayForValue(dest, 0xEE2ADDFF); 1369 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.max); 1370 testArrayForValue(dest0, 0xEE2ADDFF); 1371 fillWithSingleValue(dest0, 0); 1372 1373 screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.min); 1374 testArrayForValue(dest, 0xEE2ADDFF); 1375 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.min); 1376 testArrayForValue(dest0, 0xEE2ADDFF); 1377 fillWithSingleValue(dest0, 0); 1378 1379 screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.min); 1380 testArrayForValue(dest, 0xEE2ADDFF); 1381 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.min); 1382 testArrayForValue(dest0, 0xEE2ADDFF); 1383 fillWithSingleValue(dest0, 0); 1384 1385 screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.min); 1386 testArrayForValue(dest, 0xEE2ADDFF); 1387 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.min); 1388 testArrayForValue(dest0, 0xEE2ADDFF); 1389 fillWithSingleValue(dest0, 0); 1390 1391 screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.min); 1392 testArrayForValue(dest, 0xEE2ADDFF); 1393 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.min); 1394 testArrayForValue(dest0, 0xEE2ADDFF); 1395 fillWithSingleValue(dest0, 0); 1396 // 1397 1398 screenMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.max); 1399 testArrayForValue(dest, 0xF27AF1FF); 1400 fillWithSingleValue(dest, 0xEE2ADDFF); 1401 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.max); 1402 testArrayForValue(dest0, 0xF27AF1FF); 1403 fillWithSingleValue(dest0, 0); 1404 1405 screenMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.max); 1406 testArrayForValue(dest, 0xEE2ADDFF); 1407 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.max); 1408 testArrayForValue(dest0, 0xEE2ADDFF); 1409 fillWithSingleValue(dest0, 0); 1410 1411 screenMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.max); 1412 testArrayForValue(dest, 0xF27AF1FF); 1413 fillWithSingleValue(dest, 0xEE2ADDFF); 1414 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.max); 1415 testArrayForValue(dest0, 0xF27AF1FF); 1416 fillWithSingleValue(dest0, 0); 1417 1418 screenMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.max); 1419 testArrayForValue(dest, 0xEE2ADDFF); 1420 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.max); 1421 testArrayForValue(dest0, 0xEE2ADDFF); 1422 fillWithSingleValue(dest0, 0); 1423 1424 screenMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.min); 1425 testArrayForValue(dest, 0xEE2ADDFF); 1426 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.min); 1427 testArrayForValue(dest0, 0xEE2ADDFF); 1428 fillWithSingleValue(dest0, 0); 1429 1430 screenMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.min); 1431 testArrayForValue(dest, 0xEE2ADDFF); 1432 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.min); 1433 testArrayForValue(dest0, 0xEE2ADDFF); 1434 fillWithSingleValue(dest0, 0); 1435 1436 screenMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.min); 1437 testArrayForValue(dest, 0xEE2ADDFF); 1438 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.min); 1439 testArrayForValue(dest0, 0xEE2ADDFF); 1440 fillWithSingleValue(dest0, 0); 1441 1442 screenMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.min); 1443 testArrayForValue(dest, 0xEE2ADDFF); 1444 screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.min); 1445 testArrayForValue(dest0, 0xEE2ADDFF); 1446 fillWithSingleValue(dest0, 0); 1447 }