1 module CPUblit.composing.mult; 2 3 import CPUblit.composing.common; 4 5 /* 6 * CPUblit 7 * Multiply-blend compose functions. 8 * Author: Laszlo Szeremi 9 * 10 * Multiply-blend functions compose two images together using the following formula: 11 * dest0[rgba] = src[rgba] * dest[rgba] 12 * This is translated to the following formula: 13 * dest0[rgba] = ((1 + src[rgba]) * dest[rgba])>>>8 14 * If alpha channel is enabled, it'll control the blend between the multiplied value and the original one. 15 * dest0[rgba] = ((1.0 - mask[aaaa]) * dest) + (mask[aaaa] * src[rgba] * dest[rgba]) 16 * In integer, this is: 17 * dest0[rgba] = (((256 - mask[aaaa]) * dest) + ((1 + mask[aaaa]) * ((1 + src[rgba]) * dest[rgba])>>>8))>>>8 18 */ 19 @nogc pure nothrow { 20 /** 21 * 2 operator multiply function without blending. 22 */ 23 void mult(uint* src, uint* dest, size_t length) { 24 while (length >= 4) { 25 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 26 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 27 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 28 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 29 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 30 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 31 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 32 src += 4; 33 dest += 4; 34 length -= 4; 35 } 36 if (length >= 2) { 37 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 38 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 39 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 40 //__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 41 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 42 //src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 43 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 44 src += 2; 45 dest += 2; 46 length -= 2; 47 } 48 if (length) { 49 __m128i srcV = _mm_loadu_si32(src); 50 __m128i destV = _mm_loadu_si32(dest); 51 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 52 //__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 53 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 54 //src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 55 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 56 } 57 } 58 /** 59 * 3 operator multiply function without blending. 60 * Has separate destination 61 */ 62 void mult(uint* src, uint* dest, uint* dest0, size_t length) { 63 while (length >= 4) { 64 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 65 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 66 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 67 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 68 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 69 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 70 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 71 src += 4; 72 dest += 4; 73 dest0 += 4; 74 length -= 4; 75 } 76 if (length >= 2) { 77 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 78 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 79 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 80 //__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 81 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 82 //src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 83 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 84 src += 2; 85 dest += 2; 86 dest0 += 2; 87 length -= 2; 88 } 89 if (length) { 90 __m128i srcV = _mm_loadu_si32(src); 91 __m128i destV = _mm_loadu_si32(dest); 92 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 93 //__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 94 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 95 //src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 96 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 97 } 98 } 99 /** 100 * 2 operator multiply function with blending. 101 */ 102 void multBl(uint* src, uint* dest, size_t length) { 103 while (length >= 4) { 104 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 105 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 106 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 107 version (cpublit_revalpha) { 108 maskV |= _mm_srli_epi32(maskV, 8); 109 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 110 } else { 111 maskV |= _mm_slli_epi32(maskV, 8); 112 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 113 } 114 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 115 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 116 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 117 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 118 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 119 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 120 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 121 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 122 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 123 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 124 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 125 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 126 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 127 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 128 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 129 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 130 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 131 src += 4; 132 dest += 4; 133 length -= 4; 134 } 135 if (length >= 2) { 136 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 137 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 138 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 139 version (cpublit_revalpha) { 140 maskV |= _mm_srli_epi32(maskV, 8); 141 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 142 } else { 143 maskV |= _mm_slli_epi32(maskV, 8); 144 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 145 } 146 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 147 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 148 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 149 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 150 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 151 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 152 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 153 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 154 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 155 src += 2; 156 dest += 2; 157 length -= 2; 158 } 159 if (length) { 160 __m128i srcV = _mm_loadu_si32(src); 161 __m128i destV = _mm_loadu_si32(dest); 162 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 163 version (cpublit_revalpha) { 164 maskV |= _mm_srli_epi32(maskV, 8); 165 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 166 } else { 167 maskV |= _mm_slli_epi32(maskV, 8); 168 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 169 } 170 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 171 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 172 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 173 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 174 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 175 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 176 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 177 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 178 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 179 } 180 } 181 /** 182 * 3 operator multiply function without blending. 183 * Has separate destination 184 */ 185 void multBl(uint* src, uint* dest, uint* dest0, size_t length) { 186 while (length >= 4) { 187 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 188 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 189 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 190 version (cpublit_revalpha) { 191 maskV |= _mm_srli_epi32(maskV, 8); 192 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 193 } else { 194 maskV |= _mm_slli_epi32(maskV, 8); 195 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 196 } 197 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 198 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 199 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 200 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 201 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 202 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 203 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 204 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 205 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 206 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 207 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 208 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 209 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 210 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 211 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 212 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 213 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 214 src += 4; 215 dest += 4; 216 dest0 += 4; 217 length -= 4; 218 } 219 if (length >= 2) { 220 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 221 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 222 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 223 version (cpublit_revalpha) { 224 maskV |= _mm_srli_epi32(maskV, 8); 225 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 226 } else { 227 maskV |= _mm_slli_epi32(maskV, 8); 228 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 229 } 230 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 231 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 232 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 233 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 234 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 235 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 236 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 237 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 238 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 239 src += 2; 240 dest += 2; 241 dest0 += 2; 242 length -= 2; 243 } 244 if (length) { 245 __m128i srcV = _mm_loadu_si32(src); 246 __m128i destV = _mm_loadu_si32(dest); 247 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 248 version (cpublit_revalpha) { 249 maskV |= _mm_srli_epi32(maskV, 8); 250 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 251 } else { 252 maskV |= _mm_slli_epi32(maskV, 8); 253 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 254 } 255 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 256 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 257 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 258 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 259 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 260 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 261 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 262 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 263 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 264 } 265 } 266 /** 267 * 2 operator multiply function without blending and with master value. 268 */ 269 void multMV(V)(uint* src, uint* dest, size_t length, V value) { 270 __m128i masterV; 271 static if (is(V == uint)) { 272 masterV[0] = value; 273 masterV[1] = value; 274 } else static if (is(V == ubyte)) { 275 masterV[0] = value; 276 masterV[1] = value; 277 masterV |= _mm_slli_epi32(masterV, 8); 278 masterV |= _mm_slli_epi32(masterV, 16); 279 } else static assert (0, "Value must be either 8 or 32 bits!"); 280 __m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT)); 281 __m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 282 while (length >= 4) { 283 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 284 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 285 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 286 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 287 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 288 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 289 src_lo = _mm_mullo_epi16(src_lo, master_1); 290 src_hi = _mm_mullo_epi16(src_hi, master_1); 291 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256); 292 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), master_256); 293 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 294 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 295 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 296 src += 4; 297 dest += 4; 298 length -= 4; 299 } 300 if (length >= 2) { 301 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 302 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 303 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 304 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 305 src_lo = _mm_mullo_epi16(src_lo, master_1); 306 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256); 307 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 308 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 309 src += 2; 310 dest += 2; 311 length -= 2; 312 } 313 if (length) { 314 __m128i srcV = _mm_loadu_si32(src); 315 __m128i destV = _mm_loadu_si32(dest); 316 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 317 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 318 src_lo = _mm_mullo_epi16(src_lo, master_1); 319 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256); 320 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 321 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 322 } 323 } 324 /** 325 * 3 operator multiply function without blending and with master value. 326 * Has separate destination. 327 */ 328 void multMV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) { 329 __m128i masterV; 330 static if (is(V == uint)) { 331 masterV[0] = value; 332 masterV[1] = value; 333 } else static if (is(V == ubyte)) { 334 masterV[0] = value; 335 masterV[1] = value; 336 masterV |= _mm_slli_epi32(masterV, 8); 337 masterV |= _mm_slli_epi32(masterV, 16); 338 } else static assert (0, "Value must be either 8 or 32 bits!"); 339 __m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT)); 340 __m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 341 while (length >= 4) { 342 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 343 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 344 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 345 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 346 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 347 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 348 src_lo = _mm_mullo_epi16(src_lo, master_1); 349 src_hi = _mm_mullo_epi16(src_hi, master_1); 350 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256); 351 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), master_256); 352 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 353 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 354 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 355 src += 4; 356 dest += 4; 357 dest0 += 4; 358 length -= 4; 359 } 360 if (length >= 2) { 361 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 362 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 363 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 364 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 365 src_lo = _mm_mullo_epi16(src_lo, master_1); 366 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256); 367 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 368 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 369 src += 2; 370 dest += 2; 371 dest0 += 2; 372 length -= 2; 373 } 374 if (length) { 375 __m128i srcV = _mm_loadu_si32(src); 376 __m128i destV = _mm_loadu_si32(dest); 377 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 378 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 379 src_lo = _mm_mullo_epi16(src_lo, master_1); 380 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256); 381 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 382 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 383 } 384 } 385 /** 386 * 3 operator multiply function with masking. 387 */ 388 void mult(M)(uint* src, uint* dest, size_t length, M* mask) { 389 while (length >= 4) { 390 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 391 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 392 static if (is(M == uint)) { 393 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 394 } else static if (is(M == ubyte)) { 395 __m128i maskV; 396 maskV[0] = mask[0]; 397 maskV[1] = mask[1]; 398 maskV[2] = mask[2]; 399 maskV[3] = mask[3]; 400 maskV |= _mm_slli_epi32(maskV, 8); 401 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 402 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 403 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 404 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 405 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 406 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 407 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 408 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 409 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 410 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 411 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 412 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 413 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 414 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 415 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 416 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 417 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 418 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 419 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 420 src += 4; 421 dest += 4; 422 mask += 4; 423 length -= 4; 424 } 425 if (length >= 2) { 426 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 427 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 428 static if (is(M == uint)) { 429 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 430 } else static if (is(M == ubyte)) { 431 __m128i maskV; 432 maskV[0] = mask[0]; 433 maskV[1] = mask[1]; 434 maskV |= _mm_slli_epi32(maskV, 8); 435 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 436 } 437 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 438 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 439 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 440 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 441 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 442 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 443 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 444 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 445 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 446 src += 2; 447 dest += 2; 448 mask += 2; 449 length -= 2; 450 } 451 if (length) { 452 __m128i srcV = _mm_loadu_si32(src); 453 __m128i destV = _mm_loadu_si32(dest); 454 static if (is(M == uint)) { 455 __m128i maskV = _mm_loadu_si32(mask); 456 } else static if (is(M == ubyte)) { 457 __m128i maskV; 458 maskV[0] = mask[0]; 459 maskV |= _mm_slli_epi32(maskV, 8); 460 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 461 } 462 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 463 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 464 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 465 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 466 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 467 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 468 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 469 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 470 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 471 } 472 } 473 /** 474 * 4 operator multiply function with masking. 475 * Has separate destination. 476 */ 477 void mult(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) { 478 while (length >= 4) { 479 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 480 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 481 static if (is(M == uint)) { 482 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 483 } else static if (is(M == ubyte)) { 484 __m128i maskV; 485 maskV[0] = mask[0]; 486 maskV[1] = mask[1]; 487 maskV[2] = mask[2]; 488 maskV[3] = mask[3]; 489 maskV |= _mm_slli_epi32(maskV, 8); 490 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 491 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 492 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 493 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 494 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 495 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 496 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 497 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 498 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 499 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 500 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 501 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 502 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 503 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 504 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 505 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 506 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 507 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 508 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 509 src += 4; 510 dest += 4; 511 dest0 += 4; 512 mask += 4; 513 length -= 4; 514 } 515 if (length >= 2) { 516 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 517 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 518 static if (is(M == uint)) { 519 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 520 } else static if (is(M == ubyte)) { 521 __m128i maskV; 522 maskV[0] = mask[0]; 523 maskV[1] = mask[1]; 524 maskV |= _mm_slli_epi32(maskV, 8); 525 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 526 } 527 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 528 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 529 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 530 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 531 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 532 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 533 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 534 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 535 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 536 src += 2; 537 dest += 2; 538 dest0 += 2; 539 mask += 2; 540 length -= 2; 541 } 542 if (length) { 543 __m128i srcV = _mm_loadu_si32(src); 544 __m128i destV = _mm_loadu_si32(dest); 545 static if (is(M == uint)) { 546 __m128i maskV = _mm_loadu_si32(mask); 547 } else static if (is(M == ubyte)) { 548 __m128i maskV; 549 maskV[0] = mask[0]; 550 maskV |= _mm_slli_epi32(maskV, 8); 551 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 552 } 553 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 554 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 555 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 556 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 557 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 558 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 559 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 560 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 561 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 562 } 563 } 564 /** 565 * 2 operator multiply function with blending and master value. 566 */ 567 void multMVBl(V)(uint* src, uint* dest, size_t length, V value) { 568 __m128i masterV; 569 static if (is(V == uint)) { 570 masterV[0] = value; 571 masterV[1] = value; 572 } else static if (is(V == ubyte)) { 573 masterV[0] = value; 574 masterV[1] = value; 575 masterV |= _mm_slli_epi32(masterV, 8); 576 masterV |= _mm_slli_epi32(masterV, 16); 577 } else static assert (0, "Value must be either 8 or 32 bits!"); 578 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 579 while (length >= 4) { 580 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 581 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 582 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 583 version (cpublit_revalpha) { 584 maskV |= _mm_srli_epi32(maskV, 8); 585 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 586 } else { 587 maskV |= _mm_slli_epi32(maskV, 8); 588 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 589 } 590 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 591 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 592 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 593 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 594 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 595 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 596 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 597 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 598 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 599 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 600 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 601 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 602 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 603 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 604 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 605 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 606 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 607 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 608 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 609 src += 4; 610 dest += 4; 611 length -= 4; 612 } 613 if (length >= 2) { 614 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 615 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 616 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 617 version (cpublit_revalpha) { 618 maskV |= _mm_srli_epi32(maskV, 8); 619 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 620 } else { 621 maskV |= _mm_slli_epi32(maskV, 8); 622 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 623 } 624 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 625 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 626 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 627 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 628 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 629 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 630 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 631 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 632 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 633 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 634 src += 2; 635 dest += 2; 636 length -= 2; 637 } 638 if (length) { 639 __m128i srcV = _mm_loadu_si32(src); 640 __m128i destV = _mm_loadu_si32(dest); 641 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 642 version (cpublit_revalpha) { 643 maskV |= _mm_srli_epi32(maskV, 8); 644 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 645 } else { 646 maskV |= _mm_slli_epi32(maskV, 8); 647 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 648 } 649 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 650 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 651 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 652 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 653 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 654 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 655 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 656 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 657 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 658 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 659 } 660 } 661 /** 662 * 3 operator multiply function with blending and master value. 663 * Has separate destination. 664 */ 665 void multMVBl(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) { 666 __m128i masterV; 667 static if (is(V == uint)) { 668 masterV[0] = value; 669 masterV[1] = value; 670 } else static if (is(V == ubyte)) { 671 masterV[0] = value; 672 masterV[1] = value; 673 masterV |= _mm_slli_epi32(masterV, 8); 674 masterV |= _mm_slli_epi32(masterV, 16); 675 } else static assert (0, "Value must be either 8 or 32 bits!"); 676 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 677 while (length >= 4) { 678 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 679 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 680 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 681 version (cpublit_revalpha) { 682 maskV |= _mm_srli_epi32(maskV, 8); 683 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 684 } else { 685 maskV |= _mm_slli_epi32(maskV, 8); 686 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 687 } 688 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 689 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 690 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 691 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 692 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 693 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 694 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 695 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 696 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 697 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 698 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 699 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 700 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 701 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 702 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 703 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 704 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 705 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 706 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 707 src += 4; 708 dest += 4; 709 dest0 += 4; 710 length -= 4; 711 } 712 if (length >= 2) { 713 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 714 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 715 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 716 version (cpublit_revalpha) { 717 maskV |= _mm_srli_epi32(maskV, 8); 718 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 719 } else { 720 maskV |= _mm_slli_epi32(maskV, 8); 721 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 722 } 723 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 724 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 725 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 726 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 727 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 728 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 729 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 730 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 731 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 732 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 733 src += 2; 734 dest += 2; 735 dest0 += 2; 736 length -= 2; 737 } 738 if (length) { 739 __m128i srcV = _mm_loadu_si32(src); 740 __m128i destV = _mm_loadu_si32(dest); 741 __m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK; 742 version (cpublit_revalpha) { 743 maskV |= _mm_srli_epi32(maskV, 8); 744 maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A] 745 } else { 746 maskV |= _mm_slli_epi32(maskV, 8); 747 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 748 } 749 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 750 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 751 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 752 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 753 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 754 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 755 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 756 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 757 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 758 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 759 } 760 } 761 /** 762 * 3 operator multiply function with masking and master value. 763 */ 764 void multMV(M,V)(uint* src, uint* dest, size_t length, M* mask, V value) { 765 __m128i masterV; 766 static if (is(V == uint)) { 767 masterV[0] = value; 768 masterV[1] = value; 769 } else static if (is(V == ubyte)) { 770 masterV[0] = value; 771 masterV[1] = value; 772 masterV |= _mm_slli_epi32(masterV, 8); 773 masterV |= _mm_slli_epi32(masterV, 16); 774 } else static assert (0, "Value must be either 8 or 32 bits!"); 775 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 776 while (length >= 4) { 777 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 778 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 779 static if (is(M == uint)) { 780 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 781 } else static if (is(M == ubyte)) { 782 __m128i maskV; 783 maskV[0] = mask[0]; 784 maskV[1] = mask[1]; 785 maskV[2] = mask[2]; 786 maskV[3] = mask[3]; 787 maskV |= _mm_slli_epi32(maskV, 8); 788 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 789 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 790 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 791 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 792 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 793 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 794 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 795 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 796 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 797 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 798 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 799 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 800 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 801 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 802 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 803 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 804 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 805 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 806 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 807 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 808 _mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi)); 809 src += 4; 810 mask += 4; 811 dest += 4; 812 length -= 4; 813 } 814 if (length >= 2) { 815 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 816 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 817 static if (is(M == uint)) { 818 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 819 } else static if (is(M == ubyte)) { 820 __m128i maskV; 821 maskV[0] = mask[0]; 822 maskV[1] = mask[1]; 823 maskV |= _mm_slli_epi32(maskV, 8); 824 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 825 } 826 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 827 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 828 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 829 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 830 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 831 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 832 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 833 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 834 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 835 _mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 836 src += 2; 837 mask += 2; 838 dest += 2; 839 length -= 2; 840 } 841 if (length) { 842 __m128i srcV = _mm_loadu_si32(src); 843 __m128i destV = _mm_loadu_si32(dest); 844 static if (is(M == uint)) { 845 __m128i maskV = _mm_loadu_si32(mask); 846 } else static if (is(M == ubyte)) { 847 __m128i maskV; 848 maskV[0] = mask[0]; 849 maskV |= _mm_slli_epi32(maskV, 8); 850 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 851 } 852 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 853 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 854 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 855 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 856 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 857 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 858 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 859 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 860 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 861 _mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 862 } 863 } 864 /** 865 * 4 operator multiply function with masking and master value. 866 * Has separate destination. 867 */ 868 void multMV(M,V)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) { 869 __m128i masterV; 870 static if (is(V == uint)) { 871 masterV[0] = value; 872 masterV[1] = value; 873 } else static if (is(V == ubyte)) { 874 masterV[0] = value; 875 masterV[1] = value; 876 masterV |= _mm_slli_epi32(masterV, 8); 877 masterV |= _mm_slli_epi32(masterV, 16); 878 } else static assert (0, "Value must be either 8 or 32 bits!"); 879 masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 880 while (length >= 4) { 881 __m128i srcV = _mm_loadu_si128(cast(__m128i*)src); 882 __m128i destV = _mm_loadu_si128(cast(__m128i*)dest); 883 static if (is(M == uint)) { 884 __m128i maskV = _mm_loadu_si128(cast(__m128i*)mask); 885 } else static if (is(M == ubyte)) { 886 __m128i maskV; 887 maskV[0] = mask[0]; 888 maskV[1] = mask[1]; 889 maskV[2] = mask[2]; 890 maskV[3] = mask[3]; 891 maskV |= _mm_slli_epi32(maskV, 8); 892 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 893 } else static assert (0, "Alpha mask must be either 8 or 32 bits!"); 894 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 895 __m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 896 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 897 src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8); 898 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 899 __m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT); 900 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 901 mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8); 902 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 903 __m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1); 904 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 905 mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi); 906 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 907 src_hi = _mm_mullo_epi16(src_hi, mask0_hi); 908 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 909 __m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi); 910 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 911 src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8); 912 _mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi)); 913 src += 4; 914 mask += 4; 915 dest += 4; 916 dest0 += 4; 917 length -= 4; 918 } 919 if (length >= 2) { 920 __m128i srcV = _mm_loadl_epi64(cast(__m128i*)src); 921 __m128i destV = _mm_loadl_epi64(cast(__m128i*)dest); 922 static if (is(M == uint)) { 923 __m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask); 924 } else static if (is(M == ubyte)) { 925 __m128i maskV; 926 maskV[0] = mask[0]; 927 maskV[1] = mask[1]; 928 maskV |= _mm_slli_epi32(maskV, 8); 929 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 930 } 931 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 932 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 933 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 934 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 935 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 936 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 937 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 938 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 939 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 940 _mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 941 src += 2; 942 mask += 2; 943 dest += 2; 944 dest0 += 2; 945 length -= 2; 946 } 947 if (length) { 948 __m128i srcV = _mm_loadu_si32(src); 949 __m128i destV = _mm_loadu_si32(dest); 950 static if (is(M == uint)) { 951 __m128i maskV = _mm_loadu_si32(mask); 952 } else static if (is(M == ubyte)) { 953 __m128i maskV; 954 maskV[0] = mask[0]; 955 maskV |= _mm_slli_epi32(maskV, 8); 956 maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A] 957 } 958 __m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1); 959 src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8); 960 __m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT); 961 mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8); 962 __m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1); 963 mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo); 964 src_lo = _mm_mullo_epi16(src_lo, mask0_lo); 965 __m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo); 966 src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8); 967 _mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT)); 968 } 969 } 970 } 971 unittest { 972 uint[] src, src0, dest, dest0, maskA, maskB; 973 ubyte[] mask0A, mask0B; 974 src.length = 255; 975 src0.length = 255; 976 dest.length = 255; 977 dest0.length = 255; 978 maskA.length = 255; 979 fillWithSingleValue(maskA, uint.max); 980 maskB.length = 255; 981 mask0A.length = 255; 982 fillWithSingleValue(mask0A, ubyte.max); 983 mask0B.length = 255; 984 fillWithSingleValue(src, 0x306090FF); 985 fillWithSingleValue(src0, 0x30609000); 986 fillWithSingleValue(dest, 0xEE2ADDFF);//result should be `0x2D0F7DFF` if A is FF 987 988 //Test basic functions 989 mult(src.ptr, dest.ptr, 255); 990 testArrayForValue(dest, 0x2D0F7DFF); 991 fillWithSingleValue(dest, 0xEE2ADDFF); 992 mult(src.ptr, dest.ptr, dest0.ptr, 255); 993 testArrayForValue(dest0, 0x2D0F7DFF); 994 fillWithSingleValue(dest0, 0); 995 996 //Test blend functions 997 multBl(src.ptr, dest.ptr, 255); 998 testArrayForValue(dest, 0x2D0F7DFF); 999 fillWithSingleValue(dest, 0xEE2ADDFF); 1000 multBl(src.ptr, dest.ptr, dest0.ptr, 255); 1001 testArrayForValue(dest0, 0x2D0F7DFF); 1002 fillWithSingleValue(dest0, 0); 1003 1004 multBl(src0.ptr, dest.ptr, 255); 1005 testArrayForValue(dest, 0xEE2ADDFF); 1006 multBl(src0.ptr, dest.ptr, dest0.ptr, 255); 1007 testArrayForValue(dest0, 0xEE2ADDFF); 1008 fillWithSingleValue(dest0, 0); 1009 1010 //Test master value functions 1011 multMV(src.ptr, dest.ptr, 255, ubyte.max); 1012 testArrayForValue(dest, 0x2D0F7DFF); 1013 fillWithSingleValue(dest, 0xEE2ADDFF); 1014 multMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max); 1015 testArrayForValue(dest0, 0x2D0F7DFF); 1016 fillWithSingleValue(dest0, 0); 1017 1018 multMV(src.ptr, dest.ptr, 255, uint.max); 1019 testArrayForValue(dest, 0x2D0F7DFF); 1020 fillWithSingleValue(dest, 0xEE2ADDFF); 1021 multMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.max); 1022 testArrayForValue(dest0, 0x2D0F7DFF); 1023 fillWithSingleValue(dest0, 0); 1024 1025 multMV(src.ptr, dest.ptr, 255, ubyte.min); 1026 testArrayForValue(dest, 0xEE2ADDFF); 1027 multMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min); 1028 testArrayForValue(dest0, 0xEE2ADDFF); 1029 fillWithSingleValue(dest0, 0); 1030 1031 multMV(src.ptr, dest.ptr, 255, uint.min); 1032 testArrayForValue(dest, 0xEE2ADDFF); 1033 multMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.min); 1034 testArrayForValue(dest0, 0xEE2ADDFF); 1035 fillWithSingleValue(dest0, 0); 1036 1037 //Test mask functions 1038 mult(src.ptr, dest.ptr, 255, mask0A.ptr); 1039 testArrayForValue(dest, 0x2D0F7DFF); 1040 fillWithSingleValue(dest, 0xEE2ADDFF); 1041 mult(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr); 1042 testArrayForValue(dest0, 0x2D0F7DFF); 1043 fillWithSingleValue(dest0, 0); 1044 1045 mult(src.ptr, dest.ptr, 255, maskA.ptr); 1046 testArrayForValue(dest, 0x2D0F7DFF); 1047 fillWithSingleValue(dest, 0xEE2ADDFF); 1048 mult(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr); 1049 testArrayForValue(dest0, 0x2D0F7DFF); 1050 fillWithSingleValue(dest0, 0); 1051 1052 mult(src.ptr, dest.ptr, 255, mask0B.ptr); 1053 testArrayForValue(dest, 0xEE2ADDFF); 1054 mult(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr); 1055 testArrayForValue(dest0, 0xEE2ADDFF); 1056 fillWithSingleValue(dest0, 0); 1057 1058 mult(src.ptr, dest.ptr, 255, maskB.ptr); 1059 testArrayForValue(dest, 0xEE2ADDFF); 1060 mult(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr); 1061 testArrayForValue(dest0, 0xEE2ADDFF); 1062 fillWithSingleValue(dest0, 0); 1063 1064 //Test blend with master value functions 1065 multMVBl(src.ptr, dest.ptr, 255, ubyte.max); 1066 testArrayForValue(dest, 0x2D0F7DFF); 1067 fillWithSingleValue(dest, 0xEE2ADDFF); 1068 multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max); 1069 testArrayForValue(dest0, 0x2D0F7DFF); 1070 fillWithSingleValue(dest0, 0); 1071 1072 multMVBl(src.ptr, dest.ptr, 255, uint.max); 1073 testArrayForValue(dest, 0x2D0F7DFF); 1074 fillWithSingleValue(dest, 0xEE2ADDFF); 1075 multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.max); 1076 testArrayForValue(dest0, 0x2D0F7DFF); 1077 fillWithSingleValue(dest0, 0); 1078 1079 multMVBl(src.ptr, dest.ptr, 255, ubyte.min); 1080 testArrayForValue(dest, 0xEE2ADDFF); 1081 multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min); 1082 testArrayForValue(dest0, 0xEE2ADDFF); 1083 fillWithSingleValue(dest0, 0); 1084 1085 multMVBl(src.ptr, dest.ptr, 255, uint.min); 1086 testArrayForValue(dest, 0xEE2ADDFF); 1087 multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.min); 1088 testArrayForValue(dest0, 0xEE2ADDFF); 1089 fillWithSingleValue(dest0, 0); 1090 1091 multMVBl(src0.ptr, dest.ptr, 255, ubyte.max); 1092 testArrayForValue(dest, 0xEE2ADDFF); 1093 multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.max); 1094 testArrayForValue(dest0, 0xEE2ADDFF); 1095 fillWithSingleValue(dest0, 0); 1096 1097 multMVBl(src0.ptr, dest.ptr, 255, uint.max); 1098 testArrayForValue(dest, 0xEE2ADDFF); 1099 multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.max); 1100 testArrayForValue(dest0, 0xEE2ADDFF); 1101 fillWithSingleValue(dest0, 0); 1102 1103 multMVBl(src0.ptr, dest.ptr, 255, ubyte.min); 1104 testArrayForValue(dest, 0xEE2ADDFF); 1105 multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.min); 1106 testArrayForValue(dest0, 0xEE2ADDFF); 1107 fillWithSingleValue(dest0, 0); 1108 1109 multMVBl(src0.ptr, dest.ptr, 255, uint.min); 1110 testArrayForValue(dest, 0xEE2ADDFF); 1111 multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.min); 1112 testArrayForValue(dest0, 0xEE2ADDFF); 1113 fillWithSingleValue(dest0, 0); 1114 1115 //Test masking with master value functions 1116 multMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.max); 1117 testArrayForValue(dest, 0x2D0F7DFF); 1118 fillWithSingleValue(dest, 0xEE2ADDFF); 1119 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.max); 1120 testArrayForValue(dest0, 0x2D0F7DFF); 1121 fillWithSingleValue(dest0, 0); 1122 1123 multMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.max); 1124 testArrayForValue(dest, 0x2D0F7DFF); 1125 fillWithSingleValue(dest, 0xEE2ADDFF); 1126 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.max); 1127 testArrayForValue(dest0, 0x2D0F7DFF); 1128 fillWithSingleValue(dest0, 0); 1129 1130 multMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.min); 1131 testArrayForValue(dest, 0xEE2ADDFF); 1132 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.min); 1133 testArrayForValue(dest0, 0xEE2ADDFF); 1134 fillWithSingleValue(dest0, 0); 1135 1136 multMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.min); 1137 testArrayForValue(dest, 0xEE2ADDFF); 1138 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.min); 1139 testArrayForValue(dest0, 0xEE2ADDFF); 1140 fillWithSingleValue(dest0, 0); 1141 1142 multMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.max); 1143 testArrayForValue(dest, 0xEE2ADDFF); 1144 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.max); 1145 testArrayForValue(dest0, 0xEE2ADDFF); 1146 fillWithSingleValue(dest0, 0); 1147 1148 multMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.max); 1149 testArrayForValue(dest, 0xEE2ADDFF); 1150 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.max); 1151 testArrayForValue(dest0, 0xEE2ADDFF); 1152 fillWithSingleValue(dest0, 0); 1153 1154 multMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.min); 1155 testArrayForValue(dest, 0xEE2ADDFF); 1156 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.min); 1157 testArrayForValue(dest0, 0xEE2ADDFF); 1158 fillWithSingleValue(dest0, 0); 1159 1160 multMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.min); 1161 testArrayForValue(dest, 0xEE2ADDFF); 1162 multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.min); 1163 testArrayForValue(dest0, 0xEE2ADDFF); 1164 fillWithSingleValue(dest0, 0); 1165 // 1166 multMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.max); 1167 testArrayForValue(dest, 0x2D0F7DFF); 1168 fillWithSingleValue(dest, 0xEE2ADDFF); 1169 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.max); 1170 testArrayForValue(dest0, 0x2D0F7DFF); 1171 fillWithSingleValue(dest0, 0); 1172 1173 multMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.max); 1174 testArrayForValue(dest, 0x2D0F7DFF); 1175 fillWithSingleValue(dest, 0xEE2ADDFF); 1176 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.max); 1177 testArrayForValue(dest0, 0x2D0F7DFF); 1178 fillWithSingleValue(dest0, 0); 1179 1180 multMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.min); 1181 testArrayForValue(dest, 0xEE2ADDFF); 1182 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.min); 1183 testArrayForValue(dest0, 0xEE2ADDFF); 1184 fillWithSingleValue(dest0, 0); 1185 1186 multMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.min); 1187 testArrayForValue(dest, 0xEE2ADDFF); 1188 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.min); 1189 testArrayForValue(dest0, 0xEE2ADDFF); 1190 fillWithSingleValue(dest0, 0); 1191 1192 multMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.max); 1193 testArrayForValue(dest, 0xEE2ADDFF); 1194 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.max); 1195 testArrayForValue(dest0, 0xEE2ADDFF); 1196 fillWithSingleValue(dest0, 0); 1197 1198 multMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.max); 1199 testArrayForValue(dest, 0xEE2ADDFF); 1200 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.max); 1201 testArrayForValue(dest0, 0xEE2ADDFF); 1202 fillWithSingleValue(dest0, 0); 1203 1204 multMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.min); 1205 testArrayForValue(dest, 0xEE2ADDFF); 1206 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.min); 1207 testArrayForValue(dest0, 0xEE2ADDFF); 1208 fillWithSingleValue(dest0, 0); 1209 1210 multMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.min); 1211 testArrayForValue(dest, 0xEE2ADDFF); 1212 multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.min); 1213 testArrayForValue(dest0, 0xEE2ADDFF); 1214 //fillWithSingleValue(dest0, 0); 1215 }