1 module CPUblit.composing.mult;
2 
3 import CPUblit.composing.common;
4 
5 /*
6  * CPUblit
7  * Multiply-blend compose functions.
8  * Author: Laszlo Szeremi
9  *
10  * Multiply-blend functions compose two images together using the following formula:
11  * dest0[rgba] = src[rgba] * dest[rgba]
12  * This is translated to the following formula:
13  * dest0[rgba] = ((1 + src[rgba]) * dest[rgba])>>>8
14  * If alpha channel is enabled, it'll control the blend between the multiplied value and the original one.
15  * dest0[rgba] = ((1.0 - mask[aaaa]) * dest) + (mask[aaaa] * src[rgba] * dest[rgba])
16  * In integer, this is:
17  * dest0[rgba] = (((256 - mask[aaaa]) * dest) + ((1 + mask[aaaa]) * ((1 + src[rgba]) * dest[rgba])>>>8))>>>8
18  */
19 @nogc pure nothrow {
20 	/**
21 	 * 2 operator multiply function without blending.
22 	 */
23 	void mult(uint* src, uint* dest, size_t length) {
24 		while (length >= 4) {
25 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
26 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
27 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
28 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
29 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
30 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
31 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
32 			src += 4;
33 			dest += 4;
34 			length -= 4;
35 		}
36 		if (length >= 2) {
37 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
38 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
39 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
40 			//__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
41 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
42 			//src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
43 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
44 			src += 2;
45 			dest += 2;
46 			length -= 2;
47 		}
48 		if (length) {
49 			__m128i srcV = _mm_loadu_si32(src);
50 			__m128i destV = _mm_loadu_si32(dest);
51 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
52 			//__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
53 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
54 			//src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
55 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
56 		}
57 	}
58 	/**
59 	 * 3 operator multiply function without blending.
60 	 * Has separate destination
61 	 */
62 	void mult(uint* src, uint* dest, uint* dest0, size_t length) {
63 		while (length >= 4) {
64 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
65 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
66 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
67 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
68 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
69 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
70 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
71 			src += 4;
72 			dest += 4;
73 			dest0 += 4;
74 			length -= 4;
75 		}
76 		if (length >= 2) {
77 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
78 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
79 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
80 			//__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
81 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
82 			//src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
83 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
84 			src += 2;
85 			dest += 2;
86 			dest0 += 2;
87 			length -= 2;
88 		}
89 		if (length) {
90 			__m128i srcV = _mm_loadu_si32(src);
91 			__m128i destV = _mm_loadu_si32(dest);
92 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
93 			//__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
94 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
95 			//src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
96 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
97 		}
98 	}
99 	/**
100 	 * 2 operator multiply function with blending.
101 	 */
102 	void multBl(uint* src, uint* dest, size_t length) {
103 		while (length >= 4) {
104 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
105 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
106 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
107 			version (cpublit_revalpha) {
108 				maskV |= _mm_srli_epi32(maskV, 8);
109 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
110 			} else {
111 				maskV |= _mm_slli_epi32(maskV, 8);
112 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
113 			}
114 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
115 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
116 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
117 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
118 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
119 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
120 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
121 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
122 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
123 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
124 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
125 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
126 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
127 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
128 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
129 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
130 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
131 			src += 4;
132 			dest += 4;
133 			length -= 4;
134 		}
135 		if (length >= 2) {
136 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
137 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
138 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
139 			version (cpublit_revalpha) {
140 				maskV |= _mm_srli_epi32(maskV, 8);
141 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
142 			} else {
143 				maskV |= _mm_slli_epi32(maskV, 8);
144 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
145 			}
146 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
147 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
148 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
149 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
150 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
151 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
152 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
153 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
154 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
155 			src += 2;
156 			dest += 2;
157 			length -= 2;
158 		}
159 		if (length) {
160 			__m128i srcV = _mm_loadu_si32(src);
161 			__m128i destV = _mm_loadu_si32(dest);
162 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
163 			version (cpublit_revalpha) {
164 				maskV |= _mm_srli_epi32(maskV, 8);
165 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
166 			} else {
167 				maskV |= _mm_slli_epi32(maskV, 8);
168 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
169 			}
170 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
171 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
172 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
173 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
174 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
175 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
176 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
177 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
178 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
179 		}
180 	}
181 	/**
182 	 * 3 operator multiply function without blending.
183 	 * Has separate destination
184 	 */
185 	void multBl(uint* src, uint* dest, uint* dest0, size_t length) {
186 		while (length >= 4) {
187 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
188 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
189 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
190 			version (cpublit_revalpha) {
191 				maskV |= _mm_srli_epi32(maskV, 8);
192 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
193 			} else {
194 				maskV |= _mm_slli_epi32(maskV, 8);
195 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
196 			}
197 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
198 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
199 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
200 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
201 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
202 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
203 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
204 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
205 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
206 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
207 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
208 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
209 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
210 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
211 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
212 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
213 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
214 			src += 4;
215 			dest += 4;
216 			dest0 += 4;
217 			length -= 4;
218 		}
219 		if (length >= 2) {
220 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
221 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
222 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
223 			version (cpublit_revalpha) {
224 				maskV |= _mm_srli_epi32(maskV, 8);
225 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
226 			} else {
227 				maskV |= _mm_slli_epi32(maskV, 8);
228 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
229 			}
230 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
231 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
232 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
233 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
234 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
235 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
236 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
237 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
238 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
239 			src += 2;
240 			dest += 2;
241 			dest0 += 2;
242 			length -= 2;
243 		}
244 		if (length) {
245 			__m128i srcV = _mm_loadu_si32(src);
246 			__m128i destV = _mm_loadu_si32(dest);
247 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
248 			version (cpublit_revalpha) {
249 				maskV |= _mm_srli_epi32(maskV, 8);
250 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
251 			} else {
252 				maskV |= _mm_slli_epi32(maskV, 8);
253 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
254 			}
255 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
256 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
257 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
258 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
259 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
260 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
261 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
262 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
263 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
264 		}
265 	}
266 	/**
267 	 * 2 operator multiply function without blending and with master value.
268 	 */
269 	void multMV(V)(uint* src, uint* dest, size_t length, V value) {
270 		__m128i masterV;
271 		static if (is(V == uint)) {
272 			masterV[0] = value;
273 			masterV[1] = value;
274 		} else static if (is(V == ubyte)) {
275 			masterV[0] = value;
276 			masterV[1] = value;
277 			masterV |= _mm_slli_epi32(masterV, 8);
278 			masterV |= _mm_slli_epi32(masterV, 16);
279 		} else static assert (0, "Value must be either 8 or 32 bits!");
280 		__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
281 		__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
282 		while (length >= 4) {
283 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
284 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
285 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
286 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
287 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
288 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
289 			src_lo = _mm_mullo_epi16(src_lo, master_1);
290 			src_hi = _mm_mullo_epi16(src_hi, master_1);
291 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
292 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), master_256);
293 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
294 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
295 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
296 			src += 4;
297 			dest += 4;
298 			length -= 4;
299 		}
300 		if (length >= 2) {
301 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
302 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
303 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
304 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
305 			src_lo = _mm_mullo_epi16(src_lo, master_1);
306 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
307 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
308 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
309 			src += 2;
310 			dest += 2;
311 			length -= 2;
312 		}
313 		if (length) {
314 			__m128i srcV = _mm_loadu_si32(src);
315 			__m128i destV = _mm_loadu_si32(dest);
316 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
317 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
318 			src_lo = _mm_mullo_epi16(src_lo, master_1);
319 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
320 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
321 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
322 		}
323 	}
324 	/**
325 	 * 3 operator multiply function without blending and with master value.
326 	 * Has separate destination.
327 	 */
328 	void multMV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
329 		__m128i masterV;
330 		static if (is(V == uint)) {
331 			masterV[0] = value;
332 			masterV[1] = value;
333 		} else static if (is(V == ubyte)) {
334 			masterV[0] = value;
335 			masterV[1] = value;
336 			masterV |= _mm_slli_epi32(masterV, 8);
337 			masterV |= _mm_slli_epi32(masterV, 16);
338 		} else static assert (0, "Value must be either 8 or 32 bits!");
339 		__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
340 		__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
341 		while (length >= 4) {
342 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
343 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
344 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
345 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
346 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
347 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
348 			src_lo = _mm_mullo_epi16(src_lo, master_1);
349 			src_hi = _mm_mullo_epi16(src_hi, master_1);
350 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
351 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), master_256);
352 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
353 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
354 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
355 			src += 4;
356 			dest += 4;
357 			dest0 += 4;
358 			length -= 4;
359 		}
360 		if (length >= 2) {
361 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
362 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
363 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
364 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
365 			src_lo = _mm_mullo_epi16(src_lo, master_1);
366 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
367 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
368 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
369 			src += 2;
370 			dest += 2;
371 			dest0 += 2;
372 			length -= 2;
373 		}
374 		if (length) {
375 			__m128i srcV = _mm_loadu_si32(src);
376 			__m128i destV = _mm_loadu_si32(dest);
377 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
378 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
379 			src_lo = _mm_mullo_epi16(src_lo, master_1);
380 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
381 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
382 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
383 		}
384 	}
385 	/**
386 	 * 3 operator multiply function with masking.
387 	 */
388 	void mult(M)(uint* src, uint* dest, size_t length, M* mask) {
389 		while (length >= 4) {
390 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
391 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
392 			static if (is(M == uint)) {
393 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
394 			} else static if (is(M == ubyte)) {
395 				__m128i maskV;
396 				maskV[0] = mask[0];
397 				maskV[1] = mask[1];
398 				maskV[2] = mask[2];
399 				maskV[3] = mask[3];
400 				maskV |= _mm_slli_epi32(maskV, 8);
401 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
402 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
403 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
404 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
405 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
406 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
407 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
408 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
409 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
410 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
411 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
412 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
413 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
414 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
415 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
416 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
417 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
418 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
419 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
420 			src += 4;
421 			dest += 4;
422 			mask += 4;
423 			length -= 4;
424 		}
425 		if (length >= 2) {
426 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
427 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
428 			static if (is(M == uint)) {
429 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
430 			} else static if (is(M == ubyte)) {
431 				__m128i maskV;
432 				maskV[0] = mask[0];
433 				maskV[1] = mask[1];
434 				maskV |= _mm_slli_epi32(maskV, 8);
435 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
436 			}
437 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
438 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
439 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
440 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
441 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
442 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
443 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
444 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
445 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
446 			src += 2;
447 			dest += 2;
448 			mask += 2;
449 			length -= 2;
450 		}
451 		if (length) {
452 			__m128i srcV = _mm_loadu_si32(src);
453 			__m128i destV = _mm_loadu_si32(dest);
454 			static if (is(M == uint)) {
455 				__m128i maskV = _mm_loadu_si32(mask);
456 			} else static if (is(M == ubyte)) {
457 				__m128i maskV;
458 				maskV[0] = mask[0];
459 				maskV |= _mm_slli_epi32(maskV, 8);
460 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
461 			}
462 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
463 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
464 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
465 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
466 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
467 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
468 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
469 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
470 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
471 		}
472 	}
473 	/**
474 	 * 4 operator multiply function with masking.
475 	 * Has separate destination.
476 	 */
477 	void mult(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) {
478 		while (length >= 4) {
479 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
480 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
481 			static if (is(M == uint)) {
482 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
483 			} else static if (is(M == ubyte)) {
484 				__m128i maskV;
485 				maskV[0] = mask[0];
486 				maskV[1] = mask[1];
487 				maskV[2] = mask[2];
488 				maskV[3] = mask[3];
489 				maskV |= _mm_slli_epi32(maskV, 8);
490 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
491 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
492 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
493 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
494 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
495 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
496 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
497 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
498 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
499 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
500 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
501 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
502 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
503 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
504 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
505 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
506 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
507 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
508 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
509 			src += 4;
510 			dest += 4;
511 			dest0 += 4;
512 			mask += 4;
513 			length -= 4;
514 		}
515 		if (length >= 2) {
516 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
517 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
518 			static if (is(M == uint)) {
519 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
520 			} else static if (is(M == ubyte)) {
521 				__m128i maskV;
522 				maskV[0] = mask[0];
523 				maskV[1] = mask[1];
524 				maskV |= _mm_slli_epi32(maskV, 8);
525 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
526 			}
527 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
528 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
529 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
530 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
531 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
532 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
533 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
534 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
535 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
536 			src += 2;
537 			dest += 2;
538 			dest0 += 2;
539 			mask += 2;
540 			length -= 2;
541 		}
542 		if (length) {
543 			__m128i srcV = _mm_loadu_si32(src);
544 			__m128i destV = _mm_loadu_si32(dest);
545 			static if (is(M == uint)) {
546 				__m128i maskV = _mm_loadu_si32(mask);
547 			} else static if (is(M == ubyte)) {
548 				__m128i maskV;
549 				maskV[0] = mask[0];
550 				maskV |= _mm_slli_epi32(maskV, 8);
551 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
552 			}
553 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
554 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
555 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
556 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
557 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
558 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
559 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
560 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
561 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
562 		}
563 	}
564 	/**
565 	 * 2 operator multiply function with blending and master value.
566 	 */
567 	void multMVBl(V)(uint* src, uint* dest, size_t length, V value) {
568 		__m128i masterV;
569 		static if (is(V == uint)) {
570 			masterV[0] = value;
571 			masterV[1] = value;
572 		} else static if (is(V == ubyte)) {
573 			masterV[0] = value;
574 			masterV[1] = value;
575 			masterV |= _mm_slli_epi32(masterV, 8);
576 			masterV |= _mm_slli_epi32(masterV, 16);
577 		} else static assert (0, "Value must be either 8 or 32 bits!");
578 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
579 		while (length >= 4) {
580 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
581 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
582 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
583 			version (cpublit_revalpha) {
584 				maskV |= _mm_srli_epi32(maskV, 8);
585 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
586 			} else {
587 				maskV |= _mm_slli_epi32(maskV, 8);
588 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
589 			}
590 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
591 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
592 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
593 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
594 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
595 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
596 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
597 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
598 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
599 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
600 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
601 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
602 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
603 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
604 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
605 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
606 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
607 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
608 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
609 			src += 4;
610 			dest += 4;
611 			length -= 4;
612 		}
613 		if (length >= 2) {
614 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
615 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
616 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
617 			version (cpublit_revalpha) {
618 				maskV |= _mm_srli_epi32(maskV, 8);
619 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
620 			} else {
621 				maskV |= _mm_slli_epi32(maskV, 8);
622 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
623 			}
624 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
625 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
626 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
627 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
628 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
629 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
630 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
631 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
632 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
633 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
634 			src += 2;
635 			dest += 2;
636 			length -= 2;
637 		}
638 		if (length) {
639 			__m128i srcV = _mm_loadu_si32(src);
640 			__m128i destV = _mm_loadu_si32(dest);
641 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
642 			version (cpublit_revalpha) {
643 				maskV |= _mm_srli_epi32(maskV, 8);
644 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
645 			} else {
646 				maskV |= _mm_slli_epi32(maskV, 8);
647 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
648 			}
649 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
650 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
651 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
652 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
653 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
654 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
655 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
656 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
657 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
658 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
659 		}
660 	}
661 	/**
662 	 * 3 operator multiply function with blending and master value.
663 	 * Has separate destination.
664 	 */
665 	void multMVBl(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
666 		__m128i masterV;
667 		static if (is(V == uint)) {
668 			masterV[0] = value;
669 			masterV[1] = value;
670 		} else static if (is(V == ubyte)) {
671 			masterV[0] = value;
672 			masterV[1] = value;
673 			masterV |= _mm_slli_epi32(masterV, 8);
674 			masterV |= _mm_slli_epi32(masterV, 16);
675 		} else static assert (0, "Value must be either 8 or 32 bits!");
676 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
677 		while (length >= 4) {
678 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
679 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
680 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
681 			version (cpublit_revalpha) {
682 				maskV |= _mm_srli_epi32(maskV, 8);
683 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
684 			} else {
685 				maskV |= _mm_slli_epi32(maskV, 8);
686 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
687 			}
688 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
689 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
690 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
691 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
692 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
693 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
694 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
695 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
696 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
697 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
698 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
699 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
700 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
701 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
702 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
703 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
704 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
705 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
706 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
707 			src += 4;
708 			dest += 4;
709 			dest0 += 4;
710 			length -= 4;
711 		}
712 		if (length >= 2) {
713 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
714 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
715 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
716 			version (cpublit_revalpha) {
717 				maskV |= _mm_srli_epi32(maskV, 8);
718 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
719 			} else {
720 				maskV |= _mm_slli_epi32(maskV, 8);
721 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
722 			}
723 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
724 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
725 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
726 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
727 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
728 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
729 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
730 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
731 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
732 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
733 			src += 2;
734 			dest += 2;
735 			dest0 += 2;
736 			length -= 2;
737 		}
738 		if (length) {
739 			__m128i srcV = _mm_loadu_si32(src);
740 			__m128i destV = _mm_loadu_si32(dest);
741 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
742 			version (cpublit_revalpha) {
743 				maskV |= _mm_srli_epi32(maskV, 8);
744 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
745 			} else {
746 				maskV |= _mm_slli_epi32(maskV, 8);
747 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
748 			}
749 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
750 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
751 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
752 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
753 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
754 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
755 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
756 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
757 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
758 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
759 		}
760 	}
761 	/**
762 	 * 3 operator multiply function with masking and master value.
763 	 */
764 	void multMV(M,V)(uint* src, uint* dest, size_t length, M* mask, V value) {
765 		__m128i masterV;
766 		static if (is(V == uint)) {
767 			masterV[0] = value;
768 			masterV[1] = value;
769 		} else static if (is(V == ubyte)) {
770 			masterV[0] = value;
771 			masterV[1] = value;
772 			masterV |= _mm_slli_epi32(masterV, 8);
773 			masterV |= _mm_slli_epi32(masterV, 16);
774 		} else static assert (0, "Value must be either 8 or 32 bits!");
775 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
776 		while (length >= 4) {
777 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
778 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
779 			static if (is(M == uint)) {
780 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
781 			} else static if (is(M == ubyte)) {
782 				__m128i maskV;
783 				maskV[0] = mask[0];
784 				maskV[1] = mask[1];
785 				maskV[2] = mask[2];
786 				maskV[3] = mask[3];
787 				maskV |= _mm_slli_epi32(maskV, 8);
788 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
789 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
790 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
791 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
792 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
793 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
794 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
795 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
796 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
797 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
798 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
799 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
800 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
801 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
802 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
803 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
804 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
805 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
806 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
807 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
808 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
809 			src += 4;
810 			mask += 4;
811 			dest += 4;
812 			length -= 4;
813 		}
814 		if (length >= 2) {
815 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
816 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
817 			static if (is(M == uint)) {
818 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
819 			} else static if (is(M == ubyte)) {
820 				__m128i maskV;
821 				maskV[0] = mask[0];
822 				maskV[1] = mask[1];
823 				maskV |= _mm_slli_epi32(maskV, 8);
824 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
825 			}
826 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
827 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
828 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
829 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
830 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
831 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
832 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
833 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
834 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
835 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
836 			src += 2;
837 			mask += 2;
838 			dest += 2;
839 			length -= 2;
840 		}
841 		if (length) {
842 			__m128i srcV = _mm_loadu_si32(src);
843 			__m128i destV = _mm_loadu_si32(dest);
844 			static if (is(M == uint)) {
845 				__m128i maskV = _mm_loadu_si32(mask);
846 			} else static if (is(M == ubyte)) {
847 				__m128i maskV;
848 				maskV[0] = mask[0];
849 				maskV |= _mm_slli_epi32(maskV, 8);
850 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
851 			}
852 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
853 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
854 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
855 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
856 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
857 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
858 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
859 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
860 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
861 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
862 		}
863 	}
864 	/**
865 	 * 4 operator multiply function with masking and master value.
866 	 * Has separate destination.
867 	 */
868 	void multMV(M,V)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) {
869 		__m128i masterV;
870 		static if (is(V == uint)) {
871 			masterV[0] = value;
872 			masterV[1] = value;
873 		} else static if (is(V == ubyte)) {
874 			masterV[0] = value;
875 			masterV[1] = value;
876 			masterV |= _mm_slli_epi32(masterV, 8);
877 			masterV |= _mm_slli_epi32(masterV, 16);
878 		} else static assert (0, "Value must be either 8 or 32 bits!");
879 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
880 		while (length >= 4) {
881 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
882 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
883 			static if (is(M == uint)) {
884 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
885 			} else static if (is(M == ubyte)) {
886 				__m128i maskV;
887 				maskV[0] = mask[0];
888 				maskV[1] = mask[1];
889 				maskV[2] = mask[2];
890 				maskV[3] = mask[3];
891 				maskV |= _mm_slli_epi32(maskV, 8);
892 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
893 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
894 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
895 			__m128i src_hi = _mm_adds_epu16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
896 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
897 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_unpackhi_epi8(destV, SSE2_NULLVECT)), 8);
898 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
899 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
900 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
901 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
902 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
903 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
904 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
905 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
906 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
907 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
908 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
909 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
910 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
911 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
912 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
913 			src += 4;
914 			mask += 4;
915 			dest += 4;
916 			dest0 += 4;
917 			length -= 4;
918 		}
919 		if (length >= 2) {
920 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
921 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
922 			static if (is(M == uint)) {
923 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
924 			} else static if (is(M == ubyte)) {
925 				__m128i maskV;
926 				maskV[0] = mask[0];
927 				maskV[1] = mask[1];
928 				maskV |= _mm_slli_epi32(maskV, 8);
929 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
930 			}
931 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
932 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
933 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
934 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
935 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
936 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
937 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
938 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
939 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
940 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
941 			src += 2;
942 			mask += 2;
943 			dest += 2;
944 			dest0 += 2;
945 			length -= 2;
946 		}
947 		if (length) {
948 			__m128i srcV = _mm_loadu_si32(src);
949 			__m128i destV = _mm_loadu_si32(dest);
950 			static if (is(M == uint)) {
951 				__m128i maskV = _mm_loadu_si32(mask);
952 			} else static if (is(M == ubyte)) {
953 				__m128i maskV;
954 				maskV[0] = mask[0];
955 				maskV |= _mm_slli_epi32(maskV, 8);
956 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
957 			}
958 			__m128i src_lo = _mm_adds_epu16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
959 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_unpacklo_epi8(destV, SSE2_NULLVECT)), 8);
960 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
961 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
962 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
963 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
964 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
965 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
966 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
967 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
968 		}
969 	}
970 }
971 unittest {
972 	uint[] src, src0, dest, dest0, maskA, maskB;
973 	ubyte[] mask0A, mask0B;
974 	src.length = 255;
975 	src0.length = 255;
976 	dest.length = 255;
977 	dest0.length = 255;
978 	maskA.length = 255;
979 	fillWithSingleValue(maskA, uint.max);
980 	maskB.length = 255;
981 	mask0A.length = 255;
982 	fillWithSingleValue(mask0A, ubyte.max);
983 	mask0B.length = 255;
984 	fillWithSingleValue(src, 0x306090FF);
985 	fillWithSingleValue(src0, 0x30609000);
986 	fillWithSingleValue(dest, 0xEE2ADDFF);//result should be `0x2D0F7DFF` if A is FF
987 
988 	//Test basic functions
989 	mult(src.ptr, dest.ptr, 255);
990 	testArrayForValue(dest, 0x2D0F7DFF);
991 	fillWithSingleValue(dest, 0xEE2ADDFF);
992 	mult(src.ptr, dest.ptr, dest0.ptr, 255);
993 	testArrayForValue(dest0, 0x2D0F7DFF);
994 	fillWithSingleValue(dest0, 0);
995 
996 	//Test blend functions
997 	multBl(src.ptr, dest.ptr, 255);
998 	testArrayForValue(dest, 0x2D0F7DFF);
999 	fillWithSingleValue(dest, 0xEE2ADDFF);
1000 	multBl(src.ptr, dest.ptr, dest0.ptr, 255);
1001 	testArrayForValue(dest0, 0x2D0F7DFF);
1002 	fillWithSingleValue(dest0, 0);
1003 
1004 	multBl(src0.ptr, dest.ptr, 255);
1005 	testArrayForValue(dest, 0xEE2ADDFF);
1006 	multBl(src0.ptr, dest.ptr, dest0.ptr, 255);
1007 	testArrayForValue(dest0, 0xEE2ADDFF);
1008 	fillWithSingleValue(dest0, 0);
1009 
1010 	//Test master value functions
1011 	multMV(src.ptr, dest.ptr, 255, ubyte.max);
1012 	testArrayForValue(dest, 0x2D0F7DFF);
1013 	fillWithSingleValue(dest, 0xEE2ADDFF);
1014 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1015 	testArrayForValue(dest0, 0x2D0F7DFF);
1016 	fillWithSingleValue(dest0, 0);
1017 
1018 	multMV(src.ptr, dest.ptr, 255, uint.max);
1019 	testArrayForValue(dest, 0x2D0F7DFF);
1020 	fillWithSingleValue(dest, 0xEE2ADDFF);
1021 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1022 	testArrayForValue(dest0, 0x2D0F7DFF);
1023 	fillWithSingleValue(dest0, 0);
1024 
1025 	multMV(src.ptr, dest.ptr, 255, ubyte.min);
1026 	testArrayForValue(dest, 0xEE2ADDFF);
1027 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1028 	testArrayForValue(dest0, 0xEE2ADDFF);
1029 	fillWithSingleValue(dest0, 0);
1030 
1031 	multMV(src.ptr, dest.ptr, 255, uint.min);
1032 	testArrayForValue(dest, 0xEE2ADDFF);
1033 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1034 	testArrayForValue(dest0, 0xEE2ADDFF);
1035 	fillWithSingleValue(dest0, 0);
1036 
1037 	//Test mask functions
1038 	mult(src.ptr, dest.ptr, 255, mask0A.ptr);
1039 	testArrayForValue(dest, 0x2D0F7DFF);
1040 	fillWithSingleValue(dest, 0xEE2ADDFF);
1041 	mult(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr);
1042 	testArrayForValue(dest0, 0x2D0F7DFF);
1043 	fillWithSingleValue(dest0, 0);
1044 
1045 	mult(src.ptr, dest.ptr, 255, maskA.ptr);
1046 	testArrayForValue(dest, 0x2D0F7DFF);
1047 	fillWithSingleValue(dest, 0xEE2ADDFF);
1048 	mult(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr);
1049 	testArrayForValue(dest0, 0x2D0F7DFF);
1050 	fillWithSingleValue(dest0, 0);
1051 
1052 	mult(src.ptr, dest.ptr, 255, mask0B.ptr);
1053 	testArrayForValue(dest, 0xEE2ADDFF);
1054 	mult(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr);
1055 	testArrayForValue(dest0, 0xEE2ADDFF);
1056 	fillWithSingleValue(dest0, 0);
1057 
1058 	mult(src.ptr, dest.ptr, 255, maskB.ptr);
1059 	testArrayForValue(dest, 0xEE2ADDFF);
1060 	mult(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr);
1061 	testArrayForValue(dest0, 0xEE2ADDFF);
1062 	fillWithSingleValue(dest0, 0);
1063 
1064 	//Test blend with master value functions
1065 	multMVBl(src.ptr, dest.ptr, 255, ubyte.max);
1066 	testArrayForValue(dest, 0x2D0F7DFF);
1067 	fillWithSingleValue(dest, 0xEE2ADDFF);
1068 	multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1069 	testArrayForValue(dest0, 0x2D0F7DFF);
1070 	fillWithSingleValue(dest0, 0);
1071 
1072 	multMVBl(src.ptr, dest.ptr, 255, uint.max);
1073 	testArrayForValue(dest, 0x2D0F7DFF);
1074 	fillWithSingleValue(dest, 0xEE2ADDFF);
1075 	multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1076 	testArrayForValue(dest0, 0x2D0F7DFF);
1077 	fillWithSingleValue(dest0, 0);
1078 
1079 	multMVBl(src.ptr, dest.ptr, 255, ubyte.min);
1080 	testArrayForValue(dest, 0xEE2ADDFF);
1081 	multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1082 	testArrayForValue(dest0, 0xEE2ADDFF);
1083 	fillWithSingleValue(dest0, 0);
1084 
1085 	multMVBl(src.ptr, dest.ptr, 255, uint.min);
1086 	testArrayForValue(dest, 0xEE2ADDFF);
1087 	multMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1088 	testArrayForValue(dest0, 0xEE2ADDFF);
1089 	fillWithSingleValue(dest0, 0);
1090 
1091 	multMVBl(src0.ptr, dest.ptr, 255, ubyte.max);
1092 	testArrayForValue(dest, 0xEE2ADDFF);
1093 	multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1094 	testArrayForValue(dest0, 0xEE2ADDFF);
1095 	fillWithSingleValue(dest0, 0);
1096 
1097 	multMVBl(src0.ptr, dest.ptr, 255, uint.max);
1098 	testArrayForValue(dest, 0xEE2ADDFF);
1099 	multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1100 	testArrayForValue(dest0, 0xEE2ADDFF);
1101 	fillWithSingleValue(dest0, 0);
1102 
1103 	multMVBl(src0.ptr, dest.ptr, 255, ubyte.min);
1104 	testArrayForValue(dest, 0xEE2ADDFF);
1105 	multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1106 	testArrayForValue(dest0, 0xEE2ADDFF);
1107 	fillWithSingleValue(dest0, 0);
1108 
1109 	multMVBl(src0.ptr, dest.ptr, 255, uint.min);
1110 	testArrayForValue(dest, 0xEE2ADDFF);
1111 	multMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1112 	testArrayForValue(dest0, 0xEE2ADDFF);
1113 	fillWithSingleValue(dest0, 0);
1114 
1115 	//Test masking with master value functions
1116 	multMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.max);
1117 	testArrayForValue(dest, 0x2D0F7DFF);
1118 	fillWithSingleValue(dest, 0xEE2ADDFF);
1119 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.max);
1120 	testArrayForValue(dest0, 0x2D0F7DFF);
1121 	fillWithSingleValue(dest0, 0);
1122 
1123 	multMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.max);
1124 	testArrayForValue(dest, 0x2D0F7DFF);
1125 	fillWithSingleValue(dest, 0xEE2ADDFF);
1126 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.max);
1127 	testArrayForValue(dest0, 0x2D0F7DFF);
1128 	fillWithSingleValue(dest0, 0);
1129 
1130 	multMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.min);
1131 	testArrayForValue(dest, 0xEE2ADDFF);
1132 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.min);
1133 	testArrayForValue(dest0, 0xEE2ADDFF);
1134 	fillWithSingleValue(dest0, 0);
1135 
1136 	multMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.min);
1137 	testArrayForValue(dest, 0xEE2ADDFF);
1138 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.min);
1139 	testArrayForValue(dest0, 0xEE2ADDFF);
1140 	fillWithSingleValue(dest0, 0);
1141 
1142 	multMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.max);
1143 	testArrayForValue(dest, 0xEE2ADDFF);
1144 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.max);
1145 	testArrayForValue(dest0, 0xEE2ADDFF);
1146 	fillWithSingleValue(dest0, 0);
1147 
1148 	multMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.max);
1149 	testArrayForValue(dest, 0xEE2ADDFF);
1150 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.max);
1151 	testArrayForValue(dest0, 0xEE2ADDFF);
1152 	fillWithSingleValue(dest0, 0);
1153 
1154 	multMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.min);
1155 	testArrayForValue(dest, 0xEE2ADDFF);
1156 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.min);
1157 	testArrayForValue(dest0, 0xEE2ADDFF);
1158 	fillWithSingleValue(dest0, 0);
1159 
1160 	multMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.min);
1161 	testArrayForValue(dest, 0xEE2ADDFF);
1162 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.min);
1163 	testArrayForValue(dest0, 0xEE2ADDFF);
1164 	fillWithSingleValue(dest0, 0);
1165 	//
1166 	multMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.max);
1167 	testArrayForValue(dest, 0x2D0F7DFF);
1168 	fillWithSingleValue(dest, 0xEE2ADDFF);
1169 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.max);
1170 	testArrayForValue(dest0, 0x2D0F7DFF);
1171 	fillWithSingleValue(dest0, 0);
1172 
1173 	multMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.max);
1174 	testArrayForValue(dest, 0x2D0F7DFF);
1175 	fillWithSingleValue(dest, 0xEE2ADDFF);
1176 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.max);
1177 	testArrayForValue(dest0, 0x2D0F7DFF);
1178 	fillWithSingleValue(dest0, 0);
1179 
1180 	multMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.min);
1181 	testArrayForValue(dest, 0xEE2ADDFF);
1182 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.min);
1183 	testArrayForValue(dest0, 0xEE2ADDFF);
1184 	fillWithSingleValue(dest0, 0);
1185 
1186 	multMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.min);
1187 	testArrayForValue(dest, 0xEE2ADDFF);
1188 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.min);
1189 	testArrayForValue(dest0, 0xEE2ADDFF);
1190 	fillWithSingleValue(dest0, 0);
1191 
1192 	multMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.max);
1193 	testArrayForValue(dest, 0xEE2ADDFF);
1194 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.max);
1195 	testArrayForValue(dest0, 0xEE2ADDFF);
1196 	fillWithSingleValue(dest0, 0);
1197 
1198 	multMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.max);
1199 	testArrayForValue(dest, 0xEE2ADDFF);
1200 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.max);
1201 	testArrayForValue(dest0, 0xEE2ADDFF);
1202 	fillWithSingleValue(dest0, 0);
1203 
1204 	multMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.min);
1205 	testArrayForValue(dest, 0xEE2ADDFF);
1206 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.min);
1207 	testArrayForValue(dest0, 0xEE2ADDFF);
1208 	fillWithSingleValue(dest0, 0);
1209 
1210 	multMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.min);
1211 	testArrayForValue(dest, 0xEE2ADDFF);
1212 	multMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.min);
1213 	testArrayForValue(dest0, 0xEE2ADDFF);
1214 	//fillWithSingleValue(dest0, 0);
1215 }