1 module CPUblit.composing.screen;
2 
3 import CPUblit.composing.common;
4 
5 /*
6  * CPUblit
7  * Screen-blend compose functions.
8  * Author: Laszlo Szeremi
9  *
10  * Screen-blend functions compose two images together using the following formula:
11  * dest0[rgba] = 1 - (1 - src[rgba]) * (1 - dest[rgba])
12  * This is translated to the following formula:
13  * dest0[rgba] = 255 - ((256 - src[rgba]) * (255 - dest[rgba]))>>>8
14  * If alpha channel is enabled, it'control the blend between the multiplied value and the original one.
15  * dest0[rgba] = ((1.0 - mask[aaaa]) * dest) + (mask[aaaa] * (1 - (1 - src[rgba]) * (1 - dest[rgba])))
16  * In integer, this is:
17  * dest0[rgba] = (((256 - mask[aaaa]) * dest) + (255 - ((256 - src[rgba]) * (255 - dest[rgba]))>>>8)>>>8
18  */
19 @nogc pure nothrow {
20 	/**
21 	 * 2 Operator screen function.
22 	 */
23 	void screen(uint* src, uint* dest, size_t length) {
24 		while (length >= 4) {
25 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
26 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
27 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
28 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
29 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
30 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
31 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
32 					_mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8);
33 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
34 					_mm_unpackhi_epi8(destV, SSE2_NULLVECT))), 8);
35 			_mm_storeu_si128(cast(__m128i*) dest, _mm_subs_epu8(SSE2_FULLVECT,
36 					_mm_packus_epi16(src_lo, src_hi)));
37 			src += 4;
38 			dest += 4;
39 			length -= 4;
40 		}
41 		if (length >= 2) {
42 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
43 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
44 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
45 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
46 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
47 					_mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8);
48 			_mm_storel_epi64(cast(__m128i*) dest, _mm_subs_epu8(SSE2_FULLVECT,
49 					_mm_packus_epi16(src_lo, SSE2_NULLVECT)));
50 			src += 2;
51 			dest += 2;
52 			length -= 2;
53 		}
54 		if (length) {
55 			__m128i srcV = _mm_loadu_si32(src);
56 			__m128i destV = _mm_loadu_si32(dest);
57 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
58 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
59 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
60 					_mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8);
61 			_mm_storeu_si32(dest, _mm_subs_epu8(SSE2_FULLVECT,
62 					_mm_packus_epi16(src_lo, SSE2_NULLVECT)));
63 		}
64 	}
65 	/**
66 	 * 3 Operator screen function.
67 	 */
68 	void screen(uint* src, uint* dest, uint* dest0, size_t length) {
69 		while (length >= 4) {
70 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
71 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
72 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
73 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
74 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
75 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
76 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
77 					_mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8);
78 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
79 					_mm_unpackhi_epi8(destV, SSE2_NULLVECT))), 8);
80 			_mm_storeu_si128(cast(__m128i*) dest0, _mm_subs_epu8(SSE2_FULLVECT,
81 					_mm_packus_epi16(src_lo, src_hi)));
82 			src += 4;
83 			dest += 4;
84 			dest0 += 4;
85 			length -= 4;
86 		}
87 		if (length >= 2) {
88 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
89 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
90 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
91 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
92 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
93 					_mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8);
94 			_mm_storel_epi64(cast(__m128i*) dest0, _mm_subs_epu8(SSE2_FULLVECT,
95 					_mm_packus_epi16(src_lo, SSE2_NULLVECT)));
96 			src += 2;
97 			dest += 2;
98 			dest0 += 2;
99 			length -= 2;
100 		}
101 		if (length) {
102 			__m128i srcV = _mm_loadu_si32(src);
103 			__m128i destV = _mm_loadu_si32(dest);
104 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
105 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
106 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255,
107 					_mm_unpacklo_epi8(destV, SSE2_NULLVECT))), 8);
108 			_mm_storeu_si32(dest0, _mm_subs_epu8(SSE2_FULLVECT,
109 					_mm_packus_epi16(src_lo, SSE2_NULLVECT)));
110 		}
111 	}
112 	/**
113 	 * 2 Operator multiply function with blending.
114 	 */
115 	void screenBl(uint* src, uint* dest, size_t length) {
116 		while (length >= 4) {
117 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
118 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
119 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
120 			version (cpublit_revalpha) {
121 				maskV |= _mm_srli_epi32(maskV, 8);
122 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
123 			} else {
124 				maskV |= _mm_slli_epi32(maskV, 8);
125 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
126 			}
127 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
128 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
129 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
130 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
131 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
132 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
133 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
134 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
135 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
136 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
137 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
138 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
139 
140 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
141 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
142 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
143 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
144 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
145 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
146 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
147 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
148 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
149 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
150 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
151 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
152 			_mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi));
153 			src += 4;
154 			dest += 4;
155 			length -= 4;
156 		}
157 		if (length >= 2) {
158 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
159 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
160 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
161 			version (cpublit_revalpha) {
162 				maskV |= _mm_srli_epi32(maskV, 8);
163 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
164 			} else {
165 				maskV |= _mm_slli_epi32(maskV, 8);
166 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
167 			}
168 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
169 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
170 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
171 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
172 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
173 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
174 
175 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
176 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
177 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
178 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
179 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
180 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
181 			_mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
182 			src += 2;
183 			dest += 2;
184 			length -= 2;
185 		}
186 		if (length) {
187 			__m128i srcV = _mm_loadu_si32(src);
188 			__m128i destV = _mm_loadu_si32(dest);
189 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
190 			version (cpublit_revalpha) {
191 				maskV |= _mm_srli_epi32(maskV, 8);
192 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
193 			} else {
194 				maskV |= _mm_slli_epi32(maskV, 8);
195 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
196 			}
197 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
198 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
199 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
200 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
201 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
202 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
203 
204 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
205 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
206 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
207 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
208 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
209 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
210 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
211 		}
212 	}
213 	/**
214 	 * 3 Operator multiply function with blending.
215 	 */
216 	void screenBl(uint* src, uint* dest, uint* dest0, size_t length) {
217 		while (length >= 4) {
218 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
219 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
220 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
221 			version (cpublit_revalpha) {
222 				maskV |= _mm_srli_epi32(maskV, 8);
223 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
224 			} else {
225 				maskV |= _mm_slli_epi32(maskV, 8);
226 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
227 			}
228 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
229 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
230 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
231 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
232 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
233 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
234 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
235 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
236 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
237 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
238 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
239 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
240 
241 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
242 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
243 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
244 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
245 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
246 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
247 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
248 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
249 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
250 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
251 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
252 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
253 			_mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi));
254 			src += 4;
255 			dest += 4;
256 			dest0 += 4;
257 			length -= 4;
258 		}
259 		if (length >= 2) {
260 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
261 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
262 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
263 			version (cpublit_revalpha) {
264 				maskV |= _mm_srli_epi32(maskV, 8);
265 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
266 			} else {
267 				maskV |= _mm_slli_epi32(maskV, 8);
268 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
269 			}
270 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
271 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
272 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
273 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
274 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
275 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
276 
277 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
278 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
279 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
280 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
281 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
282 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
283 			_mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
284 			src += 2;
285 			dest += 2;
286 			dest0 += 2;
287 			length -= 2;
288 		}
289 		if (length) {
290 			__m128i srcV = _mm_loadu_si32(src);
291 			__m128i destV = _mm_loadu_si32(dest);
292 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
293 			version (cpublit_revalpha) {
294 				maskV |= _mm_srli_epi32(maskV, 8);
295 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
296 			} else {
297 				maskV |= _mm_slli_epi32(maskV, 8);
298 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
299 			}
300 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
301 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
302 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
303 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
304 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
305 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
306 
307 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
308 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
309 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
310 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
311 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
312 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
313 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
314 		}
315 	}
316 
317 	/**
318 	 * 3 Operator multiply function with blending and masking.
319 	 */
320 	void screen(M)(uint* src, uint* dest, size_t length, M* mask) {
321 		while (length >= 4) {
322 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
323 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
324 			static if (is(M == uint)) {
325 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
326 			} else static if (is(M == ubyte)) {
327 				__m128i maskV;
328 				maskV[0] = mask[0];
329 				maskV[1] = mask[1];
330 				maskV[2] = mask[2];
331 				maskV[3] = mask[3];
332 				maskV |= _mm_slli_epi32(maskV, 8);
333 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
334 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
335 
336 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
337 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
338 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
339 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
340 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
341 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
342 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
343 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
344 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
345 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
346 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
347 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
348 
349 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
350 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
351 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
352 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
353 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
354 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
355 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
356 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
357 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
358 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
359 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
360 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
361 			_mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi));
362 			src += 4;
363 			dest += 4;
364 			mask += 4;
365 			length -= 4;
366 		}
367 		if (length >= 2) {
368 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
369 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
370 			static if (is(M == uint)) {
371 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
372 			} else static if (is(M == ubyte)) {
373 				__m128i maskV;
374 				maskV[0] = mask[0];
375 				maskV[1] = mask[1];
376 				maskV |= _mm_slli_epi32(maskV, 8);
377 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
378 			}
379 
380 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
381 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
382 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
383 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
384 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
385 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
386 
387 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
388 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
389 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
390 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
391 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
392 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
393 			_mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
394 			src += 2;
395 			dest += 2;
396 			mask += 2;
397 			length -= 2;
398 		}
399 		if (length) {
400 			__m128i srcV = _mm_loadu_si32(src);
401 			__m128i destV = _mm_loadu_si32(dest);
402 			static if (is(M == uint)) {
403 				__m128i maskV = _mm_loadu_si32(mask);
404 			} else static if (is(M == ubyte)) {
405 				__m128i maskV;
406 				maskV[0] = mask[0];
407 				maskV |= _mm_slli_epi32(maskV, 8);
408 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
409 			}
410 
411 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
412 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
413 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
414 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
415 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
416 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
417 
418 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
419 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
420 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
421 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
422 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
423 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
424 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
425 		}
426 	}
427 	/**
428 		 * 4 Operator multiply function with blending and masking.
429 		 */
430 	void screen(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) {
431 		while (length >= 4) {
432 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
433 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
434 			static if (is(M == uint)) {
435 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
436 			} else static if (is(M == ubyte)) {
437 				__m128i maskV;
438 				maskV[0] = mask[0];
439 				maskV[1] = mask[1];
440 				maskV[2] = mask[2];
441 				maskV[3] = mask[3];
442 				maskV |= _mm_slli_epi32(maskV, 8);
443 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
444 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
445 
446 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
447 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
448 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
449 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
450 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
451 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
452 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
453 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
454 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
455 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
456 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
457 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
458 
459 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
460 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
461 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
462 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
463 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
464 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
465 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
466 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
467 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
468 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
469 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
470 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
471 			_mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi));
472 			src += 4;
473 			dest += 4;
474 			dest0 += 4;
475 			mask += 4;
476 			length -= 4;
477 		}
478 		if (length >= 2) {
479 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
480 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
481 			static if (is(M == uint)) {
482 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
483 			} else static if (is(M == ubyte)) {
484 				__m128i maskV;
485 				maskV[0] = mask[0];
486 				maskV[1] = mask[1];
487 				maskV |= _mm_slli_epi32(maskV, 8);
488 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
489 			}
490 
491 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
492 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
493 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
494 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
495 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
496 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
497 
498 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
499 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
500 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
501 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
502 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
503 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
504 			_mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
505 			src += 2;
506 			dest += 2;
507 			dest0 += 2;
508 			mask += 2;
509 			length -= 2;
510 		}
511 		if (length) {
512 			__m128i srcV = _mm_loadu_si32(src);
513 			__m128i destV = _mm_loadu_si32(dest);
514 			static if (is(M == uint)) {
515 				__m128i maskV = _mm_loadu_si32(mask);
516 			} else static if (is(M == ubyte)) {
517 				__m128i maskV;
518 				maskV[0] = mask[0];
519 				maskV |= _mm_slli_epi32(maskV, 8);
520 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
521 			}
522 
523 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
524 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
525 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
526 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
527 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
528 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
529 
530 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
531 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
532 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
533 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
534 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
535 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
536 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
537 		}
538 	}
539 
540 	/**
541 	 * 2 Operator multiply function with master value.
542 	 */
543 	void screenMV(V)(uint* src, uint* dest, size_t length, V value) {
544 		__m128i masterV;
545 		static if (is(V == uint)) {
546 			masterV[0] = value;
547 			masterV[1] = value;
548 		} else static if (is(V == ubyte)) {
549 			masterV[0] = value;
550 			masterV[1] = value;
551 			masterV |= _mm_slli_epi32(masterV, 8);
552 			masterV |= _mm_slli_epi32(masterV, 16);
553 		} else
554 			static assert(0, "Value must be either 8 or 32 bits!");
555 		__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
556 				_mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
557 		__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV,
558 				SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
559 		while (length >= 4) {
560 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
561 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
562 
563 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
564 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
565 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
566 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
567 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
568 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
569 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
570 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
571 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
572 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
573 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
574 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
575 
576 			src_lo = _mm_mullo_epi16(src_lo, master_1);
577 			src_hi = _mm_mullo_epi16(src_hi, master_1);
578 			dest_lo = _mm_mullo_epi16(dest_lo, master_256);
579 			dest_hi = _mm_mullo_epi16(dest_hi, master_256);
580 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
581 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
582 			_mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi));
583 			src += 4;
584 			dest += 4;
585 			length -= 4;
586 		}
587 		if (length >= 2) {
588 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
589 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
590 
591 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
592 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
593 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
594 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
595 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
596 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
597 
598 			src_lo = _mm_mullo_epi16(src_lo, master_1);
599 			dest_lo = _mm_mullo_epi16(dest_lo, master_256);
600 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
601 			_mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
602 			src += 2;
603 			dest += 2;
604 			length -= 2;
605 		}
606 		if (length) {
607 			__m128i srcV = _mm_loadu_si32(src);
608 			__m128i destV = _mm_loadu_si32(dest);
609 
610 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
611 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
612 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
613 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
614 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
615 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
616 
617 			src_lo = _mm_mullo_epi16(src_lo, master_1);
618 			dest_lo = _mm_mullo_epi16(dest_lo, master_256);
619 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
620 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
621 		}
622 	}
623 	/**
624 	 * 3 Operator multiply function with master value.
625 	 */
626 	void screenMV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
627 		__m128i masterV;
628 		static if (is(V == uint)) {
629 			masterV[0] = value;
630 			masterV[1] = value;
631 		} else static if (is(V == ubyte)) {
632 			masterV[0] = value;
633 			masterV[1] = value;
634 			masterV |= _mm_slli_epi32(masterV, 8);
635 			masterV |= _mm_slli_epi32(masterV, 16);
636 		} else
637 			static assert(0, "Value must be either 8 or 32 bits!");
638 		__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
639 				_mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
640 		__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV,
641 				SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
642 		while (length >= 4) {
643 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
644 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
645 
646 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
647 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
648 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
649 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
650 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
651 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
652 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
653 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
654 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
655 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
656 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
657 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
658 
659 			src_lo = _mm_mullo_epi16(src_lo, master_1);
660 			src_hi = _mm_mullo_epi16(src_hi, master_1);
661 			dest_lo = _mm_mullo_epi16(dest_lo, master_256);
662 			dest_hi = _mm_mullo_epi16(dest_hi, master_256);
663 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
664 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
665 			_mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi));
666 			src += 4;
667 			dest += 4;
668 			dest0 += 4;
669 			length -= 4;
670 		}
671 		if (length >= 2) {
672 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
673 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
674 
675 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
676 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
677 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
678 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
679 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
680 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
681 
682 			src_lo = _mm_mullo_epi16(src_lo, master_1);
683 			dest_lo = _mm_mullo_epi16(dest_lo, master_256);
684 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
685 			_mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
686 			src += 2;
687 			dest += 2;
688 			dest0 += 2;
689 			length -= 2;
690 		}
691 		if (length) {
692 			__m128i srcV = _mm_loadu_si32(src);
693 			__m128i destV = _mm_loadu_si32(dest);
694 
695 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
696 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
697 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
698 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
699 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
700 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
701 
702 			src_lo = _mm_mullo_epi16(src_lo, master_1);
703 			dest_lo = _mm_mullo_epi16(dest_lo, master_256);
704 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
705 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
706 		}
707 	}
708 
709 	/**
710 	 * 2 Operator multiply function with blending.
711 	 */
712 	void screenMVBl(V)(uint* src, uint* dest, size_t length, V value) {
713 		__m128i masterV;
714 		static if (is(V == uint)) {
715 			masterV[0] = value;
716 			masterV[1] = value;
717 		} else static if (is(V == ubyte)) {
718 			masterV[0] = value;
719 			masterV[1] = value;
720 			masterV |= _mm_slli_epi32(masterV, 8);
721 			masterV |= _mm_slli_epi32(masterV, 16);
722 		} else
723 			static assert(0, "Value must be either 8 or 32 bits!");
724 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
725 		while (length >= 4) {
726 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
727 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
728 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
729 			version (cpublit_revalpha) {
730 				maskV |= _mm_srli_epi32(maskV, 8);
731 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
732 			} else {
733 				maskV |= _mm_slli_epi32(maskV, 8);
734 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
735 			}
736 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
737 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
738 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
739 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
740 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
741 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
742 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
743 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
744 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
745 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
746 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
747 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
748 
749 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
750 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
751 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
752 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
753 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
754 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
755 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
756 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
757 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
758 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
759 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
760 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
761 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
762 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
763 			_mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi));
764 			src += 4;
765 			dest += 4;
766 			length -= 4;
767 		}
768 		if (length >= 2) {
769 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
770 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
771 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
772 			version (cpublit_revalpha) {
773 				maskV |= _mm_srli_epi32(maskV, 8);
774 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
775 			} else {
776 				maskV |= _mm_slli_epi32(maskV, 8);
777 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
778 			}
779 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
780 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
781 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
782 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
783 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
784 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
785 
786 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
787 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
788 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
789 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
790 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
791 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
792 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
793 			_mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
794 			src += 2;
795 			dest += 2;
796 			length -= 2;
797 		}
798 		if (length) {
799 			__m128i srcV = _mm_loadu_si32(src);
800 			__m128i destV = _mm_loadu_si32(dest);
801 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
802 			version (cpublit_revalpha) {
803 				maskV |= _mm_srli_epi32(maskV, 8);
804 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
805 			} else {
806 				maskV |= _mm_slli_epi32(maskV, 8);
807 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
808 			}
809 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
810 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
811 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
812 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
813 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
814 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
815 
816 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
817 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
818 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
819 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
820 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
821 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
822 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
823 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
824 		}
825 	}
826 	/**
827 	 * 3 Operator multiply function with blending.
828 	 */
829 	void screenMVBl(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
830 		__m128i masterV;
831 		static if (is(V == uint)) {
832 			masterV[0] = value;
833 			masterV[1] = value;
834 		} else static if (is(V == ubyte)) {
835 			masterV[0] = value;
836 			masterV[1] = value;
837 			masterV |= _mm_slli_epi32(masterV, 8);
838 			masterV |= _mm_slli_epi32(masterV, 16);
839 		} else
840 			static assert(0, "Value must be either 8 or 32 bits!");
841 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
842 		while (length >= 4) {
843 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
844 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
845 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
846 			version (cpublit_revalpha) {
847 				maskV |= _mm_srli_epi32(maskV, 8);
848 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
849 			} else {
850 				maskV |= _mm_slli_epi32(maskV, 8);
851 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
852 			}
853 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
854 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
855 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
856 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
857 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
858 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
859 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
860 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
861 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
862 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
863 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
864 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
865 
866 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
867 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
868 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
869 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
870 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
871 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
872 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
873 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
874 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
875 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
876 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
877 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
878 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
879 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
880 			_mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi));
881 			src += 4;
882 			dest += 4;
883 			dest0 += 4;
884 			length -= 4;
885 		}
886 		if (length >= 2) {
887 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
888 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
889 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
890 			version (cpublit_revalpha) {
891 				maskV |= _mm_srli_epi32(maskV, 8);
892 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
893 			} else {
894 				maskV |= _mm_slli_epi32(maskV, 8);
895 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
896 			}
897 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
898 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
899 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
900 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
901 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
902 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
903 
904 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
905 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
906 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
907 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
908 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
909 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
910 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
911 			_mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
912 			src += 2;
913 			dest += 2;
914 			dest0 += 2;
915 			length -= 2;
916 		}
917 		if (length) {
918 			__m128i srcV = _mm_loadu_si32(src);
919 			__m128i destV = _mm_loadu_si32(dest);
920 			__m128i maskV = srcV & cast(__m128i) ALPHABLEND_SSE2_AMASK;
921 			version (cpublit_revalpha) {
922 				maskV |= _mm_srli_epi32(maskV, 8);
923 				maskV |= _mm_srli_epi32(maskV, 16); //[A,A,A,A]
924 			} else {
925 				maskV |= _mm_slli_epi32(maskV, 8);
926 				maskV |= _mm_slli_epi32(maskV, 16); //[A,A,A,A]
927 			}
928 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
929 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
930 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
931 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
932 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
933 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
934 
935 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
936 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
937 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
938 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
939 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
940 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
941 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
942 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
943 		}
944 	}
945 
946 	/**
947 	 * 3 Operator multiply function with blending and masking.
948 	 */
949 	void screenMV(M, V)(uint* src, uint* dest, size_t length, M* mask, V value) {
950 		__m128i masterV;
951 		static if (is(V == uint)) {
952 			masterV[0] = value;
953 			masterV[1] = value;
954 		} else static if (is(V == ubyte)) {
955 			masterV[0] = value;
956 			masterV[1] = value;
957 			masterV |= _mm_slli_epi32(masterV, 8);
958 			masterV |= _mm_slli_epi32(masterV, 16);
959 		} else
960 			static assert(0, "Value must be either 8 or 32 bits!");
961 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV,	SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
962 		while (length >= 4) {
963 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
964 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
965 			static if (is(M == uint)) {
966 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
967 			} else static if (is(M == ubyte)) {
968 				__m128i maskV;
969 				maskV[0] = mask[0];
970 				maskV[1] = mask[1];
971 				maskV[2] = mask[2];
972 				maskV[3] = mask[3];
973 				maskV |= _mm_slli_epi32(maskV, 8);
974 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
975 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
976 
977 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
978 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
979 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
980 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
981 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
982 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
983 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
984 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
985 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
986 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
987 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
988 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
989 
990 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
991 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
992 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
993 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
994 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
995 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
996 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
997 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
998 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
999 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
1000 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
1001 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
1002 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
1003 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
1004 			_mm_storeu_si128(cast(__m128i*) dest, _mm_packus_epi16(src_lo, src_hi));
1005 			src += 4;
1006 			dest += 4;
1007 			mask += 4;
1008 			length -= 4;
1009 		}
1010 		if (length >= 2) {
1011 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
1012 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
1013 			static if (is(M == uint)) {
1014 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
1015 			} else static if (is(M == ubyte)) {
1016 				__m128i maskV;
1017 				maskV[0] = mask[0];
1018 				maskV[1] = mask[1];
1019 				maskV |= _mm_slli_epi32(maskV, 8);
1020 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
1021 			}
1022 
1023 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
1024 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
1025 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
1026 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
1027 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
1028 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
1029 
1030 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
1031 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
1032 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
1033 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
1034 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
1035 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
1036 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
1037 			_mm_storel_epi64(cast(__m128i*) dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
1038 			src += 2;
1039 			dest += 2;
1040 			mask += 2;
1041 			length -= 2;
1042 		}
1043 		if (length) {
1044 			__m128i srcV = _mm_loadu_si32(src);
1045 			__m128i destV = _mm_loadu_si32(dest);
1046 			static if (is(M == uint)) {
1047 				__m128i maskV = _mm_loadu_si32(mask);
1048 			} else static if (is(M == ubyte)) {
1049 				__m128i maskV;
1050 				maskV[0] = mask[0];
1051 				maskV |= _mm_slli_epi32(maskV, 8);
1052 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
1053 			}
1054 
1055 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
1056 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
1057 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
1058 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
1059 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
1060 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
1061 
1062 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
1063 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
1064 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
1065 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
1066 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
1067 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
1068 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
1069 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
1070 		}
1071 	}
1072 	/**
1073 	 * 4 Operator multiply function with blending.
1074 	 */
1075 	void screenMV(M, V)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) {
1076 		__m128i masterV;
1077 		static if (is(V == uint)) {
1078 			masterV[0] = value;
1079 			masterV[1] = value;
1080 		} else static if (is(V == ubyte)) {
1081 			masterV[0] = value;
1082 			masterV[1] = value;
1083 			masterV |= _mm_slli_epi32(masterV, 8);
1084 			masterV |= _mm_slli_epi32(masterV, 16);
1085 		} else
1086 			static assert(0, "Value must be either 8 or 32 bits!");
1087 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV,	SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
1088 		while (length >= 4) {
1089 			__m128i srcV = _mm_loadu_si128(cast(__m128i*) src);
1090 			__m128i destV = _mm_loadu_si128(cast(__m128i*) dest);
1091 			static if (is(M == uint)) {
1092 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
1093 			} else static if (is(M == ubyte)) {
1094 				__m128i maskV;
1095 				maskV[0] = mask[0];
1096 				maskV[1] = mask[1];
1097 				maskV[2] = mask[2];
1098 				maskV[3] = mask[3];
1099 				maskV |= _mm_slli_epi32(maskV, 8);
1100 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
1101 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
1102 
1103 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
1104 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
1105 			__m128i src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
1106 					_mm_unpackhi_epi8(srcV, SSE2_NULLVECT));
1107 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
1108 			__m128i dest_hi = _mm_unpackhi_epi8(destV, SSE2_NULLVECT);
1109 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
1110 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
1111 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi,
1112 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_hi)), 8);
1113 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
1114 			src_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_hi);
1115 
1116 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
1117 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
1118 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
1119 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
1120 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
1121 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
1122 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
1123 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
1124 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
1125 			src_hi = _mm_mullo_epi16(src_hi, mask0_hi);
1126 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
1127 			dest_hi = _mm_mullo_epi16(dest_hi, mask_hi);
1128 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
1129 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
1130 			_mm_storeu_si128(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, src_hi));
1131 			src += 4;
1132 			dest += 4;
1133 			dest0 += 4;
1134 			mask += 4;
1135 			length -= 4;
1136 		}
1137 		if (length >= 2) {
1138 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*) src);
1139 			__m128i destV = _mm_loadl_epi64(cast(__m128i*) dest);
1140 			static if (is(M == uint)) {
1141 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
1142 			} else static if (is(M == ubyte)) {
1143 				__m128i maskV;
1144 				maskV[0] = mask[0];
1145 				maskV[1] = mask[1];
1146 				maskV |= _mm_slli_epi32(maskV, 8);
1147 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
1148 			}
1149 
1150 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
1151 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
1152 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
1153 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
1154 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
1155 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
1156 
1157 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
1158 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
1159 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
1160 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
1161 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
1162 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
1163 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
1164 			_mm_storel_epi64(cast(__m128i*) dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
1165 			src += 2;
1166 			dest += 2;
1167 			dest0 += 2;
1168 			mask += 2;
1169 			length -= 2;
1170 		}
1171 		if (length) {
1172 			__m128i srcV = _mm_loadu_si32(src);
1173 			__m128i destV = _mm_loadu_si32(dest);
1174 			static if (is(M == uint)) {
1175 				__m128i maskV = _mm_loadu_si32(mask);
1176 			} else static if (is(M == ubyte)) {
1177 				__m128i maskV;
1178 				maskV[0] = mask[0];
1179 				maskV |= _mm_slli_epi32(maskV, 8);
1180 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
1181 			}
1182 
1183 			__m128i src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256,
1184 					_mm_unpacklo_epi8(srcV, SSE2_NULLVECT));
1185 			__m128i dest_lo = _mm_unpacklo_epi8(destV, SSE2_NULLVECT);
1186 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo,
1187 					_mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, dest_lo)), 8);
1188 			src_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST255, src_lo);
1189 
1190 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
1191 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
1192 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
1193 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
1194 			src_lo = _mm_mullo_epi16(src_lo, mask0_lo);
1195 			dest_lo = _mm_mullo_epi16(dest_lo, mask_lo);
1196 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
1197 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
1198 		}
1199 	}
1200 }
1201 
1202 unittest {
1203 	uint[] src, src0, dest, dest0, maskA, maskB;
1204 	ubyte[] mask0A, mask0B;
1205 	src.length = 255;
1206 	src0.length = 255;
1207 	dest.length = 255;
1208 	dest0.length = 255;
1209 	maskA.length = 255;
1210 	fillWithSingleValue(maskA, uint.max);
1211 	maskB.length = 255;
1212 	mask0A.length = 255;
1213 	fillWithSingleValue(mask0A, ubyte.max);
1214 	mask0B.length = 255;
1215 	fillWithSingleValue(src, 0x306090FF);
1216 	fillWithSingleValue(src0, 0x30609000);
1217 	fillWithSingleValue(dest, 0xEE2ADDFF); //result should be `0xF27AF1FF` if A is FF
1218 
1219 	//Test basic functions
1220 	screen(src.ptr, dest.ptr, 255);
1221 	testArrayForValue(dest, 0xF27AF1FF);
1222 	fillWithSingleValue(dest, 0xEE2ADDFF);
1223 	screen(src.ptr, dest.ptr, dest0.ptr, 255);
1224 	testArrayForValue(dest0, 0xF27AF1FF);
1225 	fillWithSingleValue(dest0, 0);
1226 
1227 	//Test blend functions
1228 	screenBl(src.ptr, dest.ptr, 255);
1229 	testArrayForValue(dest, 0xF27AF1FF);
1230 	fillWithSingleValue(dest, 0xEE2ADDFF);
1231 	screenBl(src.ptr, dest.ptr, dest0.ptr, 255);
1232 	testArrayForValue(dest0, 0xF27AF1FF);
1233 	fillWithSingleValue(dest0, 0);
1234 
1235 	screenBl(src0.ptr, dest.ptr, 255);
1236 	testArrayForValue(dest, 0xEE2ADDFF);
1237 	screenBl(src0.ptr, dest.ptr, dest0.ptr, 255);
1238 	testArrayForValue(dest0, 0xEE2ADDFF);
1239 	fillWithSingleValue(dest0, 0);
1240 
1241 	//Test master value functions
1242 	screenMV(src.ptr, dest.ptr, 255, uint.max);
1243 	testArrayForValue(dest, 0xF27AF1FF);
1244 	fillWithSingleValue(dest, 0xEE2ADDFF);
1245 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1246 	testArrayForValue(dest0, 0xF27AF1FF);
1247 	fillWithSingleValue(dest0, 0);
1248 
1249 	screenMV(src.ptr, dest.ptr, 255, ubyte.max);
1250 	testArrayForValue(dest, 0xF27AF1FF);
1251 	fillWithSingleValue(dest, 0xEE2ADDFF);
1252 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1253 	testArrayForValue(dest0, 0xF27AF1FF);
1254 	fillWithSingleValue(dest0, 0);
1255 
1256 	screenMV(src.ptr, dest.ptr, 255, uint.min);
1257 	testArrayForValue(dest, 0xEE2ADDFF);
1258 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1259 	testArrayForValue(dest0, 0xEE2ADDFF);
1260 	fillWithSingleValue(dest0, 0);
1261 
1262 	screenMV(src.ptr, dest.ptr, 255, ubyte.min);
1263 	testArrayForValue(dest, 0xEE2ADDFF);
1264 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1265 	testArrayForValue(dest0, 0xEE2ADDFF);
1266 	fillWithSingleValue(dest0, 0);
1267 
1268 	//Test mask functions
1269 	screen(src.ptr, dest.ptr, 255, mask0A.ptr);
1270 	testArrayForValue(dest, 0xF27AF1FF);
1271 	fillWithSingleValue(dest, 0xEE2ADDFF);
1272 	screen(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr);
1273 	testArrayForValue(dest0, 0xF27AF1FF);
1274 	fillWithSingleValue(dest0, 0);
1275 
1276 	screen(src.ptr, dest.ptr, 255, maskA.ptr);
1277 	testArrayForValue(dest, 0xF27AF1FF);
1278 	fillWithSingleValue(dest, 0xEE2ADDFF);
1279 	screen(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr);
1280 	testArrayForValue(dest0, 0xF27AF1FF);
1281 	fillWithSingleValue(dest0, 0);
1282 
1283 	screen(src.ptr, dest.ptr, 255, mask0B.ptr);
1284 	testArrayForValue(dest, 0xEE2ADDFF);
1285 	screen(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr);
1286 	testArrayForValue(dest0, 0xEE2ADDFF);
1287 	fillWithSingleValue(dest0, 0);
1288 
1289 	screen(src.ptr, dest.ptr, 255, maskB.ptr);
1290 	testArrayForValue(dest, 0xEE2ADDFF);
1291 	screen(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr);
1292 	testArrayForValue(dest0, 0xEE2ADDFF);
1293 	fillWithSingleValue(dest0, 0);
1294 
1295 	//Test blend with master value functions
1296 	screenMVBl(src.ptr, dest.ptr, 255, ubyte.max);
1297 	testArrayForValue(dest, 0xF27AF1FF);
1298 	fillWithSingleValue(dest, 0xEE2ADDFF);
1299 	screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1300 	testArrayForValue(dest0, 0xF27AF1FF);
1301 	fillWithSingleValue(dest0, 0);
1302 
1303 	screenMVBl(src0.ptr, dest.ptr, 255, ubyte.max);
1304 	testArrayForValue(dest, 0xEE2ADDFF);
1305 	screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1306 	testArrayForValue(dest0, 0xEE2ADDFF);
1307 	fillWithSingleValue(dest0, 0);
1308 
1309 	screenMVBl(src.ptr, dest.ptr, 255, uint.max);
1310 	testArrayForValue(dest, 0xF27AF1FF);
1311 	fillWithSingleValue(dest, 0xEE2ADDFF);
1312 	screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1313 	testArrayForValue(dest0, 0xF27AF1FF);
1314 	fillWithSingleValue(dest0, 0);
1315 
1316 	screenMVBl(src0.ptr, dest.ptr, 255, uint.max);
1317 	testArrayForValue(dest, 0xEE2ADDFF);
1318 	screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1319 	testArrayForValue(dest0, 0xEE2ADDFF);
1320 	fillWithSingleValue(dest0, 0);
1321 
1322 	screenMVBl(src.ptr, dest.ptr, 255, ubyte.min);
1323 	testArrayForValue(dest, 0xEE2ADDFF);
1324 	screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1325 	testArrayForValue(dest0, 0xEE2ADDFF);
1326 	fillWithSingleValue(dest0, 0);
1327 
1328 	screenMVBl(src0.ptr, dest.ptr, 255, ubyte.min);
1329 	testArrayForValue(dest, 0xEE2ADDFF);
1330 	screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1331 	testArrayForValue(dest0, 0xEE2ADDFF);
1332 	fillWithSingleValue(dest0, 0);
1333 
1334 	screenMVBl(src.ptr, dest.ptr, 255, uint.min);
1335 	testArrayForValue(dest, 0xEE2ADDFF);
1336 	screenMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1337 	testArrayForValue(dest0, 0xEE2ADDFF);
1338 	fillWithSingleValue(dest0, 0);
1339 
1340 	screenMVBl(src0.ptr, dest.ptr, 255, uint.min);
1341 	testArrayForValue(dest, 0xEE2ADDFF);
1342 	screenMVBl(src0.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1343 	testArrayForValue(dest0, 0xEE2ADDFF);
1344 	fillWithSingleValue(dest0, 0);
1345 
1346 	//Test masking with master value functions
1347 	screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.max);
1348 	testArrayForValue(dest, 0xF27AF1FF);
1349 	fillWithSingleValue(dest, 0xEE2ADDFF);
1350 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.max);
1351 	testArrayForValue(dest0, 0xF27AF1FF);
1352 	fillWithSingleValue(dest0, 0);
1353 
1354 	screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.max);
1355 	testArrayForValue(dest, 0xEE2ADDFF);
1356 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.max);
1357 	testArrayForValue(dest0, 0xEE2ADDFF);
1358 	fillWithSingleValue(dest0, 0);
1359 
1360 	screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.max);
1361 	testArrayForValue(dest, 0xF27AF1FF);
1362 	fillWithSingleValue(dest, 0xEE2ADDFF);
1363 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.max);
1364 	testArrayForValue(dest0, 0xF27AF1FF);
1365 	fillWithSingleValue(dest0, 0);
1366 
1367 	screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.max);
1368 	testArrayForValue(dest, 0xEE2ADDFF);
1369 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.max);
1370 	testArrayForValue(dest0, 0xEE2ADDFF);
1371 	fillWithSingleValue(dest0, 0);
1372 
1373 	screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, ubyte.min);
1374 	testArrayForValue(dest, 0xEE2ADDFF);
1375 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, ubyte.min);
1376 	testArrayForValue(dest0, 0xEE2ADDFF);
1377 	fillWithSingleValue(dest0, 0);
1378 
1379 	screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, ubyte.min);
1380 	testArrayForValue(dest, 0xEE2ADDFF);
1381 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, ubyte.min);
1382 	testArrayForValue(dest0, 0xEE2ADDFF);
1383 	fillWithSingleValue(dest0, 0);
1384 
1385 	screenMV(src.ptr, dest.ptr, 255, mask0A.ptr, uint.min);
1386 	testArrayForValue(dest, 0xEE2ADDFF);
1387 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0A.ptr, uint.min);
1388 	testArrayForValue(dest0, 0xEE2ADDFF);
1389 	fillWithSingleValue(dest0, 0);
1390 
1391 	screenMV(src.ptr, dest.ptr, 255, mask0B.ptr, uint.min);
1392 	testArrayForValue(dest, 0xEE2ADDFF);
1393 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0B.ptr, uint.min);
1394 	testArrayForValue(dest0, 0xEE2ADDFF);
1395 	fillWithSingleValue(dest0, 0);
1396 	//
1397 
1398 	screenMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.max);
1399 	testArrayForValue(dest, 0xF27AF1FF);
1400 	fillWithSingleValue(dest, 0xEE2ADDFF);
1401 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.max);
1402 	testArrayForValue(dest0, 0xF27AF1FF);
1403 	fillWithSingleValue(dest0, 0);
1404 
1405 	screenMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.max);
1406 	testArrayForValue(dest, 0xEE2ADDFF);
1407 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.max);
1408 	testArrayForValue(dest0, 0xEE2ADDFF);
1409 	fillWithSingleValue(dest0, 0);
1410 
1411 	screenMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.max);
1412 	testArrayForValue(dest, 0xF27AF1FF);
1413 	fillWithSingleValue(dest, 0xEE2ADDFF);
1414 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.max);
1415 	testArrayForValue(dest0, 0xF27AF1FF);
1416 	fillWithSingleValue(dest0, 0);
1417 
1418 	screenMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.max);
1419 	testArrayForValue(dest, 0xEE2ADDFF);
1420 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.max);
1421 	testArrayForValue(dest0, 0xEE2ADDFF);
1422 	fillWithSingleValue(dest0, 0);
1423 
1424 	screenMV(src.ptr, dest.ptr, 255, maskA.ptr, ubyte.min);
1425 	testArrayForValue(dest, 0xEE2ADDFF);
1426 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, ubyte.min);
1427 	testArrayForValue(dest0, 0xEE2ADDFF);
1428 	fillWithSingleValue(dest0, 0);
1429 
1430 	screenMV(src.ptr, dest.ptr, 255, maskB.ptr, ubyte.min);
1431 	testArrayForValue(dest, 0xEE2ADDFF);
1432 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, ubyte.min);
1433 	testArrayForValue(dest0, 0xEE2ADDFF);
1434 	fillWithSingleValue(dest0, 0);
1435 
1436 	screenMV(src.ptr, dest.ptr, 255, maskA.ptr, uint.min);
1437 	testArrayForValue(dest, 0xEE2ADDFF);
1438 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskA.ptr, uint.min);
1439 	testArrayForValue(dest0, 0xEE2ADDFF);
1440 	fillWithSingleValue(dest0, 0);
1441 
1442 	screenMV(src.ptr, dest.ptr, 255, maskB.ptr, uint.min);
1443 	testArrayForValue(dest, 0xEE2ADDFF);
1444 	screenMV(src.ptr, dest.ptr, dest0.ptr, 255, maskB.ptr, uint.min);
1445 	testArrayForValue(dest0, 0xEE2ADDFF);
1446 	fillWithSingleValue(dest0, 0);
1447 }