1 module CPUblit.composing.sub;
2 
3 import CPUblit.composing.common;
4 
5 /*
6  * CPUblit
7  * Substract with saturation functions.
8  * Author: Laszlo Szeremi
9  *
10  * These functions compose two image together using the following function:
11  * dest0[rgba] = dest[rgba] - src[rbga]
12  * If alpha channel is enabled in the template or mask is used, then the function will be the following:
13  * dest0[rgba] = dest[rgba] - (mask[aaaa] * src[rgba])
14  * which translates to the integer implementation:
15  * dest0[rgba] = dest[rgba] - ((1 + mask[aaaa]) * src[rgba])>>>8
16  *
17  * These functions only work with 8 bit channels, and many require 32 bit values.
18  * Masks can be either 8 bit per pixel, or 32 bit per pixel with the ability of processing up to 4 channels
19  * independently.
20  */
21 @nogc pure nothrow {
22 	/**
23 	 * 2 operator subtraction function
24 	 */
25 	public void sub(bool UseAlpha = false)(uint* src, uint* dest, size_t length) {
26 		while(length >= 4) {
27 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
28 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
29 			static if (UseAlpha) {
30 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
31 				version (cpublit_revalpha) {
32 					maskV |= _mm_srli_epi32(maskV, 8);
33 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
34 				} else {
35 					maskV |= _mm_slli_epi32(maskV, 8);
36 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
37 				}
38 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
39 				__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
40 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
41 				__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
42 				srcV = _mm_packus_epi16(src_lo, src_hi);
43 			}
44 			destV = _mm_subs_epu8(destV, srcV);
45 			_mm_storeu_si128(cast(__m128i*)dest, destV);
46 			src += 4;
47 			dest += 4;
48 			length -= 4;
49 		}
50 		if (length >= 2) {
51 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
52 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
53 			static if (UseAlpha) {
54 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
55 				version (cpublit_revalpha) {
56 					maskV |= _mm_srli_epi32(maskV, 8);
57 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
58 				} else {
59 					maskV |= _mm_slli_epi32(maskV, 8);
60 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
61 				}
62 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
63 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
64 				srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
65 			}
66 			destV = _mm_subs_epu8(destV, srcV);
67 			_mm_storel_epi64(cast(__m128i*)dest, destV);
68 			src += 2;
69 			dest += 2;
70 			length -= 2;
71 		}
72 		if (length) {
73 			__m128i srcV = _mm_loadu_si32(src);
74 			__m128i destV = _mm_loadu_si32(dest);
75 			static if (UseAlpha) {
76 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
77 				version (cpublit_revalpha) {
78 					maskV |= _mm_srli_epi32(maskV, 8);
79 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
80 				} else {
81 					maskV |= _mm_slli_epi32(maskV, 8);
82 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
83 				}
84 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
85 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
86 				srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
87 			}
88 			destV = _mm_subs_epu8(destV, srcV);
89 			_mm_storeu_si32(dest, destV);
90 		}
91 		
92 	}
93 	/**
94 	 * 3 operator subtraction function with separate destination.
95 	 */
96 	public void sub(bool UseAlpha = false)(uint* src, uint* dest, uint* dest0, size_t length) {
97 		while(length >= 4) {
98 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
99 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
100 			static if (UseAlpha) {
101 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
102 				version (cpublit_revalpha) {
103 					maskV |= _mm_srli_epi32(maskV, 8);
104 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
105 				} else {
106 					maskV |= _mm_slli_epi32(maskV, 8);
107 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
108 				}
109 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
110 				__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
111 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
112 				__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
113 				srcV = _mm_packus_epi16(src_lo, src_hi);
114 			}
115 			destV = _mm_subs_epu8(destV, srcV);
116 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
117 			src += 4;
118 			dest += 4;
119 			dest0 += 4;
120 			length -= 4;
121 		}
122 		if (length >= 2) {
123 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
124 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
125 			static if (UseAlpha) {
126 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
127 				version (cpublit_revalpha) {
128 					maskV |= _mm_srli_epi32(maskV, 8);
129 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
130 				} else {
131 					maskV |= _mm_slli_epi32(maskV, 8);
132 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
133 				}
134 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
135 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
136 				srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
137 			}
138 			destV = _mm_subs_epu8(destV, srcV);
139 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
140 			src += 2;
141 			dest += 2;
142 			dest0 += 2;
143 			length -= 2;
144 		}
145 		if (length) {
146 			__m128i srcV = _mm_loadu_si32(src);
147 			__m128i destV = _mm_loadu_si32(dest);
148 			static if (UseAlpha) {
149 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
150 				version (cpublit_revalpha) {
151 					maskV |= _mm_srli_epi32(maskV, 8);
152 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
153 				} else {
154 					maskV |= _mm_slli_epi32(maskV, 8);
155 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
156 				}
157 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
158 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
159 				srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
160 			}
161 			destV = _mm_subs_epu8(destV, srcV);
162 			_mm_storeu_si32(dest0, destV);
163 		}
164 	}
165 	/**
166 	 * 3 operator subtraction function with masking
167 	 */
168 	public void sub(M)(uint* src, uint* dest, size_t length, M* mask) {
169 		while(length >= 4) {
170 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
171 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
172 			static if (is(M == uint)) {
173 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
174 			} else static if (is(M == ubyte)) {
175 				__m128i maskV;
176 				maskV[0] = mask[0];
177 				maskV[1] = mask[1];
178 				maskV[2] = mask[2];
179 				maskV[3] = mask[3];
180 				maskV |= _mm_slli_epi32(maskV, 8);
181 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
182 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
183 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
184 			__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
185 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
186 			__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
187 			srcV = _mm_packus_epi16(src_lo, src_hi);
188 			
189 			destV = _mm_subs_epu8(destV, srcV);
190 			_mm_storeu_si128(cast(__m128i*)dest, destV);
191 			src += 4;
192 			dest += 4;
193 			mask += 4;
194 			length -= 4;
195 		}
196 		if (length >= 2) {
197 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
198 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
199 			static if (is(M == uint)) {
200 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
201 			} else static if (is(M == ubyte)) {
202 				__m128i maskV;
203 				maskV[0] = mask[0];
204 				maskV[1] = mask[1];
205 				maskV |= _mm_slli_epi32(maskV, 8);
206 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
207 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
208 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
209 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
210 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
211 			destV = _mm_subs_epu8(destV, srcV);
212 			_mm_storel_epi64(cast(__m128i*)dest, destV);
213 			src += 2;
214 			dest += 2;
215 			mask += 2;
216 			length -= 2;
217 		}
218 		if (length) {
219 			__m128i srcV = _mm_loadu_si32(src);
220 			__m128i destV = _mm_loadu_si32(dest);
221 			static if (is(M == uint)) {
222 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
223 			} else static if (is(M == ubyte)) {
224 				__m128i maskV;
225 				maskV[0] = mask[0];
226 				maskV |= _mm_slli_epi32(maskV, 8);
227 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
228 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
229 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
230 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
231 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
232 			destV = _mm_subs_epu8(destV, srcV);
233 			_mm_storeu_si32(dest, destV);
234 		}
235 	}
236 	/**
237 	 * 3 operator subtraction function with separate destination and masking.
238 	 */
239 	public void sub(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) {
240 		while(length >= 4) {
241 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
242 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
243 			static if (is(M == uint)) {
244 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
245 			} else static if (is(M == ubyte)) {
246 				__m128i maskV;
247 				maskV[0] = mask[0];
248 				maskV[1] = mask[1];
249 				maskV[2] = mask[2];
250 				maskV[3] = mask[3];
251 				maskV |= _mm_slli_epi32(maskV, 8);
252 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
253 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
254 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
255 			__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
256 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
257 			__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
258 			srcV = _mm_packus_epi16(src_lo, src_hi);
259 			destV = _mm_subs_epu8(destV, srcV);
260 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
261 			src += 4;
262 			dest += 4;
263 			dest0 += 4;
264 			mask += 4;
265 			length -= 4;
266 		}
267 		if (length >= 2) {
268 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
269 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
270 			static if (is(M == uint)) {
271 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
272 			} else static if (is(M == ubyte)) {
273 				__m128i maskV;
274 				maskV[0] = mask[0];
275 				maskV[1] = mask[1];
276 				maskV |= _mm_slli_epi32(maskV, 8);
277 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
278 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
279 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
280 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
281 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
282 			destV = _mm_subs_epu8(destV, srcV);
283 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
284 			src += 2;
285 			dest += 2;
286 			dest0 += 2;
287 			mask += 2;
288 			length -= 2;
289 		}
290 		if (length) {
291 			__m128i srcV = _mm_loadu_si32(src);
292 			__m128i destV = _mm_loadu_si32(dest);
293 			static if (is(M == uint)) {
294 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
295 			} else static if (is(M == ubyte)) {
296 				__m128i maskV;
297 				maskV[0] = mask[0];
298 				maskV |= _mm_slli_epi32(maskV, 8);
299 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
300 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
301 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
302 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
303 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
304 			destV = _mm_subs_epu8(destV, srcV);
305 			_mm_storeu_si32(dest0, destV);
306 		}
307 	}
308 	/**
309 	 * 2 operator subtraction function with master alpha value.
310 	 * `UseAlpha` determines whether the src's alpha channel will be used or not.
311 	 */
312 	public void subMV(bool UseAlpha = false, V)(uint* src, uint* dest, size_t length, V value) {
313 		__m128i master_1;
314 		static if (is(V == uint)) {
315 			master_1[0] = value;
316 			master_1[1] = value;
317 			//master_1[2] = value;
318 			//master_1[3] = value;
319 		} else static if (is(V == ubyte)) {
320 			master_1[0] = value;
321 			master_1[1] = value;
322 			//master_1[2] = value;
323 			//master_1[3] = value;
324 			master_1 |= _mm_slli_epi32(master_1, 8);
325 			master_1 |= _mm_slli_epi32(master_1, 16);
326 		} else static assert (0, "Value must be either 8 or 32 bits!");
327 		master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(master_1, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
328 		while(length >= 4) {
329 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
330 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
331 			static if (UseAlpha) {
332 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
333 				version (cpublit_revalpha) {
334 					maskV |= _mm_srli_epi32(maskV, 8);
335 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
336 				} else {
337 					maskV |= _mm_slli_epi32(maskV, 8);
338 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
339 				}
340 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
341 				__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
342 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
343 				__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
344 				src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
345 				src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, master_1), 8);
346 			} else {
347 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1), 8);
348 				__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), master_1), 8);
349 			}
350 			srcV = _mm_packus_epi16(src_lo, src_hi);
351 			destV = _mm_subs_epu8(destV, srcV);
352 			_mm_storeu_si128(cast(__m128i*)dest, destV);
353 			src += 4;
354 			dest += 4;
355 			length -= 4;
356 		}
357 		if (length >= 2) {
358 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
359 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
360 			static if (UseAlpha) {
361 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
362 				version (cpublit_revalpha) {
363 					maskV |= _mm_srli_epi32(maskV, 8);
364 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
365 				} else {
366 					maskV |= _mm_slli_epi32(maskV, 8);
367 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
368 				}
369 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
370 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
371 				src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
372 			} else {
373 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1), 8);
374 			}
375 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
376 			destV = _mm_subs_epu8(destV, srcV);
377 			_mm_storel_epi64(cast(__m128i*)dest, destV);
378 			src += 2;
379 			dest += 2;
380 			length -= 2;
381 		}
382 		if (length) {
383 			__m128i srcV = _mm_loadu_si32(src);
384 			__m128i destV = _mm_loadu_si32(dest);
385 			static if (UseAlpha) {
386 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
387 				version (cpublit_revalpha) {
388 					maskV |= _mm_srli_epi32(maskV, 8);
389 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
390 				} else {
391 					maskV |= _mm_slli_epi32(maskV, 8);
392 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
393 				}
394 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
395 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
396 				src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
397 			} else {
398 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1), 8);
399 			}
400 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
401 			destV = _mm_subs_epu8(destV, srcV);
402 			_mm_storeu_si32(dest, destV);
403 		}
404 		
405 	}
406 	/**
407 	 * 3 operator subtraction function with separate destination and master alpha value.
408 	 */
409 	public void subMV(bool UseAlpha = false, V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
410 		__m128i master_1;
411 		static if (is(V == uint)) {
412 			master_1[0] = value;
413 			master_1[1] = value;
414 			//master_1[2] = value;
415 			//master_1[3] = value;
416 		} else static if (is(V == ubyte)) {
417 			master_1[0] = value;
418 			master_1[1] = value;
419 			//master_1[2] = value;
420 			//master_1[3] = value;
421 			master_1 |= _mm_slli_epi32(master_1, 8);
422 			master_1 |= _mm_slli_epi32(master_1, 16);
423 		} else static assert (0, "Value must be either 8 or 32 bits!");
424 		master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(master_1, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
425 		while(length >= 4) {
426 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
427 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
428 			static if (UseAlpha) {
429 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
430 				version (cpublit_revalpha) {
431 					maskV |= _mm_srli_epi32(maskV, 8);
432 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
433 				} else {
434 					maskV |= _mm_slli_epi32(maskV, 8);
435 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
436 				}
437 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
438 				__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
439 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
440 				__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
441 				src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
442 				src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, master_1), 8);
443 			} else {
444 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1), 8);
445 				__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), master_1), 8);
446 			}
447 			srcV = _mm_packus_epi16(src_lo, src_hi);
448 			destV = _mm_subs_epu8(destV, srcV);
449 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
450 			src += 4;
451 			dest += 4;
452 			dest0 += 4;
453 			length -= 4;
454 		}
455 		if (length >= 2) {
456 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
457 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
458 			static if (UseAlpha) {
459 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
460 				version (cpublit_revalpha) {
461 					maskV |= _mm_srli_epi32(maskV, 8);
462 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
463 				} else {
464 					maskV |= _mm_slli_epi32(maskV, 8);
465 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
466 				}
467 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
468 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
469 				src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
470 			} else {
471 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1), 8);
472 			}
473 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
474 			destV = _mm_subs_epu8(destV, srcV);
475 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
476 			src += 2;
477 			dest += 2;
478 			dest0 += 2;
479 			length -= 2;
480 		}
481 		if (length) {
482 			__m128i srcV = _mm_loadu_si32(src);
483 			__m128i destV = _mm_loadu_si32(dest);
484 			static if (UseAlpha) {
485 				__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
486 				version (cpublit_revalpha) {
487 					maskV |= _mm_srli_epi32(maskV, 8);
488 					maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
489 				} else {
490 					maskV |= _mm_slli_epi32(maskV, 8);
491 					maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
492 				}
493 				__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
494 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
495 				src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
496 			} else {
497 				__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1), 8);
498 			}
499 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
500 			destV = _mm_subs_epu8(destV, srcV);
501 			_mm_storeu_si32(dest0, destV);
502 		}
503 	}
504 	/**
505 	 * 3 operator subtraction function with masking and master alpha value.
506 	 */
507 	public void subMV(M,V)(uint* src, uint* dest, size_t length, M* mask, V value) {
508 		__m128i master_1;
509 		static if (is(V == uint)) {
510 			master_1[0] = value;
511 			master_1[1] = value;
512 			//master_1[2] = value;
513 			//master_1[3] = value;
514 		} else static if (is(V == ubyte)) {
515 			master_1[0] = value;
516 			master_1[1] = value;
517 			//master_1[2] = value;
518 			//master_1[3] = value;
519 			master_1 |= _mm_slli_epi32(master_1, 8);
520 			master_1 |= _mm_slli_epi32(master_1, 16);
521 		} else static assert (0, "Value must be either 8 or 32 bits!");
522 		master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(master_1, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
523 		while(length >= 4) {
524 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
525 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
526 			static if (is(M == uint)) {
527 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
528 			} else static if (is(M == ubyte)) {
529 				__m128i maskV;
530 				maskV[0] = mask[0];
531 				maskV[1] = mask[1];
532 				maskV[2] = mask[2];
533 				maskV[3] = mask[3];
534 				maskV |= _mm_slli_epi32(maskV, 8);
535 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
536 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
537 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
538 			__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
539 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
540 			__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
541 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
542 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, master_1), 8);
543 			srcV = _mm_packus_epi16(src_lo, src_hi);
544 			destV = _mm_subs_epu8(destV, srcV);
545 			_mm_storeu_si128(cast(__m128i*)dest, destV);
546 			src += 4;
547 			dest += 4;
548 			mask += 4;
549 			length -= 4;
550 		}
551 		if (length >= 2) {
552 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
553 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
554 			static if (is(M == uint)) {
555 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
556 			} else static if (is(M == ubyte)) {
557 				__m128i maskV;
558 				maskV[0] = mask[0];
559 				maskV[1] = mask[1];
560 				maskV |= _mm_slli_epi32(maskV, 8);
561 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
562 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
563 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
564 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
565 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
566 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
567 			destV = _mm_subs_epu8(destV, srcV);
568 			_mm_storel_epi64(cast(__m128i*)dest, destV);
569 			src += 2;
570 			dest += 2;
571 			mask += 2;
572 			length -= 2;
573 		}
574 		if (length) {
575 			__m128i srcV = _mm_loadu_si32(src);
576 			__m128i destV = _mm_loadu_si32(dest);
577 			static if (is(M == uint)) {
578 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
579 			} else static if (is(M == ubyte)) {
580 				__m128i maskV;
581 				maskV[0] = mask[0];
582 				maskV[1] = mask[1];
583 				maskV |= _mm_slli_epi32(maskV, 8);
584 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
585 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
586 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
587 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
588 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
589 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
590 			destV = _mm_subs_epu8(destV, srcV);
591 			_mm_storeu_si32(dest, destV);
592 		}
593 	}
594 	/**
595 	 * 3 operator subtraction function with separate destination, masking, and master value.
596 	 */
597 	public void subMV(M, V)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) {
598 		__m128i master_1;
599 		static if (is(V == uint)) {
600 			master_1[0] = value;
601 			master_1[1] = value;
602 			//master_1[2] = value;
603 			//master_1[3] = value;
604 		} else static if (is(V == ubyte)) {
605 			master_1[0] = value;
606 			master_1[1] = value;
607 			//master_1[2] = value;
608 			//master_1[3] = value;
609 			master_1 |= _mm_slli_epi32(master_1, 8);
610 			master_1 |= _mm_slli_epi32(master_1, 16);
611 		} else static assert (0, "Value must be either 8 or 32 bits!");
612 		master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(master_1, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
613 		while(length >= 4) {
614 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
615 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
616 			static if (is(M == uint)) {
617 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
618 			} else static if (is(M == ubyte)) {
619 				__m128i maskV;
620 				maskV[0] = mask[0];
621 				maskV[1] = mask[1];
622 				maskV[2] = mask[2];
623 				maskV[3] = mask[3];
624 				maskV |= _mm_slli_epi32(maskV, 8);
625 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
626 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
627 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
628 			__m128i mask_hi = _mm_adds_epu16(_mm_unpackhi_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
629 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
630 			__m128i src_hi = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask_hi), 8);
631 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
632 			src_hi = _mm_srli_epi16(_mm_mullo_epi16(src_hi, master_1), 8);
633 			srcV = _mm_packus_epi16(src_lo, src_hi);
634 			destV = _mm_subs_epu8(destV, srcV);
635 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
636 			src += 4;
637 			dest += 4;
638 			dest0 += 4;
639 			mask += 4;
640 			length -= 4;
641 		}
642 		if (length >= 2) {
643 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
644 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
645 			static if (is(M == uint)) {
646 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
647 			} else static if (is(M == ubyte)) {
648 				__m128i maskV;
649 				maskV[0] = mask[0];
650 				maskV[1] = mask[1];
651 				maskV |= _mm_slli_epi32(maskV, 8);
652 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
653 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
654 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
655 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
656 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
657 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
658 			destV = _mm_subs_epu8(destV, srcV);
659 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
660 			src += 2;
661 			dest += 2;
662 			dest0 += 2;
663 			mask += 2;
664 			length -= 2;
665 		}
666 		if (length) {
667 			__m128i srcV = _mm_loadu_si32(src);
668 			__m128i destV = _mm_loadu_si32(dest);
669 			static if (is(M == uint)) {
670 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
671 			} else static if (is(M == ubyte)) {
672 				__m128i maskV;
673 				maskV[0] = mask[0];
674 				maskV |= _mm_slli_epi32(maskV, 8);
675 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
676 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
677 			__m128i mask_lo = _mm_adds_epu16(_mm_unpacklo_epi8(maskV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
678 			__m128i src_lo = _mm_srli_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask_lo), 8);
679 			src_lo = _mm_srli_epi16(_mm_mullo_epi16(src_lo, master_1), 8);
680 			srcV = _mm_packus_epi16(src_lo, SSE2_NULLVECT);
681 			destV = _mm_subs_epu8(destV, srcV);
682 			_mm_storeu_si32(dest0, destV);
683 		}
684 	}
685 }
686 unittest {
687 	uint[] src, dest, dest0, mask;
688 	ubyte[] mask0;
689 	src.length = 255;
690 	dest.length = 255;
691 	dest0.length = 255;
692 	mask.length = 255;
693 	mask0.length = 255;
694 	fillWithSingleValue(src, 0x05050505);
695 	fillWithSingleValue(dest, 0x10101010);
696 	sub!false(src.ptr, dest.ptr, 255);
697 	testArrayForValue(dest, 0x0b0b0b0b);
698 	fillWithSingleValue(dest, 0x10101010);
699 	sub!false(src.ptr, dest.ptr, dest0.ptr, 255);
700 	testArrayForValue(dest0, 0x0b0b0b0b);
701 	fillWithSingleValue(dest0, 0);
702 
703 	//mask value of 0 should generate no change in the output
704 	sub(src.ptr, dest.ptr, 255, mask.ptr);
705 	testArrayForValue(dest, 0x10101010);
706 	sub(src.ptr, dest.ptr, 255, mask0.ptr);
707 	testArrayForValue(dest, 0x10101010);
708 	sub(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr);
709 	testArrayForValue(dest0, 0x10101010);
710 	fillWithSingleValue(dest0, 0);
711 	sub(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr);
712 	testArrayForValue(dest0, 0x10101010);
713 	fillWithSingleValue(dest0, 0);
714 
715 	//mask value of 255 should generate maximum change in the output
716 	fillWithSingleValue(mask, uint.max);
717 	fillWithSingleValue(mask0, ubyte.max);
718 	sub(src.ptr, dest.ptr, 255, mask.ptr);
719 	testArrayForValue(dest, 0x0b0b0b0b);
720 	fillWithSingleValue(dest, 0x10101010);
721 	sub(src.ptr, dest.ptr, 255, mask0.ptr);
722 	testArrayForValue(dest, 0x0b0b0b0b);
723 	fillWithSingleValue(dest, 0x10101010);
724 	sub(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr);
725 	testArrayForValue(dest0, 0x0b0b0b0b);
726 	fillWithSingleValue(dest0, 0);
727 	sub(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr);
728 	testArrayForValue(dest0, 0x0b0b0b0b);
729 	fillWithSingleValue(dest0, 0);
730 
731 	//test with alpha channel
732 
733 	//the least significant byte of a 32 bit pixel is the alpha
734 	fillWithSingleValue(src, 0x050505FF);
735 	fillWithSingleValue(dest, 0x101010FF);
736 	sub!true(src.ptr, dest.ptr, 255);
737 	testArrayForValue(dest, 0x0b0b0b00);
738 	fillWithSingleValue(dest, 0x101010FF);
739 	sub!true(src.ptr, dest.ptr, dest0.ptr, 255);
740 	testArrayForValue(dest0, 0x0b0b0b00);
741 	fillWithSingleValue(dest0, 0);
742 	//with alpha value of zero, the destination shouldn't be affected
743 	fillWithSingleValue(src, 0x05050500);
744 	sub!true(src.ptr, dest.ptr, 255);
745 	testArrayForValue(dest, 0x101010FF);
746 	sub!true(src.ptr, dest.ptr, dest0.ptr, 255);
747 	testArrayForValue(dest0, 0x101010FF);
748 	fillWithSingleValue(dest0, 0);
749 
750 	//test master value functions
751 
752 	//master value of zero shouldn't affect anything
753 	fillWithSingleValue(src, 0x050505FF);
754 	subMV!false(src.ptr, dest.ptr, 255, ubyte.min);
755 	testArrayForValue(dest, 0x101010FF);
756 	subMV!true(src.ptr, dest.ptr, 255, uint.min);
757 	testArrayForValue(dest, 0x101010FF);
758 	subMV!true(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
759 	testArrayForValue(dest0, 0x101010FF);
760 	fillWithSingleValue(dest0, 0);
761 	subMV!false(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
762 	testArrayForValue(dest0, 0x101010FF);
763 	fillWithSingleValue(dest0, 0);
764 
765 	//masks should be also "ignored"
766 	subMV(src.ptr, dest.ptr, 255, mask.ptr, ubyte.min);
767 	testArrayForValue(dest, 0x101010FF);
768 	subMV(src.ptr, dest.ptr, 255, mask.ptr, uint.min);
769 	testArrayForValue(dest, 0x101010FF);
770 	subMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr, ubyte.min);
771 	testArrayForValue(dest0, 0x101010FF);
772 	fillWithSingleValue(dest0, 0);
773 	subMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr, uint.min);
774 	testArrayForValue(dest0, 0x101010FF);
775 	fillWithSingleValue(dest0, 0);
776 
777 	//master value of 255 should generate maximum change in the output
778 	subMV!true(src.ptr, dest.ptr, 255, ubyte.max);
779 	testArrayForValue(dest, 0x0b0b0b00);
780 	fillWithSingleValue(dest, 0x101010FF);
781 	subMV!true(src.ptr, dest.ptr, 255, uint.max);
782 	testArrayForValue(dest, 0x0b0b0b00);
783 	fillWithSingleValue(dest, 0x101010FF);
784 	subMV!true(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
785 	testArrayForValue(dest0, 0x0b0b0b00);
786 	fillWithSingleValue(dest0, 0);
787 	subMV!true(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
788 	testArrayForValue(dest0, 0x0b0b0b00);
789 	fillWithSingleValue(dest0, 0);
790 
791 	subMV!false(src.ptr, dest.ptr, 255, ubyte.max);
792 	testArrayForValue(dest, 0x0b0b0b00);
793 	fillWithSingleValue(dest, 0x101010FF);
794 	subMV!false(src.ptr, dest.ptr, 255, uint.max);
795 	testArrayForValue(dest, 0x0b0b0b00);
796 	fillWithSingleValue(dest, 0x101010FF);
797 	subMV!false(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
798 	testArrayForValue(dest0, 0x0b0b0b00);
799 	fillWithSingleValue(dest0, 0);
800 	subMV!false(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
801 	testArrayForValue(dest0, 0x0b0b0b00);
802 	fillWithSingleValue(dest0, 0);
803 
804 	subMV!true(src.ptr, dest.ptr, 255, ubyte.max);
805 	testArrayForValue(dest, 0x0b0b0b00);
806 	fillWithSingleValue(dest, 0x101010FF);
807 	subMV!true(src.ptr, dest.ptr, 255, uint.max);
808 	testArrayForValue(dest, 0x0b0b0b00);
809 	fillWithSingleValue(dest, 0x101010FF);
810 	subMV!true(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
811 	testArrayForValue(dest0, 0x0b0b0b00);
812 	fillWithSingleValue(dest0, 0);
813 	subMV!true(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
814 	testArrayForValue(dest0, 0x0b0b0b00);
815 	fillWithSingleValue(dest0, 0);
816 
817 	//ditto with masks of maximum value
818 	subMV(src.ptr, dest.ptr, 255, mask.ptr, ubyte.max);
819 	testArrayForValue(dest, 0x0b0b0b00);
820 	fillWithSingleValue(dest, 0x101010FF);
821 	subMV(src.ptr, dest.ptr, 255, mask.ptr, uint.max);
822 	testArrayForValue(dest, 0x0b0b0b00);
823 	fillWithSingleValue(dest, 0x101010FF);
824 	subMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr, ubyte.max);
825 	testArrayForValue(dest0, 0x0b0b0b00);
826 	fillWithSingleValue(dest0, 0);
827 	subMV(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr, uint.max);
828 	testArrayForValue(dest0, 0x0b0b0b00);
829 }