1 module CPUblit.composing.diff;
2 
3 import CPUblit.composing.common;
4 
5 /*
6  * CPUblit
7  * Difference functions.
8  * Author: Laszlo Szeremi
9  *
10  * These functions compose two image together using the following function:
11  * dest0[rgba] = max(dest[rgba], src[rbga]) - min(dest[rgba], src[rbga])
12  * If alpha channel is enabled in the template or mask is used, then the function will be the following:
13  * dest0[rgba] = ((1.0 - mask[aaaa]) * dest[rgba]) + (mask[aaaa] * (max(dest[rgba], src[rbga]) - min(dest[rgba], src[rbga])))
14  * which translates to the integer implementation:
15  * dest0[rgba] = (((256 - mask[aaaa]) * dest[rgba]) + ((1 + mask[aaaa]) * (max(dest[rgba], src[rbga]) - min(dest[rgba], src[rbga])))) >>> 8
16  *
17  * These functions only work with 8 bit channels, and many require 32 bit values.
18  * Masks can be either 8 bit per pixel, or 32 bit per pixel with the ability of processing up to 4 channels
19  * independently.
20  */
21 @nogc pure nothrow {
22 	/**
23 	 * 2 operator difference function without alpha
24 	 */
25 	public void diff(uint* src, uint* dest, size_t length) {
26 		while(length >= 4) {
27 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
28 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
29 			destV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
30 			_mm_storeu_si128(cast(__m128i*)dest, destV);
31 			src += 4;
32 			dest += 4;
33 			length -= 4;
34 		}
35 		if (length >= 2) {
36 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
37 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
38 			destV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
39 			_mm_storel_epi64(cast(__m128i*)dest, destV);
40 			src += 2;
41 			dest += 2;
42 			length -= 2;
43 		}
44 		if (length) {
45 			__m128i srcV = _mm_loadu_si32(src);
46 			__m128i destV = _mm_loadu_si32(dest);
47 			destV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
48 			_mm_storeu_si32(dest, destV);//*cast(int*)dest = destV[0];
49 		}
50 		
51 	}
52 	/**
53 	 * 3 operator difference function with separate destination without alpha.
54 	 */
55 	public void diff(uint* src, uint* dest, uint* dest0, size_t length) {
56 		while(length >= 4) {
57 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
58 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
59 			destV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
60 			_mm_storeu_si128(cast(__m128i*)dest0, destV);
61 			src += 4;
62 			dest += 4;
63 			dest0 += 4;
64 			length -= 4;
65 		}
66 		if (length >= 2) {
67 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
68 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
69 			destV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
70 			_mm_storel_epi64(cast(__m128i*)dest0, destV);
71 			src += 2;
72 			dest += 2;
73 			dest0 += 2;
74 			length -= 2;
75 		}
76 		if (length) {
77 			__m128i srcV = _mm_loadu_si32(src);
78 			__m128i destV = _mm_loadu_si32(dest);
79 			destV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
80 			_mm_storeu_si32(dest0, destV);//*cast(int*)dest0 = destV[0];
81 		}
82 	}
83 	/**
84 	 * 2 operator difference function with alpha
85 	 */
86 	public void diffBl(uint* src, uint* dest, size_t length) {
87 		while(length >= 4) {
88 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
89 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
90 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
91 			version (cpublit_revalpha) {
92 				maskV |= _mm_srli_epi32(maskV, 8);
93 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
94 			} else {
95 				maskV |= _mm_slli_epi32(maskV, 8);
96 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
97 			}
98 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
99 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
100 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
101 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
102 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
103 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
104 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
105 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
106 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
107 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
108 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
109 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
110 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
111 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
112 			src += 4;
113 			dest += 4;
114 			length -= 4;
115 		}
116 		if (length >= 2) {
117 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
118 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
119 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
120 			version (cpublit_revalpha) {
121 				maskV |= _mm_srli_epi32(maskV, 8);
122 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
123 			} else {
124 				maskV |= _mm_slli_epi32(maskV, 8);
125 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
126 			}
127 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
128 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
129 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
130 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
131 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
132 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
133 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
134 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
135 			src += 2;
136 			dest += 2;
137 			length -= 2;
138 		}
139 		if (length) {
140 			__m128i srcV = _mm_loadu_si32(src);
141 			__m128i destV = _mm_loadu_si32(dest);
142 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
143 			version (cpublit_revalpha) {
144 				maskV |= _mm_srli_epi32(maskV, 8);
145 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
146 			} else {
147 				maskV |= _mm_slli_epi32(maskV, 8);
148 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
149 			}
150 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
151 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
152 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
153 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
154 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
155 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
156 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
157 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
158 		}
159 		
160 	}
161 	/**
162 	 * 3 operator difference function with separate destination and alpha.
163 	 */
164 	public void diffBl(uint* src, uint* dest, uint* dest0, size_t length) {
165 		while(length >= 4) {
166 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
167 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
168 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
169 			version (cpublit_revalpha) {
170 				maskV |= _mm_srli_epi32(maskV, 8);
171 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
172 			} else {
173 				maskV |= _mm_slli_epi32(maskV, 8);
174 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
175 			}
176 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
177 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
178 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
179 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
180 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
181 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
182 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
183 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
184 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
185 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
186 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
187 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
188 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
189 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
190 			src += 4;
191 			dest += 4;
192 			dest0 += 4;
193 			length -= 4;
194 		}
195 		if (length >= 2) {
196 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
197 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
198 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
199 			version (cpublit_revalpha) {
200 				maskV |= _mm_srli_epi32(maskV, 8);
201 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
202 			} else {
203 				maskV |= _mm_slli_epi32(maskV, 8);
204 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
205 			}
206 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
207 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
208 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
209 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
210 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
211 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
212 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
213 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
214 			src += 2;
215 			dest += 2;
216 			dest0 += 2;
217 			length -= 2;
218 		}
219 		if (length) {
220 			__m128i srcV = _mm_loadu_si32(src);
221 			__m128i destV = _mm_loadu_si32(dest);
222 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
223 			version (cpublit_revalpha) {
224 				maskV |= _mm_srli_epi32(maskV, 8);
225 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
226 			} else {
227 				maskV |= _mm_slli_epi32(maskV, 8);
228 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
229 			}
230 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
231 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
232 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
233 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
234 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
235 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
236 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
237 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
238 		}
239 	}
240 	/**
241 	 * 3 operator difference function with masking
242 	 */
243 	public void diff(M)(uint* src, uint* dest, size_t length, M* mask) {
244 		while(length >= 4) {
245 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
246 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
247 			static if (is(M == uint)) {
248 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
249 			} else static if (is(M == ubyte)) {
250 				__m128i maskV;
251 				maskV[0] = mask[0];
252 				maskV[1] = mask[1];
253 				maskV[2] = mask[2];
254 				maskV[3] = mask[3];
255 				maskV |= _mm_slli_epi32(maskV, 8);
256 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
257 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
258 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
259 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
260 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
261 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
262 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
263 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
264 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
265 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
266 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
267 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
268 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
269 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
270 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
271 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
272 			src += 4;
273 			dest += 4;
274 			mask += 4;
275 			length -= 4;
276 		}
277 		if (length >= 2) {
278 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
279 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
280 			static if (is(M == uint)) {
281 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
282 			} else static if (is(M == ubyte)) {
283 				__m128i maskV;
284 				maskV[0] = mask[0];
285 				maskV[1] = mask[1];
286 				maskV |= _mm_slli_epi32(maskV, 8);
287 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
288 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
289 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
290 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
291 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
292 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
293 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
294 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
295 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
296 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
297 			src += 2;
298 			dest += 2;
299 			mask += 2;
300 			length -= 2;
301 		}
302 		if (length) {
303 			__m128i srcV = _mm_loadu_si32(src);
304 			__m128i destV = _mm_loadu_si32(dest);
305 			static if (is(M == uint)) {
306 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
307 			} else static if (is(M == ubyte)) {
308 				__m128i maskV;
309 				maskV[0] = mask[0];
310 				maskV |= _mm_slli_epi32(maskV, 8);
311 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
312 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
313 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
314 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
315 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
316 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
317 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
318 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
319 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
320 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
321 		}
322 	}
323 	/**
324 	 * 4 operator difference function with separate destination and masking.
325 	 */
326 	public void diff(M)(uint* src, uint* dest, uint* dest0, size_t length, M* mask) {
327 		while(length >= 4) {
328 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
329 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
330 			static if (is(M == uint)) {
331 				__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
332 			} else static if (is(M == ubyte)) {
333 				__m128i maskV;
334 				maskV[0] = mask[0];
335 				maskV[1] = mask[1];
336 				maskV[2] = mask[2];
337 				maskV[3] = mask[3];
338 				maskV |= _mm_slli_epi32(maskV, 8);
339 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
340 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
341 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
342 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
343 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
344 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
345 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
346 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
347 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
348 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
349 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
350 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
351 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
352 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
353 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
354 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
355 			src += 4;
356 			dest += 4;
357 			dest0 += 4;
358 			mask += 4;
359 			length -= 4;
360 		}
361 		if (length >= 2) {
362 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
363 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
364 			static if (is(M == uint)) {
365 				__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
366 			} else static if (is(M == ubyte)) {
367 				__m128i maskV;
368 				maskV[0] = mask[0];
369 				maskV[1] = mask[1];
370 				maskV |= _mm_slli_epi32(maskV, 8);
371 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
372 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
373 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
374 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
375 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
376 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
377 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
378 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
379 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
380 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
381 			src += 2;
382 			dest += 2;
383 			dest0 += 2;
384 			mask += 2;
385 			length -= 2;
386 		}
387 		if (length) {
388 			__m128i srcV = _mm_loadu_si32(src);
389 			__m128i destV = _mm_loadu_si32(dest);
390 			static if (is(M == uint)) {
391 				__m128i maskV = _mm_loadu_si32(cast(__m128i*)mask);
392 			} else static if (is(M == ubyte)) {
393 				__m128i maskV;
394 				maskV[0] = mask[0];
395 				maskV |= _mm_slli_epi32(maskV, 8);
396 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
397 			} else static assert (0, "Alpha mask must be either 8 or 32 bits!");
398 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
399 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
400 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
401 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
402 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
403 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
404 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
405 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
406 		}
407 	}
408 	/**
409 	 * 2 operator add function with master alpha value.
410 	 * `UseAlpha` determines whether the src's alpha channel will be used or not.
411 	 */
412 	public void diffMV(V)(uint* src, uint* dest, size_t length, V value) {
413 		__m128i masterV;
414 		static if (is(V == uint)) {
415 			masterV[0] = value;
416 			masterV[1] = value;
417 			//masterV[2] = value;
418 			//masterV[3] = value;
419 		} else static if (is(V == ubyte)) {
420 			masterV[0] = value;
421 			masterV[1] = value;
422 			//masterV[2] = value;
423 			//masterV[3] = value;
424 			masterV |= _mm_slli_epi32(masterV, 8);
425 			masterV |= _mm_slli_epi32(masterV, 16);
426 		} else static assert (0, "Value must be either 8 or 32 bits!");
427 		__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
428 		__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
429 		while(length >= 4) {
430 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
431 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
432 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
433 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1);
434 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), master_1);
435 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
436 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), master_256);
437 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
438 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
439 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
440 			src += 4;
441 			dest += 4;
442 			length -= 4;
443 		}
444 		if (length >= 2) {
445 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
446 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
447 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
448 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1);
449 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
450 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
451 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
452 			src += 2;
453 			dest += 2;
454 			length -= 2;
455 		}
456 		if (length) {
457 			__m128i srcV = _mm_loadu_si32(src);
458 			__m128i destV = _mm_loadu_si32(dest);
459 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
460 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1);
461 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
462 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
463 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
464 		}
465 		
466 	}
467 	/**
468 	 * 3 operator difference function with separate destination and master alpha value.
469 	 */
470 	public void diffMV(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
471 		__m128i masterV;
472 		static if (is(V == uint)) {
473 			masterV[0] = value;
474 			masterV[1] = value;
475 			//masterV[2] = value;
476 			//masterV[3] = value;
477 		} else static if (is(V == ubyte)) {
478 			masterV[0] = value;
479 			masterV[1] = value;
480 			//masterV[2] = value;
481 			//masterV[3] = value;
482 			masterV |= _mm_slli_epi32(masterV, 8);
483 			masterV |= _mm_slli_epi32(masterV, 16);
484 		} else static assert (0, "Value must be either 8 or 32 bits!");
485 		__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
486 		__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
487 		while(length >= 4) {
488 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
489 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
490 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
491 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1);
492 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), master_1);
493 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
494 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), master_256);
495 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
496 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
497 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
498 			dest += 4;
499 			dest0 += 4;
500 			length -= 4;
501 		}
502 		if (length >= 2) {
503 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
504 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
505 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
506 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1);
507 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
508 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
509 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
510 			src += 2;
511 			dest += 2;
512 			dest0 += 2;
513 			length -= 2;
514 		}
515 		if (length) {
516 			__m128i srcV = _mm_loadu_si32(src);
517 			__m128i destV = _mm_loadu_si32(dest);
518 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
519 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), master_1);
520 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), master_256);
521 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
522 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
523 		}
524 	}
525 	/**
526 	 * 2 operator add function with master alpha value and per pixel alpha.
527 	 */
528 	public void diffMVBl(V)(uint* src, uint* dest, size_t length, V value) {
529 		__m128i masterV;
530 		static if (is(V == uint)) {
531 			masterV[0] = value;
532 			masterV[1] = value;
533 			//masterV[2] = value;
534 			//masterV[3] = value;
535 		} else static if (is(V == ubyte)) {
536 			masterV[0] = value;
537 			masterV[1] = value;
538 			//masterV[2] = value;
539 			//masterV[3] = value;
540 			masterV |= _mm_slli_epi32(masterV, 8);
541 			masterV |= _mm_slli_epi32(masterV, 16);
542 		} else static assert (0, "Value must be either 8 or 32 bits!");
543 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
544 		//__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
545 		//__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
546 		while(length >= 4) {
547 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
548 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
549 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
550 			version (cpublit_revalpha) {
551 				maskV |= _mm_srli_epi32(maskV, 8);
552 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
553 			} else {
554 				maskV |= _mm_slli_epi32(maskV, 8);
555 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
556 			}
557 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
558 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
559 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
560 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
561 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
562 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
563 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
564 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
565 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
566 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
567 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
568 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
569 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
570 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
571 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
572 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
573 			src += 4;
574 			dest += 4;
575 			length -= 4;
576 		}
577 		if (length >= 2) {
578 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
579 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
580 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
581 			version (cpublit_revalpha) {
582 				maskV |= _mm_srli_epi32(maskV, 8);
583 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
584 			} else {
585 				maskV |= _mm_slli_epi32(maskV, 8);
586 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
587 			}
588 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
589 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
590 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
591 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
592 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
593 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
594 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
595 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
596 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
597 			src += 2;
598 			dest += 2;
599 			length -= 2;
600 		}
601 		if (length) {
602 			__m128i srcV = _mm_loadu_si32(src);
603 			__m128i destV = _mm_loadu_si32(dest);
604 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
605 			version (cpublit_revalpha) {
606 				maskV |= _mm_srli_epi32(maskV, 8);
607 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
608 			} else {
609 				maskV |= _mm_slli_epi32(maskV, 8);
610 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
611 			}
612 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
613 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
614 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
615 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
616 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
617 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
618 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
619 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
620 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
621 		}
622 		
623 	}
624 	/**
625 	 * 3 operator difference function with separate destination and master alpha value.
626 	 */
627 	public void diffMVBl(V)(uint* src, uint* dest, uint* dest0, size_t length, V value) {
628 		__m128i masterV;
629 		static if (is(V == uint)) {
630 			masterV[0] = value;
631 			masterV[1] = value;
632 			//masterV[2] = value;
633 			//masterV[3] = value;
634 		} else static if (is(V == ubyte)) {
635 			masterV[0] = value;
636 			masterV[1] = value;
637 			//masterV[2] = value;
638 			//masterV[3] = value;
639 			masterV |= _mm_slli_epi32(masterV, 8);
640 			masterV |= _mm_slli_epi32(masterV, 16);
641 		} else static assert (0, "Value must be either 8 or 32 bits!");
642 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
643 		//__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
644 		//__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
645 		while(length >= 4) {
646 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
647 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
648 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
649 			version (cpublit_revalpha) {
650 				maskV |= _mm_srli_epi32(maskV, 8);
651 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
652 			} else {
653 				maskV |= _mm_slli_epi32(maskV, 8);
654 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
655 			}
656 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
657 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
658 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
659 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
660 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
661 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
662 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
663 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
664 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
665 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
666 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
667 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
668 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
669 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
670 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
671 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
672 			dest += 4;
673 			dest0 += 4;
674 			length -= 4;
675 		}
676 		if (length >= 2) {
677 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
678 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
679 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
680 			version (cpublit_revalpha) {
681 				maskV |= _mm_srli_epi32(maskV, 8);
682 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
683 			} else {
684 				maskV |= _mm_slli_epi32(maskV, 8);
685 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
686 			}
687 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
688 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
689 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
690 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
691 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
692 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
693 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
694 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
695 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
696 			src += 2;
697 			dest += 2;
698 			dest0 += 2;
699 			length -= 2;
700 		}
701 		if (length) {
702 			__m128i srcV = _mm_loadu_si32(src);
703 			__m128i destV = _mm_loadu_si32(dest);
704 			__m128i maskV = srcV & cast(__m128i)ALPHABLEND_SSE2_AMASK;
705 			version (cpublit_revalpha) {
706 				maskV |= _mm_srli_epi32(maskV, 8);
707 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
708 			} else {
709 				maskV |= _mm_slli_epi32(maskV, 8);
710 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
711 			}
712 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
713 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
714 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
715 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
716 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
717 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
718 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
719 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
720 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
721 		}
722 	}
723 	/**
724 	 * 3 operator difference function with masking, per-pixel alpha, and master alpha value.
725 	 */
726 	public void diffMV(M,V)(uint* src, uint* dest, size_t length, M* mask, V value) {
727 		__m128i masterV;
728 		static if (is(V == uint)) {
729 			masterV[0] = value;
730 			masterV[1] = value;
731 			//masterV[2] = value;
732 			//masterV[3] = value;
733 		} else static if (is(V == ubyte)) {
734 			masterV[0] = value;
735 			masterV[1] = value;
736 			//masterV[2] = value;
737 			//masterV[3] = value;
738 			masterV |= _mm_slli_epi32(masterV, 8);
739 			masterV |= _mm_slli_epi32(masterV, 16);
740 		} else static assert (0, "Value must be either 8 or 32 bits!");
741 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
742 		//__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
743 		//__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
744 		while(length >= 4) {
745 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
746 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
747 			__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
748 			version (cpublit_revalpha) {
749 				maskV |= _mm_srli_epi32(maskV, 8);
750 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
751 			} else {
752 				maskV |= _mm_slli_epi32(maskV, 8);
753 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
754 			}
755 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
756 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
757 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
758 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
759 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
760 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
761 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
762 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
763 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
764 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
765 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
766 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
767 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
768 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
769 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
770 			_mm_storeu_si128(cast(__m128i*)dest, _mm_packus_epi16(src_lo, src_hi));
771 			src += 4;
772 			dest += 4;
773 			mask += 4;
774 			length -= 4;
775 		}
776 		if (length >= 2) {
777 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
778 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
779 			__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
780 			version (cpublit_revalpha) {
781 				maskV |= _mm_srli_epi32(maskV, 8);
782 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
783 			} else {
784 				maskV |= _mm_slli_epi32(maskV, 8);
785 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
786 			}
787 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
788 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
789 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
790 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
791 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
792 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
793 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
794 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
795 			_mm_storel_epi64(cast(__m128i*)dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
796 			src += 2;
797 			dest += 2;
798 			mask += 2;
799 			length -= 2;
800 		}
801 		if (length) {
802 			__m128i srcV = _mm_loadu_si32(src);
803 			__m128i destV = _mm_loadu_si32(dest);
804 			__m128i maskV = _mm_loadu_si32(mask);
805 			version (cpublit_revalpha) {
806 				maskV |= _mm_srli_epi32(maskV, 8);
807 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
808 			} else {
809 				maskV |= _mm_slli_epi32(maskV, 8);
810 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
811 			}
812 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
813 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
814 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
815 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
816 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
817 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
818 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
819 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
820 			_mm_storeu_si32(dest, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
821 		}
822 	}
823 	/**
824 	 * 4 operator difference function with masking, separate destination, per-pixel alpha, and master alpha value.
825 	 */
826 	public void diffMV(M, V)(uint* src, uint* dest, uint* dest0, size_t length, M* mask, V value) {
827 		__m128i masterV;
828 		static if (is(V == uint)) {
829 			masterV[0] = value;
830 			masterV[1] = value;
831 			//masterV[2] = value;
832 			//masterV[3] = value;
833 		} else static if (is(V == ubyte)) {
834 			masterV[0] = value;
835 			masterV[1] = value;
836 			//masterV[2] = value;
837 			//masterV[3] = value;
838 			masterV |= _mm_slli_epi32(masterV, 8);
839 			masterV |= _mm_slli_epi32(masterV, 16);
840 		} else static assert (0, "Value must be either 8 or 32 bits!");
841 		masterV = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
842 		//__m128i master_1 = _mm_adds_epu16(_mm_unpacklo_epi8(masterV, SSE2_NULLVECT), cast(__m128i)ALPHABLEND_SSE2_CONST1);
843 		//__m128i master_256 = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, _mm_unpacklo_epi8(masterV, SSE2_NULLVECT));
844 		while(length >= 4) {
845 			__m128i srcV = _mm_loadu_si128(cast(__m128i*)src);
846 			__m128i destV = _mm_loadu_si128(cast(__m128i*)dest);
847 			__m128i maskV = _mm_loadu_si128(cast(__m128i*)mask);
848 			version (cpublit_revalpha) {
849 				maskV |= _mm_srli_epi32(maskV, 8);
850 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
851 			} else {
852 				maskV |= _mm_slli_epi32(maskV, 8);
853 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
854 			}
855 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
856 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
857 			__m128i mask_hi = _mm_unpackhi_epi8(maskV, SSE2_NULLVECT);
858 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
859 			mask_hi = _mm_srli_epi16(_mm_mullo_epi16(mask_hi, masterV), 8);
860 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
861 			__m128i mask0_hi = _mm_adds_epu16(mask_hi, cast(__m128i)ALPHABLEND_SSE2_CONST1);
862 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
863 			mask_hi = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_hi);
864 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
865 			__m128i src_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(srcV, SSE2_NULLVECT), mask0_hi);
866 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
867 			__m128i dest_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(destV, SSE2_NULLVECT), mask_hi);
868 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
869 			src_hi = _mm_srli_epi16(_mm_adds_epu16(src_hi, dest_hi), 8);
870 			_mm_storeu_si128(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, src_hi));
871 			dest += 4;
872 			dest0 += 4;
873 			mask += 4;
874 			length -= 4;
875 		}
876 		if (length >= 2) {
877 			__m128i srcV = _mm_loadl_epi64(cast(__m128i*)src);
878 			__m128i destV = _mm_loadl_epi64(cast(__m128i*)dest);
879 			__m128i maskV = _mm_loadl_epi64(cast(__m128i*)mask);
880 			version (cpublit_revalpha) {
881 				maskV |= _mm_srli_epi32(maskV, 8);
882 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
883 			} else {
884 				maskV |= _mm_slli_epi32(maskV, 8);
885 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
886 			}
887 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
888 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
889 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
890 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
891 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
892 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
893 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
894 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
895 			_mm_storel_epi64(cast(__m128i*)dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
896 			src += 2;
897 			dest += 2;
898 			dest0 += 2;
899 			mask += 2;
900 			length -= 2;
901 		}
902 		if (length) {
903 			__m128i srcV = _mm_loadu_si32(src);
904 			__m128i destV = _mm_loadu_si32(dest);
905 			__m128i maskV = _mm_loadu_si32(mask);
906 			version (cpublit_revalpha) {
907 				maskV |= _mm_srli_epi32(maskV, 8);
908 				maskV |= _mm_srli_epi32(maskV, 16);//[A,A,A,A]
909 			} else {
910 				maskV |= _mm_slli_epi32(maskV, 8);
911 				maskV |= _mm_slli_epi32(maskV, 16);//[A,A,A,A]
912 			}
913 			srcV = _mm_subs_epu8(_mm_max_epu8(destV, srcV), _mm_min_epu8(destV, srcV));
914 			__m128i mask_lo = _mm_unpacklo_epi8(maskV, SSE2_NULLVECT);
915 			mask_lo = _mm_srli_epi16(_mm_mullo_epi16(mask_lo, masterV), 8);
916 			__m128i mask0_lo = _mm_adds_epu16(mask_lo, cast(__m128i)ALPHABLEND_SSE2_CONST1);
917 			mask_lo = _mm_subs_epu16(cast(__m128i)ALPHABLEND_SSE2_CONST256, mask_lo);
918 			__m128i src_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(srcV, SSE2_NULLVECT), mask0_lo);
919 			__m128i dest_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(destV, SSE2_NULLVECT), mask_lo);
920 			src_lo = _mm_srli_epi16(_mm_adds_epu16(src_lo, dest_lo), 8);
921 			_mm_storeu_si32(dest0, _mm_packus_epi16(src_lo, SSE2_NULLVECT));
922 		}
923 	}
924 }
925 
926 unittest {
927 	uint[] src, dest, dest0, mask;
928 	ubyte[] mask0;
929 	src.length = 255;
930 	dest.length = 255;
931 	dest0.length = 255;
932 	mask.length = 255;
933 	mask0.length = 255;
934 	fillWithSingleValue(src, 0x0f010fFF);
935 	fillWithSingleValue(dest, 0x010f01FF);
936 
937 	//test basic functions
938 	diff(src.ptr, dest.ptr, 255);
939 	testArrayForValue(dest, 0x0e0e0e00);
940 	fillWithSingleValue(dest, 0x010f01FF);
941 	diff(src.ptr, dest.ptr, dest0.ptr, 255);
942 	testArrayForValue(dest0, 0x0e0e0e00);
943 	fillWithSingleValue(dest0, 0);
944 
945 	//test functions with blend
946 	diffBl(src.ptr, dest.ptr, 255);
947 	testArrayForValue(dest, 0x0e0e0e00);
948 	fillWithSingleValue(dest, 0x010f01FF);
949 	diffBl(src.ptr, dest.ptr, dest0.ptr, 255);
950 	testArrayForValue(dest0, 0x0e0e0e00);
951 	fillWithSingleValue(dest0, 0);
952 
953 	fillWithSingleValue(src, 0x0f010f00);
954 
955 	diffBl(src.ptr, dest.ptr, 255);
956 	testArrayForValue(dest, 0x010f01FF);
957 	//fillWithSingleValue(dest, 0x010f01FF);
958 	diffBl(src.ptr, dest.ptr, dest0.ptr, 255);
959 	testArrayForValue(dest0, 0x010f01FF);
960 	fillWithSingleValue(dest0, 0);
961 
962 	fillWithSingleValue(src, 0x0f010fFF);
963 
964 	//test functions with masking
965 	diff(src.ptr, dest.ptr, 255, mask.ptr);
966 	testArrayForValue(dest, 0x010f01FF);
967 	//fillWithSingleValue(dest, 0x010f01FF);
968 	diff(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr);
969 	testArrayForValue(dest0, 0x010f01FF);
970 	fillWithSingleValue(dest0, 0);
971 
972 	diff(src.ptr, dest.ptr, 255, mask0.ptr);
973 	testArrayForValue(dest, 0x010f01FF);
974 	//fillWithSingleValue(dest, 0x010f01FF);
975 	diff(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr);
976 	testArrayForValue(dest0, 0x010f01FF);
977 	fillWithSingleValue(dest0, 0);
978 
979 	fillWithSingleValue(mask, uint.max);
980 	fillWithSingleValue(mask0, ubyte.max);
981 
982 	diff(src.ptr, dest.ptr, 255, mask.ptr);
983 	testArrayForValue(dest, 0x0e0e0e00);
984 	fillWithSingleValue(dest, 0x010f01FF);
985 	diff(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr);
986 	testArrayForValue(dest0, 0x0e0e0e00);
987 	fillWithSingleValue(dest0, 0);
988 
989 	diff(src.ptr, dest.ptr, 255, mask0.ptr);
990 	testArrayForValue(dest, 0x0e0e0e00);
991 	fillWithSingleValue(dest, 0x010f01FF);
992 	diff(src.ptr, dest.ptr, dest0.ptr, 255, mask0.ptr);
993 	testArrayForValue(dest0, 0x0e0e0e00);
994 	fillWithSingleValue(dest0, 0);
995 
996 	//test master value functions without blend
997 	diffMV(src.ptr, dest.ptr, 255, ubyte.max);
998 	testArrayForValue(dest, 0x0e0e0e00);
999 	fillWithSingleValue(dest, 0x010f01FF);
1000 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1001 	testArrayForValue(dest0, 0x0e0e0e00);
1002 	fillWithSingleValue(dest0, 0);
1003 
1004 	diffMV(src.ptr, dest.ptr, 255, ubyte.min);
1005 	testArrayForValue(dest, 0x010f01FF);
1006 	//fillWithSingleValue(dest, 0x010f01FF);
1007 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1008 	testArrayForValue(dest0, 0x010f01FF);
1009 	fillWithSingleValue(dest0, 0);
1010 
1011 	diffMV(src.ptr, dest.ptr, 255, uint.max);
1012 	testArrayForValue(dest, 0x0e0e0e00);
1013 	fillWithSingleValue(dest, 0x010f01FF);
1014 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1015 	testArrayForValue(dest0, 0x0e0e0e00);
1016 	fillWithSingleValue(dest0, 0);
1017 
1018 	diffMV(src.ptr, dest.ptr, 255, uint.min);
1019 	testArrayForValue(dest, 0x010f01FF);
1020 	//fillWithSingleValue(dest, 0x010f01FF);
1021 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1022 	testArrayForValue(dest0, 0x010f01FF);
1023 	fillWithSingleValue(dest0, 0);
1024 
1025 	//test master value functions with blend
1026 	//255 alpha values
1027 	diffMVBl(src.ptr, dest.ptr, 255, ubyte.max);
1028 	testArrayForValue(dest, 0x0e0e0e00);
1029 	fillWithSingleValue(dest, 0x010f01FF);
1030 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1031 	testArrayForValue(dest0, 0x0e0e0e00);
1032 	fillWithSingleValue(dest0, 0);
1033 
1034 	diffMVBl(src.ptr, dest.ptr, 255, ubyte.min);
1035 	testArrayForValue(dest, 0x010f01FF);
1036 	//fillWithSingleValue(dest, 0x010f01FF);
1037 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1038 	testArrayForValue(dest0, 0x010f01FF);
1039 	fillWithSingleValue(dest0, 0);
1040 
1041 	diffMVBl(src.ptr, dest.ptr, 255, uint.max);
1042 	testArrayForValue(dest, 0x0e0e0e00);
1043 	fillWithSingleValue(dest, 0x010f01FF);
1044 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1045 	testArrayForValue(dest0, 0x0e0e0e00);
1046 	fillWithSingleValue(dest0, 0);
1047 
1048 	diffMVBl(src.ptr, dest.ptr, 255, uint.min);
1049 	testArrayForValue(dest, 0x010f01FF);
1050 	//fillWithSingleValue(dest, 0x010f01FF);
1051 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1052 	testArrayForValue(dest0, 0x010f01FF);
1053 	fillWithSingleValue(dest0, 0);
1054 	//0 alpha values
1055 	fillWithSingleValue(src, 0x0f010f00);
1056 
1057 	diffMVBl(src.ptr, dest.ptr, 255, ubyte.max);
1058 	testArrayForValue(dest, 0x010f01FF);
1059 	//fillWithSingleValue(dest, 0x010f01FF);
1060 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.max);
1061 	testArrayForValue(dest0, 0x010f01FF);
1062 	fillWithSingleValue(dest0, 0);
1063 
1064 	diffMVBl(src.ptr, dest.ptr, 255, ubyte.min);
1065 	testArrayForValue(dest, 0x010f01FF);
1066 	//fillWithSingleValue(dest, 0x010f01FF);
1067 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, ubyte.min);
1068 	testArrayForValue(dest0, 0x010f01FF);
1069 	fillWithSingleValue(dest0, 0);
1070 
1071 	diffMVBl(src.ptr, dest.ptr, 255, uint.max);
1072 	testArrayForValue(dest, 0x010f01FF);
1073 	//fillWithSingleValue(dest, 0x010f01FF);
1074 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.max);
1075 	testArrayForValue(dest0, 0x010f01FF);
1076 	fillWithSingleValue(dest0, 0);
1077 
1078 	diffMVBl(src.ptr, dest.ptr, 255, uint.min);
1079 	testArrayForValue(dest, 0x010f01FF);
1080 	//fillWithSingleValue(dest, 0x010f01FF);
1081 	diffMVBl(src.ptr, dest.ptr, dest0.ptr, 255, uint.min);
1082 	testArrayForValue(dest0, 0x010f01FF);
1083 	fillWithSingleValue(dest0, 0);
1084 
1085 	//test master value functions with masking
1086 	fillWithSingleValue(src, 0x0f010fFF);
1087 	fillWithSingleValue(mask, uint.max);
1088 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, ubyte.max);
1089 	testArrayForValue(dest, 0x0e0e0e00);
1090 	fillWithSingleValue(dest, 0x010f01FF);
1091 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, ubyte.max);
1092 	testArrayForValue(dest0, 0x0e0e0e00);
1093 	fillWithSingleValue(dest0, 0);
1094 
1095 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, ubyte.min);
1096 	testArrayForValue(dest, 0x010f01FF);
1097 	//fillWithSingleValue(dest, 0x010f01FF);
1098 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, ubyte.min);
1099 	testArrayForValue(dest0, 0x010f01FF);
1100 	fillWithSingleValue(dest0, 0);
1101 
1102 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, uint.max);
1103 	testArrayForValue(dest, 0x0e0e0e00);
1104 	fillWithSingleValue(dest, 0x010f01FF);
1105 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, uint.max);
1106 	testArrayForValue(dest0, 0x0e0e0e00);
1107 	fillWithSingleValue(dest0, 0);
1108 
1109 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, uint.min);
1110 	testArrayForValue(dest, 0x010f01FF);
1111 	//fillWithSingleValue(dest, 0x010f01FF);
1112 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, uint.min);
1113 	testArrayForValue(dest0, 0x010f01FF);
1114 	fillWithSingleValue(dest0, 0);
1115 	//0 alpha values
1116 	fillWithSingleValue(mask, uint.min);
1117 
1118 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, ubyte.max);
1119 	testArrayForValue(dest, 0x010f01FF);
1120 	//fillWithSingleValue(dest, 0x010f01FF);
1121 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, ubyte.max);
1122 	testArrayForValue(dest0, 0x010f01FF);
1123 	fillWithSingleValue(dest0, 0);
1124 
1125 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, ubyte.min);
1126 	testArrayForValue(dest, 0x010f01FF);
1127 	//fillWithSingleValue(dest, 0x010f01FF);
1128 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, ubyte.min);
1129 	testArrayForValue(dest0, 0x010f01FF);
1130 	fillWithSingleValue(dest0, 0);
1131 
1132 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, uint.max);
1133 	testArrayForValue(dest, 0x010f01FF);
1134 	//fillWithSingleValue(dest, 0x010f01FF);
1135 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, uint.max);
1136 	testArrayForValue(dest0, 0x010f01FF);
1137 	fillWithSingleValue(dest0, 0);
1138 
1139 	diffMV(src.ptr, dest.ptr, 255, mask.ptr, uint.min);
1140 	testArrayForValue(dest, 0x010f01FF);
1141 	//fillWithSingleValue(dest, 0x010f01FF);
1142 	diffMV(src.ptr, dest.ptr, dest0.ptr, 255, mask.ptr, uint.min);
1143 	testArrayForValue(dest0, 0x010f01FF);
1144 	fillWithSingleValue(dest0, 0);
1145 }