1 module CPUblit.composing;
2 
3 import CPUblit.colorspaces;
4 
5 /**
6  * CPUblit
7  * Low-level image composing functions
8  * Author: Laszlo Szeremi
9  * Contains 2, 3, and 4 operand functions.
10  * All blitter follows this formula: dest1 = (dest & mask) | src
11  * Two plus one operand blitter is done via evaluation on systems that don't support vector operations.
12  * Alpha-blending function formula: dest1 = (src * (1 + alpha) + dest * (256 - alpha)) >> 8
13  * Where it was possible I implemented vector support. Due to various quirks I needed (such as the ability of load unaligned values, and load less than 128/64bits), I often 
14  * had to rely on assembly. As the functions themselves aren't too complicated it wasn't an impossible task, but makes debugging time-consuming.
15  * See specific functions for more information. 
16  */
17 
18 //import core.simd;
19 //package immutable ubyte[16] NULLVECT_SSE2;
20 package immutable uint[4] BLT32BITTESTER_SSE2 = [0x01000000,0x01000000,0x01000000,0x01000000];
21 package immutable ushort[8] ALPHABLEND_SSE2_CONST1 = [1,1,1,1,1,1,1,1];
22 package immutable ushort[8] ALPHABLEND_SSE2_CONST256 = [256,256,256,256,256,256,256,256];
23 package immutable ubyte[16] ALPHABLEND_SSE2_MASK = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0];
24 //package immutable ubyte[8] NULLVECT_MMX;
25 package immutable uint[2] BLT32BITTESTER_MMX = [0x01000000,0x01000000];
26 package immutable ushort[4] ALPHABLEND_MMX_CONST1 = [1,1,1,1];
27 package immutable ushort[4] ALPHABLEND_MMX_CONST256 = [256,256,256,256];
28 package immutable ubyte[8] ALPHABLEND_MMX_MASK = [255,0,0,0,255,0,0,0];
29 /**
30  * Two plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula:
31  * mask = src == 0x00 ? 0xFF : 0x00
32  */
33 public @nogc void blitter8bit(ubyte* src, ubyte* dest, size_t length){
34 	version(X86){
35 		version(MMX){
36 			asm @nogc{
37 				pxor	MM7, MM7;
38 				mov		ESI, src[EBP];
39 				mov		EDI, dest[EBP];
40 				mov		ECX, length;
41 				cmp		ECX, 8;
42 				jl		fourpixel;
43 			eigthpixelloop:
44 				movq	MM0, [ESI];
45 				movq	MM1, [EDI];
46 				movq	MM2, MM7;
47 				pcmpeqb	MM2, MM0;
48 				pand	MM1, MM2;
49 				por		MM1, MM0;
50 				movq	[EDI], MM1;
51 				add		ESI, 8;
52 				add		EDI, 8;
53 				sub		ECX, 8;
54 				jge		eigthpixelloop;
55 			fourpixel:
56 				cmp		ECX, 4;
57 				jl		singlepixelloop;
58 				movd	MM0, [ESI];
59 				movd	MM1, [EDI];
60 				movq	MM2, MM7;
61 				pcmpeqb	MM2, MM0;
62 				pand	MM1, MM2;
63 				por		MM1, MM0;
64 				movd	[EDI], MM1;
65 				add		ESI, 4;
66 				add		EDI, 4;
67 				sub		ECX, 4;
68 			singlepixelloop:
69 				//cmp		ECX, 0;
70 				jecxz	end;
71 				mov		AL, [ESI];
72 				cmp		AL, 0;
73 				jz		step;
74 				mov		AL, [EDI];
75 			step:
76 				mov		[EDI], AL;
77 				cmp		ECX, 0;
78 				inc		ESI;
79 				inc		EDI;
80 				dec		ECX;
81 				jmp		singlepixelloop;
82 			end:
83 				emms;
84 			}
85 		}else{
86 			asm @nogc{
87 				pxor	XMM7, XMM7;
88 				mov		ESI, src[EBP];
89 				mov		EDI, dest[EBP];
90 				mov		ECX, length;
91 				cmp		ECX, 16;
92 				jl		eightpixel;
93 			sixteenpixelloop:
94 				movups	XMM0, [ESI];
95 				movups	XMM1, [EDI];
96 				movups	XMM2, XMM7;
97 				pcmpeqb	XMM2, XMM0;
98 				pand	XMM1, XMM2;
99 				por		XMM1, XMM0;
100 				movups	[EDI], XMM1;
101 				add		ESI, 16;
102 				add		EDI, 16;
103 				sub		ECX, 16;
104 				cmp		ECX, 16;
105 				jge		sixteenpixelloop;
106 			eightpixel:
107 				cmp		ECX, 8;
108 				jl		fourpixel;
109 				movq	XMM0, [ESI];
110 				movq	XMM1, [EDI];
111 				movups	XMM2, XMM7;
112 				pcmpeqb	XMM2, XMM0;
113 				pand	XMM1, XMM2;
114 				por		XMM1, XMM0;
115 				movq	[EDI], XMM1;
116 				add		ESI, 8;
117 				add		EDI, 8;
118 				sub		ECX, 8;
119 			fourpixel:
120 				cmp		ECX, 4;
121 				jl		singlepixelloop;
122 				movd	XMM0, [ESI];
123 				movd	XMM1, [EDI];
124 				movups	XMM2, XMM7;
125 				pcmpeqb	XMM2, XMM0;
126 				pand	XMM1, XMM2;
127 				por		XMM1, XMM0;
128 				movd	[EDI], XMM1;
129 				add		ESI, 4;
130 				add		EDI, 4;
131 				sub		ECX, 4;
132 			singlepixelloop:
133 				//cmp		ECX, 0;
134 				jecxz	end;
135 				mov		AL, [ESI];
136 				cmp		AL, 0;
137 				jz		step;
138 				mov		AL, [EDI];
139 			step:
140 				mov		[EDI], AL;
141 				cmp		ECX, 0;
142 				inc		ESI;
143 				inc		EDI;
144 				dec		ECX;
145 				jmp		singlepixelloop;
146 			end:
147 				;
148 			}
149 		}
150 	}else version(X86_64){
151 		asm @nogc{
152 			pxor	XMM7, XMM7;
153 			mov		RSI, src[RBP];
154 			mov		RDI, dest[RBP];
155 			mov		RCX, length;
156 			cmp		RCX, 16;
157 			jl		eightpixel;
158 		sixteenpixelloop:
159 			movups	XMM0, [RSI];
160 			movups	XMM1, [RDI];
161 			movups	XMM2, XMM7;
162 			pcmpeqb	XMM2, XMM0;
163 			pand	XMM1, XMM2;
164 			por		XMM1, XMM0;
165 			movups	[RDI], XMM1;
166 			add		RSI, 16;
167 			add		RDI, 16;
168 			sub		RCX, 16;
169 			cmp		RCX, 16;
170 			jge		sixteenpixelloop;
171 		eightpixel:
172 			cmp		RCX, 8;
173 			jl		fourpixel;
174 			movq	XMM0, [RSI];
175 			movq	XMM1, [RDI];
176 			movups	XMM2, XMM7;
177 			pcmpeqb	XMM2, XMM0;
178 			pand	XMM1, XMM2;
179 			por		XMM1, XMM0;
180 			movq	[RDI], XMM1;
181 			add		RSI, 8;
182 			add		RDI, 8;
183 			sub		RCX, 8;
184 		fourpixel:
185 			cmp		RCX, 4;
186 			jl		singlepixelloop;
187 			movd	XMM0, [RSI];
188 			movd	XMM1, [RDI];
189 			movups	XMM2, XMM7;
190 			pcmpeqb	XMM2, XMM0;
191 			pand	XMM1, XMM2;
192 			por		XMM1, XMM0;
193 			movd	[RDI], XMM1;
194 			add		RSI, 4;
195 			add		RDI, 4;
196 			sub		RCX, 4;
197 		singlepixelloop:
198 			cmp		RCX, 0;
199 			jz		end;
200 			mov		AL, [RSI];
201 			cmp		AL, 0;
202 			jz		step;
203 			mov		AL, [EDI];
204 		step:
205 			mov		[RDI], AL;
206 			cmp		RCX, 0;
207 			inc		RSI;
208 			inc		RDI;
209 			dec		RCX;
210 			jmp		singlepixelloop;
211 		end:
212 			;
213 		}
214 	}else{
215 		while(length){
216 			if(*src)
217 				*dest = *src;
218 			src++;
219 			dest++;
220 			length--;
221 		}
222 	}
223 }
224 /**
225  * Copies an 8bit image onto another without blitter. No transparency is used.
226  */
227 public @nogc void copy8bit(ubyte* src, ubyte* dest, size_t length){
228 	version(X86){
229 		version(MMX){
230 			asm @nogc{
231 				mov		ESI, src[EBP];
232 				mov		EDI, dest[EBP];
233 				mov		ECX, length;
234 				cmp		ECX, 8;
235 				jl		fourpixel;
236 			eigthpixelloop:
237 				movq	MM0, [ESI];
238 				movq	[EDI], MM0;
239 				add		ESI, 8;
240 				add		EDI, 8;
241 				sub		ECX, 8;
242 				jge		eigthpixelloop;
243 			fourpixel:
244 				cmp		ECX, 4;
245 				jl		singlepixelloop;
246 				movd	MM0, [ESI];
247 				movd	[EDI], MM0;
248 				add		ESI, 4;
249 				add		EDI, 4;
250 				sub		ECX, 4;
251 			singlepixelloop:
252 				//cmp		ECX, 0;
253 				jecxz		end;
254 				mov		AL, [ESI];
255 				mov		[EDI], AL;
256 				cmp		ECX, 0;
257 				inc		ESI;
258 				inc		EDI;
259 				dec		ECX;
260 				jnz		singlepixelloop;
261 			end:
262 				;
263 			}
264 		}else{
265 			asm @nogc{
266 				mov		ESI, src[EBP];
267 				mov		EDI, dest[EBP];
268 				mov		ECX, length;
269 				cmp		ECX, 16;
270 				jl		eightpixel;
271 			sixteenpixelloop:
272 				movups	XMM0, [ESI];
273 				movups	[EDI], XMM0;
274 				add		ESI, 16;
275 				add		EDI, 16;
276 				sub		ECX, 16;
277 				cmp		ECX, 16;
278 				jge		sixteenpixelloop;
279 			eightpixel:
280 				cmp		ECX, 8;
281 				jl		fourpixel;
282 				movq	XMM0, [ESI];
283 				movq	[EDI], XMM0;
284 				add		ESI, 8;
285 				add		EDI, 8;
286 				sub		ECX, 8;
287 			fourpixel:
288 				cmp		ECX, 4;
289 				jl		singlepixelloop;
290 				movd	XMM0, [ESI];
291 				movd	XMM1, [EDI];
292 				movups	XMM2, XMM7;
293 				pcmpeqb	XMM2, XMM0;
294 				pand	XMM1, XMM2;
295 				por		XMM1, XMM0;
296 				movd	[EDI], XMM1;
297 				add		ESI, 4;
298 				add		EDI, 4;
299 				sub		ECX, 4;
300 			singlepixelloop:
301 				//cmp		ECX, 0;
302 				jecxz		end;
303 				mov		AL, [ESI];
304 				mov		[EDI], AL;
305 				cmp		ECX, 0;
306 				inc		ESI;
307 				inc		EDI;
308 				dec		ECX;
309 				jnz		singlepixelloop;
310 			end:
311 				;
312 			}
313 		}
314 	}else version(X86_64){
315 		asm @nogc{
316 			mov		RSI, src[RBP];
317 			mov		RDI, dest[RBP];
318 			mov		RCX, length;
319 			cmp		RCX, 16;
320 			jl		eightpixel;
321 		sixteenpixelloop:
322 			movups	XMM0, [RSI];
323 			movups	[RDI], XMM0;
324 			add		RSI, 16;
325 			add		RDI, 16;
326 			sub		RCX, 16;
327 			cmp		RCX, 16;
328 			jge		sixteenpixelloop;
329 		eightpixel:
330 			cmp		RCX, 8;
331 			jl		fourpixel;
332 			movq	XMM0, [RSI];
333 			movq	[RDI], XMM0;
334 			add		RSI, 8;
335 			add		RDI, 8;
336 			sub		RCX, 8;
337 		fourpixel:
338 			cmp		RCX, 4;
339 			jl		singlepixelloop;
340 			movd	XMM1, [RSI];
341 			movd	[RDI], XMM1;
342 			add		RSI, 4;
343 			add		RDI, 4;
344 			sub		RCX, 4;
345 		singlepixelloop:
346 			cmp		RCX, 0;
347 			jz		end;
348 			mov		AL, [RSI];
349 			mov		[RDI], AL;
350 			cmp		RCX, 0;
351 			inc		RSI;
352 			inc		RDI;
353 			dec		RCX;
354 			jmp		singlepixelloop;
355 		end:
356 			;
357 		}
358 	}else{
359 		while(length){
360 			*dest = *src;
361 			src++;
362 			dest++;
363 			length--;
364 		}
365 	}
366 }
367 /**
368  * Two plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula:
369  * mask = src == 0x0000 ? 0xFFFF : 0x0000
370  */
371 public @nogc void blitter16bit(ushort* src, ushort* dest, size_t length){
372 	version(X86){
373 		version(MMX){
374 			asm @nogc{
375 				pxor	MM7, MM7;
376 				mov		ESI, src[EBP];
377 				mov		EDI, dest[EBP];
378 				mov		ECX, length;
379 				cmp		ECX, 4;
380 				jl		twopixel;
381 			fourpixelloop:
382 				movq	MM0, [ESI];
383 				movq	MM1, [EDI];
384 				movq	MM2, MM0;
385 				pcmpeqw	MM2, MM7;
386 				pand	MM1, MM2;
387 				por		MM1, MM0;
388 				movq	[EDI], MM1;
389 				add		ESI, 8;
390 				add		EDI, 8;
391 				sub		ECX, 4;
392 				jge		fourpixelloop;
393 			twopixel:
394 				cmp		ECX, 4;
395 				jl		singlepixel;
396 				movd	MM0, [ESI];
397 				movd	MM1, [EDI];
398 				movq	MM2, MM7;
399 				pcmpeqw	MM2, MM0;
400 				pand	MM1, MM2;
401 				por		MM1, MM0;
402 				movd	[EDI], MM1;
403 				add		ESI, 4;
404 				add		EDI, 4;
405 				sub		ECX, 2;
406 			singlepixel:
407 				//cmp		ECX, 0;
408 				jecxz		end;
409 				mov		AX, [ESI];
410 				cmp		AX, 0;
411 				cmovz	AX, [EDI];
412 				mov		[EDI], AL;
413 			end:
414 				emms;
415 			}
416 		}else{
417 			asm @nogc{
418 				pxor	XMM7, XMM7;
419 				mov		ESI, src[EBP];
420 				mov		EDI, dest[EBP];
421 				mov		ECX, length;
422 				cmp		ECX, 8;
423 				jl		fourpixel;
424 			eigthpixelloop:
425 				movups	XMM0, [ESI];
426 				movups	XMM1, [EDI];
427 				movups	XMM2, XMM7;
428 				pcmpeqw	XMM2, XMM0;
429 				pand	XMM1, XMM2;
430 				por		XMM1, XMM0;
431 				movups	[EDI], XMM1;
432 				add		ESI, 16;
433 				add		EDI, 16;
434 				sub		ECX, 8;
435 				cmp		ECX, 8;
436 				jge		eigthpixelloop;
437 			fourpixel:
438 				cmp		ECX, 4;
439 				jl		twopixel;
440 				movq	XMM0, [ESI];
441 				movq	XMM1, [EDI];
442 				movups	XMM2, XMM7;
443 				pcmpeqw	XMM2, XMM0;
444 				pand	XMM1, XMM2;
445 				por		XMM1, XMM0;
446 				movq	[EDI], XMM1;
447 				add		ESI, 8;
448 				add		EDI, 8;
449 				sub		ECX, 4;
450 			twopixel:
451 				cmp		ECX, 2;
452 				jl		singlepixel;
453 				movd	XMM0, [ESI];
454 				movd	XMM1, [EDI];
455 				movups	XMM2, XMM7;
456 				pcmpeqw	XMM2, XMM0;
457 				pand	XMM1, XMM2;
458 				por		XMM1, XMM0;
459 				movd	[EDI], XMM1;
460 				add		ESI, 4;
461 				add		EDI, 4;
462 				sub		ECX, 2;
463 			singlepixel:
464 				//cmp		ECX, 0;
465 				jecxz		end;
466 				mov		AX, [ESI];
467 				cmp		AX, 0;
468 				cmovz	AX, [EDI];
469 				mov		[EDI], AX;
470 			end:
471 				;
472 			}
473 		}
474 	}else version(X86_64){
475 		asm @nogc{
476 			pxor	XMM7, XMM7;
477 			mov		RSI, src[RBP];
478 			mov		RDI, dest[RBP];
479 			mov		RCX, length;
480 			cmp		RCX, 8;
481 			jl		fourpixel;
482 		eigthpixelloop:
483 			movups	XMM0, [RSI];
484 			movups	XMM1, [RDI];
485 			movups	XMM2, XMM7;
486 			pcmpeqw	XMM2, XMM0;
487 			pand	XMM1, XMM2;
488 			por		XMM1, XMM0;
489 			movups	[RDI], XMM1;
490 			add		RSI, 8;
491 			add		RDI, 8;
492 			sub		RCX, 8;
493 			cmp		RCX, 8;
494 			jge		eigthpixelloop;
495 		fourpixel:
496 			cmp		RCX, 4;
497 			jl		twopixel;
498 			movq	XMM0, [RSI];
499 			movq	XMM1, [RDI];
500 			movups	XMM2, XMM7;
501 			pcmpeqw	XMM2, XMM0;
502 			pand	XMM1, XMM2;
503 			por		XMM1, XMM0;
504 			movq	[RDI], XMM1;
505 			add		RSI, 4;
506 			add		RDI, 4;
507 			sub		RCX, 4;
508 		twopixel:
509 			cmp		RCX, 2;
510 			jl		singlepixel;
511 			movd	XMM0, [RSI];
512 			movd	XMM1, [RDI];
513 			movups	XMM2, XMM7;
514 			pcmpeqw	XMM2, XMM0;
515 			pand	XMM1, XMM2;
516 			por		XMM1, XMM0;
517 			movd	[RDI], XMM1;
518 			add		RSI, 2;
519 			add		RDI, 2;
520 			sub		RCX, 2;
521 		singlepixel:
522 			cmp		RCX, 0;
523 			jz		end;
524 			mov		AX, [RSI];
525 			cmp		AX, 0;
526 			cmovz	AX, [RDI];
527 			mov		[RDI], AX;
528 		end:
529 			;
530 		}
531 	}else{
532 		while(length){
533 			if(*src)
534 				*dest = *src;
535 			src++;
536 			dest++;
537 			length--;
538 		}
539 	}
540 }
541 /**
542  * Copies a 16bit image onto another without blitter. No transparency is used.
543  */
544 public @nogc void copy16bit(ushort* src, ushort* dest, size_t length){
545 	version(X86){
546 		version(MMX){
547 				asm @nogc{
548 				mov		ESI, src[EBP];
549 				mov		EDI, dest[EBP];
550 				mov		ECX, length;
551 				cmp		ECX, 4;
552 				//pxor	MM7, MM7;
553 				jl		twopixel;
554 			fourpixelloop:
555 				movq	MM0, [ESI];
556 				movq	[EDI], MM0;
557 				add		ESI, 8;
558 				add		EDI, 8;
559 				sub		ECX, 4;
560 				jge		fourpixelloop;
561 			twopixel:
562 				cmp		ECX, 4;
563 				jl		singlepixel;
564 				movd	MM0, [ESI];
565 				movd	[EDI], MM0;
566 				add		ESI, 4;
567 				add		EDI, 4;
568 				sub		ECX, 2;
569 			singlepixel:
570 				cmp		ECX, 0;
571 				jz		end;
572 				mov		AX, [ESI];
573 				mov		[EDI], AL;
574 			end:
575 				emms;
576 			}
577 		}else{
578 			asm @nogc{
579 				mov		ESI, src[EBP];
580 				mov		EDI, dest[EBP];
581 				mov		ECX, length;
582 				cmp		ECX, 8;
583 				//pxor	XMM7, XMM7;
584 				jl		fourpixel;
585 			eigthpixelloop:
586 				movups	XMM0, [ESI];
587 				movups	[EDI], XMM0;
588 				add		ESI, 16;
589 				add		EDI, 16;
590 				sub		ECX, 8;
591 				cmp		ECX, 8;
592 				jge		eigthpixelloop;
593 			fourpixel:
594 				cmp		ECX, 4;
595 				jl		twopixel;
596 				movq	XMM0, [ESI];
597 				movq	[EDI], XMM0;
598 				add		ESI, 8;
599 				add		EDI, 8;
600 				sub		ECX, 4;
601 			twopixel:
602 				cmp		ECX, 2;
603 				jl		singlepixel;
604 				movd	XMM0, [ESI];
605 				movd	[EDI], XMM0;
606 				add		ESI, 4;
607 				add		EDI, 4;
608 				sub		ECX, 2;
609 			singlepixel:
610 				cmp		ECX, 0;
611 				jz		end;
612 				mov		AL, [ESI];
613 				mov		[EDI], AL;
614 			end:
615 				;
616 			}
617 		}
618 	}else version(X86_64){
619 		asm @nogc{
620 			mov		RSI, src[RBP];
621 			mov		RDI, dest[RBP];
622 			mov		RCX, length;
623 			cmp		RCX, 8;
624 			//pxor	XMM7, XMM7;
625 			jl		fourpixel;
626 		eigthpixelloop:
627 			movups	XMM0, [RSI];
628 			movups	[RDI], XMM0;
629 			add		RSI, 8;
630 			add		RDI, 8;
631 			sub		RCX, 8;
632 			cmp		RCX, 8;
633 			jge		eigthpixelloop;
634 		fourpixel:
635 			cmp		RCX, 4;
636 			jl		twopixel;
637 			movq	XMM0, [RSI];
638 			movq	[RDI], XMM0;
639 			add		RSI, 4;
640 			add		RDI, 4;
641 			sub		RCX, 4;
642 		twopixel:
643 			cmp		RCX, 2;
644 			jl		singlepixel;
645 			movd	XMM0, [RSI];
646 			movd	[RDI], XMM0;
647 			add		RSI, 2;
648 			add		RDI, 2;
649 			sub		RCX, 2;
650 		singlepixel:
651 			cmp		RCX, 0;
652 			jz		end;
653 			mov		AL, [RSI];;
654 			mov		[RDI], AL;
655 		end:
656 			;
657 		}
658 	}else{
659 		while(length){
660 			*dest = *src;
661 			src++;
662 			dest++;
663 			length--;
664 		}
665 	}
666 }
667 /**
668  * Two plus one operand blitter for 32 bit values. Automatic mask-generation is used from the source's alpha channel with the following formula:
669  * mask = src.alpha == 0x00 ? 0xFFFFFFFF : 0x00000000
670  */
671 public @nogc void blitter32bit(uint* src, uint* dest, size_t length){
672 	version(X86){
673 		version(MMX){
674 			asm @nogc{
675 				mov		ESI, src[EBP];
676 				mov		EDI, dest[EBP];
677 				mov		ECX, length;
678 				movq	MM6, ALPHABLEND_MMX_MASK;
679 				pxor	MM7, MM7;
680 				cmp		ECX, 2;
681 				jl		twopixel;
682 			twopixelloop:
683 				movq	MM0, [ESI];
684 				movq	MM1, [EDI];
685 				movq	MM2, MM0;
686 				pand	MM2, MM6;
687 				pcmpeqd	MM2, MM7;
688 				pand	MM1, MM2;
689 				por		MM1, MM0;
690 				movq	[EDI], MM1;
691 				add		ESI, 8;
692 				add		EDI, 8;
693 				sub		ECX, 2;
694 				jge		fourpixelloop;
695 			onepixel:
696 				cmp		ECX, 1;
697 				jl		end;
698 				movd	MM0, [ESI];
699 				movd	MM1, [EDI];
700 				movq	MM2, MM0;
701 				pand	MM2, MM6;
702 				pcmpeqd	MM2, MM7;
703 				pand	MM1, MM2;
704 				por		MM1, MM0;
705 				movd	[EDI], MM1;
706 			end:
707 				emms;
708 			}
709 		}else{
710 			asm @nogc{
711 				mov		ESI, src[EBP];
712 				mov		EDI, dest[EBP];
713 				mov		ECX, length;
714 				movups	XMM6, ALPHABLEND_SSE2_MASK;
715 				pxor	XMM7, XMM7;
716 				cmp		ECX, 8;
717 				jl		twopixel;
718 			fourpixelloop:
719 				movups	XMM0, [ESI];
720 				movups	XMM1, [EDI];
721 				movups	XMM2, XMM0;
722 				pand	XMM2, XMM6;
723 				pcmpeqd	XMM2, XMM7;
724 				pand	XMM1, XMM2;
725 				por		XMM1, XMM0;
726 				movups	[EDI], XMM1;
727 				add		ESI, 16;
728 				add		EDI, 16;
729 				sub		ECX, 4;
730 				cmp		ECX, 4;
731 				jge		fourpixelloop;
732 			twopixel:
733 				cmp		ECX, 2;
734 				jl		onepixel;
735 				movq	XMM0, [ESI];
736 				movq	XMM1, [EDI];
737 				movq	XMM2, XMM0;
738 				pand	XMM2, XMM6;
739 				pcmpeqd	XMM2, XMM7;
740 				pand	XMM1, XMM2;
741 				por		XMM1, XMM0;
742 				movq	[EDI], XMM1;
743 				add		ESI, 8;
744 				add		EDI, 8;
745 				sub		ECX, 2;
746 			onepixel:
747 				cmp		ECX, 1;
748 				jl		end;
749 				movd	XMM0, [ESI];
750 				movd	XMM1, [EDI];
751 				movq	XMM2, XMM0;
752 				pand	XMM2, XMM6;
753 				pcmpeqd	XMM2, XMM7;
754 				pand	XMM1, XMM2;
755 				por		XMM1, XMM0;
756 				movd	[EDI], XMM1;
757 			end:
758 				;
759 			}
760 		}
761 	}else version(X86_64){
762 		asm @nogc{
763 			mov		RSI, src[RBP];
764 			mov		RDI, dest[RBP];
765 			mov		RCX, length;
766 			movups	XMM6, ALPHABLEND_SSE2_MASK;
767 			pxor	XMM7, XMM7;
768 			cmp		ECX, 8;
769 			jl		twopixel;
770 		fourpixelloop:
771 			movups	XMM0, [RSI];
772 			movups	XMM1, [RDI];
773 			movups	XMM2, XMM0;
774 			pand	XMM2, XMM6;
775 			pcmpeqd	XMM2, XMM7;
776 			pand	XMM1, XMM2;
777 			por		XMM1, XMM0;
778 			movups	[RDI], XMM1;
779 			add		RSI, 4;
780 			add		RDI, 4;
781 			sub		RCX, 4;
782 			cmp		RCX, 4;
783 			jge		fourpixelloop;
784 		twopixel:
785 			cmp		RCX, 2;
786 			jl		onepixel;
787 			movq	XMM0, [RSI];
788 			movq	XMM1, [RDI];
789 			movq	XMM2, XMM0;
790 			pand	XMM2, XMM6;
791 			pcmpeqd	XMM2, XMM7;
792 			pand	XMM1, XMM2;
793 			por		XMM1, XMM0;
794 			movq	[RDI], XMM1;
795 			add		RSI, 2;
796 			add		RDI, 2;
797 			sub		RCX, 2;
798 		onepixel:
799 			cmp		RCX, 1;
800 			jl		end;
801 			movd	XMM0, [RSI];
802 			movd	XMM1, [RDI];
803 			movq	XMM2, XMM0;
804 			pand	XMM2, XMM6;
805 			pcmpeqd	XMM2, XMM7;
806 			pand	XMM1, XMM2;
807 			por		XMM1, XMM0;
808 			movd	[RDI], XMM1;
809 		end:
810 			;
811 		}
812 	}else{
813 		while(length){
814 			if(*cast(Pixel32Bit)src.ColorSpaceARGB.alpha)
815 				*dest = *src;
816 			src++;
817 			dest++;
818 			length--;
819 		}
820 	}
821 }
822 /**
823  * Implements a two plus one operand alpha-blending algorithm for 32bit bitmaps. Automatic alpha-mask generation follows this formula:
824  * src[B,G,R,A] --> mask [A,A,A,A]
825  */
826 public @nogc void alphaBlend32bit(uint* src, uint* dest, size_t length){
827 	version(X86){
828 		version(MMX){
829 			int target8 = length/8, target4 = length%2;
830 			asm @nogc {
831 				//setting up the pointer registers and the counter register
832 				//mov		EBX, alpha[EBP];
833 				mov		ESI, src[EBP];
834 				mov		EDI, dest[EBP];
835 				mov		ECX, target8;
836 				cmp		ECX, 0;
837 				jz		fourpixelblend; //skip 16 byte operations if not needed
838 				//iteration cycle entry point
839 			sixteenpixelblend:
840 				//create alpha mask on the fly
841 				movq	MM3, [ESI];
842 				movq	MM1, MM3;
843 				pand	MM1, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
844 				movq	MM0, MM1;
845 				pslld	MM0, 8;
846 				por		MM1, MM0;	//mask is ready for RA
847 				pslld	MM1, 16;
848 				por		MM0, MM1; //mask is ready for BGRA
849 				movq	MM1, MM0;
850 				punpcklbw	MM0, MM2;
851 				punpckhbw	MM1, MM2;
852 				movq	MM6, ALPHABLEND_MMX_CONST256;
853 				movq	MM7, MM6;
854 				movq	MM4, ALPHABLEND_MMX_CONST1;
855 				movq	MM5, MM4;
856 			
857 				paddusw	MM4, MM0;	//1 + alpha01
858 				paddusw	MM5, MM1; //1 + alpha23 
859 				psubusw	MM6, MM0;	//256 - alpha01
860 				psubusw	MM7, MM1; //256 - alpha23
861 			
862 				//moving the values to their destinations
863 				movq	MM0, MM3;	//src01
864 				movq	MM1, MM0; //src23
865 				punpcklbw	MM0, MM2;
866 				punpckhbw	MM1, MM2;
867 				pmullw	MM4, MM0;	//src01 * (1 + alpha01)
868 				pmullw	MM5, MM1;	//src23 * (1 + alpha23)
869 				movq	MM0, [EDI];	//dest01
870 				movq	MM1, MM0;		//dest23
871 				punpcklbw	MM0, MM2;
872 				punpckhbw	MM1, MM2;
873 				pmullw	MM6, MM0;	//dest01 * (256 - alpha)
874 				pmullw	MM7, MM1; //dest23 * (256 - alpha)
875 		
876 				paddusw	MM4, MM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
877 				paddusw	MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
878 				psrlw	MM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
879 				psrlw	MM5, 8;
880 				//moving the result to its place;
881 				//pxor	MM2, MM2;
882 				packuswb	MM4, MM5;
883 		
884 				movq	[EDI], MM4;
885 				//add		EBX, 16;
886 				add		ESI, 8;
887 				add		EDI, 8;
888 				dec		ECX;
889 				cmp		ECX, 0;
890 				jnz		sixteenpixelblend;
891 				fourpixelblend:
892 				mov		ECX, target4;
893 				cmp		ECX, 0;
894 				jz		endofalgorithm;
895 				fourpixelblendloop:
896 
897 				//movd	XMM6, [EBX];//alpha
898 			
899 
900 				movd	MM0, [EDI];
901 				movd	MM1, [ESI];
902 				punpcklbw	MM0, MM2;//dest
903 				punpcklbw	MM1, MM2;//src
904 				movups	MM6, MM1;
905 				pand	MM6, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
906 				movups	MM7, MM6;
907 				pslld	MM6, 8;
908 				por		MM7, MM6;	//mask is ready for RA
909 				pslld	MM7, 16;
910 				por		MM6, MM7; //mask is ready for GRA
911 				punpcklbw	MM7, MM2;
912 				movaps	MM4, ALPHABLEND_MMX_CONST256;
913 				movaps	MM5, ALPHABLEND_MMX_CONST1;
914 				
915 				paddusw MM5, MM6;//1+alpha
916 				psubusw	MM4, MM6;//256-alpha
917 				
918 				pmullw	MM0, MM4;//dest*(256-alpha)
919 				pmullw	MM1, MM5;//src*(1+alpha)
920 				paddusw	MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha))
921 				psrlw	MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
922 				
923 				packuswb	MM0, MM2;
924 				
925 				movd	[EDI], MM0;	
926 
927 			endofalgorithm:
928 				emms;
929 			}
930 		}else{
931 			int target16 = length/4, target4 = length%4;
932 			asm @nogc {
933 				//setting up the pointer registers and the counter register
934 				//mov		EBX, alpha[EBP];
935 				mov		ESI, src[EBP];
936 				mov		EDI, dest[EBP];
937 				mov		ECX, target16;
938 				cmp		ECX, 0;
939 				jz		fourpixelblend; //skip 16 byte operations if not needed
940 				//iteration cycle entry point
941 			sixteenpixelblend:
942 				//create alpha mask on the fly
943 				movups	XMM3, [ESI];
944 				movups	XMM1, XMM3;
945 				pand	XMM1, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
946 				movups	XMM0, XMM1;
947 				pslld	XMM0, 8;
948 				por		XMM1, XMM0;	//mask is ready for RA
949 				pslld	XMM1, 16;
950 				por		XMM0, XMM1; //mask is ready for BGRA/**/
951 				movups	XMM1, XMM0;
952 				
953 				punpcklbw	XMM0, XMM2;
954 				punpckhbw	XMM1, XMM2;
955 				movups	XMM6, ALPHABLEND_SSE2_CONST256;
956 				movups	XMM7, XMM6;
957 				movups	XMM4, ALPHABLEND_SSE2_CONST1;
958 				movups	XMM5, XMM4;
959 			
960 				paddusw	XMM4, XMM0;	//1 + alpha01
961 				paddusw	XMM5, XMM1; //1 + alpha23 
962 				psubusw	XMM6, XMM0;	//256 - alpha01
963 				psubusw	XMM7, XMM1; //256 - alpha23
964 				
965 				//moving the values to their destinations
966 
967 				movups	XMM0, XMM3;	//src01
968 				movups	XMM1, XMM0; //src23
969 				punpcklbw	XMM0, XMM2;
970 				punpckhbw	XMM1, XMM2;
971 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
972 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
973 				movups	XMM0, [EDI];	//dest01
974 				movups	XMM1, XMM0;		//dest23
975 				punpcklbw	XMM0, XMM2;
976 				punpckhbw	XMM1, XMM2;
977 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
978 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
979 			
980 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
981 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
982 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
983 				psrlw	XMM5, 8;
984 				//moving the result to its place;
985 				//pxor	MM2, MM2;
986 				packuswb	XMM4, XMM5;
987 			
988 				movups	[EDI], XMM4;
989 				//add		EBX, 16;
990 				add		ESI, 16;
991 				add		EDI, 16;
992 				dec		ECX;
993 				cmp		ECX, 0;
994 				jnz		sixteenpixelblend;
995 
996 			fourpixelblend:
997 
998 				mov		ECX, target4;
999 				cmp		ECX, 0;
1000 				jz		endofalgorithm;
1001 
1002 			fourpixelblendloop:
1003 
1004 				//movd	XMM6, [EBX];//alpha
1005 				
1006 
1007 				movd	XMM0, [EDI];
1008 				movd	XMM1, [ESI];
1009 				punpcklbw	XMM0, XMM2;//dest
1010 				punpcklbw	XMM1, XMM2;//src
1011 				movups	XMM6, XMM1;
1012 				pand	XMM6, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
1013 				movups	XMM7, XMM6;
1014 				pslld	XMM6, 8;
1015 				por		XMM7, XMM6;	//mask is ready for RA
1016 				pslld	XMM7, 16;
1017 				por		XMM6, XMM7; //mask is ready for BGRA
1018 				
1019 				punpcklbw	XMM6, XMM2;
1020 				
1021 				movaps	XMM4, ALPHABLEND_SSE2_CONST256;
1022 				movaps	XMM5, ALPHABLEND_SSE2_CONST1;
1023 				
1024 				paddusw XMM5, XMM6;//1+alpha
1025 				psubusw	XMM4, XMM6;//256-alpha
1026 				
1027 				pmullw	XMM0, XMM4;//dest*(256-alpha)
1028 				pmullw	XMM1, XMM5;//src*(1+alpha)
1029 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
1030 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
1031 				
1032 				packuswb	XMM0, XMM2;
1033 				
1034 				movd	[EDI], XMM0;
1035 				
1036 				add		ESI, 4;
1037 				add		EDI, 4;/**/
1038 				dec		ECX;
1039 				cmp		ECX, 0;
1040 				jnz		fourpixelblendloop;
1041 
1042 			endofalgorithm:
1043 				;
1044 			}
1045 		}
1046 	}else version(X86_64){
1047 		size_t target16 = length/4, target4 = length%4;
1048 			asm @nogc {
1049 				//setting up the pointer registers and the counter register
1050 				//mov		EBX, alpha[EBP];
1051 				mov		RSI, src[RBP];
1052 				mov		RDI, dest[RBP];
1053 				mov		RCX, target16;
1054 				movups	XMM8, ALPHABLEND_SSE2_CONST256;
1055 				movups	XMM9, ALPHABLEND_SSE2_CONST1;
1056 				movups	XMM10, ALPHABLEND_SSE2_MASK;
1057 				cmp		RCX, 8;
1058 				jl		fourpixelblend; //skip 16 byte operations if not needed
1059 				//iteration cycle entry point
1060 			sixteenpixelblend:
1061 				//create alpha mask on the fly
1062 				movups	XMM3, [RSI];
1063 				movups	XMM1, XMM3;
1064 				pand	XMM1, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
1065 				movups	XMM0, XMM1;
1066 				pslld	XMM0, 8;
1067 				por		XMM1, XMM0;	//mask is ready for RA
1068 				pslld	XMM1, 16;
1069 				por		XMM0, XMM1; //mask is ready for BGRA/**/
1070 				movups	XMM1, XMM0;
1071 				
1072 				punpcklbw	XMM0, XMM2;
1073 				punpckhbw	XMM1, XMM2;
1074 				movups	XMM6, XMM8;
1075 				movups	XMM7, XMM8;
1076 				movups	XMM4, XMM9;
1077 				movups	XMM5, XMM9;
1078 			
1079 				paddusw	XMM4, XMM0;	//1 + alpha01
1080 				paddusw	XMM5, XMM1; //1 + alpha23 
1081 				psubusw	XMM6, XMM0;	//256 - alpha01
1082 				psubusw	XMM7, XMM1; //256 - alpha23
1083 				
1084 				//moving the values to their destinations
1085 
1086 				movups	XMM0, XMM3;	//src01
1087 				movups	XMM1, XMM0; //src23
1088 				punpcklbw	XMM0, XMM2;
1089 				punpckhbw	XMM1, XMM2;
1090 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
1091 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
1092 				movups	XMM0, [EDI];	//dest01
1093 				movups	XMM1, XMM0;		//dest23
1094 				punpcklbw	XMM0, XMM2;
1095 				punpckhbw	XMM1, XMM2;
1096 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
1097 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
1098 			
1099 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
1100 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
1101 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
1102 				psrlw	XMM5, 8;
1103 				//moving the result to its place;
1104 				//pxor	MM2, MM2;
1105 				packuswb	XMM4, XMM5;
1106 			
1107 				movups	[RDI], XMM4;
1108 				//add		EBX, 16;
1109 				add		RSI, 16;
1110 				add		RDI, 16;
1111 				dec		RCX;
1112 				cmp		RCX, 0;
1113 				jnz		sixteenpixelblend;
1114 
1115 			fourpixelblend:
1116 
1117 				mov		RCX, target4;
1118 				cmp		RCX, 0;
1119 				jz		endofalgorithm;
1120 
1121 			fourpixelblendloop:
1122 
1123 				//movd	XMM6, [EBX];//alpha
1124 				
1125 
1126 				movd	XMM0, [RDI];
1127 				movd	XMM1, [RSI];
1128 				punpcklbw	XMM0, XMM2;//dest
1129 				punpcklbw	XMM1, XMM2;//src
1130 				movups	XMM6, XMM1;
1131 				pand	XMM6, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
1132 				movups	XMM7, XMM6;
1133 				pslld	XMM6, 8;
1134 				por		XMM7, XMM6;	//mask is ready for RA
1135 				pslld	XMM7, 16;
1136 				por		XMM6, XMM7; //mask is ready for BGRA
1137 				
1138 				punpcklbw	XMM6, XMM2;
1139 				
1140 				movaps	XMM4, XMM8;
1141 				movaps	XMM5, XMM9;
1142 				
1143 				paddusw XMM5, XMM6;//1+alpha
1144 				psubusw	XMM4, XMM6;//256-alpha
1145 				
1146 				pmullw	XMM0, XMM4;//dest*(256-alpha)
1147 				pmullw	XMM1, XMM5;//src*(1+alpha)
1148 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
1149 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
1150 				
1151 				packuswb	XMM0, XMM2;
1152 				
1153 				movd	[RDI], XMM0;
1154 				
1155 				add		RSI, 4;
1156 				add		RDI, 4;/**/
1157 				dec		RCX;
1158 				cmp		RCX, 0;
1159 				jnz		fourpixelblendloop;
1160 
1161 			endofalgorithm:
1162 				;
1163 		}
1164 	}else{
1165 		for(int i ; i < length ; i++){
1166 			switch(src.ColorSpaceARGB.alpha){
1167 				case 0: 
1168 					break;
1169 				case 255: 
1170 					dest = src;
1171 					break;
1172 				default:
1173 					int src1 = 1 + src.ColorSpaceARGB.alpha;
1174 					int src256 = 256 - src.ColorSpaceARGB.alpha;
1175 					dest.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8);
1176 					dest.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8);
1177 					dest.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8);
1178 					break;
1179 			}
1180 			src++;
1181 			dest++;
1182 		}
1183 	}
1184 }
1185 /**
1186  * Copies a 32bit image onto another without blitter. No transparency is used.
1187  */
1188 public @nogc void copy32bit(uint* src, uint* dest, size_t length){
1189 	version(X86){
1190 		version(MMX){
1191 			asm @nogc{
1192 				mov		ESI, src[EBP];
1193 				mov		EDI, dest[EBP];
1194 				mov		ECX, length;
1195 				movq	MM6, ALPHABLEND_MMX_MASK;
1196 				pxor	MM7, MM7;
1197 				cmp		ECX, 2;
1198 				jl		twopixel;
1199 			twopixelloop:
1200 				movq	MM0, [ESI];
1201 				movq	[EDI], MM0;
1202 				add		ESI, 8;
1203 				add		EDI, 8;
1204 				sub		ECX, 2;
1205 				jge		fourpixelloop;
1206 			onepixel:
1207 				cmp		ECX, 1;
1208 				jl		end;
1209 				movd	MM0, [ESI];;
1210 				movd	[EDI], MM0;
1211 				add		ESI, 2;
1212 				add		EDI, 2;
1213 				sub		ECX, 2;
1214 			end:
1215 				emms;
1216 			}
1217 		}else{
1218 			asm @nogc{
1219 				mov		ESI, src[EBP];
1220 				mov		EDI, dest[EBP];
1221 				mov		ECX, length;
1222 				movups	XMM6, ALPHABLEND_SSE2_MASK;
1223 				pxor	XMM7, XMM7;
1224 				cmp		ECX, 8;
1225 				jl		twopixel;
1226 			eigthpixelloop:
1227 				movups	XMM0, [ESI];
1228 				movups	[EDI], XMM0;
1229 				add		ESI, 16;
1230 				add		EDI, 16;
1231 				sub		ECX, 4;
1232 				cmp		ECX, 4;
1233 				jge		eigthpixelloop;
1234 			twopixel:
1235 				cmp		ECX, 2;
1236 				jl		onepixel;
1237 				movq	XMM0, [ESI];
1238 				movq	[EDI], XMM0;
1239 				add		ESI, 8;
1240 				add		EDI, 8;
1241 				sub		ECX, 2;
1242 			onepixel:
1243 				cmp		ECX, 1;
1244 				jl		end;
1245 				movd	XMM0, [ESI];
1246 				movd	[EDI], XMM0;
1247 			end:
1248 				;
1249 			}
1250 		}
1251 	}else version(X86_64){
1252 		asm @nogc{
1253 			mov		RSI, src[RBP];
1254 			mov		RDI, dest[RBP];
1255 			mov		RCX, length;
1256 			movups	XMM6, ALPHABLEND_SSE2_MASK;
1257 			pxor	XMM7, XMM7;
1258 			cmp		ECX, 8;
1259 			jl		twopixel;
1260 		eigthpixelloop:
1261 			movups	XMM0, [RSI];
1262 			movups	[RDI], XMM0;
1263 			add		RSI, 16;
1264 			add		RDI, 16;
1265 			sub		RCX, 4;
1266 			cmp		RCX, 4;
1267 			jge		eigthpixelloop;
1268 		twopixel:
1269 			cmp		RCX, 2;
1270 			jl		onepixel;
1271 			movq	XMM0, [RSI];
1272 			movq	[RDI], XMM0;
1273 			add		RSI, 8;
1274 			add		RDI, 8;
1275 			sub		RCX, 2;
1276 		onepixel:
1277 			cmp		RCX, 1;
1278 			jl		end;
1279 			movd	XMM0, [RSI];
1280 			movd	[RDI], XMM0;
1281 		end:
1282 			;
1283 		}
1284 	}else{
1285 		while(length){
1286 			if(*src.ColorSpaceARGB.alpha)
1287 				*dest = *src;
1288 			src++;
1289 			dest++;
1290 			length--;
1291 		}
1292 	}
1293 }
1294 /**
1295  * Three plus one operand blitter for 8 bit values. Uses an external mask.
1296  */
1297 public @nogc void blitter8bit(ubyte* src, ubyte* dest, size_t length, ubyte* mask){
1298 	version(X86){
1299 		version(MMX){
1300 			asm @nogc{
1301 				mov		ESI, src[EBP];
1302 				mov		EDI, dest[EBP];
1303 				mov		EBX, mask[EBP];
1304 				mov		ECX, length;
1305 				cmp		ECX, 8;
1306 				jl		fourpixel;
1307 			eigthpixelloop:
1308 				movq	MM0, [ESI];
1309 				movq	MM1, [EDI];
1310 				movq	MM2, [EBX];
1311 				pand	MM1, MM2;
1312 				por		MM1, MM0;
1313 				movq	[EDI], MM1;
1314 				add		ESI, 8;
1315 				add		EDI, 8;
1316 				add		EBX, 8;
1317 				sub		ECX, 8;
1318 				jge		eigthpixelloop;
1319 			fourpixel:
1320 				cmp		ECX, 4;
1321 				jl		singlepixelloop;
1322 				movd	MM0, [ESI];
1323 				movd	MM1, [EDI];
1324 				movd	MM2, [EBX];
1325 				pand	MM1, MM2;
1326 				por		MM1, MM0;
1327 				movd	[EDI], MM1;
1328 				add		ESI, 4;
1329 				add		EDI, 4;
1330 				add		EBX, 4;
1331 				sub		ECX, 4;
1332 			singlepixelloop:
1333 				//cmp		ECX, 0;
1334 				jecxz	end;
1335 				mov		AL, [ESI];
1336 				mov		AH, [EDI];
1337 				and		AH, [EBX];
1338 				or		AH, AL;
1339 				mov		[EDI], AH;
1340 				inc		ESI;
1341 				inc		EDI;
1342 				inc		EBX;
1343 				dec		ECX;
1344 				jmp		singlepixelloop;
1345 			end:
1346 				emms;
1347 			}
1348 		}else{
1349 			asm @nogc{
1350 				mov		ESI, src[EBP];
1351 				mov		EDI, dest[EBP];
1352 				mov		EBX, mask[EBP];
1353 				mov		ECX, length;
1354 				cmp		ECX, 16;
1355 				pxor	XMM7, XMM7;
1356 				jl		eightpixel;
1357 			sixteenpixelloop:
1358 				movups	XMM0, [ESI];
1359 				movups	XMM1, [EDI];
1360 				movups	XMM2, [EBX];
1361 				pand	XMM1, XMM2;
1362 				por		XMM1, XMM0;
1363 				movups	[EDI], XMM1;
1364 				add		ESI, 16;
1365 				add		EDI, 16;
1366 				add		EBX, 16;
1367 				sub		ECX, 16;
1368 				cmp		ECX, 16;
1369 				jge		sixteenpixelloop;
1370 			eightpixel:
1371 				cmp		ECX, 8;
1372 				jl		fourpixel;
1373 				movq	XMM0, [ESI];
1374 				movq	XMM1, [EDI];
1375 				movq	XMM2, [EBX];
1376 				pand	XMM1, XMM2;
1377 				por		XMM1, XMM0;
1378 				movq	[EDI], XMM1;
1379 				add		ESI, 8;
1380 				add		EDI, 8;
1381 				add		EBX, 8;
1382 				sub		ECX, 8;
1383 			fourpixel:
1384 				cmp		ECX, 4;
1385 				jl		singlepixelloop;
1386 				movd	XMM0, [ESI];
1387 				movd	XMM1, [EDI];
1388 				movd	XMM2, [EBX];
1389 				pand	XMM1, XMM2;
1390 				por		XMM1, XMM0;
1391 				movd	[EDI], XMM1;
1392 				add		ESI, 4;
1393 				add		EDI, 4;
1394 				add		EBX, 4;
1395 				sub		ECX, 4;
1396 			singlepixelloop:
1397 				//cmp		ECX, 0;
1398 				jecxz	end;
1399 				mov		AL, [ESI];
1400 				mov		AH, [EDI];
1401 				and		AH, [EBX];
1402 				or		AH, AL;
1403 				mov		[EDI], AH;
1404 				inc		ESI;
1405 				inc		EDI;
1406 				inc		EBX;
1407 				dec		ECX;
1408 				jmp		singlepixelloop;
1409 			end:
1410 				;
1411 			}
1412 		}
1413 	}else version(X86_64){
1414 		asm @nogc{
1415 			mov		RSI, src[RBP];
1416 			mov		RDI, dest[RBP];
1417 			mov		RBX, mask[RBP];
1418 			mov		RCX, length;
1419 			cmp		RCX, 16;
1420 			//pxor	XMM7, XMM7;
1421 			jl		eightpixel;
1422 		sixteenpixelloop:
1423 			movups	XMM0, [RSI];
1424 			movups	XMM1, [RDI];
1425 			movups	XMM2, [RBX];
1426 			pand	XMM1, XMM2;
1427 			por		XMM1, XMM0;
1428 			movups	[RDI], XMM1;
1429 			add		RSI, 16;
1430 			add		RDI, 16;
1431 			add		RBX, 16;
1432 			sub		RCX, 16;
1433 			cmp		RCX, 16;
1434 			jge		sixteenpixelloop;
1435 		eightpixel:
1436 			cmp		RCX, 8;
1437 			jl		fourpixel;
1438 			movq	XMM0, [RSI];
1439 			movq	XMM1, [RDI];
1440 			movq	XMM2, [RBX];
1441 			pand	XMM1, XMM2;
1442 			por		XMM1, XMM0;
1443 			movq	[RDI], XMM1;
1444 			add		RSI, 8;
1445 			add		RDI, 8;
1446 			add		RBX, 8;
1447 			sub		RCX, 8;
1448 		fourpixel:
1449 			cmp		RCX, 4;
1450 			jl		singlepixelloop;
1451 			movd	XMM0, [RSI];
1452 			movd	XMM1, [RDI];
1453 			movups	XMM2, [RBX];
1454 			pand	XMM1, XMM2;
1455 			por		XMM1, XMM0;
1456 			movd	[RDI], XMM1;
1457 			add		RSI, 4;
1458 			add		RDI, 4;
1459 			add		RBX, 4;
1460 			sub		RCX, 4;
1461 		singlepixelloop:
1462 			cmp		RCX, 0;
1463 			jz		end;
1464 			mov		AL, [RSI];
1465 			mov		AH, [RDI];
1466 			and		AH, [RBX];
1467 			or		AH, AL;
1468 			mov		[RDI], AH;
1469 			inc		RSI;
1470 			inc		RDI;
1471 			inc		RBX;
1472 			dec		RCX;
1473 			jmp		singlepixelloop;
1474 		end:
1475 			;
1476 		}
1477 	}else{
1478 		while(length){
1479 			if(*src)
1480 				*dest = (*dest & *mask) | *src;
1481 			src++;
1482 			dest++;
1483 			mask++;
1484 			length--;
1485 		}
1486 	}
1487 }
1488 /**
1489  * Copies an 8bit image onto another without blitter. No transparency is used. Mask is placeholder.
1490  */
1491 public @nogc void copy8bit(ubyte* src, ubyte* dest, size_t length, ubyte* mask){
1492 	copy8bit(src,dest,length);
1493 }
1494 /**
1495  * Three plus one operand blitter for 8 bit values. An external mask is used for this operation.
1496  */
1497 public @nogc void blitter16bit(ushort* src, ushort* dest, size_t length, ushort* mask){
1498 	version(X86){
1499 		version(MMX){
1500 			asm @nogc{
1501 				pxor	MM7, MM7;
1502 				mov		ESI, src[EBP];
1503 				mov		EDI, dest[EBP];
1504 				mov		EBX, mask[EBP];
1505 				mov		ECX, length;
1506 				cmp		ECX, 4;
1507 				jl		twopixel;
1508 			fourpixelloop:
1509 				movq	MM0, [ESI];
1510 				movq	MM1, [EDI];
1511 				movq	MM2, [EBX];
1512 				pand	MM1, MM2;
1513 				por		MM1, MM0;
1514 				movq	[EDI], MM1;
1515 				add		ESI, 8;
1516 				add		EDI, 8;
1517 				add		EBX, 8;
1518 				sub		ECX, 4;
1519 				jge		fourpixelloop;
1520 			twopixel:
1521 				cmp		ECX, 4;
1522 				jl		singlepixel;
1523 				movd	MM0, [ESI];
1524 				movd	MM1, [EDI];
1525 				movd	MM2, [EBX];
1526 				pand	MM1, MM2;
1527 				por		MM1, MM0;
1528 				movd	[EDI], MM1;
1529 				add		ESI, 4;
1530 				add		EDI, 4;
1531 				add		EBX, 4;
1532 				sub		ECX, 2;
1533 			singlepixel:
1534 				//cmp		ECX, 0;
1535 				jecxz		end;
1536 				mov		AX, [EBX];
1537 				and		AX, [EDI];
1538 				or		AX, [ESI];
1539 				mov		[EDI], AX;
1540 			end:
1541 				emms;
1542 			}
1543 		}else{
1544 			asm @nogc{
1545 				pxor	XMM7, XMM7;
1546 				mov		ESI, src[EBP];
1547 				mov		EDI, dest[EBP];
1548 				mov		EBX, mask[EBP];
1549 				mov		ECX, length;
1550 				cmp		ECX, 8;
1551 				jl		fourpixel;
1552 			eigthpixelloop:
1553 				movups	XMM0, [ESI];
1554 				movups	XMM1, [EDI];
1555 				movups	XMM2, [EBX];
1556 				pand	XMM1, XMM2;
1557 				por		XMM1, XMM0;
1558 				movups	[EDI], XMM1;
1559 				add		ESI, 16;
1560 				add		EDI, 16;
1561 				add		EBX, 16;
1562 				sub		ECX, 8;
1563 				cmp		ECX, 8;
1564 				jge		eigthpixelloop;
1565 			fourpixel:
1566 				cmp		ECX, 4;
1567 				jl		twopixel;
1568 				movq	XMM0, [ESI];
1569 				movq	XMM1, [EDI];
1570 				movq	XMM2, [EBX];
1571 				pand	XMM1, XMM2;
1572 				por		XMM1, XMM0;
1573 				movq	[EDI], XMM1;
1574 				add		ESI, 8;
1575 				add		EDI, 8;
1576 				add		EBX, 8;
1577 				sub		ECX, 4;
1578 			twopixel:
1579 				cmp		ECX, 2;
1580 				jl		singlepixel;
1581 				movd	XMM0, [ESI];
1582 				movd	XMM1, [EDI];
1583 				movd	XMM2, [EBX];
1584 				pand	XMM1, XMM2;
1585 				por		XMM1, XMM0;
1586 				movd	[EDI], XMM1;
1587 				add		ESI, 4;
1588 				add		EDI, 4;
1589 				add		EBX, 4;
1590 				sub		ECX, 2;
1591 			singlepixel:
1592 				//cmp		ECX, 0;
1593 				jecxz		end;
1594 				mov		AX, [EBX];
1595 				and		AX, [EDI];
1596 				or		AX, [ESI];
1597 				mov		[EDI], AX;
1598 			end:
1599 				;
1600 			}
1601 		}
1602 	}else version(X86_64){
1603 		asm @nogc{
1604 			pxor	XMM7, XMM7;
1605 			mov		RSI, src[RBP];
1606 			mov		RDI, dest[RBP];
1607 			mov		RBX, mask[RBP];
1608 			mov		RCX, length;
1609 			cmp		RCX, 8;
1610 			jl		fourpixel;
1611 		eigthpixelloop:
1612 			movups	XMM0, [RSI];
1613 			movups	XMM1, [RDI];
1614 			movups	XMM2, [RBX];
1615 			pand	XMM1, XMM2;
1616 			por		XMM1, XMM0;
1617 			movups	[RDI], XMM1;
1618 			add		RSI, 16;
1619 			add		RDI, 16;
1620 			add		RBX, 16;
1621 			sub		RCX, 8;
1622 			cmp		RCX, 8;
1623 			jge		eigthpixelloop;
1624 		fourpixel:
1625 			cmp		RCX, 4;
1626 			jl		twopixel;
1627 			movq	XMM0, [RSI];
1628 			movq	XMM1, [RDI];
1629 			movq	XMM2, [RBX];
1630 			pand	XMM1, XMM2;
1631 			por		XMM1, XMM0;
1632 			movq	[RDI], XMM1;
1633 			add		RSI, 8;
1634 			add		RDI, 8;
1635 			add		RBX, 8;
1636 			sub		RCX, 4;
1637 		twopixel:
1638 			cmp		RCX, 2;
1639 			jl		singlepixel;
1640 			movd	XMM0, [RSI];
1641 			movd	XMM1, [RDI];
1642 			movd	XMM2, [RBX];
1643 			pand	XMM1, XMM2;
1644 			por		XMM1, XMM0;
1645 			movd	[RDI], XMM1;
1646 			add		RSI, 4;
1647 			add		RDI, 4;
1648 			add		RBX, 4;
1649 			sub		RCX, 2;
1650 		singlepixel:
1651 			cmp		RCX, 0;
1652 			jz		end;
1653 			mov		AX, [RBX];
1654 			and		AX, [RDI];
1655 			or		AX, [RSI];
1656 			mov		[RDI], AX;
1657 		end:
1658 			;
1659 		}
1660 	}else{
1661 		while(length){
1662 			*dest = (*dest & *mask) | *src;
1663 			src++;
1664 			dest++;
1665 			mask++;
1666 			length--;
1667 		}
1668 	}
1669 }
1670 /**
1671  * Copies a 16bit image onto another without blitter. No transparency is used. Mask is a placeholder for easy exchangeability with other functions.
1672  */
1673 public @nogc void copy16bit(ushort* src, ushort* dest, size_t length, ushort* mask){
1674 	copy16bit(src,dest,length);
1675 }
1676 /**
1677  * Two plus one operand blitter for 32 bit values. A separate mask is used for the operation.
1678  */
1679 public @nogc void blitter32bit(uint* src, uint* dest, size_t length, uint* mask){
1680 	version(X86){
1681 		version(MMX){
1682 			asm @nogc{
1683 				mov		ESI, src[EBP];
1684 				mov		EDI, dest[EBP];
1685 				mov		EBX, mask[EBP];
1686 				mov		ECX, length;
1687 				movq	MM6, ALPHABLEND_MMX_MASK;
1688 				pxor	MM7, MM7;
1689 				cmp		ECX, 2;
1690 				jl		twopixel;
1691 			twopixelloop:
1692 				movq	MM0, [ESI];
1693 				movq	MM1, [EDI];
1694 				movq	MM2, [EBX];
1695 				pand	MM1, MM2;
1696 				por		MM1, MM0;
1697 				movq	[EDI], MM1;
1698 				add		ESI, 8;
1699 				add		EDI, 8;
1700 				add		EBX, 8;
1701 				sub		ECX, 2;
1702 				jge		fourpixelloop;
1703 			onepixel:
1704 				jecxz	end;
1705 				movd	MM0, [ESI];
1706 				movd	MM1, [EDI];
1707 				movd	MM2, [EBX];
1708 				pand	MM1, MM2;
1709 				por		MM1, MM0;
1710 				movd	[EDI], MM1;
1711 			end:
1712 				emms;
1713 			}
1714 		}else{
1715 			asm @nogc{
1716 				mov		ESI, src[EBP];
1717 				mov		EDI, dest[EBP];
1718 				mov		EBX, mask[EBP];
1719 				mov		ECX, length;
1720 				movups	XMM6, ALPHABLEND_SSE2_MASK;
1721 				pxor	XMM7, XMM7;
1722 				cmp		ECX, 4;
1723 				jl		twopixel;
1724 			fourpixelloop:
1725 				movups	XMM0, [ESI];
1726 				movups	XMM1, [EDI];
1727 				movups	XMM2, [EBX];
1728 				pand	XMM1, XMM2;
1729 				por		XMM1, XMM0;
1730 				movups	[EDI], XMM1;
1731 				add		ESI, 16;
1732 				add		EDI, 16;
1733 				add		EBX, 16;
1734 				sub		ECX, 4;
1735 				cmp		ECX, 4;
1736 				jge		fourpixelloop;
1737 			twopixel:
1738 				cmp		ECX, 2;
1739 				jl		onepixel;
1740 				movq	XMM0, [ESI];
1741 				movq	XMM1, [EDI];
1742 				movq	XMM2, [EBX];
1743 				pand	XMM1, XMM2;
1744 				por		XMM1, XMM0;
1745 				movq	[EDI], XMM1;
1746 				add		ESI, 8;
1747 				add		EDI, 8;
1748 				add		EBX, 8;
1749 				sub		ECX, 2;
1750 			onepixel:
1751 				jecxz	end;
1752 				movd	XMM0, [ESI];
1753 				movd	XMM1, [EDI];
1754 				movd	XMM2, [EBX];
1755 				pand	XMM1, XMM2;
1756 				por		XMM1, XMM0;
1757 				movd	[EDI], XMM1;
1758 			end:
1759 				;
1760 			}
1761 		}
1762 	}else version(X86_64){
1763 		asm @nogc{
1764 			mov		RSI, src[RBP];
1765 			mov		RDI, dest[RBP];
1766 			mov		RBX, mask[RBP];
1767 			mov		RCX, length;
1768 			movups	XMM6, ALPHABLEND_SSE2_MASK;
1769 			pxor	XMM7, XMM7;
1770 			cmp		ECX, 4;
1771 			jl		twopixel;
1772 		fourpixelloop:
1773 			movups	XMM0, [RSI];
1774 			movups	XMM1, [RDI];
1775 			movups	XMM2, [RBX];
1776 			pand	XMM1, XMM2;
1777 			por		XMM1, XMM0;
1778 			movups	[RDI], XMM1;
1779 			add		RSI, 16;
1780 			add		RDI, 16;
1781 			add		RBX, 16;
1782 			sub		RCX, 4;
1783 			cmp		RCX, 4;
1784 			jge		fourpixelloop;
1785 		twopixel:
1786 			cmp		RCX, 2;
1787 			jl		onepixel;
1788 			movq	XMM0, [RSI];
1789 			movq	XMM1, [RDI];
1790 			movq	XMM2, [RBX];
1791 			pand	XMM1, XMM2;
1792 			por		XMM1, XMM0;
1793 			movq	[RDI], XMM1;
1794 			add		RSI, 8;
1795 			add		RDI, 8;
1796 			add		RBX, 8;
1797 			sub		RCX, 2;
1798 		onepixel:
1799 			cmp		RCX, 1;
1800 			jl		end;
1801 			movd	XMM0, [RSI];
1802 			movd	XMM1, [RDI];
1803 			movd	XMM2, [RBX];
1804 			pand	XMM1, XMM2;
1805 			por		XMM1, XMM0;
1806 			movd	[RDI], XMM1;
1807 		end:
1808 			;
1809 		}
1810 	}else{
1811 		while(length){
1812 			dest.base = (dest.base & mask.base) | src.base;
1813 			mask++;
1814 			src++;
1815 			dest++;
1816 			length--;
1817 		}
1818 	}
1819 }
1820 /**
1821  * Implements a three plus one operand alpha-blending algorithm for 32bit bitmaps. For masking, use Pixel32Bit.AlphaMask from CPUblit.colorspaces.
1822  */
1823 public @nogc void alphaBlend32bit(uint* src, uint* dest, size_t length, uint* mask){
1824 	version(X86){
1825 		version(MMX){
1826 			int target8 = length/8, target4 = length%2;
1827 			asm @nogc {
1828 				//setting up the pointer registers and the counter register
1829 				//mov		EBX, alpha[EBP];
1830 				mov		ESI, src[EBP];
1831 				mov		EDI, dest[EBP];
1832 				mov		EBX, mask[EBP];
1833 				mov		ECX, target8;
1834 				cmp		ECX, 0;
1835 				jz		fourpixelblend; //skip 16 byte operations if not needed
1836 				//iteration cycle entry point
1837 			sixteenpixelblend:
1838 				//create alpha mask on the fly
1839 				movq	MM3, [ESI];
1840 				/*movq	MM1, MM3;
1841 				pand	MM1, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
1842 				movq	MM0, MM1;
1843 				pslld	MM0, 8;
1844 				por		MM1, MM0;	//mask is ready for RA
1845 				pslld	MM1, 16;
1846 				por		MM0, MM1; //mask is ready for BGRA*/
1847 				movq	MM0, [EBX];
1848 				movq	MM1, MM0;
1849 				punpcklbw	MM0, MM2;
1850 				punpckhbw	MM1, MM2;
1851 				movq	MM6, ALPHABLEND_MMX_CONST256;
1852 				movq	MM7, MM6;
1853 				movq	MM4, ALPHABLEND_MMX_CONST1;
1854 				movq	MM5, MM4;
1855 			
1856 				paddusw	MM4, MM0;	//1 + alpha01
1857 				paddusw	MM5, MM1; //1 + alpha23 
1858 				psubusw	MM6, MM0;	//256 - alpha01
1859 				psubusw	MM7, MM1; //256 - alpha23
1860 			
1861 				//moving the values to their destinations
1862 				movq	MM0, MM3;	//src01
1863 				movq	MM1, MM0; //src23
1864 				punpcklbw	MM0, MM2;
1865 				punpckhbw	MM1, MM2;
1866 				pmullw	MM4, MM0;	//src01 * (1 + alpha01)
1867 				pmullw	MM5, MM1;	//src23 * (1 + alpha23)
1868 				movq	MM0, [EDI];	//dest01
1869 				movq	MM1, MM0;		//dest23
1870 				punpcklbw	MM0, MM2;
1871 				punpckhbw	MM1, MM2;
1872 				pmullw	MM6, MM0;	//dest01 * (256 - alpha)
1873 				pmullw	MM7, MM1; //dest23 * (256 - alpha)
1874 		
1875 				paddusw	MM4, MM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
1876 				paddusw	MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
1877 				psrlw	MM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
1878 				psrlw	MM5, 8;
1879 				//moving the result to its place;
1880 				//pxor	MM2, MM2;
1881 				packuswb	MM4, MM5;
1882 		
1883 				movq	[EDI], MM4;
1884 				//add		EBX, 16;
1885 				add		ESI, 8;
1886 				add		EDI, 8;
1887 				add		EBX, 8;
1888 				dec		ECX;
1889 				cmp		ECX, 0;
1890 				jnz		sixteenpixelblend;
1891 				fourpixelblend:
1892 				mov		ECX, target4;
1893 				cmp		ECX, 0;
1894 				jz		endofalgorithm;
1895 				fourpixelblendloop:
1896 
1897 				//movd	XMM6, [EBX];//alpha
1898 			
1899 
1900 				movd	MM0, [EDI];
1901 				movd	MM1, [ESI];
1902 				punpcklbw	MM0, MM2;//dest
1903 				punpcklbw	MM1, MM2;//src
1904 				movups	MM6, MM1;
1905 				pand	MM6, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
1906 				movups	MM7, MM6;
1907 				pslld	MM6, 8;
1908 				por		MM7, MM6;	//mask is ready for RA
1909 				pslld	MM7, 16;
1910 				por		MM6, MM7; //mask is ready for GRA
1911 				punpcklbw	MM7, MM2;
1912 				movaps	MM4, ALPHABLEND_MMX_CONST256;
1913 				movaps	MM5, ALPHABLEND_MMX_CONST1;
1914 				
1915 				paddusw MM5, MM6;//1+alpha
1916 				psubusw	MM4, MM6;//256-alpha
1917 				
1918 				pmullw	MM0, MM4;//dest*(256-alpha)
1919 				pmullw	MM1, MM5;//src*(1+alpha)
1920 				paddusw	MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha))
1921 				psrlw	MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
1922 				
1923 				packuswb	MM0, MM2;
1924 				
1925 				movd	[EDI], MM0;	
1926 
1927 			endofalgorithm:
1928 				emms;
1929 			}
1930 		}else{
1931 			int target16 = length/4, target4 = length%4;
1932 			asm @nogc {
1933 				//setting up the pointer registers and the counter register
1934 				//mov		EBX, alpha[EBP];
1935 				mov		ESI, src[EBP];
1936 				mov		EDI, dest[EBP];
1937 				mov		EBX, mask[EBP];
1938 				mov		ECX, target16;
1939 				cmp		ECX, 0;
1940 				jz		fourpixelblend; //skip 16 byte operations if not needed
1941 				//iteration cycle entry point
1942 			sixteenpixelblend:
1943 				//create alpha mask on the fly
1944 				movups	XMM3, [ESI];
1945 				movups	XMM1, [EBX];
1946 				//pand	XMM1, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
1947 				//movups	XMM0, XMM1;
1948 				//pslld	XMM0, 8;
1949 				//por		XMM1, XMM0;	//mask is ready for RA
1950 				//pslld	XMM1, 16;
1951 				//por		XMM0, XMM1; //mask is ready for BGRA/**/
1952 				movups	XMM0, XMM1;
1953 				
1954 				punpcklbw	XMM0, XMM2;
1955 				punpckhbw	XMM1, XMM2;
1956 				movups	XMM6, ALPHABLEND_SSE2_CONST256;
1957 				movups	XMM7, XMM6;
1958 				movups	XMM4, ALPHABLEND_SSE2_CONST1;
1959 				movups	XMM5, XMM4;
1960 			
1961 				paddusw	XMM4, XMM0;	//1 + alpha01
1962 				paddusw	XMM5, XMM1; //1 + alpha23 
1963 				psubusw	XMM6, XMM0;	//256 - alpha01
1964 				psubusw	XMM7, XMM1; //256 - alpha23
1965 				
1966 				//moving the values to their destinations
1967 
1968 				movups	XMM0, XMM3;	//src01
1969 				movups	XMM1, XMM0; //src23
1970 				punpcklbw	XMM0, XMM2;
1971 				punpckhbw	XMM1, XMM2;
1972 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
1973 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
1974 				movups	XMM0, [EDI];	//dest01
1975 				movups	XMM1, XMM0;		//dest23
1976 				punpcklbw	XMM0, XMM2;
1977 				punpckhbw	XMM1, XMM2;
1978 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
1979 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
1980 			
1981 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
1982 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
1983 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
1984 				psrlw	XMM5, 8;
1985 				//moving the result to its place;
1986 				//pxor	MM2, MM2;
1987 				packuswb	XMM4, XMM5;
1988 			
1989 				movups	[EDI], XMM4;
1990 				//add		EBX, 16;
1991 				add		ESI, 16;
1992 				add		EDI, 16;
1993 				add		EBX, 16;
1994 				dec		ECX;
1995 				cmp		ECX, 0;
1996 				jnz		sixteenpixelblend;
1997 
1998 			fourpixelblend:
1999 
2000 				mov		ECX, target4;
2001 				cmp		ECX, 0;
2002 				jz		endofalgorithm;
2003 
2004 			fourpixelblendloop:
2005 
2006 				//movd	XMM6, [EBX];//alpha
2007 				
2008 
2009 				movd	XMM0, [EDI];
2010 				movd	XMM1, [ESI];
2011 				punpcklbw	XMM0, XMM2;//dest
2012 				punpcklbw	XMM1, XMM2;//src
2013 				movd	XMM6, [EBX];
2014 				/*pand	XMM6, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2015 				movups	XMM7, XMM6;
2016 				pslld	XMM6, 8;
2017 				por		XMM7, XMM6;	//mask is ready for RA
2018 				pslld	XMM7, 16;
2019 				por		XMM6, XMM7; //mask is ready for BGRA*/
2020 				
2021 				punpcklbw	XMM6, XMM2;
2022 				
2023 				movaps	XMM4, ALPHABLEND_SSE2_CONST256;
2024 				movaps	XMM5, ALPHABLEND_SSE2_CONST1;
2025 				
2026 				paddusw XMM5, XMM6;//1+alpha
2027 				psubusw	XMM4, XMM6;//256-alpha
2028 				
2029 				pmullw	XMM0, XMM4;//dest*(256-alpha)
2030 				pmullw	XMM1, XMM5;//src*(1+alpha)
2031 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
2032 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
2033 				
2034 				packuswb	XMM0, XMM2;
2035 				
2036 				movd	[EDI], XMM0;
2037 				
2038 				add		ESI, 4;
2039 				add		EDI, 4;/**/
2040 				add		EBX, 4;
2041 				dec		ECX;
2042 				cmp		ECX, 0;
2043 				jnz		fourpixelblendloop;
2044 
2045 			endofalgorithm:
2046 				;
2047 			}
2048 		}
2049 	}else version(X86_64){
2050 		size_t target16 = length/4, target4 = length%4;
2051 			asm @nogc {
2052 				//setting up the pointer registers and the counter register
2053 				//mov		EBX, alpha[EBP];
2054 				mov		RSI, src[RBP];
2055 				mov		RDI, dest[RBP];
2056 				mov		RBX, mask[RBP];
2057 				mov		RCX, target16;
2058 				cmp		RCX, 0;
2059 				movups	XMM8, ALPHABLEND_SSE2_CONST256;
2060 				movups	XMM9, ALPHABLEND_SSE2_CONST1;
2061 				movups	XMM10, ALPHABLEND_SSE2_MASK;
2062 				jz		fourpixelblend; //skip 16 byte operations if not needed
2063 				//iteration cycle entry point
2064 			sixteenpixelblend:
2065 				//create alpha mask on the fly
2066 				movups	XMM3, [RSI];
2067 				/*movups	XMM1, XMM3;
2068 				pand	XMM1, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2069 				movups	XMM0, XMM1;
2070 				pslld	XMM0, 8;
2071 				por		XMM1, XMM0;	//mask is ready for RA
2072 				pslld	XMM1, 16;
2073 				por		XMM0, XMM1; //mask is ready for BGRA/**/
2074 				movups	XMM0, [RBX];
2075 				movups	XMM1, XMM0;
2076 				
2077 				punpcklbw	XMM0, XMM2;
2078 				punpckhbw	XMM1, XMM2;
2079 				movups	XMM6, XMM8;
2080 				movups	XMM7, XMM8;
2081 				movups	XMM4, XMM9;
2082 				movups	XMM5, XMM9;
2083 			
2084 				paddusw	XMM4, XMM0;	//1 + alpha01
2085 				paddusw	XMM5, XMM1; //1 + alpha23 
2086 				psubusw	XMM6, XMM0;	//256 - alpha01
2087 				psubusw	XMM7, XMM1; //256 - alpha23
2088 				
2089 				//moving the values to their destinations
2090 
2091 				movups	XMM0, XMM3;	//src01
2092 				movups	XMM1, XMM0; //src23
2093 				punpcklbw	XMM0, XMM2;
2094 				punpckhbw	XMM1, XMM2;
2095 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
2096 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
2097 				movups	XMM0, [EDI];	//dest01
2098 				movups	XMM1, XMM0;		//dest23
2099 				punpcklbw	XMM0, XMM2;
2100 				punpckhbw	XMM1, XMM2;
2101 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
2102 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
2103 			
2104 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
2105 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
2106 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
2107 				psrlw	XMM5, 8;
2108 				//moving the result to its place;
2109 				//pxor	MM2, MM2;
2110 				packuswb	XMM4, XMM5;
2111 			
2112 				movups	[RDI], XMM4;
2113 				//add		EBX, 16;
2114 				add		RSI, 16;
2115 				add		RDI, 16;
2116 				add		RBX, 16;
2117 				dec		RCX;
2118 				cmp		RCX, 0;
2119 				jnz		sixteenpixelblend;
2120 
2121 			fourpixelblend:
2122 
2123 				mov		RCX, target4;
2124 				cmp		RCX, 0;
2125 				jz		endofalgorithm;
2126 
2127 			fourpixelblendloop:
2128 
2129 				//movd	XMM6, [EBX];//alpha
2130 				
2131 
2132 				movd	XMM0, [RDI];
2133 				movd	XMM1, [RSI];
2134 				punpcklbw	XMM0, XMM2;//dest
2135 				punpcklbw	XMM1, XMM2;//src
2136 				movups	XMM6, [RBX];
2137 				/*pand	XMM6, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2138 				movups	XMM7, XMM6;
2139 				pslld	XMM6, 8;
2140 				por		XMM7, XMM6;	//mask is ready for RA
2141 				pslld	XMM7, 16;
2142 				por		XMM6, XMM7; //mask is ready for BGRA*/
2143 				
2144 				punpcklbw	XMM6, XMM2;
2145 				
2146 				movaps	XMM4, XMM8;
2147 				movaps	XMM5, XMM9;
2148 				
2149 				paddusw XMM5, XMM6;//1+alpha
2150 				psubusw	XMM4, XMM6;//256-alpha
2151 				
2152 				pmullw	XMM0, XMM4;//dest*(256-alpha)
2153 				pmullw	XMM1, XMM5;//src*(1+alpha)
2154 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
2155 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
2156 				
2157 				packuswb	XMM0, XMM2;
2158 				
2159 				movd	[RDI], XMM0;
2160 				
2161 				add		RSI, 4;
2162 				add		RDI, 4;/**/
2163 				add		RBX, 4;
2164 				dec		RCX;
2165 				cmp		RCX, 0;
2166 				jnz		fourpixelblendloop;
2167 
2168 			endofalgorithm:
2169 				;
2170 		}
2171 	}else{
2172 		for(int i ; i < length ; i++){
2173 			switch(mask.AlphaMask.value){
2174 				case 0: 
2175 					break;
2176 				case 255: 
2177 					dest = src;
2178 					break;
2179 				default:
2180 					int src1 = 1 + mask.AlphaMask.value;
2181 					int src256 = 256 - mask.AlphaMask.value;
2182 					dest.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8);
2183 					dest.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8);
2184 					dest.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8);
2185 					break;
2186 			}
2187 			src++;
2188 			dest++;
2189 			mask++;
2190 		}
2191 	}
2192 }
2193 /**
2194  * Copies a 32bit image onto another without blitter. No transparency is used. Mask is placeholder.
2195  */
2196 public @nogc void copy32bit(uint* src, uint* dest, size_t length, uint* mask){
2197 	copy32bit(src,dest,length);
2198 }
2199 /**
2200  * Two plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula:
2201  * mask = src == 0x00 ? 0xFF : 0x00
2202  * Final values are copied into memory location specified by dest1.
2203  */
2204 public @nogc void blitter8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length){
2205 	version(X86){
2206 		version(MMX){
2207 			asm @nogc{
2208 				pxor	MM7, MM7;
2209 				mov		ESI, src[EBP];
2210 				mov		EDI, dest[EBP];
2211 				mov		EDX, dest1[EBP];
2212 				mov		ECX, length;
2213 				cmp		ECX, 8;
2214 				jl		fourpixel;
2215 			eigthpixelloop:
2216 				movq	MM0, [ESI];
2217 				movq	MM1, [EDI];
2218 				movq	MM2, MM7;
2219 				pcmpeqb	MM2, MM0;
2220 				pand	MM1, MM2;
2221 				por		MM1, MM0;
2222 				movq	[EDX], MM1;
2223 				add		ESI, 8;
2224 				add		EDI, 8;
2225 				add		EDX, 8;
2226 				sub		ECX, 8;
2227 				jge		eigthpixelloop;
2228 			fourpixel:
2229 				cmp		ECX, 4;
2230 				jl		singlepixelloop;
2231 				movd	MM0, [ESI];
2232 				movd	MM1, [EDI];
2233 				movq	MM2, MM7;
2234 				pcmpeqb	MM2, MM0;
2235 				pand	MM1, MM2;
2236 				por		MM1, MM0;
2237 				movd	[EDX], MM1;
2238 				add		ESI, 4;
2239 				add		EDI, 4;
2240 				add		EDX, 4;
2241 				sub		ECX, 4;
2242 			singlepixelloop:
2243 				//cmp		ECX, 0;
2244 				jecxz	end;
2245 				mov		AL, [ESI];
2246 				cmp		AL, 0;
2247 				jz		step;
2248 				mov		AL, [EDI];
2249 			step:
2250 				mov		[EDX], AL;
2251 				cmp		ECX, 0;
2252 				inc		ESI;
2253 				inc		EDI;
2254 				inc		EDX;
2255 				dec		ECX;
2256 				jmp		singlepixelloop;
2257 			end:
2258 				emms;
2259 			}
2260 		}else{
2261 			asm @nogc{
2262 				pxor	XMM7, XMM7;
2263 				mov		ESI, src[EBP];
2264 				mov		EDI, dest[EBP];
2265 				mov		EDX, dest1[EBP];
2266 				mov		ECX, length;
2267 				cmp		ECX, 16;
2268 				jl		eightpixel;
2269 			sixteenpixelloop:
2270 				movups	XMM0, [ESI];
2271 				movups	XMM1, [EDI];
2272 				movups	XMM2, XMM7;
2273 				pcmpeqb	XMM2, XMM0;
2274 				pand	XMM1, XMM2;
2275 				por		XMM1, XMM0;
2276 				movups	[EDX], XMM1;
2277 				add		ESI, 16;
2278 				add		EDI, 16;
2279 				add		EDX, 16;
2280 				sub		ECX, 16;
2281 				cmp		ECX, 16;
2282 				jge		sixteenpixelloop;
2283 			eightpixel:
2284 				cmp		ECX, 8;
2285 				jl		fourpixel;
2286 				movq	XMM0, [ESI];
2287 				movq	XMM1, [EDI];
2288 				movups	XMM2, XMM7;
2289 				pcmpeqb	XMM2, XMM0;
2290 				pand	XMM1, XMM2;
2291 				por		XMM1, XMM0;
2292 				movq	[EDX], XMM1;
2293 				add		ESI, 8;
2294 				add		EDI, 8;
2295 				add		EDX, 8;
2296 				sub		ECX, 8;
2297 			fourpixel:
2298 				cmp		ECX, 4;
2299 				jl		singlepixelloop;
2300 				movd	XMM0, [ESI];
2301 				movd	XMM1, [EDI];
2302 				movups	XMM2, XMM7;
2303 				pcmpeqb	XMM2, XMM0;
2304 				pand	XMM1, XMM2;
2305 				por		XMM1, XMM0;
2306 				movd	[EDX], XMM1;
2307 				add		ESI, 4;
2308 				add		EDI, 4;
2309 				add		EDX, 4;
2310 				sub		ECX, 4;
2311 			singlepixelloop:
2312 				//cmp		ECX, 0;
2313 				jecxz	end;
2314 				mov		AL, [ESI];
2315 				cmp		AL, 0;
2316 				jz		step;
2317 				mov		AL, [EDI];
2318 			step:
2319 				mov		[EDX], AL;
2320 				cmp		ECX, 0;
2321 				inc		ESI;
2322 				inc		EDI;
2323 				inc		EDX;
2324 				dec		ECX;
2325 				jmp		singlepixelloop;
2326 			end:
2327 				;
2328 			}
2329 		}
2330 	}else version(X86_64){
2331 		asm @nogc{
2332 			pxor	XMM7, XMM7;
2333 			mov		RSI, src[RBP];
2334 			mov		RDI, dest[RBP];
2335 			mov		RDX, dest1[RBP];
2336 			mov		RCX, length;
2337 			cmp		RCX, 16;
2338 			jl		eightpixel;
2339 		sixteenpixelloop:
2340 			movups	XMM0, [RSI];
2341 			movups	XMM1, [RDI];
2342 			movups	XMM2, XMM7;
2343 			pcmpeqb	XMM2, XMM0;
2344 			pand	XMM1, XMM2;
2345 			por		XMM1, XMM0;
2346 			movups	[RDX], XMM1;
2347 			add		RSI, 16;
2348 			add		RDI, 16;
2349 			add		RDX, 16;
2350 			sub		RCX, 16;
2351 			cmp		RCX, 16;
2352 			jge		sixteenpixelloop;
2353 		eightpixel:
2354 			cmp		RCX, 8;
2355 			jl		fourpixel;
2356 			movq	XMM0, [RSI];
2357 			movq	XMM1, [RDI];
2358 			movups	XMM2, XMM7;
2359 			pcmpeqb	XMM2, XMM0;
2360 			pand	XMM1, XMM2;
2361 			por		XMM1, XMM0;
2362 			movq	[RDX], XMM1;
2363 			add		RSI, 8;
2364 			add		RDI, 8;
2365 			add		RDI, 8;
2366 			sub		RCX, 8;
2367 		fourpixel:
2368 			cmp		RCX, 4;
2369 			jl		singlepixelloop;
2370 			movd	XMM0, [RSI];
2371 			movd	XMM1, [RDI];
2372 			movups	XMM2, XMM7;
2373 			pcmpeqb	XMM2, XMM0;
2374 			pand	XMM1, XMM2;
2375 			por		XMM1, XMM0;
2376 			movd	[RDX], XMM1;
2377 			add		RSI, 4;
2378 			add		RDI, 4;
2379 			add		RDI, 4;
2380 			sub		RCX, 4;
2381 		singlepixelloop:
2382 			cmp		RCX, 0;
2383 			jz		end;
2384 			mov		AL, [RSI];
2385 			cmp		AL, 0;
2386 			jz		step;
2387 			mov		AL, [EDI];
2388 		step:
2389 			mov		[RDI], AL;
2390 			cmp		RCX, 0;
2391 			inc		RSI;
2392 			inc		RDI;
2393 			inc		RDX;
2394 			dec		RCX;
2395 			jmp		singlepixelloop;
2396 		end:
2397 			;
2398 		}
2399 	}else{
2400 		while(length){
2401 			if(*src)
2402 				*dest = *src;
2403 			src++;
2404 			dest++;
2405 			length--;
2406 		}
2407 	}
2408 }
2409 /**
2410  * Copies an 8bit image onto another without blitter. No transparency is used. Dest is placeholder.
2411  */
2412 public @nogc void copy8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length){
2413 	copy8bit(src,dest1,length);
2414 }
2415 /**
2416  * Three plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula:
2417  * mask = src == 0x0000 ? 0xFFFF : 0x0000
2418  * Result is copied into memory location specified by dest1.
2419  */
2420 public @nogc void blitter16bit(ushort* src, ushort* dest, ushort* dest1, size_t length){
2421 		version(X86){
2422 		version(MMX){
2423 			asm @nogc{
2424 				pxor	MM7, MM7;
2425 				mov		ESI, src[EBP];
2426 				mov		EDI, dest[EBP];
2427 				mov		EDX, dest[EBP];
2428 				mov		ECX, length;
2429 				cmp		ECX, 4;
2430 				jl		twopixel;
2431 			fourpixelloop:
2432 				movq	MM0, [ESI];
2433 				movq	MM1, [EDI];
2434 				movq	MM2, MM0;
2435 				pcmpeqw	MM2, MM7;
2436 				pand	MM1, MM2;
2437 				por		MM1, MM0;
2438 				movq	[EDX], MM1;
2439 				add		ESI, 8;
2440 				add		EDI, 8;
2441 				add		EDX, 8;
2442 				sub		ECX, 4;
2443 				jge		fourpixelloop;
2444 			twopixel:
2445 				cmp		ECX, 4;
2446 				jl		singlepixel;
2447 				movd	MM0, [ESI];
2448 				movd	MM1, [EDI];
2449 				movq	MM2, MM7;
2450 				pcmpeqw	MM2, MM0;
2451 				pand	MM1, MM2;
2452 				por		MM1, MM0;
2453 				movd	[EDX], MM1;
2454 				add		ESI, 2;
2455 				add		EDI, 2;
2456 				add		EDX, 2;
2457 				sub		ECX, 2;
2458 			singlepixel:
2459 				//cmp		ECX, 0;
2460 				jecxz		end;
2461 				mov		AX, [ESI];
2462 				cmp		AX, 0;
2463 				cmovz	AX, [EDI];
2464 				mov		[EDX], AL;
2465 			end:
2466 				emms;
2467 			}
2468 		}else{
2469 			asm @nogc{
2470 				pxor	XMM7, XMM7;
2471 				mov		ESI, src[EBP];
2472 				mov		EDI, dest[EBP];
2473 				mov		EDX, dest[EBP];
2474 				mov		ECX, length;
2475 				cmp		ECX, 8;
2476 				jl		fourpixel;
2477 			eigthpixelloop:
2478 				movups	XMM0, [ESI];
2479 				movups	XMM1, [EDI];
2480 				movups	XMM2, XMM7;
2481 				pcmpeqw	XMM2, XMM0;
2482 				pand	XMM1, XMM2;
2483 				por		XMM1, XMM0;
2484 				movups	[EDX], XMM1;
2485 				add		ESI,16;
2486 				add		EDI,16;
2487 				add		EDX,16;
2488 				sub		ECX, 8;
2489 				cmp		ECX, 8;
2490 				jge		eigthpixelloop;
2491 			fourpixel:
2492 				cmp		ECX, 4;
2493 				jl		twopixel;
2494 				movq	XMM0, [ESI];
2495 				movq	XMM1, [EDI];
2496 				movups	XMM2, XMM7;
2497 				pcmpeqw	XMM2, XMM0;
2498 				pand	XMM1, XMM2;
2499 				por		XMM1, XMM0;
2500 				movq	[EDX], XMM1;
2501 				add		ESI, 8;
2502 				add		EDI, 8;
2503 				add		EDX, 8;
2504 				sub		ECX, 4;
2505 			twopixel:
2506 				cmp		ECX, 2;
2507 				jl		singlepixel;
2508 				movd	XMM0, [ESI];
2509 				movd	XMM1, [EDI];
2510 				movups	XMM2, XMM7;
2511 				pcmpeqw	XMM2, XMM0;
2512 				pand	XMM1, XMM2;
2513 				por		XMM1, XMM0;
2514 				movd	[EDX], XMM1;
2515 				add		ESI, 4;
2516 				add		EDI, 4;
2517 				add		EDX, 4;
2518 				sub		ECX, 2;
2519 			singlepixel:
2520 				//cmp		ECX, 0;
2521 				jecxz		end;
2522 				mov		AX, [ESI];
2523 				cmp		AX, 0;
2524 				cmovz	AX, [EDI];
2525 				mov		[EDX], AX;
2526 			end:
2527 				;
2528 			}
2529 		}
2530 	}else version(X86_64){
2531 		asm @nogc{
2532 			pxor	XMM7, XMM7;
2533 			mov		RSI, src[RBP];
2534 			mov		RDI, dest[RBP];
2535 			mov		RDX, dest1[RBP];
2536 			mov		RCX, length;
2537 			cmp		RCX, 8;
2538 			jl		fourpixel;
2539 		eigthpixelloop:
2540 			movups	XMM0, [RSI];
2541 			movups	XMM1, [RDI];
2542 			movups	XMM2, XMM7;
2543 			pcmpeqw	XMM2, XMM0;
2544 			pand	XMM1, XMM2;
2545 			por		XMM1, XMM0;
2546 			movups	[RDX], XMM1;
2547 			add		RSI,16;
2548 			add		RDI,16;
2549 			add		RDX,16;
2550 			sub		RCX, 8;
2551 			cmp		RCX, 8;
2552 			jge		eigthpixelloop;
2553 		fourpixel:
2554 			cmp		RCX, 4;
2555 			jl		twopixel;
2556 			movq	XMM0, [RSI];
2557 			movq	XMM1, [RDI];
2558 			movups	XMM2, XMM7;
2559 			pcmpeqw	XMM2, XMM0;
2560 			pand	XMM1, XMM2;
2561 			por		XMM1, XMM0;
2562 			movq	[RDX], XMM1;
2563 			add		RSI, 8;
2564 			add		RDI, 8;
2565 			add		RDX, 8;
2566 			sub		RCX, 4;
2567 		twopixel:
2568 			cmp		RCX, 2;
2569 			jl		singlepixel;
2570 			movd	XMM0, [RSI];
2571 			movd	XMM1, [RDI];
2572 			movups	XMM2, XMM7;
2573 			pcmpeqw	XMM2, XMM0;
2574 			pand	XMM1, XMM2;
2575 			por		XMM1, XMM0;
2576 			movd	[RDX], XMM1;
2577 			add		RSI, 4;
2578 			add		RDI, 4;
2579 			add		RDX, 4;
2580 			sub		RCX, 2;
2581 		singlepixel:
2582 			cmp		RCX, 0;
2583 			jz		end;
2584 			mov		AX, [RSI];
2585 			cmp		AX, 0;
2586 			cmovz	AX, [RDI];
2587 			mov		[RDX], AX;
2588 		end:
2589 			;
2590 		}
2591 	}else{
2592 		while(length){
2593 			if(*src)
2594 				*dest1 = *src;
2595 			else
2596 				*dest1 = *dest;
2597 			src++;
2598 			dest++;
2599 			length--;
2600 		}
2601 	}
2602 }
2603 /**
2604  * Copies a 16bit image onto another without blitter. No transparency is used. Dest is placeholder.
2605  */
2606 public @nogc void copy16bit(ushort* src, ushort* dest, ushort* dest1, size_t length){
2607 	copy16bit(src,dest1,length);
2608 }
2609 /**
2610  * Three plus one operand blitter for 32 bit values. Automatic mask-generation is used from the source's alpha channel with the following formula:
2611  * mask = src.alpha ? 0xFFFFFFFF : 0x00000000
2612  * The result is copied into the memory location specified by dest1
2613  */
2614 public @nogc void blitter32bit(uint* src, uint* dest, uint* dest1, size_t length){
2615 	version(X86){
2616 		version(MMX){
2617 			asm @nogc{
2618 				mov		ESI, src[EBP];
2619 				mov		EDI, dest[EBP];
2620 				mov		EDX, dest1[EBP];
2621 				mov		ECX, length;
2622 				movq	MM6, ALPHABLEND_MMX_MASK;
2623 				pxor	MM7, MM7;
2624 				cmp		ECX, 2;
2625 				jl		twopixel;
2626 			twopixelloop:
2627 				movq	MM0, [ESI];
2628 				movq	MM1, [EDI];
2629 				movq	MM2, MM0;
2630 				pand	MM2, MM6;
2631 				pcmpeqd	MM2, MM7;
2632 				pand	MM1, MM2;
2633 				por		MM1, MM0;
2634 				movq	[EDX], MM1;
2635 				add		ESI, 8;
2636 				add		EDI, 8;
2637 				add		EDX, 8;
2638 				sub		ECX, 2;
2639 				jge		fourpixelloop;
2640 			onepixel:
2641 				cmp		ECX, 1;
2642 				jl		end;
2643 				movd	MM0, [ESI];
2644 				movd	MM1, [EDI];
2645 				movq	MM2, MM0;
2646 				pand	MM2, MM6;
2647 				pcmpeqd	MM2, MM7;
2648 				pand	MM1, MM2;
2649 				por		MM1, MM0;
2650 				movd	[EDX], MM1;
2651 			end:
2652 				emms;
2653 			}
2654 		}else{
2655 			asm @nogc{
2656 				mov		ESI, src[EBP];
2657 				mov		EDI, dest[EBP];
2658 				mov		EDX, dest1[EBP];
2659 				mov		ECX, length;
2660 				movups	XMM6, ALPHABLEND_SSE2_MASK;
2661 				pxor	XMM7, XMM7;
2662 				cmp		ECX, 8;
2663 				jl		twopixel;
2664 			fourpixelloop:
2665 				movups	XMM0, [ESI];
2666 				movups	XMM1, [EDI];
2667 				movups	XMM2, XMM0;
2668 				pand	XMM2, XMM6;
2669 				pcmpeqd	XMM2, XMM7;
2670 				pand	XMM1, XMM2;
2671 				por		XMM1, XMM0;
2672 				movups	[EDX], XMM1;
2673 				add		ESI,16;
2674 				add		EDI,16;
2675 				add		EDX,16;
2676 				sub		ECX, 4;
2677 				cmp		ECX, 4;
2678 				jge		fourpixelloop;
2679 			twopixel:
2680 				cmp		ECX, 2;
2681 				jl		onepixel;
2682 				movq	XMM0, [ESI];
2683 				movq	XMM1, [EDI];
2684 				movq	XMM2, XMM0;
2685 				pand	XMM2, XMM6;
2686 				pcmpeqd	XMM2, XMM7;
2687 				pand	XMM1, XMM2;
2688 				por		XMM1, XMM0;
2689 				movq	[EDX], XMM1;
2690 				add		ESI, 8;
2691 				add		EDI, 8;
2692 				add		EDX, 8;
2693 				sub		ECX, 2;
2694 			onepixel:
2695 				cmp		ECX, 1;
2696 				jl		end;
2697 				movd	XMM0, [ESI];
2698 				movd	XMM1, [EDI];
2699 				movq	XMM2, XMM0;
2700 				pand	XMM2, XMM6;
2701 				pcmpeqd	XMM2, XMM7;
2702 				pand	XMM1, XMM2;
2703 				por		XMM1, XMM0;
2704 				movd	[EDX], XMM1;
2705 			end:
2706 				;
2707 			}
2708 		}
2709 	}else version(X86_64){
2710 		asm @nogc{
2711 			mov		RSI, src[RBP];
2712 			mov		RDI, dest[RBP];
2713 			mov		RDX, dest1[RBP];
2714 			mov		RCX, length;
2715 			movups	XMM6, ALPHABLEND_SSE2_MASK;
2716 			pxor	XMM7, XMM7;
2717 			cmp		ECX, 8;
2718 			jl		twopixel;
2719 		fourpixelloop:
2720 			movups	XMM0, [RSI];
2721 			movups	XMM1, [RDI];
2722 			movups	XMM2, XMM0;
2723 			pand	XMM2, XMM6;
2724 			pcmpeqd	XMM2, XMM7;
2725 			pand	XMM1, XMM2;
2726 			por		XMM1, XMM0;
2727 			movups	[RDX], XMM1;
2728 			add		RSI,16;
2729 			add		RDI,16;
2730 			add		RDX,16;
2731 			sub		RCX, 4;
2732 			cmp		RCX, 4;
2733 			jge		fourpixelloop;
2734 		twopixel:
2735 			cmp		RCX, 2;
2736 			jl		onepixel;
2737 			movq	XMM0, [RSI];
2738 			movq	XMM1, [RDI];
2739 			movq	XMM2, XMM0;
2740 			pand	XMM2, XMM6;
2741 			pcmpeqd	XMM2, XMM7;
2742 			pand	XMM1, XMM2;
2743 			por		XMM1, XMM0;
2744 			movq	[RDX], XMM1;
2745 			add		RSI, 8;
2746 			add		RDI, 8;
2747 			add		RDX, 8;
2748 			sub		RCX, 2;
2749 		onepixel:
2750 			cmp		RCX, 1;
2751 			jl		end;
2752 			movd	XMM0, [RSI];
2753 			movd	XMM1, [RDI];
2754 			movq	XMM2, XMM0;
2755 			pand	XMM2, XMM6;
2756 			pcmpeqd	XMM2, XMM7;
2757 			pand	XMM1, XMM2;
2758 			por		XMM1, XMM0;
2759 			movd	[RDI], XMM1;
2760 		end:
2761 			;
2762 		}
2763 	}else{
2764 		while(length){
2765 			if(*src.ColorSpaceARGB.alpha)
2766 				*dest1 = *src;
2767 			else
2768 				*dest1 = *dest;
2769 			src++;
2770 			dest++;
2771 			dest1++;
2772 			length--;
2773 		}
2774 	}
2775 }
2776 /**
2777  * Implements a three plus one operand alpha-blending algorithm for 32bit bitmaps. Automatic alpha-mask generation follows this formula:
2778  * src[B,G,R,A] --> mask [A,A,A,A]
2779  */
2780 public @nogc void alphaBlend32bit(uint* src, uint* dest, uint* dest1, size_t length){
2781 	version(X86){
2782 		version(MMX){
2783 			int target8 = length/8, target4 = length%2;
2784 			asm @nogc {
2785 				//setting up the pointer registers and the counter register
2786 				//mov		EBX, alpha[EBP];
2787 				mov		ESI, src[EBP];
2788 				mov		EDI, dest[EBP];
2789 				mov		EDX, dest1[EBP];
2790 				mov		ECX, target8;
2791 				cmp		ECX, 0;
2792 				jz		fourpixelblend; //skip 16 byte operations if not needed
2793 				//iteration cycle entry point
2794 			sixteenpixelblend:
2795 				//create alpha mask on the fly
2796 				movq	MM3, [ESI];
2797 				movq	MM1, MM3;
2798 				pand	MM1, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2799 				movq	MM0, MM1;
2800 				pslld	MM0, 8;
2801 				por		MM1, MM0;	//mask is ready for RA
2802 				pslld	MM1, 16;
2803 				por		MM0, MM1; //mask is ready for BGRA
2804 				movq	MM1, MM0;
2805 				punpcklbw	MM0, MM2;
2806 				punpckhbw	MM1, MM2;
2807 				movq	MM6, ALPHABLEND_MMX_CONST256;
2808 				movq	MM7, MM6;
2809 				movq	MM4, ALPHABLEND_MMX_CONST1;
2810 				movq	MM5, MM4;
2811 			
2812 				paddusw	MM4, MM0;	//1 + alpha01
2813 				paddusw	MM5, MM1; //1 + alpha23 
2814 				psubusw	MM6, MM0;	//256 - alpha01
2815 				psubusw	MM7, MM1; //256 - alpha23
2816 			
2817 				//moving the values to their destinations
2818 				movq	MM0, MM3;	//src01
2819 				movq	MM1, MM0; //src23
2820 				punpcklbw	MM0, MM2;
2821 				punpckhbw	MM1, MM2;
2822 				pmullw	MM4, MM0;	//src01 * (1 + alpha01)
2823 				pmullw	MM5, MM1;	//src23 * (1 + alpha23)
2824 				movq	MM0, [EDI];	//dest01
2825 				movq	MM1, MM0;		//dest23
2826 				punpcklbw	MM0, MM2;
2827 				punpckhbw	MM1, MM2;
2828 				pmullw	MM6, MM0;	//dest01 * (256 - alpha)
2829 				pmullw	MM7, MM1; //dest23 * (256 - alpha)
2830 		
2831 				paddusw	MM4, MM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
2832 				paddusw	MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
2833 				psrlw	MM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
2834 				psrlw	MM5, 8;
2835 				//moving the result to its place;
2836 				//pxor	MM2, MM2;
2837 				packuswb	MM4, MM5;
2838 		
2839 				movq	[EDX], MM4;
2840 				//add		EBX, 16;
2841 				add		ESI, 8;
2842 				add		EDI, 8;
2843 				add		EDX, 8;
2844 				dec		ECX;
2845 				cmp		ECX, 0;
2846 				jnz		sixteenpixelblend;
2847 				fourpixelblend:
2848 				mov		ECX, target4;
2849 				cmp		ECX, 0;
2850 				jz		endofalgorithm;
2851 				fourpixelblendloop:
2852 
2853 				//movd	XMM6, [EBX];//alpha
2854 			
2855 
2856 				movd	MM0, [EDI];
2857 				movd	MM1, [ESI];
2858 				punpcklbw	MM0, MM2;//dest
2859 				punpcklbw	MM1, MM2;//src
2860 				movups	MM6, MM1;
2861 				pand	MM6, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2862 				movups	MM7, MM6;
2863 				pslld	MM6, 8;
2864 				por		MM7, MM6;	//mask is ready for RA
2865 				pslld	MM7, 16;
2866 				por		MM6, MM7; //mask is ready for GRA
2867 				punpcklbw	MM7, MM2;
2868 				movaps	MM4, ALPHABLEND_MMX_CONST256;
2869 				movaps	MM5, ALPHABLEND_MMX_CONST1;
2870 				
2871 				paddusw MM5, MM6;//1+alpha
2872 				psubusw	MM4, MM6;//256-alpha
2873 				
2874 				pmullw	MM0, MM4;//dest*(256-alpha)
2875 				pmullw	MM1, MM5;//src*(1+alpha)
2876 				paddusw	MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha))
2877 				psrlw	MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
2878 				
2879 				packuswb	MM0, MM2;
2880 				
2881 				movd	[EDX], MM0;	
2882 
2883 			endofalgorithm:
2884 				emms;
2885 			}
2886 		}else{
2887 			int target16 = length/4, target4 = length%4;
2888 			asm @nogc {
2889 				//setting up the pointer registers and the counter register
2890 				//mov		EBX, alpha[EBP];
2891 				mov		ESI, src[EBP];
2892 				mov		EDI, dest[EBP];
2893 				mov		ECX, target16;
2894 				cmp		ECX, 0;
2895 				jz		fourpixelblend; //skip 16 byte operations if not needed
2896 				//iteration cycle entry point
2897 			sixteenpixelblend:
2898 				//create alpha mask on the fly
2899 				movups	XMM3, [ESI];
2900 				movups	XMM1, XMM3;
2901 				pand	XMM1, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2902 				movups	XMM0, XMM1;
2903 				pslld	XMM0, 8;
2904 				por		XMM1, XMM0;	//mask is ready for RA
2905 				pslld	XMM1, 16;
2906 				por		XMM0, XMM1; //mask is ready for BGRA/**/
2907 				movups	XMM1, XMM0;
2908 				
2909 				punpcklbw	XMM0, XMM2;
2910 				punpckhbw	XMM1, XMM2;
2911 				movups	XMM6, ALPHABLEND_SSE2_CONST256;
2912 				movups	XMM7, XMM6;
2913 				movups	XMM4, ALPHABLEND_SSE2_CONST1;
2914 				movups	XMM5, XMM4;
2915 			
2916 				paddusw	XMM4, XMM0;	//1 + alpha01
2917 				paddusw	XMM5, XMM1; //1 + alpha23 
2918 				psubusw	XMM6, XMM0;	//256 - alpha01
2919 				psubusw	XMM7, XMM1; //256 - alpha23
2920 				
2921 				//moving the values to their destinations
2922 
2923 				movups	XMM0, XMM3;	//src01
2924 				movups	XMM1, XMM0; //src23
2925 				punpcklbw	XMM0, XMM2;
2926 				punpckhbw	XMM1, XMM2;
2927 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
2928 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
2929 				movups	XMM0, [EDI];	//dest01
2930 				movups	XMM1, XMM0;		//dest23
2931 				punpcklbw	XMM0, XMM2;
2932 				punpckhbw	XMM1, XMM2;
2933 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
2934 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
2935 			
2936 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
2937 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
2938 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
2939 				psrlw	XMM5, 8;
2940 				//moving the result to its place;
2941 				//pxor	MM2, MM2;
2942 				packuswb	XMM4, XMM5;
2943 			
2944 				movups	[EDI], XMM4;
2945 				//add		EBX, 16;
2946 				add		ESI, 16;
2947 				add		EDI, 16;
2948 				add		EDX, 16;
2949 				dec		ECX;
2950 				cmp		ECX, 0;
2951 				jnz		sixteenpixelblend;
2952 
2953 			fourpixelblend:
2954 
2955 				mov		ECX, target4;
2956 				cmp		ECX, 0;
2957 				jz		endofalgorithm;
2958 
2959 			fourpixelblendloop:
2960 
2961 				//movd	XMM6, [EBX];//alpha
2962 				
2963 
2964 				movd	XMM0, [EDI];
2965 				movd	XMM1, [ESI];
2966 				punpcklbw	XMM0, XMM2;//dest
2967 				punpcklbw	XMM1, XMM2;//src
2968 				movups	XMM6, XMM1;
2969 				pand	XMM6, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
2970 				movups	XMM7, XMM6;
2971 				pslld	XMM6, 8;
2972 				por		XMM7, XMM6;	//mask is ready for RA
2973 				pslld	XMM7, 16;
2974 				por		XMM6, XMM7; //mask is ready for BGRA
2975 				
2976 				punpcklbw	XMM6, XMM2;
2977 				
2978 				movaps	XMM4, ALPHABLEND_SSE2_CONST256;
2979 				movaps	XMM5, ALPHABLEND_SSE2_CONST1;
2980 				
2981 				paddusw XMM5, XMM6;//1+alpha
2982 				psubusw	XMM4, XMM6;//256-alpha
2983 				
2984 				pmullw	XMM0, XMM4;//dest*(256-alpha)
2985 				pmullw	XMM1, XMM5;//src*(1+alpha)
2986 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
2987 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
2988 				
2989 				packuswb	XMM0, XMM2;
2990 				
2991 				movd	[EDI], XMM0;
2992 				
2993 				add		ESI, 4;
2994 				add		EDI, 4;/**/
2995 				add		EDX, 4;
2996 				dec		ECX;
2997 				cmp		ECX, 0;
2998 				jnz		fourpixelblendloop;
2999 
3000 			endofalgorithm:
3001 				;
3002 			}
3003 		}
3004 	}else version(X86_64){
3005 		size_t target16 = length/4, target4 = length%4;
3006 			asm @nogc {
3007 				//setting up the pointer registers and the counter register
3008 				//mov		EBX, alpha[EBP];
3009 				mov		RSI, src[RBP];
3010 				mov		RDI, dest[RBP];
3011 				mov		RDX, dest1[RBP];
3012 				mov		RCX, target16;
3013 				cmp		RCX, 0;
3014 				movups	XMM8, ALPHABLEND_SSE2_CONST256;
3015 				movups	XMM9, ALPHABLEND_SSE2_CONST1;
3016 				movups	XMM10, ALPHABLEND_SSE2_MASK;
3017 				jz		fourpixelblend; //skip 16 byte operations if not needed
3018 				//iteration cycle entry point
3019 			sixteenpixelblend:
3020 				//create alpha mask on the fly
3021 				movups	XMM3, [RSI];
3022 				movups	XMM1, XMM3;
3023 				pand	XMM1, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3024 				movups	XMM0, XMM1;
3025 				pslld	XMM0, 8;
3026 				por		XMM1, XMM0;	//mask is ready for RA
3027 				pslld	XMM1, 16;
3028 				por		XMM0, XMM1; //mask is ready for BGRA/**/
3029 				movups	XMM1, XMM0;
3030 				
3031 				punpcklbw	XMM0, XMM2;
3032 				punpckhbw	XMM1, XMM2;
3033 				movups	XMM6, XMM8;
3034 				movups	XMM7, XMM8;
3035 				movups	XMM4, XMM9;
3036 				movups	XMM5, XMM9;
3037 			
3038 				paddusw	XMM4, XMM0;	//1 + alpha01
3039 				paddusw	XMM5, XMM1; //1 + alpha23 
3040 				psubusw	XMM6, XMM0;	//256 - alpha01
3041 				psubusw	XMM7, XMM1; //256 - alpha23
3042 				
3043 				//moving the values to their destinations
3044 
3045 				movups	XMM0, XMM3;	//src01
3046 				movups	XMM1, XMM0; //src23
3047 				punpcklbw	XMM0, XMM2;
3048 				punpckhbw	XMM1, XMM2;
3049 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
3050 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
3051 				movups	XMM0, [EDI];	//dest01
3052 				movups	XMM1, XMM0;		//dest23
3053 				punpcklbw	XMM0, XMM2;
3054 				punpckhbw	XMM1, XMM2;
3055 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
3056 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
3057 			
3058 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
3059 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
3060 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
3061 				psrlw	XMM5, 8;
3062 				//moving the result to its place;
3063 				//pxor	MM2, MM2;
3064 				packuswb	XMM4, XMM5;
3065 			
3066 				movups	[RDX], XMM4;
3067 				//add		EBX, 16;
3068 				add		RSI, 16;
3069 				add		RDI, 16;
3070 				add		RDX, 16;
3071 				dec		RCX;
3072 				cmp		RCX, 0;
3073 				jnz		sixteenpixelblend;
3074 
3075 			fourpixelblend:
3076 
3077 				mov		RCX, target4;
3078 				cmp		RCX, 0;
3079 				jz		endofalgorithm;
3080 
3081 			fourpixelblendloop:
3082 
3083 				//movd	XMM6, [EBX];//alpha
3084 				
3085 
3086 				movd	XMM0, [RDI];
3087 				movd	XMM1, [RSI];
3088 				punpcklbw	XMM0, XMM2;//dest
3089 				punpcklbw	XMM1, XMM2;//src
3090 				movups	XMM6, XMM1;
3091 				pand	XMM6, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3092 				movups	XMM7, XMM6;
3093 				pslld	XMM6, 8;
3094 				por		XMM7, XMM6;	//mask is ready for RA
3095 				pslld	XMM7, 16;
3096 				por		XMM6, XMM7; //mask is ready for BGRA
3097 				
3098 				punpcklbw	XMM6, XMM2;
3099 				
3100 				movaps	XMM4, XMM8;
3101 				movaps	XMM5, XMM9;
3102 				
3103 				paddusw XMM5, XMM6;//1+alpha
3104 				psubusw	XMM4, XMM6;//256-alpha
3105 				
3106 				pmullw	XMM0, XMM4;//dest*(256-alpha)
3107 				pmullw	XMM1, XMM5;//src*(1+alpha)
3108 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
3109 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
3110 				
3111 				packuswb	XMM0, XMM2;
3112 				
3113 				movd	[RDI], XMM0;
3114 				
3115 				add		RSI, 4;
3116 				add		RDI, 4;
3117 				add		RDX, 4;
3118 				dec		RCX;
3119 				cmp		RCX, 0;
3120 				jnz		fourpixelblendloop;
3121 
3122 			endofalgorithm:
3123 				;
3124 		}
3125 	}else{
3126 		for(int i ; i < length ; i++){
3127 			switch(src.ColorSpaceARGB.alpha){
3128 				case 0: 
3129 					break;
3130 				case 255: 
3131 					dest = src;
3132 					break;
3133 				default:
3134 					int src1 = 1 + src.ColorSpaceARGB.alpha;
3135 					int src256 = 256 - src.ColorSpaceARGB.alpha;
3136 					dest1.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8);
3137 					dest1.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8);
3138 					dest1.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8);
3139 					break;
3140 			}
3141 			src++;
3142 			dest++;
3143 			dest1++;
3144 		}
3145 	}	
3146 }
3147 /**
3148  * Copies a 32bit image onto another without blitter. No transparency is used. Dest is placeholder.
3149  */
3150 public @nogc void copy32bit(uint* src, uint* dest, uint* dest1, size_t length){
3151 	copy32bit(src, dest1, length);
3152 }
3153 /**
3154  * Four plus one operand blitter for 8 bit values. Uses an external mask. Final values are copied into memory location specified by dest1;
3155  */
3156 public @nogc void blitter8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length, ubyte* mask){
3157 	version(X86){
3158 		version(MMX){
3159 			asm @nogc{
3160 				mov		ESI, src[EBP];
3161 				mov		EDI, dest[EBP];
3162 				mov		EBX, mask[EBP];
3163 				mov		EDX, dest1[EBP];
3164 				mov		ECX, length;
3165 				cmp		ECX, 8;
3166 				jl		fourpixel;
3167 			eigthpixelloop:
3168 				movq	MM0, [ESI];
3169 				movq	MM1, [EDI];
3170 				movq	MM2, [EBX];
3171 				pand	MM1, MM2;
3172 				por		MM1, MM0;
3173 				movq	[EDX], MM1;
3174 				add		ESI, 8;
3175 				add		EDI, 8;
3176 				add		EBX, 8;
3177 				add		EDX, 8;
3178 				sub		ECX, 8;
3179 				jge		eigthpixelloop;
3180 			fourpixel:
3181 				cmp		ECX, 4;
3182 				jl		singlepixelloop;
3183 				movd	MM0, [ESI];
3184 				movd	MM1, [EDI];
3185 				movd	MM2, [EBX];
3186 				pand	MM1, MM2;
3187 				por		MM1, MM0;
3188 				movd	[EDX], MM1;
3189 				add		ESI, 4;
3190 				add		EDI, 4;
3191 				add		EBX, 4;
3192 				add		EDX, 4;
3193 				sub		ECX, 4;
3194 			singlepixelloop:
3195 				//cmp		ECX, 0;
3196 				jecxz	end;
3197 				mov		AL, [ESI];
3198 				mov		AH, [EDI];
3199 				and		AH, [EBX];
3200 				or		AH, AL;
3201 				mov		[EDX], AH;
3202 				inc		ESI;
3203 				inc		EDI;
3204 				inc		EBX;
3205 				inc		EDX;
3206 				dec		ECX;
3207 				jmp		singlepixelloop;
3208 			end:
3209 				emms;
3210 			}
3211 		}else{
3212 			asm @nogc{
3213 				mov		ESI, src[EBP];
3214 				mov		EDI, dest[EBP];
3215 				mov		EBX, mask[EBP];
3216 				mov		EDX, dest1[EBP];
3217 				mov		ECX, length;
3218 				cmp		ECX, 16;
3219 				pxor	XMM7, XMM7;
3220 				jl		eightpixel;
3221 			sixteenpixelloop:
3222 				movups	XMM0, [ESI];
3223 				movups	XMM1, [EDI];
3224 				movups	XMM2, [EBX];
3225 				pand	XMM1, XMM2;
3226 				por		XMM1, XMM0;
3227 				movups	[EDX], XMM1;
3228 				add		ESI, 16;
3229 				add		EDI, 16;
3230 				add		EBX, 16;
3231 				add		EDX, 16;
3232 				sub		ECX, 16;
3233 				cmp		ECX, 16;
3234 				jge		sixteenpixelloop;
3235 			eightpixel:
3236 				cmp		ECX, 8;
3237 				jl		fourpixel;
3238 				movq	XMM0, [ESI];
3239 				movq	XMM1, [EDI];
3240 				movq	XMM2, [EBX];
3241 				pand	XMM1, XMM2;
3242 				por		XMM1, XMM0;
3243 				movq	[EDX], XMM1;
3244 				add		ESI, 8;
3245 				add		EDI, 8;
3246 				add		EBX, 8;
3247 				add		EDX, 8;
3248 				sub		ECX, 8;
3249 			fourpixel:
3250 				cmp		ECX, 4;
3251 				jl		singlepixelloop;
3252 				movd	XMM0, [ESI];
3253 				movd	XMM1, [EDI];
3254 				movd	XMM2, [EBX];
3255 				pand	XMM1, XMM2;
3256 				por		XMM1, XMM0;
3257 				movd	[EDX], XMM1;
3258 				add		ESI, 4;
3259 				add		EDI, 4;
3260 				add		EBX, 4;
3261 				add		EDX, 4;
3262 				sub		ECX, 4;
3263 			singlepixelloop:
3264 				//cmp		ECX, 0;
3265 				jecxz	end;
3266 				mov		AL, [ESI];
3267 				mov		AH, [EDI];
3268 				and		AH, [EBX];
3269 				or		AH, AL;
3270 				mov		[EDX], AH;
3271 				inc		ESI;
3272 				inc		EDI;
3273 				inc		EBX;
3274 				inc		EDX;
3275 				dec		ECX;
3276 				jmp		singlepixelloop;
3277 			end:
3278 				;
3279 			}
3280 		}
3281 	}else version(X86_64){
3282 		asm @nogc{
3283 			mov		RSI, src[RBP];
3284 			mov		RDI, dest[RBP];
3285 			mov		RBX, mask[RBP];
3286 			mov		RDX, dest1[RBP];
3287 			mov		RCX, length;
3288 			cmp		RCX, 16;
3289 			//pxor	XMM7, XMM7;
3290 			jl		eightpixel;
3291 		sixteenpixelloop:
3292 			movups	XMM0, [RSI];
3293 			movups	XMM1, [RDI];
3294 			movups	XMM2, [RBX];
3295 			pand	XMM1, XMM2;
3296 			por		XMM1, XMM0;
3297 			movups	[RDX], XMM1;
3298 			add		RSI, 16;
3299 			add		RDI, 16;
3300 			add		RBX, 16;
3301 			add		RDX, 16;
3302 			sub		RCX, 16;
3303 			cmp		RCX, 16;
3304 			jge		sixteenpixelloop;
3305 		eightpixel:
3306 			cmp		RCX, 8;
3307 			jl		fourpixel;
3308 			movq	XMM0, [RSI];
3309 			movq	XMM1, [RDI];
3310 			movq	XMM2, [RBX];
3311 			pand	XMM1, XMM2;
3312 			por		XMM1, XMM0;
3313 			movq	[RDX], XMM1;
3314 			add		RSI, 8;
3315 			add		RDI, 8;
3316 			add		RBX, 8;
3317 			add		RDX, 8;
3318 			sub		RCX, 8;
3319 		fourpixel:
3320 			cmp		RCX, 4;
3321 			jl		singlepixelloop;
3322 			movd	XMM0, [RSI];
3323 			movd	XMM1, [RDI];
3324 			movups	XMM2, [RBX];
3325 			pand	XMM1, XMM2;
3326 			por		XMM1, XMM0;
3327 			movd	[RDX], XMM1;
3328 			add		RSI, 4;
3329 			add		RDI, 4;
3330 			add		RBX, 4;
3331 			add		RDX, 4;
3332 			sub		RCX, 4;
3333 		singlepixelloop:
3334 			cmp		RCX, 0;
3335 			jz		end;
3336 			mov		AL, [RSI];
3337 			mov		AH, [RDI];
3338 			and		AH, [RBX];
3339 			or		AH, AL;
3340 			mov		[RDX], AH;
3341 			inc		RSI;
3342 			inc		RDI;
3343 			inc		RBX;
3344 			inc		RDX;
3345 			dec		RCX;
3346 			jmp		singlepixelloop;
3347 		end:
3348 			;
3349 		}
3350 	}else{
3351 		while(length){
3352 			if(*src)
3353 				*dest1 = (*dest & *mask) | *src;
3354 			src++;
3355 			dest++;
3356 			dest1++;
3357 			mask++;
3358 			length--;
3359 		}
3360 	}
3361 }
3362 /**
3363  * Copies an 8bit image onto another without blitter. No transparency is used. Dest and mask are placeholders.
3364  */
3365 public @nogc void copy8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length, ubyte* mask){
3366 	copy8bit(src,dest1,length);
3367 }
3368 /**
3369  * Four plus one operand blitter for 8 bit values. Uses external mask. Copies the result to the memory location specified by dest1.
3370  */
3371 public @nogc void blitter16bit(ushort* src, ushort* dest, ushort* dest1, size_t length, ushort* mask){
3372 	version(X86){
3373 		version(MMX){
3374 			asm @nogc{
3375 				pxor	MM7, MM7;
3376 				mov		ESI, src[EBP];
3377 				mov		EDI, dest[EBP];
3378 				mov		EBX, mask[EBP];
3379 				mov		EDX, dest[EBP];
3380 				mov		ECX, length;
3381 				cmp		ECX, 4;
3382 				jl		twopixel;
3383 			fourpixelloop:
3384 				movq	MM0, [ESI];
3385 				movq	MM1, [EDI];
3386 				movq	MM2, [EBX];
3387 				pand	MM1, MM2;
3388 				por		MM1, MM0;
3389 				movq	[EDX], MM1;
3390 				add		ESI, 8;
3391 				add		EDI, 8;
3392 				add		EBX, 8;
3393 				add		EDX, 8;
3394 				sub		ECX, 4;
3395 				jge		fourpixelloop;
3396 			twopixel:
3397 				cmp		ECX, 4;
3398 				jl		singlepixel;
3399 				movd	MM0, [ESI];
3400 				movd	MM1, [EDI];
3401 				movd	MM2, [EBX];
3402 				pand	MM1, MM2;
3403 				por		MM1, MM0;
3404 				movd	[EDX], MM1;
3405 				add		ESI, 4;
3406 				add		EDI, 4;
3407 				add		EBX, 4;
3408 				add		EDX, 4;
3409 				sub		ECX, 2;
3410 			singlepixel:
3411 				//cmp		ECX, 0;
3412 				jecxz		end;
3413 				mov		AX, [EBX];
3414 				and		AX, [EDI];
3415 				or		AX, [ESI];
3416 				mov		[EDX], AX;
3417 			end:
3418 				emms;
3419 			}
3420 		}else{
3421 			asm @nogc{
3422 				pxor	XMM7, XMM7;
3423 				mov		ESI, src[EBP];
3424 				mov		EDI, dest[EBP];
3425 				mov		EBX, mask[EBP];
3426 				mov		EDX, dest1[EBP];
3427 				mov		ECX, length;
3428 				cmp		ECX, 8;
3429 				jl		fourpixel;
3430 			eigthpixelloop:
3431 				movups	XMM0, [ESI];
3432 				movups	XMM1, [EDI];
3433 				movups	XMM2, [EBX];
3434 				pand	XMM1, XMM2;
3435 				por		XMM1, XMM0;
3436 				movups	[EDX], XMM1;
3437 				add		ESI,16;
3438 				add		EDI,16;
3439 				add		EBX,16;
3440 				add		EDX,16;
3441 				sub		ECX, 8;
3442 				cmp		ECX, 8;
3443 				jge		eigthpixelloop;
3444 			fourpixel:
3445 				cmp		ECX, 4;
3446 				jl		twopixel;
3447 				movq	XMM0, [ESI];
3448 				movq	XMM1, [EDI];
3449 				movq	XMM2, [EBX];
3450 				pand	XMM1, XMM2;
3451 				por		XMM1, XMM0;
3452 				movq	[EDX], XMM1;
3453 				add		ESI, 8;
3454 				add		EDI, 8;
3455 				add		EBX, 8;
3456 				add		EDX, 8;
3457 				sub		ECX, 4;
3458 			twopixel:
3459 				cmp		ECX, 2;
3460 				jl		singlepixel;
3461 				movd	XMM0, [ESI];
3462 				movd	XMM1, [EDI];
3463 				movd	XMM2, [EBX];
3464 				pand	XMM1, XMM2;
3465 				por		XMM1, XMM0;
3466 				movd	[EDX], XMM1;
3467 				add		ESI, 4;
3468 				add		EDI, 4;
3469 				add		EBX, 4;
3470 				add		EDX, 4;
3471 				sub		ECX, 2;
3472 			singlepixel:
3473 				//cmp		ECX, 0;
3474 				jecxz		end;
3475 				mov		AX, [EBX];
3476 				and		AX, [EDI];
3477 				or		AX, [ESI];
3478 				mov		[EDX], AX;
3479 			end:
3480 				;
3481 			}
3482 		}
3483 	}else version(X86_64){
3484 		asm @nogc{
3485 			pxor	XMM7, XMM7;
3486 			mov		RSI, src[RBP];
3487 			mov		RDI, dest[RBP];
3488 			mov		RBX, mask[RBP];
3489 			mov		RDX, dest1[RBP];
3490 			mov		RCX, length;
3491 			cmp		RCX, 8;
3492 			jl		fourpixel;
3493 		eigthpixelloop:
3494 			movups	XMM0, [RSI];
3495 			movups	XMM1, [RDI];
3496 			movups	XMM2, [RBX];
3497 			pand	XMM1, XMM2;
3498 			por		XMM1, XMM0;
3499 			movups	[RDX], XMM1;
3500 			add		RSI,16;
3501 			add		RDI,16;
3502 			add		RBX,16;
3503 			add		RDX,16;
3504 			sub		RCX, 8;
3505 			cmp		RCX, 8;
3506 			jge		eigthpixelloop;
3507 		fourpixel:
3508 			cmp		RCX, 4;
3509 			jl		twopixel;
3510 			movq	XMM0, [RSI];
3511 			movq	XMM1, [RDI];
3512 			movq	XMM2, [RBX];
3513 			pand	XMM1, XMM2;
3514 			por		XMM1, XMM0;
3515 			movq	[RDX], XMM1;
3516 			add		RSI, 8;
3517 			add		RDI, 8;
3518 			add		RBX, 8;
3519 			add		RDX, 8;
3520 			sub		RCX, 4;
3521 		twopixel:
3522 			cmp		RCX, 2;
3523 			jl		singlepixel;
3524 			movd	XMM0, [RSI];
3525 			movd	XMM1, [RDI];
3526 			movd	XMM2, [RBX];
3527 			pand	XMM1, XMM2;
3528 			por		XMM1, XMM0;
3529 			movd	[RDX], XMM1;
3530 			add		RSI, 4;
3531 			add		RDI, 4;
3532 			add		RBX, 4;
3533 			add		RDX, 4;
3534 			sub		RCX, 2;
3535 		singlepixel:
3536 			cmp		RCX, 0;
3537 			jz		end;
3538 			mov		AX, [RBX];
3539 			and		AX, [RDI];
3540 			or		AX, [RSI];
3541 			mov		[RDX], AX;
3542 		end:
3543 			;
3544 		}
3545 	}else{
3546 		while(length){
3547 			if(*src)
3548 				*dest1 = *src;
3549 			else
3550 				*dest1 = *dest;
3551 			src++;
3552 			dest++;
3553 			dest1++;
3554 			length--;
3555 		}
3556 	}
3557 }
3558 /**
3559  * Copies a 16bit image onto another without blitter. No transparency is used. Dest and mask is placeholder.
3560  */
3561 public @nogc void copy16bit(ushort* src, ushort* dest, ushort* dest1, size_t length, ushort* mask){
3562 	copy16bit(src,dest1,length);
3563 }
3564 /**
3565  * Two plus one operand blitter for 32 bit values. Uses a separate mask. Copies the result into location specified by dest1.
3566  */
3567 public @nogc void blitter32bit(uint* src, uint* dest, uint* dest1, size_t length, uint* mask){
3568 	version(X86){
3569 		version(MMX){
3570 			asm @nogc{
3571 				mov		ESI, src[EBP];
3572 				mov		EDI, dest[EBP];
3573 				mov		EBX, mask[EBP];
3574 				mov		EDX, dest1[EBP];
3575 				mov		ECX, length;
3576 				movq	MM6, ALPHABLEND_MMX_MASK;
3577 				pxor	MM7, MM7;
3578 				cmp		ECX, 2;
3579 				jl		twopixel;
3580 			twopixelloop:
3581 				movq	MM0, [ESI];
3582 				movq	MM1, [EDI];
3583 				movq	MM2, [EBX];
3584 				pand	MM1, MM2;
3585 				por		MM1, MM0;
3586 				movq	[EDX], MM1;
3587 				add		ESI, 8;
3588 				add		EDI, 8;
3589 				add		EBX, 8;
3590 				add		EDX, 8;
3591 				sub		ECX, 2;
3592 				jge		fourpixelloop;
3593 			onepixel:
3594 				jecxz	end;
3595 				movd	MM0, [ESI];
3596 				movd	MM1, [EDI];
3597 				movd	MM2, [EBX];
3598 				pand	MM1, MM2;
3599 				por		MM1, MM0;
3600 				movd	[EDX], MM1;
3601 			end:
3602 				emms;
3603 			}
3604 		}else{
3605 			asm @nogc{
3606 				mov		ESI, src[EBP];
3607 				mov		EDI, dest[EBP];
3608 				mov		EBX, mask[EBP];
3609 				mov		EDX, dest1[EBP];
3610 				mov		ECX, length;
3611 				movups	XMM6, ALPHABLEND_SSE2_MASK;
3612 				pxor	XMM7, XMM7;
3613 				cmp		ECX, 4;
3614 				jl		twopixel;
3615 			fourpixelloop:
3616 				movups	XMM0, [ESI];
3617 				movups	XMM1, [EDI];
3618 				movups	XMM2, [EBX];
3619 				pand	XMM1, XMM2;
3620 				por		XMM1, XMM0;
3621 				movups	[EDX], XMM1;
3622 				add		ESI,16;
3623 				add		EDI,16;
3624 				add		EBX,16;
3625 				add		EDX,16;
3626 				sub		ECX, 4;
3627 				cmp		ECX, 4;
3628 				jge		fourpixelloop;
3629 			twopixel:
3630 				cmp		ECX, 2;
3631 				jl		onepixel;
3632 				movq	XMM0, [ESI];
3633 				movq	XMM1, [EDI];
3634 				movq	XMM2, [EBX];
3635 				pand	XMM1, XMM2;
3636 				por		XMM1, XMM0;
3637 				movq	[EDX], XMM1;
3638 				add		ESI, 8;
3639 				add		EDI, 8;
3640 				add		EBX, 8;
3641 				add		EDX, 8;
3642 				sub		ECX, 2;
3643 			onepixel:
3644 				jecxz	end;
3645 				movd	XMM0, [ESI];
3646 				movd	XMM1, [EDI];
3647 				movd	XMM2, [EBX];
3648 				pand	XMM1, XMM2;
3649 				por		XMM1, XMM0;
3650 				movd	[EDX], XMM1;
3651 			end:
3652 				;
3653 			}
3654 		}
3655 	}else version(X86_64){
3656 		asm @nogc{
3657 			mov		RSI, src[RBP];
3658 			mov		RDI, dest[RBP];
3659 			mov		RBX, mask[RBP];
3660 			mov		RDX, dest1[RBP];
3661 			mov		RCX, length;
3662 			movups	XMM6, ALPHABLEND_SSE2_MASK;
3663 			pxor	XMM7, XMM7;
3664 			cmp		ECX, 4;
3665 			jl		twopixel;
3666 		fourpixelloop:
3667 			movups	XMM0, [RSI];
3668 			movups	XMM1, [RDI];
3669 			movups	XMM2, [RBX];
3670 			pand	XMM1, XMM2;
3671 			por		XMM1, XMM0;
3672 			movups	[RDX], XMM1;
3673 			add		RSI,16;
3674 			add		RDI,16;
3675 			add		RBX,16;
3676 			add		RDX,16;
3677 			sub		RCX, 4;
3678 			cmp		RCX, 4;
3679 			jge		fourpixelloop;
3680 		twopixel:
3681 			cmp		RCX, 2;
3682 			jl		onepixel;
3683 			movq	XMM0, [RSI];
3684 			movq	XMM1, [RDI];
3685 			movq	XMM2, [RBX];
3686 			pand	XMM1, XMM2;
3687 			por		XMM1, XMM0;
3688 			movq	[RDX], XMM1;
3689 			add		RSI, 8;
3690 			add		RDI, 8;
3691 			add		RBX, 8;
3692 			add		RDX, 8;
3693 			sub		RCX, 2;
3694 		onepixel:
3695 			cmp		RCX, 1;
3696 			jl		end;
3697 			movd	XMM0, [RSI];
3698 			movd	XMM1, [RDI];
3699 			movd	XMM2, [RBX];
3700 			pand	XMM1, XMM2;
3701 			por		XMM1, XMM0;
3702 			movd	[RDX], XMM1;
3703 		end:
3704 			;
3705 		}
3706 	}else{
3707 		while(length){
3708 			dest1.base = (dest.base & mask.base) | src.base;
3709 			mask++;
3710 			src++;
3711 			dest++;
3712 			dest1++;
3713 			length--;
3714 		}
3715 	}
3716 }
3717 /**
3718  * Implements a four plus one operand alpha-blending algorithm for 32bit bitmaps. For masking, use Pixel32Bit.AlphaMask from CPUblit.colorspaces.
3719  * Output is copied into a memory location specified by dest1.
3720  */
3721 public @nogc void alphaBlend32bit(uint* src, uint* dest, uint* dest1, size_t length, uint* mask){
3722 	version(X86){
3723 		version(MMX){
3724 			int target8 = length/8, target4 = length%2;
3725 			asm @nogc {
3726 				//setting up the pointer registers and the counter register
3727 				//mov		EBX, alpha[EBP];
3728 				mov		ESI, src[EBP];
3729 				mov		EDI, dest[EBP];
3730 				mov		EBX, mask[EBP];
3731 				mov		EDX, dest1[EBP];
3732 				mov		ECX, target8;
3733 				cmp		ECX, 0;
3734 				jz		fourpixelblend; //skip 16 byte operations if not needed
3735 				//iteration cycle entry point
3736 			sixteenpixelblend:
3737 				//create alpha mask on the fly
3738 				movq	MM3, [ESI];
3739 				/*movq	MM1, MM3;
3740 				pand	MM1, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3741 				movq	MM0, MM1;
3742 				pslld	MM0, 8;
3743 				por		MM1, MM0;	//mask is ready for RA
3744 				pslld	MM1, 16;
3745 				por		MM0, MM1; //mask is ready for BGRA*/
3746 				movq	MM0, [EBX];
3747 				movq	MM1, MM0;
3748 				punpcklbw	MM0, MM2;
3749 				punpckhbw	MM1, MM2;
3750 				movq	MM6, ALPHABLEND_MMX_CONST256;
3751 				movq	MM7, MM6;
3752 				movq	MM4, ALPHABLEND_MMX_CONST1;
3753 				movq	MM5, MM4;
3754 			
3755 				paddusw	MM4, MM0;	//1 + alpha01
3756 				paddusw	MM5, MM1; //1 + alpha23 
3757 				psubusw	MM6, MM0;	//256 - alpha01
3758 				psubusw	MM7, MM1; //256 - alpha23
3759 			
3760 				//moving the values to their destinations
3761 				movq	MM0, MM3;	//src01
3762 				movq	MM1, MM0; //src23
3763 				punpcklbw	MM0, MM2;
3764 				punpckhbw	MM1, MM2;
3765 				pmullw	MM4, MM0;	//src01 * (1 + alpha01)
3766 				pmullw	MM5, MM1;	//src23 * (1 + alpha23)
3767 				movq	MM0, [EDI];	//dest01
3768 				movq	MM1, MM0;		//dest23
3769 				punpcklbw	MM0, MM2;
3770 				punpckhbw	MM1, MM2;
3771 				pmullw	MM6, MM0;	//dest01 * (256 - alpha)
3772 				pmullw	MM7, MM1; //dest23 * (256 - alpha)
3773 		
3774 				paddusw	MM4, MM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
3775 				paddusw	MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
3776 				psrlw	MM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
3777 				psrlw	MM5, 8;
3778 				//moving the result to its place;
3779 				//pxor	MM2, MM2;
3780 				packuswb	MM4, MM5;
3781 		
3782 				movq	[EDX], MM4;
3783 				//add		EBX, 16;
3784 				add		ESI, 8;
3785 				add		EDI, 8;
3786 				add		EBX, 8;
3787 				add		EDX, 8;
3788 				dec		ECX;
3789 				cmp		ECX, 0;
3790 				jnz		sixteenpixelblend;
3791 				fourpixelblend:
3792 				mov		ECX, target4;
3793 				cmp		ECX, 0;
3794 				jz		endofalgorithm;
3795 				fourpixelblendloop:
3796 
3797 				//movd	XMM6, [EBX];//alpha
3798 			
3799 
3800 				movd	MM0, [EDI];
3801 				movd	MM1, [ESI];
3802 				punpcklbw	MM0, MM2;//dest
3803 				punpcklbw	MM1, MM2;//src
3804 				movups	MM6, MM1;
3805 				pand	MM6, ALPHABLEND_MMX_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3806 				movups	MM7, MM6;
3807 				pslld	MM6, 8;
3808 				por		MM7, MM6;	//mask is ready for RA
3809 				pslld	MM7, 16;
3810 				por		MM6, MM7; //mask is ready for GRA
3811 				punpcklbw	MM7, MM2;
3812 				movaps	MM4, ALPHABLEND_MMX_CONST256;
3813 				movaps	MM5, ALPHABLEND_MMX_CONST1;
3814 				
3815 				paddusw MM5, MM6;//1+alpha
3816 				psubusw	MM4, MM6;//256-alpha
3817 				
3818 				pmullw	MM0, MM4;//dest*(256-alpha)
3819 				pmullw	MM1, MM5;//src*(1+alpha)
3820 				paddusw	MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha))
3821 				psrlw	MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
3822 				
3823 				packuswb	MM0, MM2;
3824 				
3825 				movd	[EDX], MM0;	
3826 
3827 			endofalgorithm:
3828 				emms;
3829 			}
3830 		}else{
3831 			int target16 = length/4, target4 = length%4;
3832 			asm @nogc {
3833 				//setting up the pointer registers and the counter register
3834 				//mov		EBX, alpha[EBP];
3835 				mov		ESI, src[EBP];
3836 				mov		EDI, dest[EBP];
3837 				mov		EBX, mask[EBP];
3838 				mov		EDX, dest1[EBP];
3839 				mov		ECX, target16;
3840 				cmp		ECX, 0;
3841 				jz		fourpixelblend; //skip 16 byte operations if not needed
3842 				//iteration cycle entry point
3843 			sixteenpixelblend:
3844 				//create alpha mask on the fly
3845 				movups	XMM3, [ESI];
3846 				movups	XMM1, [EBX];
3847 				//pand	XMM1, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3848 				//movups	XMM0, XMM1;
3849 				//pslld	XMM0, 8;
3850 				//por		XMM1, XMM0;	//mask is ready for RA
3851 				//pslld	XMM1, 16;
3852 				//por		XMM0, XMM1; //mask is ready for BGRA/**/
3853 				movups	XMM0, XMM1;
3854 				
3855 				punpcklbw	XMM0, XMM2;
3856 				punpckhbw	XMM1, XMM2;
3857 				movups	XMM6, ALPHABLEND_SSE2_CONST256;
3858 				movups	XMM7, XMM6;
3859 				movups	XMM4, ALPHABLEND_SSE2_CONST1;
3860 				movups	XMM5, XMM4;
3861 			
3862 				paddusw	XMM4, XMM0;	//1 + alpha01
3863 				paddusw	XMM5, XMM1; //1 + alpha23 
3864 				psubusw	XMM6, XMM0;	//256 - alpha01
3865 				psubusw	XMM7, XMM1; //256 - alpha23
3866 				
3867 				//moving the values to their destinations
3868 
3869 				movups	XMM0, XMM3;	//src01
3870 				movups	XMM1, XMM0; //src23
3871 				punpcklbw	XMM0, XMM2;
3872 				punpckhbw	XMM1, XMM2;
3873 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
3874 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
3875 				movups	XMM0, [EDI];	//dest01
3876 				movups	XMM1, XMM0;		//dest23
3877 				punpcklbw	XMM0, XMM2;
3878 				punpckhbw	XMM1, XMM2;
3879 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
3880 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
3881 			
3882 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
3883 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
3884 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
3885 				psrlw	XMM5, 8;
3886 				//moving the result to its place;
3887 				//pxor	MM2, MM2;
3888 				packuswb	XMM4, XMM5;
3889 			
3890 				movups	[EDX], XMM4;
3891 				//add		EBX, 16;
3892 				add		ESI, 16;
3893 				add		EDI, 16;
3894 				add		EBX, 16;
3895 				add		EDX, 16;
3896 				dec		ECX;
3897 				cmp		ECX, 0;
3898 				jnz		sixteenpixelblend;
3899 
3900 			fourpixelblend:
3901 
3902 				mov		ECX, target4;
3903 				cmp		ECX, 0;
3904 				jz		endofalgorithm;
3905 
3906 			fourpixelblendloop:
3907 
3908 				//movd	XMM6, [EBX];//alpha
3909 				
3910 
3911 				movd	XMM0, [EDI];
3912 				movd	XMM1, [ESI];
3913 				punpcklbw	XMM0, XMM2;//dest
3914 				punpcklbw	XMM1, XMM2;//src
3915 				movd	XMM6, [EBX];
3916 				/*pand	XMM6, ALPHABLEND_SSE2_MASK;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3917 				movups	XMM7, XMM6;
3918 				pslld	XMM6, 8;
3919 				por		XMM7, XMM6;	//mask is ready for RA
3920 				pslld	XMM7, 16;
3921 				por		XMM6, XMM7; //mask is ready for BGRA*/
3922 				
3923 				punpcklbw	XMM6, XMM2;
3924 				
3925 				movaps	XMM4, ALPHABLEND_SSE2_CONST256;
3926 				movaps	XMM5, ALPHABLEND_SSE2_CONST1;
3927 				
3928 				paddusw XMM5, XMM6;//1+alpha
3929 				psubusw	XMM4, XMM6;//256-alpha
3930 				
3931 				pmullw	XMM0, XMM4;//dest*(256-alpha)
3932 				pmullw	XMM1, XMM5;//src*(1+alpha)
3933 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
3934 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
3935 				
3936 				packuswb	XMM0, XMM2;
3937 				
3938 				movd	[EDX], XMM0;
3939 				
3940 				add		ESI, 4;
3941 				add		EDI, 4;/**/
3942 				add		EBX, 4;
3943 				add		EDX, 4;
3944 				dec		ECX;
3945 				cmp		ECX, 0;
3946 				jnz		fourpixelblendloop;
3947 
3948 			endofalgorithm:
3949 				;
3950 			}
3951 		}
3952 	}else version(X86_64){
3953 		size_t target16 = length/4, target4 = length%4;
3954 			asm @nogc {
3955 				//setting up the pointer registers and the counter register
3956 				//mov		EBX, alpha[EBP];
3957 				mov		RSI, src[RBP];
3958 				mov		RDI, dest[RBP];
3959 				mov		RBX, mask[RBP];
3960 				mov		RDX, dest1[RBP];
3961 				mov		RCX, target16;
3962 				cmp		RCX, 0;
3963 				movups	XMM8, ALPHABLEND_SSE2_CONST256;
3964 				movups	XMM9, ALPHABLEND_SSE2_CONST1;
3965 				movups	XMM10, ALPHABLEND_SSE2_MASK;
3966 				jz		fourpixelblend; //skip 16 byte operations if not needed
3967 				//iteration cycle entry point
3968 			sixteenpixelblend:
3969 				//create alpha mask on the fly
3970 				movups	XMM3, [RSI];
3971 				/*movups	XMM1, XMM3;
3972 				pand	XMM1, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
3973 				movups	XMM0, XMM1;
3974 				pslld	XMM0, 8;
3975 				por		XMM1, XMM0;	//mask is ready for RA
3976 				pslld	XMM1, 16;
3977 				por		XMM0, XMM1; //mask is ready for BGRA/**/
3978 				movups	XMM0, [RBX];
3979 				movups	XMM1, XMM0;
3980 				
3981 				punpcklbw	XMM0, XMM2;
3982 				punpckhbw	XMM1, XMM2;
3983 				movups	XMM6, XMM8;
3984 				movups	XMM7, XMM8;
3985 				movups	XMM4, XMM9;
3986 				movups	XMM5, XMM9;
3987 			
3988 				paddusw	XMM4, XMM0;	//1 + alpha01
3989 				paddusw	XMM5, XMM1; //1 + alpha23 
3990 				psubusw	XMM6, XMM0;	//256 - alpha01
3991 				psubusw	XMM7, XMM1; //256 - alpha23
3992 				
3993 				//moving the values to their destinations
3994 
3995 				movups	XMM0, XMM3;	//src01
3996 				movups	XMM1, XMM0; //src23
3997 				punpcklbw	XMM0, XMM2;
3998 				punpckhbw	XMM1, XMM2;
3999 				pmullw	XMM4, XMM0;	//src01 * (1 + alpha01)
4000 				pmullw	XMM5, XMM1;	//src23 * (1 + alpha23)
4001 				movups	XMM0, [EDI];	//dest01
4002 				movups	XMM1, XMM0;		//dest23
4003 				punpcklbw	XMM0, XMM2;
4004 				punpckhbw	XMM1, XMM2;
4005 				pmullw	XMM6, XMM0;	//dest01 * (256 - alpha)
4006 				pmullw	XMM7, XMM1; //dest23 * (256 - alpha)
4007 			
4008 				paddusw	XMM4, XMM6;	//(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01))
4009 				paddusw	XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha))
4010 				psrlw	XMM4, 8;		//(src * (1 + alpha)) + (dest * (256 - alpha)) / 256
4011 				psrlw	XMM5, 8;
4012 				//moving the result to its place;
4013 				//pxor	MM2, MM2;
4014 				packuswb	XMM4, XMM5;
4015 			
4016 				movups	[RDX], XMM4;
4017 				//add		EBX, 16;
4018 				add		RSI, 16;
4019 				add		RDI, 16;
4020 				add		RBX, 16;
4021 				add		RDX, 16;
4022 				dec		RCX;
4023 				cmp		RCX, 0;
4024 				jnz		sixteenpixelblend;
4025 
4026 			fourpixelblend:
4027 
4028 				mov		RCX, target4;
4029 				cmp		RCX, 0;
4030 				jz		endofalgorithm;
4031 
4032 			fourpixelblendloop:
4033 
4034 				//movd	XMM6, [EBX];//alpha
4035 				
4036 
4037 				movd	XMM0, [RDI];
4038 				movd	XMM1, [RSI];
4039 				punpcklbw	XMM0, XMM2;//dest
4040 				punpcklbw	XMM1, XMM2;//src
4041 				movups	XMM6, [RBX];
4042 				/*pand	XMM6, XMM10;	//pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF
4043 				movups	XMM7, XMM6;
4044 				pslld	XMM6, 8;
4045 				por		XMM7, XMM6;	//mask is ready for RA
4046 				pslld	XMM7, 16;
4047 				por		XMM6, XMM7; //mask is ready for BGRA*/
4048 				
4049 				punpcklbw	XMM6, XMM2;
4050 				
4051 				movaps	XMM4, XMM8;
4052 				movaps	XMM5, XMM9;
4053 				
4054 				paddusw XMM5, XMM6;//1+alpha
4055 				psubusw	XMM4, XMM6;//256-alpha
4056 				
4057 				pmullw	XMM0, XMM4;//dest*(256-alpha)
4058 				pmullw	XMM1, XMM5;//src*(1+alpha)
4059 				paddusw	XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha))
4060 				psrlw	XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256
4061 				
4062 				packuswb	XMM0, XMM2;
4063 				
4064 				movd	[RDI], XMM0;
4065 				
4066 				add		RSI, 4;
4067 				add		RDI, 4;/**/
4068 				add		RBX, 4;
4069 				add		RDX, 4;
4070 				dec		RCX;
4071 				cmp		RCX, 0;
4072 				jnz		fourpixelblendloop;
4073 
4074 			endofalgorithm:
4075 				;
4076 		}
4077 	}else{
4078 		for(int i ; i < length ; i++){
4079 			switch(mask.AlphaMask.value){
4080 				case 0: 
4081 					break;
4082 				case 255: 
4083 					dest = src;
4084 					break;
4085 				default:
4086 					int src1 = 1 + mask.AlphaMask.value;
4087 					int src256 = 256 - mask.AlphaMask.value;
4088 					dest1.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8);
4089 					dest1.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8);
4090 					dest1.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8);
4091 					break;
4092 			}
4093 			src++;
4094 			dest++;
4095 			dest1++;
4096 			mask++;
4097 		}
4098 	}
4099 }
4100 /**
4101  * Copies a 32bit image onto another without blitter. No transparency is used. Dest and mask is placeholder.
4102  */
4103 public @nogc void copy32bit(uint* src, uint* dest, uint* dest1, size_t length, uint* mask){
4104 	copy32bit(src,dest1,length);
4105 }
4106 /**
4107  * 3 + 1 operand XOR blitter. 
4108  */
4109 public @nogc void xorBlitter(T)(T* src, T* dest, T* dest1, size_t length){
4110 	static if(T == "ubyte"){
4111 		version(X86){
4112 			version(MMX){
4113 				asm @nogc{
4114 					mov		ESI, src[EBP];
4115 					mov		EDI, dest[EBP];
4116 					mov		EDX, dest1[EBP];
4117 					mov		ECX, length;
4118 					cmp		ECX, 8;
4119 					jl		fourpixel;
4120 				eightpixelloop:
4121 					movq	XMM0, [ESI];
4122 					movq	XMM1, [EDI];
4123 					pxor	XMM0, XMM1;
4124 					movq	[EDX], XMM0;
4125 					add		ESI, 8;
4126 					add		EDI, 8;
4127 					add		EDX, 8;
4128 					sub		ECX, 8;
4129 					cmp		ECX, 8;
4130 					jge		eightpixelloop;
4131 				fourpixel:
4132 					cmp		ECX, 4;
4133 					jl		singlepixelloop;
4134 					movd	XMM0, [ESI];
4135 					movd	XMM1, [EDI];
4136 					pxor	XMM0, XMM1;
4137 					movd	[EDX], XMM0;
4138 					add		ESI, 4;
4139 					add		EDI, 4;
4140 					add		EDX, 4;
4141 					sub		ECX, 4;
4142 					cmp		ECX, 0;
4143 					jle		end;
4144 				singlepixelloop:
4145 					mov		AL, [ESI];
4146 					xor		AL, [EDI];
4147 					mov		[EDX], AL;
4148 					inc		ESI;
4149 					inc		EDI;
4150 					loop	singlepixelloop;
4151 				end:
4152 					emms;
4153 				}
4154 			}else{
4155 				asm @nogc{
4156 					mov		ESI, src[EBP];
4157 					mov		EDI, dest[EBP];
4158 					mov		EDX, dest1[EBP];
4159 					mov		ECX, length;
4160 					cmp		ECX, 16;
4161 					jl		eightpixel;
4162 				sixteenpixelloop:
4163 					movups	XMM0, [ESI];
4164 					movups	XMM1, [EDI];
4165 					pxor	XMM0, XMM1;
4166 					movups	[EDX], XMM0;
4167 					add		ESI, 16;
4168 					add		EDI, 16;
4169 					add		EDX, 16;
4170 					sub		ECX, 16;
4171 					cmp		ECX, 16;
4172 					jge		sixteenpixelloop;
4173 				eightpixel:
4174 					cmp		ECX, 8;
4175 					jl		fourpixel;
4176 					movq	XMM0, [ESI];
4177 					movq	XMM1, [EDI];
4178 					pxor	XMM0, XMM1;
4179 					movq	[EDX], XMM0;
4180 					add		ESI, 8;
4181 					add		EDI, 8;
4182 					add		EDX, 8;
4183 					sub		ECX, 8;
4184 				fourpixel:
4185 					cmp		ECX, 4;
4186 					jl		singlepixelloop;
4187 					movd	XMM0, [ESI];
4188 					movd	XMM1, [EDI];
4189 					pxor	XMM0, XMM1;
4190 					movd	[EDX], XMM0;
4191 					add		ESI, 4;
4192 					add		EDI, 4;
4193 					add		EDX, 4;
4194 					sub		ECX, 4;
4195 					cmp		ECX, 0;
4196 					jle		end;
4197 				singlepixelloop:
4198 					mov		AL, [ESI];
4199 					xor		AL, [EDI];
4200 					mov		[EDX], AL;
4201 					inc		ESI;
4202 					inc		EDI;
4203 					loop	singlepixelloop;
4204 				end:
4205 					;
4206 				}
4207 			}
4208 		}else version(X86_64){
4209 			asm @nogc{
4210 				mov		RSI, src[RBP];
4211 				mov		RDI, dest[RBP];
4212 				mov		RDX, dest1[RBP];
4213 				mov		RCX, length;
4214 				cmp		RCX, 16;
4215 				jl		eightpixel;
4216 			sixteenpixelloop:
4217 				movups	XMM0, [RSI];
4218 				movups	XMM1, [RDI];
4219 				pxor	XMM0, XMM1;
4220 				movups	[RDX], XMM0;
4221 				add		RSI, 16;
4222 				add		RDI, 16;
4223 				add		RDX, 16;
4224 				sub		RCX, 16;
4225 				cmp		RCX, 16;
4226 				jge		sixteenpixelloop;
4227 			eightpixel:
4228 				cmp		RCX, 8;
4229 				jl		fourpixel;
4230 				movq	XMM0, [RSI];
4231 				movq	XMM1, [RDI];
4232 				pxor	XMM0, XMM1;
4233 				movq	[RDX], XMM0;
4234 				add		RSI, 8;
4235 				add		RDI, 8;
4236 				add		RDX, 8;
4237 				sub		RCX, 8;
4238 			fourpixel:
4239 				cmp		RCX, 4;
4240 				jl		singlepixelloop;
4241 				movd	XMM0, [RSI];
4242 				movd	XMM1, [RDI];
4243 				pxor	XMM0, XMM1;
4244 				movd	[RDX], XMM0;
4245 				add		RSI, 4;
4246 				add		RDI, 4;
4247 				add		RDX, 4;
4248 				sub		RCX, 4;
4249 				cmp		RCX, 0;
4250 				jle		end;
4251 			singlepixelloop:
4252 				mov		AL, [RSI];
4253 				xor		AL, [RDI];
4254 				mov		[RDX], AL;
4255 				inc		RSI;
4256 				inc		RDI;
4257 				loop	singlepixelloop;
4258 			end:
4259 				;
4260 			}
4261 		}else{
4262 			while(lenght){
4263 				*dest1 = *src ^ *dest;
4264 				src++;
4265 				dest++;
4266 				dest1++;
4267 				length--;
4268 			}
4269 		}
4270 	}else static if(T == "ushort"){
4271 		version(X86){
4272 			version(MMX){
4273 				asm @nogc{
4274 					mov		ESI, src[EBP];
4275 					mov		EDI, dest[EBP];
4276 					mov		EDX, dest1[EBP];
4277 					mov		ECX, length;
4278 					cmp		ECX, 4;
4279 					jl		twopixel;
4280 				fourpixelloop:
4281 					movq	XMM0, [ESI];
4282 					movq	XMM1, [EDI];
4283 					pxor	XMM0, XMM1;
4284 					movq	[EDX], XMM0;
4285 					add		ESI, 8;
4286 					add		EDI, 8;
4287 					add		EDX, 8;
4288 					sub		ECX, 4;
4289 					cmp		ECX, 4;
4290 					jge		fourpixelloop;
4291 				twopixel:
4292 					cmp		ECX, 4;
4293 					jl		onepixel;
4294 					movd	XMM0, [ESI];
4295 					movd	XMM1, [EDI];
4296 					pxor	XMM0, XMM1;
4297 					movd	[EDX], XMM0;
4298 					add		ESI, 4;
4299 					add		EDI, 4;
4300 					add		EDX, 4;
4301 					sub		ECX, 2;
4302 					cmp		ECX, 0;
4303 					jle		end;
4304 				onepixel:
4305 					mov		AX, [ESI];
4306 					xor		AX, [EDI];
4307 					mov		[EDX], AX;
4308 				end:
4309 					emms;
4310 				}
4311 			}else{
4312 				asm @nogc{
4313 					mov		ESI, src[EBP];
4314 					mov		EDI, dest[EBP];
4315 					mov		EDX, dest1[EBP];
4316 					mov		ECX, length;
4317 					cmp		ECX, 8;
4318 					jl		fourpixel;
4319 				eightpixelloop:
4320 					movups	XMM0, [ESI];
4321 					movups	XMM1, [EDI];
4322 					pxor	XMM0, XMM1;
4323 					movups	[EDX], XMM0;
4324 					add		ESI,16;
4325 					add		EDI,16;
4326 					add		EDX,16;
4327 					sub		ECX, 8;
4328 					cmp		ECX, 8;
4329 					jge		eightpixelloop;
4330 				fourpixel:
4331 					cmp		ECX, 4;
4332 					jl		twopixel;
4333 					movq	XMM0, [ESI];
4334 					movq	XMM1, [EDI];
4335 					pxor	XMM0, XMM1;
4336 					movq	[EDX], XMM0;
4337 					add		ESI, 8;
4338 					add		EDI, 8;
4339 					add		EDX, 8;
4340 					sub		ECX, 4;
4341 				twopixel:
4342 					cmp		ECX, 2;
4343 					jl		onepixel;
4344 					movd	XMM0, [ESI];
4345 					movd	XMM1, [EDI];
4346 					pxor	XMM0, XMM1;
4347 					movd	[EDX], XMM0;
4348 					add		ESI, 4;
4349 					add		EDI, 4;
4350 					add		EDX, 4;
4351 					sub		ECX, 2;
4352 					cmp		ECX, 0;
4353 					jle		end;
4354 				onepixel:
4355 					mov		AX, [ESI];
4356 					xor		AX, [EDI];
4357 					mov		[EDX], AX;
4358 				end:
4359 					;
4360 				}
4361 			}
4362 		}else version(X86_64){
4363 			asm @nogc{
4364 				mov		RSI, src[RBP];
4365 				mov		RDI, dest[RBP];
4366 				mov		RDX, dest1[RBP];
4367 				mov		RCX, length;
4368 				cmp		RCX, 8;
4369 				jl		fourpixel;
4370 			eightpixelloop:
4371 				movups	XMM0, [RSI];
4372 				movups	XMM1, [RDI];
4373 				pxor	XMM0, XMM1;
4374 				movups	[RDX], XMM0;
4375 				add		RSI,16;
4376 				add		RDI,16;
4377 				add		RDX,16;
4378 				sub		RCX, 8;
4379 				cmp		RCX, 8;
4380 				jge		eightpixelloop;
4381 			fourpixel:
4382 				cmp		RCX, 4;
4383 				jl		twopixel;
4384 				movq	XMM0, [RSI];
4385 				movq	XMM1, [RDI];
4386 				pxor	XMM0, XMM1;
4387 				movq	[RDX], XMM0;
4388 				add		RSI, 8;
4389 				add		RDI, 8;
4390 				add		RDX, 8;
4391 				sub		RCX, 4;
4392 			twopixel:
4393 				cmp		RCX, 2;
4394 				jl		singlepixelloop;
4395 				movd	XMM0, [RSI];
4396 				movd	XMM1, [RDI];
4397 				pxor	XMM0, XMM1;
4398 				movd	[RDX], XMM0;
4399 				add		RSI, 4;
4400 				add		RDI, 4;
4401 				add		RDX, 4;
4402 				sub		RCX, 2;
4403 				cmp		RCX, 0;
4404 				jle		end;
4405 			onepixel:
4406 				mov		AX, [RSI];
4407 				xor		AX, [RDI];
4408 				mov		[RDX], AX;
4409 			end:
4410 				;
4411 			}
4412 		}else{
4413 			while(lenght){
4414 				*dest1 = *src ^ *dest;
4415 				src++;
4416 				dest++;
4417 				dest1++;
4418 				length--;
4419 			}
4420 		}
4421 	}
4422 	static if(T == "uint"){
4423 		version(X86){
4424 			version(MMX){
4425 				asm @nogc{
4426 					mov		ESI, src[EBP];
4427 					mov		EDI, dest[EBP];
4428 					mov		EDX, dest1[EBP];
4429 					mov		ECX, length;
4430 					cmp		ECX, 2;
4431 					jl		onepixel;
4432 				twopixelloop:
4433 					movq	XMM0, [ESI];
4434 					movq	XMM1, [EDI];
4435 					pxor	XMM0, XMM1;
4436 					movq	[EDX], XMM0;
4437 					add		ESI, 8;
4438 					add		EDI, 8;
4439 					add		EDX, 8;
4440 					sub		ECX, 2;
4441 					cmp		ECX, 2;
4442 					jge		twopixelloop;
4443 				onepixel:
4444 					cmp		ECX, 1;
4445 					jl		end;
4446 					movd	XMM0, [ESI];
4447 					movd	XMM1, [EDI];
4448 					pxor	XMM0, XMM1;
4449 					movd	[EDX], XMM0;
4450 				end:
4451 					emms;
4452 				}
4453 			}else{
4454 				asm @nogc{
4455 					mov		ESI, src[EBP];
4456 					mov		EDI, dest[EBP];
4457 					mov		EDX, dest1[EBP];
4458 					mov		ECX, length;
4459 					cmp		ECX, 4;
4460 					jl		twopixel;
4461 				fourpixelloop:
4462 					movups	XMM0, [ESI];
4463 					movups	XMM1, [EDI];
4464 					pxor	XMM0, XMM1;
4465 					movups	[EDX], XMM0;
4466 					add		ESI,16;
4467 					add		EDI,16;
4468 					add		EDX,16;
4469 					sub		ECX, 4;
4470 					cmp		ECX, 4;
4471 					jge		fourpixelloop;
4472 				twopixel:
4473 					cmp		ECX, 2;
4474 					jl		onepixel;
4475 					movq	XMM0, [ESI];
4476 					movq	XMM1, [EDI];
4477 					pxor	XMM0, XMM1;
4478 					movq	[EDX], XMM0;
4479 					add		ESI, 8;
4480 					add		EDI, 8;
4481 					add		EDX, 8;
4482 					sub		ECX, 2;
4483 				onepixel:
4484 					cmp		ECX, 1;
4485 					jl		end;
4486 					movd	XMM0, [ESI];
4487 					movd	XMM1, [EDI];
4488 					pxor	XMM0, XMM1;
4489 					movd	[EDX], XMM0;
4490 					
4491 				end:
4492 					;
4493 				}
4494 			}
4495 		}else version(X86_64){
4496 			asm @nogc{
4497 				mov		RSI, src[RBP];
4498 				mov		RDI, dest[RBP];
4499 				mov		RDX, dest1[RBP];
4500 				mov		RCX, length;
4501 				cmp		RCX, 4;
4502 				jl		twopixel;
4503 			fourpixelloop:
4504 				movups	XMM0, [RSI];
4505 				movups	XMM1, [RDI];
4506 				pxor	XMM0, XMM1;
4507 				movups	[RDX], XMM0;
4508 				add		RSI,16;
4509 				add		RDI,16;
4510 				add		RDX,16;
4511 				sub		RCX, 4;
4512 				cmp		RCX, 4;
4513 				jge		fourpixelloop;
4514 			twopixel:
4515 				cmp		RCX, 2;
4516 				jl		onepixel;
4517 				movq	XMM0, [RSI];
4518 				movq	XMM1, [RDI];
4519 				pxor	XMM0, XMM1;
4520 				movq	[RDX], XMM0;
4521 				add		RSI, 2;
4522 				add		RDI, 2;
4523 				add		RDX, 2;
4524 				sub		RCX, 2;
4525 			onepixel:
4526 				cmp		RCX, 1;
4527 				jl		end;
4528 				movd	XMM0, [RSI];
4529 				movd	XMM1, [RDI];
4530 				pxor	XMM0, XMM1;
4531 				movd	[RDX], XMM0;
4532 			end:
4533 				;
4534 			}
4535 		}else{
4536 			while(lenght){
4537 				*dest1 = *src ^ *dest;
4538 				src++;
4539 				dest++;
4540 				dest1++;
4541 				length--;
4542 			}
4543 		}
4544 	}
4545 }
4546 /**
4547  * 2 + 1 operand XOR blitter. 
4548  */
4549 public @nogc void xorBlitter(T)(T* src, T* dest, size_t length){
4550 	static if(T == "ubyte"){
4551 		version(X86){
4552 			version(MMX){
4553 				asm @nogc{
4554 					mov		ESI, src[EBP];
4555 					mov		EDI, dest[EBP];
4556 					mov		ECX, length;
4557 					cmp		ECX, 8;
4558 					jl		fourpixel;
4559 				eightpixelloop:
4560 					movq	XMM0, [ESI];
4561 					movq	XMM1, [EDI];
4562 					pxor	XMM0, XMM1;
4563 					movq	[EDI], XMM0;
4564 					add		ESI, 8;
4565 					add		EDI, 8;
4566 					sub		ECX, 8;
4567 					cmp		ECX, 8;
4568 					jge		eightpixelloop;
4569 				fourpixel:
4570 					cmp		ECX, 4;
4571 					jl		singlepixelloop;
4572 					movd	XMM0, [ESI];
4573 					movd	XMM1, [EDI];
4574 					pxor	XMM0, XMM1;
4575 					movd	[EDI], XMM0;
4576 					add		ESI, 4;
4577 					add		EDI, 4;
4578 					sub		ECX, 4;
4579 					cmp		ECX, 0;
4580 					jle		end;
4581 				singlepixelloop:
4582 					mov		AL, [ESI];
4583 					xor		AL, [EDI];
4584 					mov		[EDI], AL;
4585 					inc		ESI;
4586 					inc		EDI;
4587 					loop	singlepixelloop;
4588 				end:
4589 					emms;
4590 				}
4591 			}else{
4592 				asm @nogc{
4593 					mov		ESI, src[EBP];
4594 					mov		EDI, dest[EBP];
4595 					mov		ECX, length;
4596 					cmp		ECX, 16;
4597 					jl		eightpixel;
4598 				sixteenpixelloop:
4599 					movups	XMM0, [ESI];
4600 					movups	XMM1, [EDI];
4601 					pxor	XMM0, XMM1;
4602 					movups	[EDI], XMM0;
4603 					add		ESI, 16;
4604 					add		EDI, 16;
4605 					sub		ECX, 16;
4606 					cmp		ECX, 16;
4607 					jge		sixteenpixelloop;
4608 				eightpixel:
4609 					cmp		ECX, 8;
4610 					jl		fourpixel;
4611 					movq	XMM0, [ESI];
4612 					movq	XMM1, [EDI];
4613 					pxor	XMM0, XMM1;
4614 					movq	[EDI], XMM0;
4615 					add		ESI, 8;
4616 					add		EDI, 8;
4617 					sub		ECX, 8;
4618 				fourpixel:
4619 					cmp		ECX, 4;
4620 					jl		singlepixelloop;
4621 					movd	XMM0, [ESI];
4622 					movd	XMM1, [EDI];
4623 					pxor	XMM0, XMM1;
4624 					movd	[EDI], XMM0;
4625 					add		ESI, 4;
4626 					add		EDI, 4;
4627 					sub		ECX, 4;
4628 					cmp		ECX, 0;
4629 					jle		end;
4630 				singlepixelloop:
4631 					mov		AL, [ESI];
4632 					xor		AL, [EDI];
4633 					mov		[EDI], AL;
4634 					inc		ESI;
4635 					inc		EDI;
4636 					loop	singlepixelloop;
4637 				end:
4638 					;
4639 				}
4640 			}
4641 		}else version(X86_64){
4642 			asm @nogc{
4643 				mov		RSI, src[RBP];
4644 				mov		RDI, dest[RBP];
4645 				mov		RCX, length;
4646 				cmp		RCX, 16;
4647 				jl		eightpixel;
4648 			sixteenpixelloop:
4649 				movups	XMM0, [RSI];
4650 				movups	XMM1, [RDI];
4651 				pxor	XMM0, XMM1;
4652 				movups	[RDI], XMM0;
4653 				add		RSI, 16;
4654 				add		RDI, 16;
4655 				sub		RCX, 16;
4656 				cmp		RCX, 16;
4657 				jge		sixteenpixelloop;
4658 			eightpixel:
4659 				cmp		RCX, 8;
4660 				jl		fourpixel;
4661 				movq	XMM0, [RSI];
4662 				movq	XMM1, [RDI];
4663 				pxor	XMM0, XMM1;
4664 				movq	[RDI], XMM0;
4665 				add		RSI, 8;
4666 				add		RDI, 8;
4667 				sub		RCX, 8;
4668 			fourpixel:
4669 				cmp		RCX, 4;
4670 				jl		singlepixelloop;
4671 				movd	XMM0, [RSI];
4672 				movd	XMM1, [RDI];
4673 				pxor	XMM0, XMM1;
4674 				movd	[RDI], XMM0;
4675 				add		RSI, 4;
4676 				add		RDI, 4;
4677 				sub		RCX, 4;
4678 				cmp		RCX, 0;
4679 				jle		end;
4680 			singlepixelloop:
4681 				mov		AL, [RSI];
4682 				xor		AL, [RDI];
4683 				mov		[RDI], AL;
4684 				inc		RSI;
4685 				inc		RDI;
4686 				loop	singlepixelloop;
4687 			end:
4688 				;
4689 			}
4690 		}else{
4691 			while(lenght){
4692 				*dest1 = *src ^ *dest;
4693 				src++;
4694 				dest++;
4695 				dest1++;
4696 				length--;
4697 			}
4698 		}
4699 	}else static if(T == "ushort"){
4700 		version(X86){
4701 			version(MMX){
4702 				asm @nogc{
4703 					mov		ESI, src[EBP];
4704 					mov		EDI, dest[EBP];
4705 					mov		ECX, length;
4706 					cmp		ECX, 4;
4707 					jl		twopixel;
4708 				fourpixelloop:
4709 					movq	XMM0, [ESI];
4710 					movq	XMM1, [EDI];
4711 					pxor	XMM0, XMM1;
4712 					movq	[EDI], XMM0;
4713 					add		ESI, 8;
4714 					add		EDI, 8;
4715 					sub		ECX, 4;
4716 					cmp		ECX, 4;
4717 					jge		fourpixelloop;
4718 				twopixel:
4719 					cmp		ECX, 4;
4720 					jl		onepixel;
4721 					movd	XMM0, [ESI];
4722 					movd	XMM1, [EDI];
4723 					pxor	XMM0, XMM1;
4724 					movd	[EDX], XMM0;
4725 					add		ESI, 4;
4726 					add		EDI, 4;
4727 					sub		ECX, 2;
4728 					cmp		ECX, 0;
4729 					jle		end;
4730 				onepixel:
4731 					mov		AX, [ESI];
4732 					xor		AX, [EDI];
4733 					mov		[EDI], AX;
4734 				end:
4735 					emms;
4736 				}
4737 			}else{
4738 				asm @nogc{
4739 					mov		ESI, src[EBP];
4740 					mov		EDI, dest[EBP];
4741 					mov		EDX, dest1[EBP];
4742 					mov		ECX, length;
4743 					cmp		ECX, 8;
4744 					jl		fourpixel;
4745 				eightpixelloop:
4746 					movups	XMM0, [ESI];
4747 					movups	XMM1, [EDI];
4748 					pxor	XMM0, XMM1;
4749 					movups	[EDX], XMM0;
4750 					add		ESI, 8;
4751 					add		EDI, 8;
4752 					sub		ECX, 8;
4753 					cmp		ECX, 8;
4754 					jge		eightpixelloop;
4755 				fourpixel:
4756 					cmp		ECX, 4;
4757 					jl		twopixel;
4758 					movq	XMM0, [ESI];
4759 					movq	XMM1, [EDI];
4760 					pxor	XMM0, XMM1;
4761 					movq	[EDI], XMM0;
4762 					add		ESI, 4;
4763 					add		EDI, 4;
4764 					sub		ECX, 4;
4765 				twopixel:
4766 					cmp		ECX, 2;
4767 					jl		onepixel;
4768 					movd	XMM0, [ESI];
4769 					movd	XMM1, [EDI];
4770 					pxor	XMM0, XMM1;
4771 					movd	[EDI], XMM0;
4772 					add		ESI, 2;
4773 					add		EDI, 2;
4774 					sub		ECX, 2;
4775 					cmp		ECX, 0;
4776 					jle		end;
4777 				onepixel:
4778 					mov		AX, [ESI];
4779 					xor		AX, [EDI];
4780 					mov		[EDI], AX;
4781 				end:
4782 					;
4783 				}
4784 			}
4785 		}else version(X86_64){
4786 			asm @nogc{
4787 				mov		RSI, src[RBP];
4788 				mov		RDI, dest[RBP];
4789 				mov		RCX, length;
4790 				cmp		RCX, 8;
4791 				jl		fourpixel;
4792 			eightpixelloop:
4793 				movups	XMM0, [RSI];
4794 				movups	XMM1, [RDI];
4795 				pxor	XMM0, XMM1;
4796 				movups	[RDI], XMM0;
4797 				add		RSI, 8;
4798 				add		RDI, 8;
4799 				sub		RCX, 8;
4800 				cmp		RCX, 8;
4801 				jge		eightpixelloop;
4802 			fourpixel:
4803 				cmp		RCX, 4;
4804 				jl		twopixel;
4805 				movq	XMM0, [RSI];
4806 				movq	XMM1, [RDI];
4807 				pxor	XMM0, XMM1;
4808 				movq	[RDI], XMM0;
4809 				add		RSI, 4;
4810 				add		RDI, 4;
4811 				sub		RCX, 4;
4812 			twopixel:
4813 				cmp		RCX, 2;
4814 				jl		singlepixelloop;
4815 				movd	XMM0, [RSI];
4816 				movd	XMM1, [RDI];
4817 				pxor	XMM0, XMM1;
4818 				movd	[RDI], XMM0;
4819 				add		RSI, 2;
4820 				add		RDI, 2;
4821 				sub		RCX, 2;
4822 				cmp		RCX, 0;
4823 				jle		end;
4824 			onepixel:
4825 				mov		AX, [RSI];
4826 				xor		AX, [RDI];
4827 				mov		[RDI], AX;
4828 			end:
4829 				;
4830 			}
4831 		}else{
4832 			while(lenght){
4833 				*dest1 = *src ^ *dest;
4834 				src++;
4835 				dest++;
4836 				dest1++;
4837 				length--;
4838 			}
4839 		}
4840 	}
4841 	static if(T == "uint"){
4842 		version(X86){
4843 			version(MMX){
4844 				asm @nogc{
4845 					mov		ESI, src[EBP];
4846 					mov		EDI, dest[EBP];
4847 					mov		ECX, length;
4848 					cmp		ECX, 2;
4849 					jl		onepixel;
4850 				twopixelloop:
4851 					movq	XMM0, [ESI];
4852 					movq	XMM1, [EDI];
4853 					pxor	XMM0, XMM1;
4854 					movq	[EDI], XMM0;
4855 					add		ESI, 8;
4856 					add		EDI, 8;
4857 					sub		ECX, 2;
4858 					cmp		ECX, 2;
4859 					jge		twopixelloop;
4860 				onepixel:
4861 					cmp		ECX, 1;
4862 					jl		end;
4863 					movd	XMM0, [ESI];
4864 					movd	XMM1, [EDI];
4865 					pxor	XMM0, XMM1;
4866 					movd	[EDI], XMM0;
4867 				end:
4868 					emms;
4869 				}
4870 			}else{
4871 				asm @nogc{
4872 					mov		ESI, src[EBP];
4873 					mov		EDI, dest[EBP];
4874 					mov		ECX, length;
4875 					cmp		ECX, 4;
4876 					jl		twopixel;
4877 				fourpixelloop:
4878 					movups	XMM0, [ESI];
4879 					movups	XMM1, [EDI];
4880 					pxor	XMM0, XMM1;
4881 					movups	[EDI], XMM0;
4882 					add		ESI,16;
4883 					add		EDI,16;
4884 					sub		ECX, 4;
4885 					cmp		ECX, 4;
4886 					jge		fourpixelloop;
4887 				twopixel:
4888 					cmp		ECX, 2;
4889 					jl		onepixel;
4890 					movq	XMM0, [ESI];
4891 					movq	XMM1, [EDI];
4892 					pxor	XMM0, XMM1;
4893 					movq	[EDI], XMM0;
4894 					add		ESI, 8;
4895 					add		EDI, 8;
4896 					sub		ECX, 2;
4897 				onepixel:
4898 					cmp		ECX, 1;
4899 					jl		end;
4900 					movd	XMM0, [ESI];
4901 					movd	XMM1, [EDI];
4902 					pxor	XMM0, XMM1;
4903 					movd	[EDI], XMM0;
4904 					
4905 				end:
4906 					;
4907 				}
4908 			}
4909 		}else version(X86_64){
4910 			asm @nogc{
4911 				mov		RSI, src[RBP];
4912 				mov		RDI, dest[RBP];
4913 				mov		RCX, length;
4914 				cmp		RCX, 4;
4915 				jl		twopixel;
4916 			fourpixelloop:
4917 				movups	XMM0, [RSI];
4918 				movups	XMM1, [RDI];
4919 				pxor	XMM0, XMM1;
4920 				movups	[RDI], XMM0;
4921 				add		RSI,16;
4922 				add		RDI,16;
4923 				sub		RCX, 4;
4924 				cmp		RCX, 4;
4925 				jge		fourpixelloop;
4926 			twopixel:
4927 				cmp		RCX, 2;
4928 				jl		onepixel;
4929 				movq	XMM0, [RSI];
4930 				movq	XMM1, [RDI];
4931 				pxor	XMM0, XMM1;
4932 				movq	[RDI], XMM0;
4933 				add		RSI, 8;
4934 				add		RDI, 8;
4935 				sub		RCX, 2;
4936 			onepixel:
4937 				cmp		RCX, 1;
4938 				jl		end;
4939 				movd	XMM0, [RSI];
4940 				movd	XMM1, [RDI];
4941 				pxor	XMM0, XMM1;
4942 				movd	[RDI], XMM0;
4943 			end:
4944 				;
4945 			}
4946 		}else{
4947 			while(lenght){
4948 				*dest1 = *src ^ *dest;
4949 				src++;
4950 				dest++;
4951 				dest1++;
4952 				length--;
4953 			}
4954 		}
4955 	}else static assert("Template parameter '"~ T.stringof ~"' not supported!");
4956 }