1 module CPUblit.composing; 2 3 import CPUblit.colorspaces; 4 5 /** 6 * CPUblit 7 * Low-level image composing functions 8 * Author: Laszlo Szeremi 9 * Contains 2, 3, and 4 operand functions. 10 * All blitter follows this formula: dest1 = (dest & mask) | src 11 * Two plus one operand blitter is done via evaluation on systems that don't support vector operations. 12 * Alpha-blending function formula: dest1 = (src * (1 + alpha) + dest * (256 - alpha)) >> 8 13 * Where it was possible I implemented vector support. Due to various quirks I needed (such as the ability of load unaligned values, and load less than 128/64bits), I often 14 * had to rely on assembly. As the functions themselves aren't too complicated it wasn't an impossible task, but makes debugging time-consuming. 15 * See specific functions for more information. 16 */ 17 18 //import core.simd; 19 //package immutable ubyte[16] NULLVECT_SSE2; 20 package immutable uint[4] BLT32BITTESTER_SSE2 = [0x01000000,0x01000000,0x01000000,0x01000000]; 21 package immutable ushort[8] ALPHABLEND_SSE2_CONST1 = [1,1,1,1,1,1,1,1]; 22 package immutable ushort[8] ALPHABLEND_SSE2_CONST256 = [256,256,256,256,256,256,256,256]; 23 package immutable ubyte[16] ALPHABLEND_SSE2_MASK = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]; 24 //package immutable ubyte[8] NULLVECT_MMX; 25 package immutable uint[2] BLT32BITTESTER_MMX = [0x01000000,0x01000000]; 26 package immutable ushort[4] ALPHABLEND_MMX_CONST1 = [1,1,1,1]; 27 package immutable ushort[4] ALPHABLEND_MMX_CONST256 = [256,256,256,256]; 28 package immutable ubyte[8] ALPHABLEND_MMX_MASK = [255,0,0,0,255,0,0,0]; 29 /** 30 * Two plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula: 31 * mask = src == 0x00 ? 0xFF : 0x00 32 */ 33 public @nogc void blitter8bit(ubyte* src, ubyte* dest, size_t length){ 34 version(X86){ 35 version(MMX){ 36 asm @nogc{ 37 pxor MM7, MM7; 38 mov ESI, src[EBP]; 39 mov EDI, dest[EBP]; 40 mov ECX, length; 41 cmp ECX, 8; 42 jl fourpixel; 43 eigthpixelloop: 44 movq MM0, [ESI]; 45 movq MM1, [EDI]; 46 movq MM2, MM7; 47 pcmpeqb MM2, MM0; 48 pand MM1, MM2; 49 por MM1, MM0; 50 movq [EDI], MM1; 51 add ESI, 8; 52 add EDI, 8; 53 sub ECX, 8; 54 jge eigthpixelloop; 55 fourpixel: 56 cmp ECX, 4; 57 jl singlepixelloop; 58 movd MM0, [ESI]; 59 movd MM1, [EDI]; 60 movq MM2, MM7; 61 pcmpeqb MM2, MM0; 62 pand MM1, MM2; 63 por MM1, MM0; 64 movd [EDI], MM1; 65 add ESI, 4; 66 add EDI, 4; 67 sub ECX, 4; 68 singlepixelloop: 69 //cmp ECX, 0; 70 jecxz end; 71 mov AL, [ESI]; 72 cmp AL, 0; 73 jz step; 74 mov AL, [EDI]; 75 step: 76 mov [EDI], AL; 77 cmp ECX, 0; 78 inc ESI; 79 inc EDI; 80 dec ECX; 81 jmp singlepixelloop; 82 end: 83 emms; 84 } 85 }else{ 86 asm @nogc{ 87 pxor XMM7, XMM7; 88 mov ESI, src[EBP]; 89 mov EDI, dest[EBP]; 90 mov ECX, length; 91 cmp ECX, 16; 92 jl eightpixel; 93 sixteenpixelloop: 94 movups XMM0, [ESI]; 95 movups XMM1, [EDI]; 96 movups XMM2, XMM7; 97 pcmpeqb XMM2, XMM0; 98 pand XMM1, XMM2; 99 por XMM1, XMM0; 100 movups [EDI], XMM1; 101 add ESI, 16; 102 add EDI, 16; 103 sub ECX, 16; 104 cmp ECX, 16; 105 jge sixteenpixelloop; 106 eightpixel: 107 cmp ECX, 8; 108 jl fourpixel; 109 movq XMM0, [ESI]; 110 movq XMM1, [EDI]; 111 movups XMM2, XMM7; 112 pcmpeqb XMM2, XMM0; 113 pand XMM1, XMM2; 114 por XMM1, XMM0; 115 movq [EDI], XMM1; 116 add ESI, 8; 117 add EDI, 8; 118 sub ECX, 8; 119 fourpixel: 120 cmp ECX, 4; 121 jl singlepixelloop; 122 movd XMM0, [ESI]; 123 movd XMM1, [EDI]; 124 movups XMM2, XMM7; 125 pcmpeqb XMM2, XMM0; 126 pand XMM1, XMM2; 127 por XMM1, XMM0; 128 movd [EDI], XMM1; 129 add ESI, 4; 130 add EDI, 4; 131 sub ECX, 4; 132 singlepixelloop: 133 //cmp ECX, 0; 134 jecxz end; 135 mov AL, [ESI]; 136 cmp AL, 0; 137 jz step; 138 mov AL, [EDI]; 139 step: 140 mov [EDI], AL; 141 cmp ECX, 0; 142 inc ESI; 143 inc EDI; 144 dec ECX; 145 jmp singlepixelloop; 146 end: 147 ; 148 } 149 } 150 }else version(X86_64){ 151 asm @nogc{ 152 pxor XMM7, XMM7; 153 mov RSI, src[RBP]; 154 mov RDI, dest[RBP]; 155 mov RCX, length; 156 cmp RCX, 16; 157 jl eightpixel; 158 sixteenpixelloop: 159 movups XMM0, [RSI]; 160 movups XMM1, [RDI]; 161 movups XMM2, XMM7; 162 pcmpeqb XMM2, XMM0; 163 pand XMM1, XMM2; 164 por XMM1, XMM0; 165 movups [RDI], XMM1; 166 add RSI, 16; 167 add RDI, 16; 168 sub RCX, 16; 169 cmp RCX, 16; 170 jge sixteenpixelloop; 171 eightpixel: 172 cmp RCX, 8; 173 jl fourpixel; 174 movq XMM0, [RSI]; 175 movq XMM1, [RDI]; 176 movups XMM2, XMM7; 177 pcmpeqb XMM2, XMM0; 178 pand XMM1, XMM2; 179 por XMM1, XMM0; 180 movq [RDI], XMM1; 181 add RSI, 8; 182 add RDI, 8; 183 sub RCX, 8; 184 fourpixel: 185 cmp RCX, 4; 186 jl singlepixelloop; 187 movd XMM0, [RSI]; 188 movd XMM1, [RDI]; 189 movups XMM2, XMM7; 190 pcmpeqb XMM2, XMM0; 191 pand XMM1, XMM2; 192 por XMM1, XMM0; 193 movd [RDI], XMM1; 194 add RSI, 4; 195 add RDI, 4; 196 sub RCX, 4; 197 singlepixelloop: 198 cmp RCX, 0; 199 jz end; 200 mov AL, [RSI]; 201 cmp AL, 0; 202 jz step; 203 mov AL, [EDI]; 204 step: 205 mov [RDI], AL; 206 cmp RCX, 0; 207 inc RSI; 208 inc RDI; 209 dec RCX; 210 jmp singlepixelloop; 211 end: 212 ; 213 } 214 }else{ 215 while(length){ 216 if(*src) 217 *dest = *src; 218 src++; 219 dest++; 220 length--; 221 } 222 } 223 } 224 /** 225 * Copies an 8bit image onto another without blitter. No transparency is used. 226 */ 227 public @nogc void copy8bit(ubyte* src, ubyte* dest, size_t length){ 228 version(X86){ 229 version(MMX){ 230 asm @nogc{ 231 mov ESI, src[EBP]; 232 mov EDI, dest[EBP]; 233 mov ECX, length; 234 cmp ECX, 8; 235 jl fourpixel; 236 eigthpixelloop: 237 movq MM0, [ESI]; 238 movq [EDI], MM0; 239 add ESI, 8; 240 add EDI, 8; 241 sub ECX, 8; 242 jge eigthpixelloop; 243 fourpixel: 244 cmp ECX, 4; 245 jl singlepixelloop; 246 movd MM0, [ESI]; 247 movd [EDI], MM0; 248 add ESI, 4; 249 add EDI, 4; 250 sub ECX, 4; 251 singlepixelloop: 252 //cmp ECX, 0; 253 jecxz end; 254 mov AL, [ESI]; 255 mov [EDI], AL; 256 cmp ECX, 0; 257 inc ESI; 258 inc EDI; 259 dec ECX; 260 jnz singlepixelloop; 261 end: 262 ; 263 } 264 }else{ 265 asm @nogc{ 266 mov ESI, src[EBP]; 267 mov EDI, dest[EBP]; 268 mov ECX, length; 269 cmp ECX, 16; 270 jl eightpixel; 271 sixteenpixelloop: 272 movups XMM0, [ESI]; 273 movups [EDI], XMM0; 274 add ESI, 16; 275 add EDI, 16; 276 sub ECX, 16; 277 cmp ECX, 16; 278 jge sixteenpixelloop; 279 eightpixel: 280 cmp ECX, 8; 281 jl fourpixel; 282 movq XMM0, [ESI]; 283 movq [EDI], XMM0; 284 add ESI, 8; 285 add EDI, 8; 286 sub ECX, 8; 287 fourpixel: 288 cmp ECX, 4; 289 jl singlepixelloop; 290 movd XMM0, [ESI]; 291 movd XMM1, [EDI]; 292 movups XMM2, XMM7; 293 pcmpeqb XMM2, XMM0; 294 pand XMM1, XMM2; 295 por XMM1, XMM0; 296 movd [EDI], XMM1; 297 add ESI, 4; 298 add EDI, 4; 299 sub ECX, 4; 300 singlepixelloop: 301 //cmp ECX, 0; 302 jecxz end; 303 mov AL, [ESI]; 304 mov [EDI], AL; 305 cmp ECX, 0; 306 inc ESI; 307 inc EDI; 308 dec ECX; 309 jnz singlepixelloop; 310 end: 311 ; 312 } 313 } 314 }else version(X86_64){ 315 asm @nogc{ 316 mov RSI, src[RBP]; 317 mov RDI, dest[RBP]; 318 mov RCX, length; 319 cmp RCX, 16; 320 jl eightpixel; 321 sixteenpixelloop: 322 movups XMM0, [RSI]; 323 movups [RDI], XMM0; 324 add RSI, 16; 325 add RDI, 16; 326 sub RCX, 16; 327 cmp RCX, 16; 328 jge sixteenpixelloop; 329 eightpixel: 330 cmp RCX, 8; 331 jl fourpixel; 332 movq XMM0, [RSI]; 333 movq [RDI], XMM0; 334 add RSI, 8; 335 add RDI, 8; 336 sub RCX, 8; 337 fourpixel: 338 cmp RCX, 4; 339 jl singlepixelloop; 340 movd XMM1, [RSI]; 341 movd [RDI], XMM1; 342 add RSI, 4; 343 add RDI, 4; 344 sub RCX, 4; 345 singlepixelloop: 346 cmp RCX, 0; 347 jz end; 348 mov AL, [RSI]; 349 mov [RDI], AL; 350 cmp RCX, 0; 351 inc RSI; 352 inc RDI; 353 dec RCX; 354 jmp singlepixelloop; 355 end: 356 ; 357 } 358 }else{ 359 while(length){ 360 *dest = *src; 361 src++; 362 dest++; 363 length--; 364 } 365 } 366 } 367 /** 368 * Two plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula: 369 * mask = src == 0x0000 ? 0xFFFF : 0x0000 370 */ 371 public @nogc void blitter16bit(ushort* src, ushort* dest, size_t length){ 372 version(X86){ 373 version(MMX){ 374 asm @nogc{ 375 pxor MM7, MM7; 376 mov ESI, src[EBP]; 377 mov EDI, dest[EBP]; 378 mov ECX, length; 379 cmp ECX, 4; 380 jl twopixel; 381 fourpixelloop: 382 movq MM0, [ESI]; 383 movq MM1, [EDI]; 384 movq MM2, MM0; 385 pcmpeqw MM2, MM7; 386 pand MM1, MM2; 387 por MM1, MM0; 388 movq [EDI], MM1; 389 add ESI, 8; 390 add EDI, 8; 391 sub ECX, 4; 392 jge fourpixelloop; 393 twopixel: 394 cmp ECX, 4; 395 jl singlepixel; 396 movd MM0, [ESI]; 397 movd MM1, [EDI]; 398 movq MM2, MM7; 399 pcmpeqw MM2, MM0; 400 pand MM1, MM2; 401 por MM1, MM0; 402 movd [EDI], MM1; 403 add ESI, 4; 404 add EDI, 4; 405 sub ECX, 2; 406 singlepixel: 407 //cmp ECX, 0; 408 jecxz end; 409 mov AX, [ESI]; 410 cmp AX, 0; 411 cmovz AX, [EDI]; 412 mov [EDI], AL; 413 end: 414 emms; 415 } 416 }else{ 417 asm @nogc{ 418 pxor XMM7, XMM7; 419 mov ESI, src[EBP]; 420 mov EDI, dest[EBP]; 421 mov ECX, length; 422 cmp ECX, 8; 423 jl fourpixel; 424 eigthpixelloop: 425 movups XMM0, [ESI]; 426 movups XMM1, [EDI]; 427 movups XMM2, XMM7; 428 pcmpeqw XMM2, XMM0; 429 pand XMM1, XMM2; 430 por XMM1, XMM0; 431 movups [EDI], XMM1; 432 add ESI, 16; 433 add EDI, 16; 434 sub ECX, 8; 435 cmp ECX, 8; 436 jge eigthpixelloop; 437 fourpixel: 438 cmp ECX, 4; 439 jl twopixel; 440 movq XMM0, [ESI]; 441 movq XMM1, [EDI]; 442 movups XMM2, XMM7; 443 pcmpeqw XMM2, XMM0; 444 pand XMM1, XMM2; 445 por XMM1, XMM0; 446 movq [EDI], XMM1; 447 add ESI, 8; 448 add EDI, 8; 449 sub ECX, 4; 450 twopixel: 451 cmp ECX, 2; 452 jl singlepixel; 453 movd XMM0, [ESI]; 454 movd XMM1, [EDI]; 455 movups XMM2, XMM7; 456 pcmpeqw XMM2, XMM0; 457 pand XMM1, XMM2; 458 por XMM1, XMM0; 459 movd [EDI], XMM1; 460 add ESI, 4; 461 add EDI, 4; 462 sub ECX, 2; 463 singlepixel: 464 //cmp ECX, 0; 465 jecxz end; 466 mov AX, [ESI]; 467 cmp AX, 0; 468 cmovz AX, [EDI]; 469 mov [EDI], AX; 470 end: 471 ; 472 } 473 } 474 }else version(X86_64){ 475 asm @nogc{ 476 pxor XMM7, XMM7; 477 mov RSI, src[RBP]; 478 mov RDI, dest[RBP]; 479 mov RCX, length; 480 cmp RCX, 8; 481 jl fourpixel; 482 eigthpixelloop: 483 movups XMM0, [RSI]; 484 movups XMM1, [RDI]; 485 movups XMM2, XMM7; 486 pcmpeqw XMM2, XMM0; 487 pand XMM1, XMM2; 488 por XMM1, XMM0; 489 movups [RDI], XMM1; 490 add RSI, 8; 491 add RDI, 8; 492 sub RCX, 8; 493 cmp RCX, 8; 494 jge eigthpixelloop; 495 fourpixel: 496 cmp RCX, 4; 497 jl twopixel; 498 movq XMM0, [RSI]; 499 movq XMM1, [RDI]; 500 movups XMM2, XMM7; 501 pcmpeqw XMM2, XMM0; 502 pand XMM1, XMM2; 503 por XMM1, XMM0; 504 movq [RDI], XMM1; 505 add RSI, 4; 506 add RDI, 4; 507 sub RCX, 4; 508 twopixel: 509 cmp RCX, 2; 510 jl singlepixel; 511 movd XMM0, [RSI]; 512 movd XMM1, [RDI]; 513 movups XMM2, XMM7; 514 pcmpeqw XMM2, XMM0; 515 pand XMM1, XMM2; 516 por XMM1, XMM0; 517 movd [RDI], XMM1; 518 add RSI, 2; 519 add RDI, 2; 520 sub RCX, 2; 521 singlepixel: 522 cmp RCX, 0; 523 jz end; 524 mov AX, [RSI]; 525 cmp AX, 0; 526 cmovz AX, [RDI]; 527 mov [RDI], AX; 528 end: 529 ; 530 } 531 }else{ 532 while(length){ 533 if(*src) 534 *dest = *src; 535 src++; 536 dest++; 537 length--; 538 } 539 } 540 } 541 /** 542 * Copies a 16bit image onto another without blitter. No transparency is used. 543 */ 544 public @nogc void copy16bit(ushort* src, ushort* dest, size_t length){ 545 version(X86){ 546 version(MMX){ 547 asm @nogc{ 548 mov ESI, src[EBP]; 549 mov EDI, dest[EBP]; 550 mov ECX, length; 551 cmp ECX, 4; 552 //pxor MM7, MM7; 553 jl twopixel; 554 fourpixelloop: 555 movq MM0, [ESI]; 556 movq [EDI], MM0; 557 add ESI, 8; 558 add EDI, 8; 559 sub ECX, 4; 560 jge fourpixelloop; 561 twopixel: 562 cmp ECX, 4; 563 jl singlepixel; 564 movd MM0, [ESI]; 565 movd [EDI], MM0; 566 add ESI, 4; 567 add EDI, 4; 568 sub ECX, 2; 569 singlepixel: 570 cmp ECX, 0; 571 jz end; 572 mov AX, [ESI]; 573 mov [EDI], AL; 574 end: 575 emms; 576 } 577 }else{ 578 asm @nogc{ 579 mov ESI, src[EBP]; 580 mov EDI, dest[EBP]; 581 mov ECX, length; 582 cmp ECX, 8; 583 //pxor XMM7, XMM7; 584 jl fourpixel; 585 eigthpixelloop: 586 movups XMM0, [ESI]; 587 movups [EDI], XMM0; 588 add ESI, 16; 589 add EDI, 16; 590 sub ECX, 8; 591 cmp ECX, 8; 592 jge eigthpixelloop; 593 fourpixel: 594 cmp ECX, 4; 595 jl twopixel; 596 movq XMM0, [ESI]; 597 movq [EDI], XMM0; 598 add ESI, 8; 599 add EDI, 8; 600 sub ECX, 4; 601 twopixel: 602 cmp ECX, 2; 603 jl singlepixel; 604 movd XMM0, [ESI]; 605 movd [EDI], XMM0; 606 add ESI, 4; 607 add EDI, 4; 608 sub ECX, 2; 609 singlepixel: 610 cmp ECX, 0; 611 jz end; 612 mov AL, [ESI]; 613 mov [EDI], AL; 614 end: 615 ; 616 } 617 } 618 }else version(X86_64){ 619 asm @nogc{ 620 mov RSI, src[RBP]; 621 mov RDI, dest[RBP]; 622 mov RCX, length; 623 cmp RCX, 8; 624 //pxor XMM7, XMM7; 625 jl fourpixel; 626 eigthpixelloop: 627 movups XMM0, [RSI]; 628 movups [RDI], XMM0; 629 add RSI, 8; 630 add RDI, 8; 631 sub RCX, 8; 632 cmp RCX, 8; 633 jge eigthpixelloop; 634 fourpixel: 635 cmp RCX, 4; 636 jl twopixel; 637 movq XMM0, [RSI]; 638 movq [RDI], XMM0; 639 add RSI, 4; 640 add RDI, 4; 641 sub RCX, 4; 642 twopixel: 643 cmp RCX, 2; 644 jl singlepixel; 645 movd XMM0, [RSI]; 646 movd [RDI], XMM0; 647 add RSI, 2; 648 add RDI, 2; 649 sub RCX, 2; 650 singlepixel: 651 cmp RCX, 0; 652 jz end; 653 mov AL, [RSI];; 654 mov [RDI], AL; 655 end: 656 ; 657 } 658 }else{ 659 while(length){ 660 *dest = *src; 661 src++; 662 dest++; 663 length--; 664 } 665 } 666 } 667 /** 668 * Two plus one operand blitter for 32 bit values. Automatic mask-generation is used from the source's alpha channel with the following formula: 669 * mask = src.alpha == 0x00 ? 0xFFFFFFFF : 0x00000000 670 */ 671 public @nogc void blitter32bit(uint* src, uint* dest, size_t length){ 672 version(X86){ 673 version(MMX){ 674 asm @nogc{ 675 mov ESI, src[EBP]; 676 mov EDI, dest[EBP]; 677 mov ECX, length; 678 movq MM6, ALPHABLEND_MMX_MASK; 679 pxor MM7, MM7; 680 cmp ECX, 2; 681 jl twopixel; 682 twopixelloop: 683 movq MM0, [ESI]; 684 movq MM1, [EDI]; 685 movq MM2, MM0; 686 pand MM2, MM6; 687 pcmpeqd MM2, MM7; 688 pand MM1, MM2; 689 por MM1, MM0; 690 movq [EDI], MM1; 691 add ESI, 8; 692 add EDI, 8; 693 sub ECX, 2; 694 jge fourpixelloop; 695 onepixel: 696 cmp ECX, 1; 697 jl end; 698 movd MM0, [ESI]; 699 movd MM1, [EDI]; 700 movq MM2, MM0; 701 pand MM2, MM6; 702 pcmpeqd MM2, MM7; 703 pand MM1, MM2; 704 por MM1, MM0; 705 movd [EDI], MM1; 706 end: 707 emms; 708 } 709 }else{ 710 asm @nogc{ 711 mov ESI, src[EBP]; 712 mov EDI, dest[EBP]; 713 mov ECX, length; 714 movups XMM6, ALPHABLEND_SSE2_MASK; 715 pxor XMM7, XMM7; 716 cmp ECX, 8; 717 jl twopixel; 718 fourpixelloop: 719 movups XMM0, [ESI]; 720 movups XMM1, [EDI]; 721 movups XMM2, XMM0; 722 pand XMM2, XMM6; 723 pcmpeqd XMM2, XMM7; 724 pand XMM1, XMM2; 725 por XMM1, XMM0; 726 movups [EDI], XMM1; 727 add ESI, 16; 728 add EDI, 16; 729 sub ECX, 4; 730 cmp ECX, 4; 731 jge fourpixelloop; 732 twopixel: 733 cmp ECX, 2; 734 jl onepixel; 735 movq XMM0, [ESI]; 736 movq XMM1, [EDI]; 737 movq XMM2, XMM0; 738 pand XMM2, XMM6; 739 pcmpeqd XMM2, XMM7; 740 pand XMM1, XMM2; 741 por XMM1, XMM0; 742 movq [EDI], XMM1; 743 add ESI, 8; 744 add EDI, 8; 745 sub ECX, 2; 746 onepixel: 747 cmp ECX, 1; 748 jl end; 749 movd XMM0, [ESI]; 750 movd XMM1, [EDI]; 751 movq XMM2, XMM0; 752 pand XMM2, XMM6; 753 pcmpeqd XMM2, XMM7; 754 pand XMM1, XMM2; 755 por XMM1, XMM0; 756 movd [EDI], XMM1; 757 end: 758 ; 759 } 760 } 761 }else version(X86_64){ 762 asm @nogc{ 763 mov RSI, src[RBP]; 764 mov RDI, dest[RBP]; 765 mov RCX, length; 766 movups XMM6, ALPHABLEND_SSE2_MASK; 767 pxor XMM7, XMM7; 768 cmp ECX, 8; 769 jl twopixel; 770 fourpixelloop: 771 movups XMM0, [RSI]; 772 movups XMM1, [RDI]; 773 movups XMM2, XMM0; 774 pand XMM2, XMM6; 775 pcmpeqd XMM2, XMM7; 776 pand XMM1, XMM2; 777 por XMM1, XMM0; 778 movups [RDI], XMM1; 779 add RSI, 4; 780 add RDI, 4; 781 sub RCX, 4; 782 cmp RCX, 4; 783 jge fourpixelloop; 784 twopixel: 785 cmp RCX, 2; 786 jl onepixel; 787 movq XMM0, [RSI]; 788 movq XMM1, [RDI]; 789 movq XMM2, XMM0; 790 pand XMM2, XMM6; 791 pcmpeqd XMM2, XMM7; 792 pand XMM1, XMM2; 793 por XMM1, XMM0; 794 movq [RDI], XMM1; 795 add RSI, 2; 796 add RDI, 2; 797 sub RCX, 2; 798 onepixel: 799 cmp RCX, 1; 800 jl end; 801 movd XMM0, [RSI]; 802 movd XMM1, [RDI]; 803 movq XMM2, XMM0; 804 pand XMM2, XMM6; 805 pcmpeqd XMM2, XMM7; 806 pand XMM1, XMM2; 807 por XMM1, XMM0; 808 movd [RDI], XMM1; 809 end: 810 ; 811 } 812 }else{ 813 while(length){ 814 if(*cast(Pixel32Bit)src.ColorSpaceARGB.alpha) 815 *dest = *src; 816 src++; 817 dest++; 818 length--; 819 } 820 } 821 } 822 /** 823 * Implements a two plus one operand alpha-blending algorithm for 32bit bitmaps. Automatic alpha-mask generation follows this formula: 824 * src[B,G,R,A] --> mask [A,A,A,A] 825 */ 826 public @nogc void alphaBlend32bit(uint* src, uint* dest, size_t length){ 827 version(X86){ 828 version(MMX){ 829 int target8 = length/8, target4 = length%2; 830 asm @nogc { 831 //setting up the pointer registers and the counter register 832 //mov EBX, alpha[EBP]; 833 mov ESI, src[EBP]; 834 mov EDI, dest[EBP]; 835 mov ECX, target8; 836 cmp ECX, 0; 837 jz fourpixelblend; //skip 16 byte operations if not needed 838 //iteration cycle entry point 839 sixteenpixelblend: 840 //create alpha mask on the fly 841 movq MM3, [ESI]; 842 movq MM1, MM3; 843 pand MM1, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 844 movq MM0, MM1; 845 pslld MM0, 8; 846 por MM1, MM0; //mask is ready for RA 847 pslld MM1, 16; 848 por MM0, MM1; //mask is ready for BGRA 849 movq MM1, MM0; 850 punpcklbw MM0, MM2; 851 punpckhbw MM1, MM2; 852 movq MM6, ALPHABLEND_MMX_CONST256; 853 movq MM7, MM6; 854 movq MM4, ALPHABLEND_MMX_CONST1; 855 movq MM5, MM4; 856 857 paddusw MM4, MM0; //1 + alpha01 858 paddusw MM5, MM1; //1 + alpha23 859 psubusw MM6, MM0; //256 - alpha01 860 psubusw MM7, MM1; //256 - alpha23 861 862 //moving the values to their destinations 863 movq MM0, MM3; //src01 864 movq MM1, MM0; //src23 865 punpcklbw MM0, MM2; 866 punpckhbw MM1, MM2; 867 pmullw MM4, MM0; //src01 * (1 + alpha01) 868 pmullw MM5, MM1; //src23 * (1 + alpha23) 869 movq MM0, [EDI]; //dest01 870 movq MM1, MM0; //dest23 871 punpcklbw MM0, MM2; 872 punpckhbw MM1, MM2; 873 pmullw MM6, MM0; //dest01 * (256 - alpha) 874 pmullw MM7, MM1; //dest23 * (256 - alpha) 875 876 paddusw MM4, MM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 877 paddusw MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 878 psrlw MM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 879 psrlw MM5, 8; 880 //moving the result to its place; 881 //pxor MM2, MM2; 882 packuswb MM4, MM5; 883 884 movq [EDI], MM4; 885 //add EBX, 16; 886 add ESI, 8; 887 add EDI, 8; 888 dec ECX; 889 cmp ECX, 0; 890 jnz sixteenpixelblend; 891 fourpixelblend: 892 mov ECX, target4; 893 cmp ECX, 0; 894 jz endofalgorithm; 895 fourpixelblendloop: 896 897 //movd XMM6, [EBX];//alpha 898 899 900 movd MM0, [EDI]; 901 movd MM1, [ESI]; 902 punpcklbw MM0, MM2;//dest 903 punpcklbw MM1, MM2;//src 904 movups MM6, MM1; 905 pand MM6, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 906 movups MM7, MM6; 907 pslld MM6, 8; 908 por MM7, MM6; //mask is ready for RA 909 pslld MM7, 16; 910 por MM6, MM7; //mask is ready for GRA 911 punpcklbw MM7, MM2; 912 movaps MM4, ALPHABLEND_MMX_CONST256; 913 movaps MM5, ALPHABLEND_MMX_CONST1; 914 915 paddusw MM5, MM6;//1+alpha 916 psubusw MM4, MM6;//256-alpha 917 918 pmullw MM0, MM4;//dest*(256-alpha) 919 pmullw MM1, MM5;//src*(1+alpha) 920 paddusw MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha)) 921 psrlw MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 922 923 packuswb MM0, MM2; 924 925 movd [EDI], MM0; 926 927 endofalgorithm: 928 emms; 929 } 930 }else{ 931 int target16 = length/4, target4 = length%4; 932 asm @nogc { 933 //setting up the pointer registers and the counter register 934 //mov EBX, alpha[EBP]; 935 mov ESI, src[EBP]; 936 mov EDI, dest[EBP]; 937 mov ECX, target16; 938 cmp ECX, 0; 939 jz fourpixelblend; //skip 16 byte operations if not needed 940 //iteration cycle entry point 941 sixteenpixelblend: 942 //create alpha mask on the fly 943 movups XMM3, [ESI]; 944 movups XMM1, XMM3; 945 pand XMM1, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 946 movups XMM0, XMM1; 947 pslld XMM0, 8; 948 por XMM1, XMM0; //mask is ready for RA 949 pslld XMM1, 16; 950 por XMM0, XMM1; //mask is ready for BGRA/**/ 951 movups XMM1, XMM0; 952 953 punpcklbw XMM0, XMM2; 954 punpckhbw XMM1, XMM2; 955 movups XMM6, ALPHABLEND_SSE2_CONST256; 956 movups XMM7, XMM6; 957 movups XMM4, ALPHABLEND_SSE2_CONST1; 958 movups XMM5, XMM4; 959 960 paddusw XMM4, XMM0; //1 + alpha01 961 paddusw XMM5, XMM1; //1 + alpha23 962 psubusw XMM6, XMM0; //256 - alpha01 963 psubusw XMM7, XMM1; //256 - alpha23 964 965 //moving the values to their destinations 966 967 movups XMM0, XMM3; //src01 968 movups XMM1, XMM0; //src23 969 punpcklbw XMM0, XMM2; 970 punpckhbw XMM1, XMM2; 971 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 972 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 973 movups XMM0, [EDI]; //dest01 974 movups XMM1, XMM0; //dest23 975 punpcklbw XMM0, XMM2; 976 punpckhbw XMM1, XMM2; 977 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 978 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 979 980 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 981 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 982 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 983 psrlw XMM5, 8; 984 //moving the result to its place; 985 //pxor MM2, MM2; 986 packuswb XMM4, XMM5; 987 988 movups [EDI], XMM4; 989 //add EBX, 16; 990 add ESI, 16; 991 add EDI, 16; 992 dec ECX; 993 cmp ECX, 0; 994 jnz sixteenpixelblend; 995 996 fourpixelblend: 997 998 mov ECX, target4; 999 cmp ECX, 0; 1000 jz endofalgorithm; 1001 1002 fourpixelblendloop: 1003 1004 //movd XMM6, [EBX];//alpha 1005 1006 1007 movd XMM0, [EDI]; 1008 movd XMM1, [ESI]; 1009 punpcklbw XMM0, XMM2;//dest 1010 punpcklbw XMM1, XMM2;//src 1011 movups XMM6, XMM1; 1012 pand XMM6, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 1013 movups XMM7, XMM6; 1014 pslld XMM6, 8; 1015 por XMM7, XMM6; //mask is ready for RA 1016 pslld XMM7, 16; 1017 por XMM6, XMM7; //mask is ready for BGRA 1018 1019 punpcklbw XMM6, XMM2; 1020 1021 movaps XMM4, ALPHABLEND_SSE2_CONST256; 1022 movaps XMM5, ALPHABLEND_SSE2_CONST1; 1023 1024 paddusw XMM5, XMM6;//1+alpha 1025 psubusw XMM4, XMM6;//256-alpha 1026 1027 pmullw XMM0, XMM4;//dest*(256-alpha) 1028 pmullw XMM1, XMM5;//src*(1+alpha) 1029 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 1030 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 1031 1032 packuswb XMM0, XMM2; 1033 1034 movd [EDI], XMM0; 1035 1036 add ESI, 4; 1037 add EDI, 4;/**/ 1038 dec ECX; 1039 cmp ECX, 0; 1040 jnz fourpixelblendloop; 1041 1042 endofalgorithm: 1043 ; 1044 } 1045 } 1046 }else version(X86_64){ 1047 size_t target16 = length/4, target4 = length%4; 1048 asm @nogc { 1049 //setting up the pointer registers and the counter register 1050 //mov EBX, alpha[EBP]; 1051 mov RSI, src[RBP]; 1052 mov RDI, dest[RBP]; 1053 mov RCX, target16; 1054 movups XMM8, ALPHABLEND_SSE2_CONST256; 1055 movups XMM9, ALPHABLEND_SSE2_CONST1; 1056 movups XMM10, ALPHABLEND_SSE2_MASK; 1057 cmp RCX, 8; 1058 jl fourpixelblend; //skip 16 byte operations if not needed 1059 //iteration cycle entry point 1060 sixteenpixelblend: 1061 //create alpha mask on the fly 1062 movups XMM3, [RSI]; 1063 movups XMM1, XMM3; 1064 pand XMM1, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 1065 movups XMM0, XMM1; 1066 pslld XMM0, 8; 1067 por XMM1, XMM0; //mask is ready for RA 1068 pslld XMM1, 16; 1069 por XMM0, XMM1; //mask is ready for BGRA/**/ 1070 movups XMM1, XMM0; 1071 1072 punpcklbw XMM0, XMM2; 1073 punpckhbw XMM1, XMM2; 1074 movups XMM6, XMM8; 1075 movups XMM7, XMM8; 1076 movups XMM4, XMM9; 1077 movups XMM5, XMM9; 1078 1079 paddusw XMM4, XMM0; //1 + alpha01 1080 paddusw XMM5, XMM1; //1 + alpha23 1081 psubusw XMM6, XMM0; //256 - alpha01 1082 psubusw XMM7, XMM1; //256 - alpha23 1083 1084 //moving the values to their destinations 1085 1086 movups XMM0, XMM3; //src01 1087 movups XMM1, XMM0; //src23 1088 punpcklbw XMM0, XMM2; 1089 punpckhbw XMM1, XMM2; 1090 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 1091 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 1092 movups XMM0, [EDI]; //dest01 1093 movups XMM1, XMM0; //dest23 1094 punpcklbw XMM0, XMM2; 1095 punpckhbw XMM1, XMM2; 1096 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 1097 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 1098 1099 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 1100 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 1101 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 1102 psrlw XMM5, 8; 1103 //moving the result to its place; 1104 //pxor MM2, MM2; 1105 packuswb XMM4, XMM5; 1106 1107 movups [RDI], XMM4; 1108 //add EBX, 16; 1109 add RSI, 16; 1110 add RDI, 16; 1111 dec RCX; 1112 cmp RCX, 0; 1113 jnz sixteenpixelblend; 1114 1115 fourpixelblend: 1116 1117 mov RCX, target4; 1118 cmp RCX, 0; 1119 jz endofalgorithm; 1120 1121 fourpixelblendloop: 1122 1123 //movd XMM6, [EBX];//alpha 1124 1125 1126 movd XMM0, [RDI]; 1127 movd XMM1, [RSI]; 1128 punpcklbw XMM0, XMM2;//dest 1129 punpcklbw XMM1, XMM2;//src 1130 movups XMM6, XMM1; 1131 pand XMM6, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 1132 movups XMM7, XMM6; 1133 pslld XMM6, 8; 1134 por XMM7, XMM6; //mask is ready for RA 1135 pslld XMM7, 16; 1136 por XMM6, XMM7; //mask is ready for BGRA 1137 1138 punpcklbw XMM6, XMM2; 1139 1140 movaps XMM4, XMM8; 1141 movaps XMM5, XMM9; 1142 1143 paddusw XMM5, XMM6;//1+alpha 1144 psubusw XMM4, XMM6;//256-alpha 1145 1146 pmullw XMM0, XMM4;//dest*(256-alpha) 1147 pmullw XMM1, XMM5;//src*(1+alpha) 1148 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 1149 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 1150 1151 packuswb XMM0, XMM2; 1152 1153 movd [RDI], XMM0; 1154 1155 add RSI, 4; 1156 add RDI, 4;/**/ 1157 dec RCX; 1158 cmp RCX, 0; 1159 jnz fourpixelblendloop; 1160 1161 endofalgorithm: 1162 ; 1163 } 1164 }else{ 1165 for(int i ; i < length ; i++){ 1166 switch(src.ColorSpaceARGB.alpha){ 1167 case 0: 1168 break; 1169 case 255: 1170 dest = src; 1171 break; 1172 default: 1173 int src1 = 1 + src.ColorSpaceARGB.alpha; 1174 int src256 = 256 - src.ColorSpaceARGB.alpha; 1175 dest.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8); 1176 dest.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8); 1177 dest.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8); 1178 break; 1179 } 1180 src++; 1181 dest++; 1182 } 1183 } 1184 } 1185 /** 1186 * Copies a 32bit image onto another without blitter. No transparency is used. 1187 */ 1188 public @nogc void copy32bit(uint* src, uint* dest, size_t length){ 1189 version(X86){ 1190 version(MMX){ 1191 asm @nogc{ 1192 mov ESI, src[EBP]; 1193 mov EDI, dest[EBP]; 1194 mov ECX, length; 1195 movq MM6, ALPHABLEND_MMX_MASK; 1196 pxor MM7, MM7; 1197 cmp ECX, 2; 1198 jl twopixel; 1199 twopixelloop: 1200 movq MM0, [ESI]; 1201 movq [EDI], MM0; 1202 add ESI, 8; 1203 add EDI, 8; 1204 sub ECX, 2; 1205 jge fourpixelloop; 1206 onepixel: 1207 cmp ECX, 1; 1208 jl end; 1209 movd MM0, [ESI];; 1210 movd [EDI], MM0; 1211 add ESI, 2; 1212 add EDI, 2; 1213 sub ECX, 2; 1214 end: 1215 emms; 1216 } 1217 }else{ 1218 asm @nogc{ 1219 mov ESI, src[EBP]; 1220 mov EDI, dest[EBP]; 1221 mov ECX, length; 1222 movups XMM6, ALPHABLEND_SSE2_MASK; 1223 pxor XMM7, XMM7; 1224 cmp ECX, 8; 1225 jl twopixel; 1226 eigthpixelloop: 1227 movups XMM0, [ESI]; 1228 movups [EDI], XMM0; 1229 add ESI, 16; 1230 add EDI, 16; 1231 sub ECX, 4; 1232 cmp ECX, 4; 1233 jge eigthpixelloop; 1234 twopixel: 1235 cmp ECX, 2; 1236 jl onepixel; 1237 movq XMM0, [ESI]; 1238 movq [EDI], XMM0; 1239 add ESI, 8; 1240 add EDI, 8; 1241 sub ECX, 2; 1242 onepixel: 1243 cmp ECX, 1; 1244 jl end; 1245 movd XMM0, [ESI]; 1246 movd [EDI], XMM0; 1247 end: 1248 ; 1249 } 1250 } 1251 }else version(X86_64){ 1252 asm @nogc{ 1253 mov RSI, src[RBP]; 1254 mov RDI, dest[RBP]; 1255 mov RCX, length; 1256 movups XMM6, ALPHABLEND_SSE2_MASK; 1257 pxor XMM7, XMM7; 1258 cmp ECX, 8; 1259 jl twopixel; 1260 eigthpixelloop: 1261 movups XMM0, [RSI]; 1262 movups [RDI], XMM0; 1263 add RSI, 16; 1264 add RDI, 16; 1265 sub RCX, 4; 1266 cmp RCX, 4; 1267 jge eigthpixelloop; 1268 twopixel: 1269 cmp RCX, 2; 1270 jl onepixel; 1271 movq XMM0, [RSI]; 1272 movq [RDI], XMM0; 1273 add RSI, 8; 1274 add RDI, 8; 1275 sub RCX, 2; 1276 onepixel: 1277 cmp RCX, 1; 1278 jl end; 1279 movd XMM0, [RSI]; 1280 movd [RDI], XMM0; 1281 end: 1282 ; 1283 } 1284 }else{ 1285 while(length){ 1286 if(*src.ColorSpaceARGB.alpha) 1287 *dest = *src; 1288 src++; 1289 dest++; 1290 length--; 1291 } 1292 } 1293 } 1294 /** 1295 * Three plus one operand blitter for 8 bit values. Uses an external mask. 1296 */ 1297 public @nogc void blitter8bit(ubyte* src, ubyte* dest, size_t length, ubyte* mask){ 1298 version(X86){ 1299 version(MMX){ 1300 asm @nogc{ 1301 mov ESI, src[EBP]; 1302 mov EDI, dest[EBP]; 1303 mov EBX, mask[EBP]; 1304 mov ECX, length; 1305 cmp ECX, 8; 1306 jl fourpixel; 1307 eigthpixelloop: 1308 movq MM0, [ESI]; 1309 movq MM1, [EDI]; 1310 movq MM2, [EBX]; 1311 pand MM1, MM2; 1312 por MM1, MM0; 1313 movq [EDI], MM1; 1314 add ESI, 8; 1315 add EDI, 8; 1316 add EBX, 8; 1317 sub ECX, 8; 1318 jge eigthpixelloop; 1319 fourpixel: 1320 cmp ECX, 4; 1321 jl singlepixelloop; 1322 movd MM0, [ESI]; 1323 movd MM1, [EDI]; 1324 movd MM2, [EBX]; 1325 pand MM1, MM2; 1326 por MM1, MM0; 1327 movd [EDI], MM1; 1328 add ESI, 4; 1329 add EDI, 4; 1330 add EBX, 4; 1331 sub ECX, 4; 1332 singlepixelloop: 1333 //cmp ECX, 0; 1334 jecxz end; 1335 mov AL, [ESI]; 1336 mov AH, [EDI]; 1337 and AH, [EBX]; 1338 or AH, AL; 1339 mov [EDI], AH; 1340 inc ESI; 1341 inc EDI; 1342 inc EBX; 1343 dec ECX; 1344 jmp singlepixelloop; 1345 end: 1346 emms; 1347 } 1348 }else{ 1349 asm @nogc{ 1350 mov ESI, src[EBP]; 1351 mov EDI, dest[EBP]; 1352 mov EBX, mask[EBP]; 1353 mov ECX, length; 1354 cmp ECX, 16; 1355 pxor XMM7, XMM7; 1356 jl eightpixel; 1357 sixteenpixelloop: 1358 movups XMM0, [ESI]; 1359 movups XMM1, [EDI]; 1360 movups XMM2, [EBX]; 1361 pand XMM1, XMM2; 1362 por XMM1, XMM0; 1363 movups [EDI], XMM1; 1364 add ESI, 16; 1365 add EDI, 16; 1366 add EBX, 16; 1367 sub ECX, 16; 1368 cmp ECX, 16; 1369 jge sixteenpixelloop; 1370 eightpixel: 1371 cmp ECX, 8; 1372 jl fourpixel; 1373 movq XMM0, [ESI]; 1374 movq XMM1, [EDI]; 1375 movq XMM2, [EBX]; 1376 pand XMM1, XMM2; 1377 por XMM1, XMM0; 1378 movq [EDI], XMM1; 1379 add ESI, 8; 1380 add EDI, 8; 1381 add EBX, 8; 1382 sub ECX, 8; 1383 fourpixel: 1384 cmp ECX, 4; 1385 jl singlepixelloop; 1386 movd XMM0, [ESI]; 1387 movd XMM1, [EDI]; 1388 movd XMM2, [EBX]; 1389 pand XMM1, XMM2; 1390 por XMM1, XMM0; 1391 movd [EDI], XMM1; 1392 add ESI, 4; 1393 add EDI, 4; 1394 add EBX, 4; 1395 sub ECX, 4; 1396 singlepixelloop: 1397 //cmp ECX, 0; 1398 jecxz end; 1399 mov AL, [ESI]; 1400 mov AH, [EDI]; 1401 and AH, [EBX]; 1402 or AH, AL; 1403 mov [EDI], AH; 1404 inc ESI; 1405 inc EDI; 1406 inc EBX; 1407 dec ECX; 1408 jmp singlepixelloop; 1409 end: 1410 ; 1411 } 1412 } 1413 }else version(X86_64){ 1414 asm @nogc{ 1415 mov RSI, src[RBP]; 1416 mov RDI, dest[RBP]; 1417 mov RBX, mask[RBP]; 1418 mov RCX, length; 1419 cmp RCX, 16; 1420 //pxor XMM7, XMM7; 1421 jl eightpixel; 1422 sixteenpixelloop: 1423 movups XMM0, [RSI]; 1424 movups XMM1, [RDI]; 1425 movups XMM2, [RBX]; 1426 pand XMM1, XMM2; 1427 por XMM1, XMM0; 1428 movups [RDI], XMM1; 1429 add RSI, 16; 1430 add RDI, 16; 1431 add RBX, 16; 1432 sub RCX, 16; 1433 cmp RCX, 16; 1434 jge sixteenpixelloop; 1435 eightpixel: 1436 cmp RCX, 8; 1437 jl fourpixel; 1438 movq XMM0, [RSI]; 1439 movq XMM1, [RDI]; 1440 movq XMM2, [RBX]; 1441 pand XMM1, XMM2; 1442 por XMM1, XMM0; 1443 movq [RDI], XMM1; 1444 add RSI, 8; 1445 add RDI, 8; 1446 add RBX, 8; 1447 sub RCX, 8; 1448 fourpixel: 1449 cmp RCX, 4; 1450 jl singlepixelloop; 1451 movd XMM0, [RSI]; 1452 movd XMM1, [RDI]; 1453 movups XMM2, [RBX]; 1454 pand XMM1, XMM2; 1455 por XMM1, XMM0; 1456 movd [RDI], XMM1; 1457 add RSI, 4; 1458 add RDI, 4; 1459 add RBX, 4; 1460 sub RCX, 4; 1461 singlepixelloop: 1462 cmp RCX, 0; 1463 jz end; 1464 mov AL, [RSI]; 1465 mov AH, [RDI]; 1466 and AH, [RBX]; 1467 or AH, AL; 1468 mov [RDI], AH; 1469 inc RSI; 1470 inc RDI; 1471 inc RBX; 1472 dec RCX; 1473 jmp singlepixelloop; 1474 end: 1475 ; 1476 } 1477 }else{ 1478 while(length){ 1479 if(*src) 1480 *dest = (*dest & *mask) | *src; 1481 src++; 1482 dest++; 1483 mask++; 1484 length--; 1485 } 1486 } 1487 } 1488 /** 1489 * Copies an 8bit image onto another without blitter. No transparency is used. Mask is placeholder. 1490 */ 1491 public @nogc void copy8bit(ubyte* src, ubyte* dest, size_t length, ubyte* mask){ 1492 copy8bit(src,dest,length); 1493 } 1494 /** 1495 * Three plus one operand blitter for 8 bit values. An external mask is used for this operation. 1496 */ 1497 public @nogc void blitter16bit(ushort* src, ushort* dest, size_t length, ushort* mask){ 1498 version(X86){ 1499 version(MMX){ 1500 asm @nogc{ 1501 pxor MM7, MM7; 1502 mov ESI, src[EBP]; 1503 mov EDI, dest[EBP]; 1504 mov EBX, mask[EBP]; 1505 mov ECX, length; 1506 cmp ECX, 4; 1507 jl twopixel; 1508 fourpixelloop: 1509 movq MM0, [ESI]; 1510 movq MM1, [EDI]; 1511 movq MM2, [EBX]; 1512 pand MM1, MM2; 1513 por MM1, MM0; 1514 movq [EDI], MM1; 1515 add ESI, 8; 1516 add EDI, 8; 1517 add EBX, 8; 1518 sub ECX, 4; 1519 jge fourpixelloop; 1520 twopixel: 1521 cmp ECX, 4; 1522 jl singlepixel; 1523 movd MM0, [ESI]; 1524 movd MM1, [EDI]; 1525 movd MM2, [EBX]; 1526 pand MM1, MM2; 1527 por MM1, MM0; 1528 movd [EDI], MM1; 1529 add ESI, 4; 1530 add EDI, 4; 1531 add EBX, 4; 1532 sub ECX, 2; 1533 singlepixel: 1534 //cmp ECX, 0; 1535 jecxz end; 1536 mov AX, [EBX]; 1537 and AX, [EDI]; 1538 or AX, [ESI]; 1539 mov [EDI], AX; 1540 end: 1541 emms; 1542 } 1543 }else{ 1544 asm @nogc{ 1545 pxor XMM7, XMM7; 1546 mov ESI, src[EBP]; 1547 mov EDI, dest[EBP]; 1548 mov EBX, mask[EBP]; 1549 mov ECX, length; 1550 cmp ECX, 8; 1551 jl fourpixel; 1552 eigthpixelloop: 1553 movups XMM0, [ESI]; 1554 movups XMM1, [EDI]; 1555 movups XMM2, [EBX]; 1556 pand XMM1, XMM2; 1557 por XMM1, XMM0; 1558 movups [EDI], XMM1; 1559 add ESI, 16; 1560 add EDI, 16; 1561 add EBX, 16; 1562 sub ECX, 8; 1563 cmp ECX, 8; 1564 jge eigthpixelloop; 1565 fourpixel: 1566 cmp ECX, 4; 1567 jl twopixel; 1568 movq XMM0, [ESI]; 1569 movq XMM1, [EDI]; 1570 movq XMM2, [EBX]; 1571 pand XMM1, XMM2; 1572 por XMM1, XMM0; 1573 movq [EDI], XMM1; 1574 add ESI, 8; 1575 add EDI, 8; 1576 add EBX, 8; 1577 sub ECX, 4; 1578 twopixel: 1579 cmp ECX, 2; 1580 jl singlepixel; 1581 movd XMM0, [ESI]; 1582 movd XMM1, [EDI]; 1583 movd XMM2, [EBX]; 1584 pand XMM1, XMM2; 1585 por XMM1, XMM0; 1586 movd [EDI], XMM1; 1587 add ESI, 4; 1588 add EDI, 4; 1589 add EBX, 4; 1590 sub ECX, 2; 1591 singlepixel: 1592 //cmp ECX, 0; 1593 jecxz end; 1594 mov AX, [EBX]; 1595 and AX, [EDI]; 1596 or AX, [ESI]; 1597 mov [EDI], AX; 1598 end: 1599 ; 1600 } 1601 } 1602 }else version(X86_64){ 1603 asm @nogc{ 1604 pxor XMM7, XMM7; 1605 mov RSI, src[RBP]; 1606 mov RDI, dest[RBP]; 1607 mov RBX, mask[RBP]; 1608 mov RCX, length; 1609 cmp RCX, 8; 1610 jl fourpixel; 1611 eigthpixelloop: 1612 movups XMM0, [RSI]; 1613 movups XMM1, [RDI]; 1614 movups XMM2, [RBX]; 1615 pand XMM1, XMM2; 1616 por XMM1, XMM0; 1617 movups [RDI], XMM1; 1618 add RSI, 16; 1619 add RDI, 16; 1620 add RBX, 16; 1621 sub RCX, 8; 1622 cmp RCX, 8; 1623 jge eigthpixelloop; 1624 fourpixel: 1625 cmp RCX, 4; 1626 jl twopixel; 1627 movq XMM0, [RSI]; 1628 movq XMM1, [RDI]; 1629 movq XMM2, [RBX]; 1630 pand XMM1, XMM2; 1631 por XMM1, XMM0; 1632 movq [RDI], XMM1; 1633 add RSI, 8; 1634 add RDI, 8; 1635 add RBX, 8; 1636 sub RCX, 4; 1637 twopixel: 1638 cmp RCX, 2; 1639 jl singlepixel; 1640 movd XMM0, [RSI]; 1641 movd XMM1, [RDI]; 1642 movd XMM2, [RBX]; 1643 pand XMM1, XMM2; 1644 por XMM1, XMM0; 1645 movd [RDI], XMM1; 1646 add RSI, 4; 1647 add RDI, 4; 1648 add RBX, 4; 1649 sub RCX, 2; 1650 singlepixel: 1651 cmp RCX, 0; 1652 jz end; 1653 mov AX, [RBX]; 1654 and AX, [RDI]; 1655 or AX, [RSI]; 1656 mov [RDI], AX; 1657 end: 1658 ; 1659 } 1660 }else{ 1661 while(length){ 1662 *dest = (*dest & *mask) | *src; 1663 src++; 1664 dest++; 1665 mask++; 1666 length--; 1667 } 1668 } 1669 } 1670 /** 1671 * Copies a 16bit image onto another without blitter. No transparency is used. Mask is a placeholder for easy exchangeability with other functions. 1672 */ 1673 public @nogc void copy16bit(ushort* src, ushort* dest, size_t length, ushort* mask){ 1674 copy16bit(src,dest,length); 1675 } 1676 /** 1677 * Two plus one operand blitter for 32 bit values. A separate mask is used for the operation. 1678 */ 1679 public @nogc void blitter32bit(uint* src, uint* dest, size_t length, uint* mask){ 1680 version(X86){ 1681 version(MMX){ 1682 asm @nogc{ 1683 mov ESI, src[EBP]; 1684 mov EDI, dest[EBP]; 1685 mov EBX, mask[EBP]; 1686 mov ECX, length; 1687 movq MM6, ALPHABLEND_MMX_MASK; 1688 pxor MM7, MM7; 1689 cmp ECX, 2; 1690 jl twopixel; 1691 twopixelloop: 1692 movq MM0, [ESI]; 1693 movq MM1, [EDI]; 1694 movq MM2, [EBX]; 1695 pand MM1, MM2; 1696 por MM1, MM0; 1697 movq [EDI], MM1; 1698 add ESI, 8; 1699 add EDI, 8; 1700 add EBX, 8; 1701 sub ECX, 2; 1702 jge fourpixelloop; 1703 onepixel: 1704 jecxz end; 1705 movd MM0, [ESI]; 1706 movd MM1, [EDI]; 1707 movd MM2, [EBX]; 1708 pand MM1, MM2; 1709 por MM1, MM0; 1710 movd [EDI], MM1; 1711 end: 1712 emms; 1713 } 1714 }else{ 1715 asm @nogc{ 1716 mov ESI, src[EBP]; 1717 mov EDI, dest[EBP]; 1718 mov EBX, mask[EBP]; 1719 mov ECX, length; 1720 movups XMM6, ALPHABLEND_SSE2_MASK; 1721 pxor XMM7, XMM7; 1722 cmp ECX, 4; 1723 jl twopixel; 1724 fourpixelloop: 1725 movups XMM0, [ESI]; 1726 movups XMM1, [EDI]; 1727 movups XMM2, [EBX]; 1728 pand XMM1, XMM2; 1729 por XMM1, XMM0; 1730 movups [EDI], XMM1; 1731 add ESI, 16; 1732 add EDI, 16; 1733 add EBX, 16; 1734 sub ECX, 4; 1735 cmp ECX, 4; 1736 jge fourpixelloop; 1737 twopixel: 1738 cmp ECX, 2; 1739 jl onepixel; 1740 movq XMM0, [ESI]; 1741 movq XMM1, [EDI]; 1742 movq XMM2, [EBX]; 1743 pand XMM1, XMM2; 1744 por XMM1, XMM0; 1745 movq [EDI], XMM1; 1746 add ESI, 8; 1747 add EDI, 8; 1748 add EBX, 8; 1749 sub ECX, 2; 1750 onepixel: 1751 jecxz end; 1752 movd XMM0, [ESI]; 1753 movd XMM1, [EDI]; 1754 movd XMM2, [EBX]; 1755 pand XMM1, XMM2; 1756 por XMM1, XMM0; 1757 movd [EDI], XMM1; 1758 end: 1759 ; 1760 } 1761 } 1762 }else version(X86_64){ 1763 asm @nogc{ 1764 mov RSI, src[RBP]; 1765 mov RDI, dest[RBP]; 1766 mov RBX, mask[RBP]; 1767 mov RCX, length; 1768 movups XMM6, ALPHABLEND_SSE2_MASK; 1769 pxor XMM7, XMM7; 1770 cmp ECX, 4; 1771 jl twopixel; 1772 fourpixelloop: 1773 movups XMM0, [RSI]; 1774 movups XMM1, [RDI]; 1775 movups XMM2, [RBX]; 1776 pand XMM1, XMM2; 1777 por XMM1, XMM0; 1778 movups [RDI], XMM1; 1779 add RSI, 16; 1780 add RDI, 16; 1781 add RBX, 16; 1782 sub RCX, 4; 1783 cmp RCX, 4; 1784 jge fourpixelloop; 1785 twopixel: 1786 cmp RCX, 2; 1787 jl onepixel; 1788 movq XMM0, [RSI]; 1789 movq XMM1, [RDI]; 1790 movq XMM2, [RBX]; 1791 pand XMM1, XMM2; 1792 por XMM1, XMM0; 1793 movq [RDI], XMM1; 1794 add RSI, 8; 1795 add RDI, 8; 1796 add RBX, 8; 1797 sub RCX, 2; 1798 onepixel: 1799 cmp RCX, 1; 1800 jl end; 1801 movd XMM0, [RSI]; 1802 movd XMM1, [RDI]; 1803 movd XMM2, [RBX]; 1804 pand XMM1, XMM2; 1805 por XMM1, XMM0; 1806 movd [RDI], XMM1; 1807 end: 1808 ; 1809 } 1810 }else{ 1811 while(length){ 1812 dest.base = (dest.base & mask.base) | src.base; 1813 mask++; 1814 src++; 1815 dest++; 1816 length--; 1817 } 1818 } 1819 } 1820 /** 1821 * Implements a three plus one operand alpha-blending algorithm for 32bit bitmaps. For masking, use Pixel32Bit.AlphaMask from CPUblit.colorspaces. 1822 */ 1823 public @nogc void alphaBlend32bit(uint* src, uint* dest, size_t length, uint* mask){ 1824 version(X86){ 1825 version(MMX){ 1826 int target8 = length/8, target4 = length%2; 1827 asm @nogc { 1828 //setting up the pointer registers and the counter register 1829 //mov EBX, alpha[EBP]; 1830 mov ESI, src[EBP]; 1831 mov EDI, dest[EBP]; 1832 mov EBX, mask[EBP]; 1833 mov ECX, target8; 1834 cmp ECX, 0; 1835 jz fourpixelblend; //skip 16 byte operations if not needed 1836 //iteration cycle entry point 1837 sixteenpixelblend: 1838 //create alpha mask on the fly 1839 movq MM3, [ESI]; 1840 /*movq MM1, MM3; 1841 pand MM1, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 1842 movq MM0, MM1; 1843 pslld MM0, 8; 1844 por MM1, MM0; //mask is ready for RA 1845 pslld MM1, 16; 1846 por MM0, MM1; //mask is ready for BGRA*/ 1847 movq MM0, [EBX]; 1848 movq MM1, MM0; 1849 punpcklbw MM0, MM2; 1850 punpckhbw MM1, MM2; 1851 movq MM6, ALPHABLEND_MMX_CONST256; 1852 movq MM7, MM6; 1853 movq MM4, ALPHABLEND_MMX_CONST1; 1854 movq MM5, MM4; 1855 1856 paddusw MM4, MM0; //1 + alpha01 1857 paddusw MM5, MM1; //1 + alpha23 1858 psubusw MM6, MM0; //256 - alpha01 1859 psubusw MM7, MM1; //256 - alpha23 1860 1861 //moving the values to their destinations 1862 movq MM0, MM3; //src01 1863 movq MM1, MM0; //src23 1864 punpcklbw MM0, MM2; 1865 punpckhbw MM1, MM2; 1866 pmullw MM4, MM0; //src01 * (1 + alpha01) 1867 pmullw MM5, MM1; //src23 * (1 + alpha23) 1868 movq MM0, [EDI]; //dest01 1869 movq MM1, MM0; //dest23 1870 punpcklbw MM0, MM2; 1871 punpckhbw MM1, MM2; 1872 pmullw MM6, MM0; //dest01 * (256 - alpha) 1873 pmullw MM7, MM1; //dest23 * (256 - alpha) 1874 1875 paddusw MM4, MM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 1876 paddusw MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 1877 psrlw MM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 1878 psrlw MM5, 8; 1879 //moving the result to its place; 1880 //pxor MM2, MM2; 1881 packuswb MM4, MM5; 1882 1883 movq [EDI], MM4; 1884 //add EBX, 16; 1885 add ESI, 8; 1886 add EDI, 8; 1887 add EBX, 8; 1888 dec ECX; 1889 cmp ECX, 0; 1890 jnz sixteenpixelblend; 1891 fourpixelblend: 1892 mov ECX, target4; 1893 cmp ECX, 0; 1894 jz endofalgorithm; 1895 fourpixelblendloop: 1896 1897 //movd XMM6, [EBX];//alpha 1898 1899 1900 movd MM0, [EDI]; 1901 movd MM1, [ESI]; 1902 punpcklbw MM0, MM2;//dest 1903 punpcklbw MM1, MM2;//src 1904 movups MM6, MM1; 1905 pand MM6, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 1906 movups MM7, MM6; 1907 pslld MM6, 8; 1908 por MM7, MM6; //mask is ready for RA 1909 pslld MM7, 16; 1910 por MM6, MM7; //mask is ready for GRA 1911 punpcklbw MM7, MM2; 1912 movaps MM4, ALPHABLEND_MMX_CONST256; 1913 movaps MM5, ALPHABLEND_MMX_CONST1; 1914 1915 paddusw MM5, MM6;//1+alpha 1916 psubusw MM4, MM6;//256-alpha 1917 1918 pmullw MM0, MM4;//dest*(256-alpha) 1919 pmullw MM1, MM5;//src*(1+alpha) 1920 paddusw MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha)) 1921 psrlw MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 1922 1923 packuswb MM0, MM2; 1924 1925 movd [EDI], MM0; 1926 1927 endofalgorithm: 1928 emms; 1929 } 1930 }else{ 1931 int target16 = length/4, target4 = length%4; 1932 asm @nogc { 1933 //setting up the pointer registers and the counter register 1934 //mov EBX, alpha[EBP]; 1935 mov ESI, src[EBP]; 1936 mov EDI, dest[EBP]; 1937 mov EBX, mask[EBP]; 1938 mov ECX, target16; 1939 cmp ECX, 0; 1940 jz fourpixelblend; //skip 16 byte operations if not needed 1941 //iteration cycle entry point 1942 sixteenpixelblend: 1943 //create alpha mask on the fly 1944 movups XMM3, [ESI]; 1945 movups XMM1, [EBX]; 1946 //pand XMM1, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 1947 //movups XMM0, XMM1; 1948 //pslld XMM0, 8; 1949 //por XMM1, XMM0; //mask is ready for RA 1950 //pslld XMM1, 16; 1951 //por XMM0, XMM1; //mask is ready for BGRA/**/ 1952 movups XMM0, XMM1; 1953 1954 punpcklbw XMM0, XMM2; 1955 punpckhbw XMM1, XMM2; 1956 movups XMM6, ALPHABLEND_SSE2_CONST256; 1957 movups XMM7, XMM6; 1958 movups XMM4, ALPHABLEND_SSE2_CONST1; 1959 movups XMM5, XMM4; 1960 1961 paddusw XMM4, XMM0; //1 + alpha01 1962 paddusw XMM5, XMM1; //1 + alpha23 1963 psubusw XMM6, XMM0; //256 - alpha01 1964 psubusw XMM7, XMM1; //256 - alpha23 1965 1966 //moving the values to their destinations 1967 1968 movups XMM0, XMM3; //src01 1969 movups XMM1, XMM0; //src23 1970 punpcklbw XMM0, XMM2; 1971 punpckhbw XMM1, XMM2; 1972 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 1973 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 1974 movups XMM0, [EDI]; //dest01 1975 movups XMM1, XMM0; //dest23 1976 punpcklbw XMM0, XMM2; 1977 punpckhbw XMM1, XMM2; 1978 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 1979 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 1980 1981 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 1982 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 1983 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 1984 psrlw XMM5, 8; 1985 //moving the result to its place; 1986 //pxor MM2, MM2; 1987 packuswb XMM4, XMM5; 1988 1989 movups [EDI], XMM4; 1990 //add EBX, 16; 1991 add ESI, 16; 1992 add EDI, 16; 1993 add EBX, 16; 1994 dec ECX; 1995 cmp ECX, 0; 1996 jnz sixteenpixelblend; 1997 1998 fourpixelblend: 1999 2000 mov ECX, target4; 2001 cmp ECX, 0; 2002 jz endofalgorithm; 2003 2004 fourpixelblendloop: 2005 2006 //movd XMM6, [EBX];//alpha 2007 2008 2009 movd XMM0, [EDI]; 2010 movd XMM1, [ESI]; 2011 punpcklbw XMM0, XMM2;//dest 2012 punpcklbw XMM1, XMM2;//src 2013 movd XMM6, [EBX]; 2014 /*pand XMM6, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2015 movups XMM7, XMM6; 2016 pslld XMM6, 8; 2017 por XMM7, XMM6; //mask is ready for RA 2018 pslld XMM7, 16; 2019 por XMM6, XMM7; //mask is ready for BGRA*/ 2020 2021 punpcklbw XMM6, XMM2; 2022 2023 movaps XMM4, ALPHABLEND_SSE2_CONST256; 2024 movaps XMM5, ALPHABLEND_SSE2_CONST1; 2025 2026 paddusw XMM5, XMM6;//1+alpha 2027 psubusw XMM4, XMM6;//256-alpha 2028 2029 pmullw XMM0, XMM4;//dest*(256-alpha) 2030 pmullw XMM1, XMM5;//src*(1+alpha) 2031 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 2032 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 2033 2034 packuswb XMM0, XMM2; 2035 2036 movd [EDI], XMM0; 2037 2038 add ESI, 4; 2039 add EDI, 4;/**/ 2040 add EBX, 4; 2041 dec ECX; 2042 cmp ECX, 0; 2043 jnz fourpixelblendloop; 2044 2045 endofalgorithm: 2046 ; 2047 } 2048 } 2049 }else version(X86_64){ 2050 size_t target16 = length/4, target4 = length%4; 2051 asm @nogc { 2052 //setting up the pointer registers and the counter register 2053 //mov EBX, alpha[EBP]; 2054 mov RSI, src[RBP]; 2055 mov RDI, dest[RBP]; 2056 mov RBX, mask[RBP]; 2057 mov RCX, target16; 2058 cmp RCX, 0; 2059 movups XMM8, ALPHABLEND_SSE2_CONST256; 2060 movups XMM9, ALPHABLEND_SSE2_CONST1; 2061 movups XMM10, ALPHABLEND_SSE2_MASK; 2062 jz fourpixelblend; //skip 16 byte operations if not needed 2063 //iteration cycle entry point 2064 sixteenpixelblend: 2065 //create alpha mask on the fly 2066 movups XMM3, [RSI]; 2067 /*movups XMM1, XMM3; 2068 pand XMM1, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2069 movups XMM0, XMM1; 2070 pslld XMM0, 8; 2071 por XMM1, XMM0; //mask is ready for RA 2072 pslld XMM1, 16; 2073 por XMM0, XMM1; //mask is ready for BGRA/**/ 2074 movups XMM0, [RBX]; 2075 movups XMM1, XMM0; 2076 2077 punpcklbw XMM0, XMM2; 2078 punpckhbw XMM1, XMM2; 2079 movups XMM6, XMM8; 2080 movups XMM7, XMM8; 2081 movups XMM4, XMM9; 2082 movups XMM5, XMM9; 2083 2084 paddusw XMM4, XMM0; //1 + alpha01 2085 paddusw XMM5, XMM1; //1 + alpha23 2086 psubusw XMM6, XMM0; //256 - alpha01 2087 psubusw XMM7, XMM1; //256 - alpha23 2088 2089 //moving the values to their destinations 2090 2091 movups XMM0, XMM3; //src01 2092 movups XMM1, XMM0; //src23 2093 punpcklbw XMM0, XMM2; 2094 punpckhbw XMM1, XMM2; 2095 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 2096 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 2097 movups XMM0, [EDI]; //dest01 2098 movups XMM1, XMM0; //dest23 2099 punpcklbw XMM0, XMM2; 2100 punpckhbw XMM1, XMM2; 2101 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 2102 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 2103 2104 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 2105 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 2106 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 2107 psrlw XMM5, 8; 2108 //moving the result to its place; 2109 //pxor MM2, MM2; 2110 packuswb XMM4, XMM5; 2111 2112 movups [RDI], XMM4; 2113 //add EBX, 16; 2114 add RSI, 16; 2115 add RDI, 16; 2116 add RBX, 16; 2117 dec RCX; 2118 cmp RCX, 0; 2119 jnz sixteenpixelblend; 2120 2121 fourpixelblend: 2122 2123 mov RCX, target4; 2124 cmp RCX, 0; 2125 jz endofalgorithm; 2126 2127 fourpixelblendloop: 2128 2129 //movd XMM6, [EBX];//alpha 2130 2131 2132 movd XMM0, [RDI]; 2133 movd XMM1, [RSI]; 2134 punpcklbw XMM0, XMM2;//dest 2135 punpcklbw XMM1, XMM2;//src 2136 movups XMM6, [RBX]; 2137 /*pand XMM6, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2138 movups XMM7, XMM6; 2139 pslld XMM6, 8; 2140 por XMM7, XMM6; //mask is ready for RA 2141 pslld XMM7, 16; 2142 por XMM6, XMM7; //mask is ready for BGRA*/ 2143 2144 punpcklbw XMM6, XMM2; 2145 2146 movaps XMM4, XMM8; 2147 movaps XMM5, XMM9; 2148 2149 paddusw XMM5, XMM6;//1+alpha 2150 psubusw XMM4, XMM6;//256-alpha 2151 2152 pmullw XMM0, XMM4;//dest*(256-alpha) 2153 pmullw XMM1, XMM5;//src*(1+alpha) 2154 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 2155 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 2156 2157 packuswb XMM0, XMM2; 2158 2159 movd [RDI], XMM0; 2160 2161 add RSI, 4; 2162 add RDI, 4;/**/ 2163 add RBX, 4; 2164 dec RCX; 2165 cmp RCX, 0; 2166 jnz fourpixelblendloop; 2167 2168 endofalgorithm: 2169 ; 2170 } 2171 }else{ 2172 for(int i ; i < length ; i++){ 2173 switch(mask.AlphaMask.value){ 2174 case 0: 2175 break; 2176 case 255: 2177 dest = src; 2178 break; 2179 default: 2180 int src1 = 1 + mask.AlphaMask.value; 2181 int src256 = 256 - mask.AlphaMask.value; 2182 dest.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8); 2183 dest.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8); 2184 dest.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8); 2185 break; 2186 } 2187 src++; 2188 dest++; 2189 mask++; 2190 } 2191 } 2192 } 2193 /** 2194 * Copies a 32bit image onto another without blitter. No transparency is used. Mask is placeholder. 2195 */ 2196 public @nogc void copy32bit(uint* src, uint* dest, size_t length, uint* mask){ 2197 copy32bit(src,dest,length); 2198 } 2199 /** 2200 * Two plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula: 2201 * mask = src == 0x00 ? 0xFF : 0x00 2202 * Final values are copied into memory location specified by dest1. 2203 */ 2204 public @nogc void blitter8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length){ 2205 version(X86){ 2206 version(MMX){ 2207 asm @nogc{ 2208 pxor MM7, MM7; 2209 mov ESI, src[EBP]; 2210 mov EDI, dest[EBP]; 2211 mov EDX, dest1[EBP]; 2212 mov ECX, length; 2213 cmp ECX, 8; 2214 jl fourpixel; 2215 eigthpixelloop: 2216 movq MM0, [ESI]; 2217 movq MM1, [EDI]; 2218 movq MM2, MM7; 2219 pcmpeqb MM2, MM0; 2220 pand MM1, MM2; 2221 por MM1, MM0; 2222 movq [EDX], MM1; 2223 add ESI, 8; 2224 add EDI, 8; 2225 add EDX, 8; 2226 sub ECX, 8; 2227 jge eigthpixelloop; 2228 fourpixel: 2229 cmp ECX, 4; 2230 jl singlepixelloop; 2231 movd MM0, [ESI]; 2232 movd MM1, [EDI]; 2233 movq MM2, MM7; 2234 pcmpeqb MM2, MM0; 2235 pand MM1, MM2; 2236 por MM1, MM0; 2237 movd [EDX], MM1; 2238 add ESI, 4; 2239 add EDI, 4; 2240 add EDX, 4; 2241 sub ECX, 4; 2242 singlepixelloop: 2243 //cmp ECX, 0; 2244 jecxz end; 2245 mov AL, [ESI]; 2246 cmp AL, 0; 2247 jz step; 2248 mov AL, [EDI]; 2249 step: 2250 mov [EDX], AL; 2251 cmp ECX, 0; 2252 inc ESI; 2253 inc EDI; 2254 inc EDX; 2255 dec ECX; 2256 jmp singlepixelloop; 2257 end: 2258 emms; 2259 } 2260 }else{ 2261 asm @nogc{ 2262 pxor XMM7, XMM7; 2263 mov ESI, src[EBP]; 2264 mov EDI, dest[EBP]; 2265 mov EDX, dest1[EBP]; 2266 mov ECX, length; 2267 cmp ECX, 16; 2268 jl eightpixel; 2269 sixteenpixelloop: 2270 movups XMM0, [ESI]; 2271 movups XMM1, [EDI]; 2272 movups XMM2, XMM7; 2273 pcmpeqb XMM2, XMM0; 2274 pand XMM1, XMM2; 2275 por XMM1, XMM0; 2276 movups [EDX], XMM1; 2277 add ESI, 16; 2278 add EDI, 16; 2279 add EDX, 16; 2280 sub ECX, 16; 2281 cmp ECX, 16; 2282 jge sixteenpixelloop; 2283 eightpixel: 2284 cmp ECX, 8; 2285 jl fourpixel; 2286 movq XMM0, [ESI]; 2287 movq XMM1, [EDI]; 2288 movups XMM2, XMM7; 2289 pcmpeqb XMM2, XMM0; 2290 pand XMM1, XMM2; 2291 por XMM1, XMM0; 2292 movq [EDX], XMM1; 2293 add ESI, 8; 2294 add EDI, 8; 2295 add EDX, 8; 2296 sub ECX, 8; 2297 fourpixel: 2298 cmp ECX, 4; 2299 jl singlepixelloop; 2300 movd XMM0, [ESI]; 2301 movd XMM1, [EDI]; 2302 movups XMM2, XMM7; 2303 pcmpeqb XMM2, XMM0; 2304 pand XMM1, XMM2; 2305 por XMM1, XMM0; 2306 movd [EDX], XMM1; 2307 add ESI, 4; 2308 add EDI, 4; 2309 add EDX, 4; 2310 sub ECX, 4; 2311 singlepixelloop: 2312 //cmp ECX, 0; 2313 jecxz end; 2314 mov AL, [ESI]; 2315 cmp AL, 0; 2316 jz step; 2317 mov AL, [EDI]; 2318 step: 2319 mov [EDX], AL; 2320 cmp ECX, 0; 2321 inc ESI; 2322 inc EDI; 2323 inc EDX; 2324 dec ECX; 2325 jmp singlepixelloop; 2326 end: 2327 ; 2328 } 2329 } 2330 }else version(X86_64){ 2331 asm @nogc{ 2332 pxor XMM7, XMM7; 2333 mov RSI, src[RBP]; 2334 mov RDI, dest[RBP]; 2335 mov RDX, dest1[RBP]; 2336 mov RCX, length; 2337 cmp RCX, 16; 2338 jl eightpixel; 2339 sixteenpixelloop: 2340 movups XMM0, [RSI]; 2341 movups XMM1, [RDI]; 2342 movups XMM2, XMM7; 2343 pcmpeqb XMM2, XMM0; 2344 pand XMM1, XMM2; 2345 por XMM1, XMM0; 2346 movups [RDX], XMM1; 2347 add RSI, 16; 2348 add RDI, 16; 2349 add RDX, 16; 2350 sub RCX, 16; 2351 cmp RCX, 16; 2352 jge sixteenpixelloop; 2353 eightpixel: 2354 cmp RCX, 8; 2355 jl fourpixel; 2356 movq XMM0, [RSI]; 2357 movq XMM1, [RDI]; 2358 movups XMM2, XMM7; 2359 pcmpeqb XMM2, XMM0; 2360 pand XMM1, XMM2; 2361 por XMM1, XMM0; 2362 movq [RDX], XMM1; 2363 add RSI, 8; 2364 add RDI, 8; 2365 add RDI, 8; 2366 sub RCX, 8; 2367 fourpixel: 2368 cmp RCX, 4; 2369 jl singlepixelloop; 2370 movd XMM0, [RSI]; 2371 movd XMM1, [RDI]; 2372 movups XMM2, XMM7; 2373 pcmpeqb XMM2, XMM0; 2374 pand XMM1, XMM2; 2375 por XMM1, XMM0; 2376 movd [RDX], XMM1; 2377 add RSI, 4; 2378 add RDI, 4; 2379 add RDI, 4; 2380 sub RCX, 4; 2381 singlepixelloop: 2382 cmp RCX, 0; 2383 jz end; 2384 mov AL, [RSI]; 2385 cmp AL, 0; 2386 jz step; 2387 mov AL, [EDI]; 2388 step: 2389 mov [RDI], AL; 2390 cmp RCX, 0; 2391 inc RSI; 2392 inc RDI; 2393 inc RDX; 2394 dec RCX; 2395 jmp singlepixelloop; 2396 end: 2397 ; 2398 } 2399 }else{ 2400 while(length){ 2401 if(*src) 2402 *dest = *src; 2403 src++; 2404 dest++; 2405 length--; 2406 } 2407 } 2408 } 2409 /** 2410 * Copies an 8bit image onto another without blitter. No transparency is used. Dest is placeholder. 2411 */ 2412 public @nogc void copy8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length){ 2413 copy8bit(src,dest1,length); 2414 } 2415 /** 2416 * Three plus one operand blitter for 8 bit values. Automatic mask-generation is used from the source's color index with the following formula: 2417 * mask = src == 0x0000 ? 0xFFFF : 0x0000 2418 * Result is copied into memory location specified by dest1. 2419 */ 2420 public @nogc void blitter16bit(ushort* src, ushort* dest, ushort* dest1, size_t length){ 2421 version(X86){ 2422 version(MMX){ 2423 asm @nogc{ 2424 pxor MM7, MM7; 2425 mov ESI, src[EBP]; 2426 mov EDI, dest[EBP]; 2427 mov EDX, dest[EBP]; 2428 mov ECX, length; 2429 cmp ECX, 4; 2430 jl twopixel; 2431 fourpixelloop: 2432 movq MM0, [ESI]; 2433 movq MM1, [EDI]; 2434 movq MM2, MM0; 2435 pcmpeqw MM2, MM7; 2436 pand MM1, MM2; 2437 por MM1, MM0; 2438 movq [EDX], MM1; 2439 add ESI, 8; 2440 add EDI, 8; 2441 add EDX, 8; 2442 sub ECX, 4; 2443 jge fourpixelloop; 2444 twopixel: 2445 cmp ECX, 4; 2446 jl singlepixel; 2447 movd MM0, [ESI]; 2448 movd MM1, [EDI]; 2449 movq MM2, MM7; 2450 pcmpeqw MM2, MM0; 2451 pand MM1, MM2; 2452 por MM1, MM0; 2453 movd [EDX], MM1; 2454 add ESI, 2; 2455 add EDI, 2; 2456 add EDX, 2; 2457 sub ECX, 2; 2458 singlepixel: 2459 //cmp ECX, 0; 2460 jecxz end; 2461 mov AX, [ESI]; 2462 cmp AX, 0; 2463 cmovz AX, [EDI]; 2464 mov [EDX], AL; 2465 end: 2466 emms; 2467 } 2468 }else{ 2469 asm @nogc{ 2470 pxor XMM7, XMM7; 2471 mov ESI, src[EBP]; 2472 mov EDI, dest[EBP]; 2473 mov EDX, dest[EBP]; 2474 mov ECX, length; 2475 cmp ECX, 8; 2476 jl fourpixel; 2477 eigthpixelloop: 2478 movups XMM0, [ESI]; 2479 movups XMM1, [EDI]; 2480 movups XMM2, XMM7; 2481 pcmpeqw XMM2, XMM0; 2482 pand XMM1, XMM2; 2483 por XMM1, XMM0; 2484 movups [EDX], XMM1; 2485 add ESI,16; 2486 add EDI,16; 2487 add EDX,16; 2488 sub ECX, 8; 2489 cmp ECX, 8; 2490 jge eigthpixelloop; 2491 fourpixel: 2492 cmp ECX, 4; 2493 jl twopixel; 2494 movq XMM0, [ESI]; 2495 movq XMM1, [EDI]; 2496 movups XMM2, XMM7; 2497 pcmpeqw XMM2, XMM0; 2498 pand XMM1, XMM2; 2499 por XMM1, XMM0; 2500 movq [EDX], XMM1; 2501 add ESI, 8; 2502 add EDI, 8; 2503 add EDX, 8; 2504 sub ECX, 4; 2505 twopixel: 2506 cmp ECX, 2; 2507 jl singlepixel; 2508 movd XMM0, [ESI]; 2509 movd XMM1, [EDI]; 2510 movups XMM2, XMM7; 2511 pcmpeqw XMM2, XMM0; 2512 pand XMM1, XMM2; 2513 por XMM1, XMM0; 2514 movd [EDX], XMM1; 2515 add ESI, 4; 2516 add EDI, 4; 2517 add EDX, 4; 2518 sub ECX, 2; 2519 singlepixel: 2520 //cmp ECX, 0; 2521 jecxz end; 2522 mov AX, [ESI]; 2523 cmp AX, 0; 2524 cmovz AX, [EDI]; 2525 mov [EDX], AX; 2526 end: 2527 ; 2528 } 2529 } 2530 }else version(X86_64){ 2531 asm @nogc{ 2532 pxor XMM7, XMM7; 2533 mov RSI, src[RBP]; 2534 mov RDI, dest[RBP]; 2535 mov RDX, dest1[RBP]; 2536 mov RCX, length; 2537 cmp RCX, 8; 2538 jl fourpixel; 2539 eigthpixelloop: 2540 movups XMM0, [RSI]; 2541 movups XMM1, [RDI]; 2542 movups XMM2, XMM7; 2543 pcmpeqw XMM2, XMM0; 2544 pand XMM1, XMM2; 2545 por XMM1, XMM0; 2546 movups [RDX], XMM1; 2547 add RSI,16; 2548 add RDI,16; 2549 add RDX,16; 2550 sub RCX, 8; 2551 cmp RCX, 8; 2552 jge eigthpixelloop; 2553 fourpixel: 2554 cmp RCX, 4; 2555 jl twopixel; 2556 movq XMM0, [RSI]; 2557 movq XMM1, [RDI]; 2558 movups XMM2, XMM7; 2559 pcmpeqw XMM2, XMM0; 2560 pand XMM1, XMM2; 2561 por XMM1, XMM0; 2562 movq [RDX], XMM1; 2563 add RSI, 8; 2564 add RDI, 8; 2565 add RDX, 8; 2566 sub RCX, 4; 2567 twopixel: 2568 cmp RCX, 2; 2569 jl singlepixel; 2570 movd XMM0, [RSI]; 2571 movd XMM1, [RDI]; 2572 movups XMM2, XMM7; 2573 pcmpeqw XMM2, XMM0; 2574 pand XMM1, XMM2; 2575 por XMM1, XMM0; 2576 movd [RDX], XMM1; 2577 add RSI, 4; 2578 add RDI, 4; 2579 add RDX, 4; 2580 sub RCX, 2; 2581 singlepixel: 2582 cmp RCX, 0; 2583 jz end; 2584 mov AX, [RSI]; 2585 cmp AX, 0; 2586 cmovz AX, [RDI]; 2587 mov [RDX], AX; 2588 end: 2589 ; 2590 } 2591 }else{ 2592 while(length){ 2593 if(*src) 2594 *dest1 = *src; 2595 else 2596 *dest1 = *dest; 2597 src++; 2598 dest++; 2599 length--; 2600 } 2601 } 2602 } 2603 /** 2604 * Copies a 16bit image onto another without blitter. No transparency is used. Dest is placeholder. 2605 */ 2606 public @nogc void copy16bit(ushort* src, ushort* dest, ushort* dest1, size_t length){ 2607 copy16bit(src,dest1,length); 2608 } 2609 /** 2610 * Three plus one operand blitter for 32 bit values. Automatic mask-generation is used from the source's alpha channel with the following formula: 2611 * mask = src.alpha ? 0xFFFFFFFF : 0x00000000 2612 * The result is copied into the memory location specified by dest1 2613 */ 2614 public @nogc void blitter32bit(uint* src, uint* dest, uint* dest1, size_t length){ 2615 version(X86){ 2616 version(MMX){ 2617 asm @nogc{ 2618 mov ESI, src[EBP]; 2619 mov EDI, dest[EBP]; 2620 mov EDX, dest1[EBP]; 2621 mov ECX, length; 2622 movq MM6, ALPHABLEND_MMX_MASK; 2623 pxor MM7, MM7; 2624 cmp ECX, 2; 2625 jl twopixel; 2626 twopixelloop: 2627 movq MM0, [ESI]; 2628 movq MM1, [EDI]; 2629 movq MM2, MM0; 2630 pand MM2, MM6; 2631 pcmpeqd MM2, MM7; 2632 pand MM1, MM2; 2633 por MM1, MM0; 2634 movq [EDX], MM1; 2635 add ESI, 8; 2636 add EDI, 8; 2637 add EDX, 8; 2638 sub ECX, 2; 2639 jge fourpixelloop; 2640 onepixel: 2641 cmp ECX, 1; 2642 jl end; 2643 movd MM0, [ESI]; 2644 movd MM1, [EDI]; 2645 movq MM2, MM0; 2646 pand MM2, MM6; 2647 pcmpeqd MM2, MM7; 2648 pand MM1, MM2; 2649 por MM1, MM0; 2650 movd [EDX], MM1; 2651 end: 2652 emms; 2653 } 2654 }else{ 2655 asm @nogc{ 2656 mov ESI, src[EBP]; 2657 mov EDI, dest[EBP]; 2658 mov EDX, dest1[EBP]; 2659 mov ECX, length; 2660 movups XMM6, ALPHABLEND_SSE2_MASK; 2661 pxor XMM7, XMM7; 2662 cmp ECX, 8; 2663 jl twopixel; 2664 fourpixelloop: 2665 movups XMM0, [ESI]; 2666 movups XMM1, [EDI]; 2667 movups XMM2, XMM0; 2668 pand XMM2, XMM6; 2669 pcmpeqd XMM2, XMM7; 2670 pand XMM1, XMM2; 2671 por XMM1, XMM0; 2672 movups [EDX], XMM1; 2673 add ESI,16; 2674 add EDI,16; 2675 add EDX,16; 2676 sub ECX, 4; 2677 cmp ECX, 4; 2678 jge fourpixelloop; 2679 twopixel: 2680 cmp ECX, 2; 2681 jl onepixel; 2682 movq XMM0, [ESI]; 2683 movq XMM1, [EDI]; 2684 movq XMM2, XMM0; 2685 pand XMM2, XMM6; 2686 pcmpeqd XMM2, XMM7; 2687 pand XMM1, XMM2; 2688 por XMM1, XMM0; 2689 movq [EDX], XMM1; 2690 add ESI, 8; 2691 add EDI, 8; 2692 add EDX, 8; 2693 sub ECX, 2; 2694 onepixel: 2695 cmp ECX, 1; 2696 jl end; 2697 movd XMM0, [ESI]; 2698 movd XMM1, [EDI]; 2699 movq XMM2, XMM0; 2700 pand XMM2, XMM6; 2701 pcmpeqd XMM2, XMM7; 2702 pand XMM1, XMM2; 2703 por XMM1, XMM0; 2704 movd [EDX], XMM1; 2705 end: 2706 ; 2707 } 2708 } 2709 }else version(X86_64){ 2710 asm @nogc{ 2711 mov RSI, src[RBP]; 2712 mov RDI, dest[RBP]; 2713 mov RDX, dest1[RBP]; 2714 mov RCX, length; 2715 movups XMM6, ALPHABLEND_SSE2_MASK; 2716 pxor XMM7, XMM7; 2717 cmp ECX, 8; 2718 jl twopixel; 2719 fourpixelloop: 2720 movups XMM0, [RSI]; 2721 movups XMM1, [RDI]; 2722 movups XMM2, XMM0; 2723 pand XMM2, XMM6; 2724 pcmpeqd XMM2, XMM7; 2725 pand XMM1, XMM2; 2726 por XMM1, XMM0; 2727 movups [RDX], XMM1; 2728 add RSI,16; 2729 add RDI,16; 2730 add RDX,16; 2731 sub RCX, 4; 2732 cmp RCX, 4; 2733 jge fourpixelloop; 2734 twopixel: 2735 cmp RCX, 2; 2736 jl onepixel; 2737 movq XMM0, [RSI]; 2738 movq XMM1, [RDI]; 2739 movq XMM2, XMM0; 2740 pand XMM2, XMM6; 2741 pcmpeqd XMM2, XMM7; 2742 pand XMM1, XMM2; 2743 por XMM1, XMM0; 2744 movq [RDX], XMM1; 2745 add RSI, 8; 2746 add RDI, 8; 2747 add RDX, 8; 2748 sub RCX, 2; 2749 onepixel: 2750 cmp RCX, 1; 2751 jl end; 2752 movd XMM0, [RSI]; 2753 movd XMM1, [RDI]; 2754 movq XMM2, XMM0; 2755 pand XMM2, XMM6; 2756 pcmpeqd XMM2, XMM7; 2757 pand XMM1, XMM2; 2758 por XMM1, XMM0; 2759 movd [RDI], XMM1; 2760 end: 2761 ; 2762 } 2763 }else{ 2764 while(length){ 2765 if(*src.ColorSpaceARGB.alpha) 2766 *dest1 = *src; 2767 else 2768 *dest1 = *dest; 2769 src++; 2770 dest++; 2771 dest1++; 2772 length--; 2773 } 2774 } 2775 } 2776 /** 2777 * Implements a three plus one operand alpha-blending algorithm for 32bit bitmaps. Automatic alpha-mask generation follows this formula: 2778 * src[B,G,R,A] --> mask [A,A,A,A] 2779 */ 2780 public @nogc void alphaBlend32bit(uint* src, uint* dest, uint* dest1, size_t length){ 2781 version(X86){ 2782 version(MMX){ 2783 int target8 = length/8, target4 = length%2; 2784 asm @nogc { 2785 //setting up the pointer registers and the counter register 2786 //mov EBX, alpha[EBP]; 2787 mov ESI, src[EBP]; 2788 mov EDI, dest[EBP]; 2789 mov EDX, dest1[EBP]; 2790 mov ECX, target8; 2791 cmp ECX, 0; 2792 jz fourpixelblend; //skip 16 byte operations if not needed 2793 //iteration cycle entry point 2794 sixteenpixelblend: 2795 //create alpha mask on the fly 2796 movq MM3, [ESI]; 2797 movq MM1, MM3; 2798 pand MM1, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2799 movq MM0, MM1; 2800 pslld MM0, 8; 2801 por MM1, MM0; //mask is ready for RA 2802 pslld MM1, 16; 2803 por MM0, MM1; //mask is ready for BGRA 2804 movq MM1, MM0; 2805 punpcklbw MM0, MM2; 2806 punpckhbw MM1, MM2; 2807 movq MM6, ALPHABLEND_MMX_CONST256; 2808 movq MM7, MM6; 2809 movq MM4, ALPHABLEND_MMX_CONST1; 2810 movq MM5, MM4; 2811 2812 paddusw MM4, MM0; //1 + alpha01 2813 paddusw MM5, MM1; //1 + alpha23 2814 psubusw MM6, MM0; //256 - alpha01 2815 psubusw MM7, MM1; //256 - alpha23 2816 2817 //moving the values to their destinations 2818 movq MM0, MM3; //src01 2819 movq MM1, MM0; //src23 2820 punpcklbw MM0, MM2; 2821 punpckhbw MM1, MM2; 2822 pmullw MM4, MM0; //src01 * (1 + alpha01) 2823 pmullw MM5, MM1; //src23 * (1 + alpha23) 2824 movq MM0, [EDI]; //dest01 2825 movq MM1, MM0; //dest23 2826 punpcklbw MM0, MM2; 2827 punpckhbw MM1, MM2; 2828 pmullw MM6, MM0; //dest01 * (256 - alpha) 2829 pmullw MM7, MM1; //dest23 * (256 - alpha) 2830 2831 paddusw MM4, MM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 2832 paddusw MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 2833 psrlw MM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 2834 psrlw MM5, 8; 2835 //moving the result to its place; 2836 //pxor MM2, MM2; 2837 packuswb MM4, MM5; 2838 2839 movq [EDX], MM4; 2840 //add EBX, 16; 2841 add ESI, 8; 2842 add EDI, 8; 2843 add EDX, 8; 2844 dec ECX; 2845 cmp ECX, 0; 2846 jnz sixteenpixelblend; 2847 fourpixelblend: 2848 mov ECX, target4; 2849 cmp ECX, 0; 2850 jz endofalgorithm; 2851 fourpixelblendloop: 2852 2853 //movd XMM6, [EBX];//alpha 2854 2855 2856 movd MM0, [EDI]; 2857 movd MM1, [ESI]; 2858 punpcklbw MM0, MM2;//dest 2859 punpcklbw MM1, MM2;//src 2860 movups MM6, MM1; 2861 pand MM6, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2862 movups MM7, MM6; 2863 pslld MM6, 8; 2864 por MM7, MM6; //mask is ready for RA 2865 pslld MM7, 16; 2866 por MM6, MM7; //mask is ready for GRA 2867 punpcklbw MM7, MM2; 2868 movaps MM4, ALPHABLEND_MMX_CONST256; 2869 movaps MM5, ALPHABLEND_MMX_CONST1; 2870 2871 paddusw MM5, MM6;//1+alpha 2872 psubusw MM4, MM6;//256-alpha 2873 2874 pmullw MM0, MM4;//dest*(256-alpha) 2875 pmullw MM1, MM5;//src*(1+alpha) 2876 paddusw MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha)) 2877 psrlw MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 2878 2879 packuswb MM0, MM2; 2880 2881 movd [EDX], MM0; 2882 2883 endofalgorithm: 2884 emms; 2885 } 2886 }else{ 2887 int target16 = length/4, target4 = length%4; 2888 asm @nogc { 2889 //setting up the pointer registers and the counter register 2890 //mov EBX, alpha[EBP]; 2891 mov ESI, src[EBP]; 2892 mov EDI, dest[EBP]; 2893 mov ECX, target16; 2894 cmp ECX, 0; 2895 jz fourpixelblend; //skip 16 byte operations if not needed 2896 //iteration cycle entry point 2897 sixteenpixelblend: 2898 //create alpha mask on the fly 2899 movups XMM3, [ESI]; 2900 movups XMM1, XMM3; 2901 pand XMM1, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2902 movups XMM0, XMM1; 2903 pslld XMM0, 8; 2904 por XMM1, XMM0; //mask is ready for RA 2905 pslld XMM1, 16; 2906 por XMM0, XMM1; //mask is ready for BGRA/**/ 2907 movups XMM1, XMM0; 2908 2909 punpcklbw XMM0, XMM2; 2910 punpckhbw XMM1, XMM2; 2911 movups XMM6, ALPHABLEND_SSE2_CONST256; 2912 movups XMM7, XMM6; 2913 movups XMM4, ALPHABLEND_SSE2_CONST1; 2914 movups XMM5, XMM4; 2915 2916 paddusw XMM4, XMM0; //1 + alpha01 2917 paddusw XMM5, XMM1; //1 + alpha23 2918 psubusw XMM6, XMM0; //256 - alpha01 2919 psubusw XMM7, XMM1; //256 - alpha23 2920 2921 //moving the values to their destinations 2922 2923 movups XMM0, XMM3; //src01 2924 movups XMM1, XMM0; //src23 2925 punpcklbw XMM0, XMM2; 2926 punpckhbw XMM1, XMM2; 2927 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 2928 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 2929 movups XMM0, [EDI]; //dest01 2930 movups XMM1, XMM0; //dest23 2931 punpcklbw XMM0, XMM2; 2932 punpckhbw XMM1, XMM2; 2933 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 2934 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 2935 2936 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 2937 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 2938 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 2939 psrlw XMM5, 8; 2940 //moving the result to its place; 2941 //pxor MM2, MM2; 2942 packuswb XMM4, XMM5; 2943 2944 movups [EDI], XMM4; 2945 //add EBX, 16; 2946 add ESI, 16; 2947 add EDI, 16; 2948 add EDX, 16; 2949 dec ECX; 2950 cmp ECX, 0; 2951 jnz sixteenpixelblend; 2952 2953 fourpixelblend: 2954 2955 mov ECX, target4; 2956 cmp ECX, 0; 2957 jz endofalgorithm; 2958 2959 fourpixelblendloop: 2960 2961 //movd XMM6, [EBX];//alpha 2962 2963 2964 movd XMM0, [EDI]; 2965 movd XMM1, [ESI]; 2966 punpcklbw XMM0, XMM2;//dest 2967 punpcklbw XMM1, XMM2;//src 2968 movups XMM6, XMM1; 2969 pand XMM6, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 2970 movups XMM7, XMM6; 2971 pslld XMM6, 8; 2972 por XMM7, XMM6; //mask is ready for RA 2973 pslld XMM7, 16; 2974 por XMM6, XMM7; //mask is ready for BGRA 2975 2976 punpcklbw XMM6, XMM2; 2977 2978 movaps XMM4, ALPHABLEND_SSE2_CONST256; 2979 movaps XMM5, ALPHABLEND_SSE2_CONST1; 2980 2981 paddusw XMM5, XMM6;//1+alpha 2982 psubusw XMM4, XMM6;//256-alpha 2983 2984 pmullw XMM0, XMM4;//dest*(256-alpha) 2985 pmullw XMM1, XMM5;//src*(1+alpha) 2986 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 2987 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 2988 2989 packuswb XMM0, XMM2; 2990 2991 movd [EDI], XMM0; 2992 2993 add ESI, 4; 2994 add EDI, 4;/**/ 2995 add EDX, 4; 2996 dec ECX; 2997 cmp ECX, 0; 2998 jnz fourpixelblendloop; 2999 3000 endofalgorithm: 3001 ; 3002 } 3003 } 3004 }else version(X86_64){ 3005 size_t target16 = length/4, target4 = length%4; 3006 asm @nogc { 3007 //setting up the pointer registers and the counter register 3008 //mov EBX, alpha[EBP]; 3009 mov RSI, src[RBP]; 3010 mov RDI, dest[RBP]; 3011 mov RDX, dest1[RBP]; 3012 mov RCX, target16; 3013 cmp RCX, 0; 3014 movups XMM8, ALPHABLEND_SSE2_CONST256; 3015 movups XMM9, ALPHABLEND_SSE2_CONST1; 3016 movups XMM10, ALPHABLEND_SSE2_MASK; 3017 jz fourpixelblend; //skip 16 byte operations if not needed 3018 //iteration cycle entry point 3019 sixteenpixelblend: 3020 //create alpha mask on the fly 3021 movups XMM3, [RSI]; 3022 movups XMM1, XMM3; 3023 pand XMM1, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3024 movups XMM0, XMM1; 3025 pslld XMM0, 8; 3026 por XMM1, XMM0; //mask is ready for RA 3027 pslld XMM1, 16; 3028 por XMM0, XMM1; //mask is ready for BGRA/**/ 3029 movups XMM1, XMM0; 3030 3031 punpcklbw XMM0, XMM2; 3032 punpckhbw XMM1, XMM2; 3033 movups XMM6, XMM8; 3034 movups XMM7, XMM8; 3035 movups XMM4, XMM9; 3036 movups XMM5, XMM9; 3037 3038 paddusw XMM4, XMM0; //1 + alpha01 3039 paddusw XMM5, XMM1; //1 + alpha23 3040 psubusw XMM6, XMM0; //256 - alpha01 3041 psubusw XMM7, XMM1; //256 - alpha23 3042 3043 //moving the values to their destinations 3044 3045 movups XMM0, XMM3; //src01 3046 movups XMM1, XMM0; //src23 3047 punpcklbw XMM0, XMM2; 3048 punpckhbw XMM1, XMM2; 3049 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 3050 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 3051 movups XMM0, [EDI]; //dest01 3052 movups XMM1, XMM0; //dest23 3053 punpcklbw XMM0, XMM2; 3054 punpckhbw XMM1, XMM2; 3055 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 3056 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 3057 3058 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 3059 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 3060 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 3061 psrlw XMM5, 8; 3062 //moving the result to its place; 3063 //pxor MM2, MM2; 3064 packuswb XMM4, XMM5; 3065 3066 movups [RDX], XMM4; 3067 //add EBX, 16; 3068 add RSI, 16; 3069 add RDI, 16; 3070 add RDX, 16; 3071 dec RCX; 3072 cmp RCX, 0; 3073 jnz sixteenpixelblend; 3074 3075 fourpixelblend: 3076 3077 mov RCX, target4; 3078 cmp RCX, 0; 3079 jz endofalgorithm; 3080 3081 fourpixelblendloop: 3082 3083 //movd XMM6, [EBX];//alpha 3084 3085 3086 movd XMM0, [RDI]; 3087 movd XMM1, [RSI]; 3088 punpcklbw XMM0, XMM2;//dest 3089 punpcklbw XMM1, XMM2;//src 3090 movups XMM6, XMM1; 3091 pand XMM6, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3092 movups XMM7, XMM6; 3093 pslld XMM6, 8; 3094 por XMM7, XMM6; //mask is ready for RA 3095 pslld XMM7, 16; 3096 por XMM6, XMM7; //mask is ready for BGRA 3097 3098 punpcklbw XMM6, XMM2; 3099 3100 movaps XMM4, XMM8; 3101 movaps XMM5, XMM9; 3102 3103 paddusw XMM5, XMM6;//1+alpha 3104 psubusw XMM4, XMM6;//256-alpha 3105 3106 pmullw XMM0, XMM4;//dest*(256-alpha) 3107 pmullw XMM1, XMM5;//src*(1+alpha) 3108 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 3109 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 3110 3111 packuswb XMM0, XMM2; 3112 3113 movd [RDI], XMM0; 3114 3115 add RSI, 4; 3116 add RDI, 4; 3117 add RDX, 4; 3118 dec RCX; 3119 cmp RCX, 0; 3120 jnz fourpixelblendloop; 3121 3122 endofalgorithm: 3123 ; 3124 } 3125 }else{ 3126 for(int i ; i < length ; i++){ 3127 switch(src.ColorSpaceARGB.alpha){ 3128 case 0: 3129 break; 3130 case 255: 3131 dest = src; 3132 break; 3133 default: 3134 int src1 = 1 + src.ColorSpaceARGB.alpha; 3135 int src256 = 256 - src.ColorSpaceARGB.alpha; 3136 dest1.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8); 3137 dest1.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8); 3138 dest1.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8); 3139 break; 3140 } 3141 src++; 3142 dest++; 3143 dest1++; 3144 } 3145 } 3146 } 3147 /** 3148 * Copies a 32bit image onto another without blitter. No transparency is used. Dest is placeholder. 3149 */ 3150 public @nogc void copy32bit(uint* src, uint* dest, uint* dest1, size_t length){ 3151 copy32bit(src, dest1, length); 3152 } 3153 /** 3154 * Four plus one operand blitter for 8 bit values. Uses an external mask. Final values are copied into memory location specified by dest1; 3155 */ 3156 public @nogc void blitter8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length, ubyte* mask){ 3157 version(X86){ 3158 version(MMX){ 3159 asm @nogc{ 3160 mov ESI, src[EBP]; 3161 mov EDI, dest[EBP]; 3162 mov EBX, mask[EBP]; 3163 mov EDX, dest1[EBP]; 3164 mov ECX, length; 3165 cmp ECX, 8; 3166 jl fourpixel; 3167 eigthpixelloop: 3168 movq MM0, [ESI]; 3169 movq MM1, [EDI]; 3170 movq MM2, [EBX]; 3171 pand MM1, MM2; 3172 por MM1, MM0; 3173 movq [EDX], MM1; 3174 add ESI, 8; 3175 add EDI, 8; 3176 add EBX, 8; 3177 add EDX, 8; 3178 sub ECX, 8; 3179 jge eigthpixelloop; 3180 fourpixel: 3181 cmp ECX, 4; 3182 jl singlepixelloop; 3183 movd MM0, [ESI]; 3184 movd MM1, [EDI]; 3185 movd MM2, [EBX]; 3186 pand MM1, MM2; 3187 por MM1, MM0; 3188 movd [EDX], MM1; 3189 add ESI, 4; 3190 add EDI, 4; 3191 add EBX, 4; 3192 add EDX, 4; 3193 sub ECX, 4; 3194 singlepixelloop: 3195 //cmp ECX, 0; 3196 jecxz end; 3197 mov AL, [ESI]; 3198 mov AH, [EDI]; 3199 and AH, [EBX]; 3200 or AH, AL; 3201 mov [EDX], AH; 3202 inc ESI; 3203 inc EDI; 3204 inc EBX; 3205 inc EDX; 3206 dec ECX; 3207 jmp singlepixelloop; 3208 end: 3209 emms; 3210 } 3211 }else{ 3212 asm @nogc{ 3213 mov ESI, src[EBP]; 3214 mov EDI, dest[EBP]; 3215 mov EBX, mask[EBP]; 3216 mov EDX, dest1[EBP]; 3217 mov ECX, length; 3218 cmp ECX, 16; 3219 pxor XMM7, XMM7; 3220 jl eightpixel; 3221 sixteenpixelloop: 3222 movups XMM0, [ESI]; 3223 movups XMM1, [EDI]; 3224 movups XMM2, [EBX]; 3225 pand XMM1, XMM2; 3226 por XMM1, XMM0; 3227 movups [EDX], XMM1; 3228 add ESI, 16; 3229 add EDI, 16; 3230 add EBX, 16; 3231 add EDX, 16; 3232 sub ECX, 16; 3233 cmp ECX, 16; 3234 jge sixteenpixelloop; 3235 eightpixel: 3236 cmp ECX, 8; 3237 jl fourpixel; 3238 movq XMM0, [ESI]; 3239 movq XMM1, [EDI]; 3240 movq XMM2, [EBX]; 3241 pand XMM1, XMM2; 3242 por XMM1, XMM0; 3243 movq [EDX], XMM1; 3244 add ESI, 8; 3245 add EDI, 8; 3246 add EBX, 8; 3247 add EDX, 8; 3248 sub ECX, 8; 3249 fourpixel: 3250 cmp ECX, 4; 3251 jl singlepixelloop; 3252 movd XMM0, [ESI]; 3253 movd XMM1, [EDI]; 3254 movd XMM2, [EBX]; 3255 pand XMM1, XMM2; 3256 por XMM1, XMM0; 3257 movd [EDX], XMM1; 3258 add ESI, 4; 3259 add EDI, 4; 3260 add EBX, 4; 3261 add EDX, 4; 3262 sub ECX, 4; 3263 singlepixelloop: 3264 //cmp ECX, 0; 3265 jecxz end; 3266 mov AL, [ESI]; 3267 mov AH, [EDI]; 3268 and AH, [EBX]; 3269 or AH, AL; 3270 mov [EDX], AH; 3271 inc ESI; 3272 inc EDI; 3273 inc EBX; 3274 inc EDX; 3275 dec ECX; 3276 jmp singlepixelloop; 3277 end: 3278 ; 3279 } 3280 } 3281 }else version(X86_64){ 3282 asm @nogc{ 3283 mov RSI, src[RBP]; 3284 mov RDI, dest[RBP]; 3285 mov RBX, mask[RBP]; 3286 mov RDX, dest1[RBP]; 3287 mov RCX, length; 3288 cmp RCX, 16; 3289 //pxor XMM7, XMM7; 3290 jl eightpixel; 3291 sixteenpixelloop: 3292 movups XMM0, [RSI]; 3293 movups XMM1, [RDI]; 3294 movups XMM2, [RBX]; 3295 pand XMM1, XMM2; 3296 por XMM1, XMM0; 3297 movups [RDX], XMM1; 3298 add RSI, 16; 3299 add RDI, 16; 3300 add RBX, 16; 3301 add RDX, 16; 3302 sub RCX, 16; 3303 cmp RCX, 16; 3304 jge sixteenpixelloop; 3305 eightpixel: 3306 cmp RCX, 8; 3307 jl fourpixel; 3308 movq XMM0, [RSI]; 3309 movq XMM1, [RDI]; 3310 movq XMM2, [RBX]; 3311 pand XMM1, XMM2; 3312 por XMM1, XMM0; 3313 movq [RDX], XMM1; 3314 add RSI, 8; 3315 add RDI, 8; 3316 add RBX, 8; 3317 add RDX, 8; 3318 sub RCX, 8; 3319 fourpixel: 3320 cmp RCX, 4; 3321 jl singlepixelloop; 3322 movd XMM0, [RSI]; 3323 movd XMM1, [RDI]; 3324 movups XMM2, [RBX]; 3325 pand XMM1, XMM2; 3326 por XMM1, XMM0; 3327 movd [RDX], XMM1; 3328 add RSI, 4; 3329 add RDI, 4; 3330 add RBX, 4; 3331 add RDX, 4; 3332 sub RCX, 4; 3333 singlepixelloop: 3334 cmp RCX, 0; 3335 jz end; 3336 mov AL, [RSI]; 3337 mov AH, [RDI]; 3338 and AH, [RBX]; 3339 or AH, AL; 3340 mov [RDX], AH; 3341 inc RSI; 3342 inc RDI; 3343 inc RBX; 3344 inc RDX; 3345 dec RCX; 3346 jmp singlepixelloop; 3347 end: 3348 ; 3349 } 3350 }else{ 3351 while(length){ 3352 if(*src) 3353 *dest1 = (*dest & *mask) | *src; 3354 src++; 3355 dest++; 3356 dest1++; 3357 mask++; 3358 length--; 3359 } 3360 } 3361 } 3362 /** 3363 * Copies an 8bit image onto another without blitter. No transparency is used. Dest and mask are placeholders. 3364 */ 3365 public @nogc void copy8bit(ubyte* src, ubyte* dest, ubyte* dest1, size_t length, ubyte* mask){ 3366 copy8bit(src,dest1,length); 3367 } 3368 /** 3369 * Four plus one operand blitter for 8 bit values. Uses external mask. Copies the result to the memory location specified by dest1. 3370 */ 3371 public @nogc void blitter16bit(ushort* src, ushort* dest, ushort* dest1, size_t length, ushort* mask){ 3372 version(X86){ 3373 version(MMX){ 3374 asm @nogc{ 3375 pxor MM7, MM7; 3376 mov ESI, src[EBP]; 3377 mov EDI, dest[EBP]; 3378 mov EBX, mask[EBP]; 3379 mov EDX, dest[EBP]; 3380 mov ECX, length; 3381 cmp ECX, 4; 3382 jl twopixel; 3383 fourpixelloop: 3384 movq MM0, [ESI]; 3385 movq MM1, [EDI]; 3386 movq MM2, [EBX]; 3387 pand MM1, MM2; 3388 por MM1, MM0; 3389 movq [EDX], MM1; 3390 add ESI, 8; 3391 add EDI, 8; 3392 add EBX, 8; 3393 add EDX, 8; 3394 sub ECX, 4; 3395 jge fourpixelloop; 3396 twopixel: 3397 cmp ECX, 4; 3398 jl singlepixel; 3399 movd MM0, [ESI]; 3400 movd MM1, [EDI]; 3401 movd MM2, [EBX]; 3402 pand MM1, MM2; 3403 por MM1, MM0; 3404 movd [EDX], MM1; 3405 add ESI, 4; 3406 add EDI, 4; 3407 add EBX, 4; 3408 add EDX, 4; 3409 sub ECX, 2; 3410 singlepixel: 3411 //cmp ECX, 0; 3412 jecxz end; 3413 mov AX, [EBX]; 3414 and AX, [EDI]; 3415 or AX, [ESI]; 3416 mov [EDX], AX; 3417 end: 3418 emms; 3419 } 3420 }else{ 3421 asm @nogc{ 3422 pxor XMM7, XMM7; 3423 mov ESI, src[EBP]; 3424 mov EDI, dest[EBP]; 3425 mov EBX, mask[EBP]; 3426 mov EDX, dest1[EBP]; 3427 mov ECX, length; 3428 cmp ECX, 8; 3429 jl fourpixel; 3430 eigthpixelloop: 3431 movups XMM0, [ESI]; 3432 movups XMM1, [EDI]; 3433 movups XMM2, [EBX]; 3434 pand XMM1, XMM2; 3435 por XMM1, XMM0; 3436 movups [EDX], XMM1; 3437 add ESI,16; 3438 add EDI,16; 3439 add EBX,16; 3440 add EDX,16; 3441 sub ECX, 8; 3442 cmp ECX, 8; 3443 jge eigthpixelloop; 3444 fourpixel: 3445 cmp ECX, 4; 3446 jl twopixel; 3447 movq XMM0, [ESI]; 3448 movq XMM1, [EDI]; 3449 movq XMM2, [EBX]; 3450 pand XMM1, XMM2; 3451 por XMM1, XMM0; 3452 movq [EDX], XMM1; 3453 add ESI, 8; 3454 add EDI, 8; 3455 add EBX, 8; 3456 add EDX, 8; 3457 sub ECX, 4; 3458 twopixel: 3459 cmp ECX, 2; 3460 jl singlepixel; 3461 movd XMM0, [ESI]; 3462 movd XMM1, [EDI]; 3463 movd XMM2, [EBX]; 3464 pand XMM1, XMM2; 3465 por XMM1, XMM0; 3466 movd [EDX], XMM1; 3467 add ESI, 4; 3468 add EDI, 4; 3469 add EBX, 4; 3470 add EDX, 4; 3471 sub ECX, 2; 3472 singlepixel: 3473 //cmp ECX, 0; 3474 jecxz end; 3475 mov AX, [EBX]; 3476 and AX, [EDI]; 3477 or AX, [ESI]; 3478 mov [EDX], AX; 3479 end: 3480 ; 3481 } 3482 } 3483 }else version(X86_64){ 3484 asm @nogc{ 3485 pxor XMM7, XMM7; 3486 mov RSI, src[RBP]; 3487 mov RDI, dest[RBP]; 3488 mov RBX, mask[RBP]; 3489 mov RDX, dest1[RBP]; 3490 mov RCX, length; 3491 cmp RCX, 8; 3492 jl fourpixel; 3493 eigthpixelloop: 3494 movups XMM0, [RSI]; 3495 movups XMM1, [RDI]; 3496 movups XMM2, [RBX]; 3497 pand XMM1, XMM2; 3498 por XMM1, XMM0; 3499 movups [RDX], XMM1; 3500 add RSI,16; 3501 add RDI,16; 3502 add RBX,16; 3503 add RDX,16; 3504 sub RCX, 8; 3505 cmp RCX, 8; 3506 jge eigthpixelloop; 3507 fourpixel: 3508 cmp RCX, 4; 3509 jl twopixel; 3510 movq XMM0, [RSI]; 3511 movq XMM1, [RDI]; 3512 movq XMM2, [RBX]; 3513 pand XMM1, XMM2; 3514 por XMM1, XMM0; 3515 movq [RDX], XMM1; 3516 add RSI, 8; 3517 add RDI, 8; 3518 add RBX, 8; 3519 add RDX, 8; 3520 sub RCX, 4; 3521 twopixel: 3522 cmp RCX, 2; 3523 jl singlepixel; 3524 movd XMM0, [RSI]; 3525 movd XMM1, [RDI]; 3526 movd XMM2, [RBX]; 3527 pand XMM1, XMM2; 3528 por XMM1, XMM0; 3529 movd [RDX], XMM1; 3530 add RSI, 4; 3531 add RDI, 4; 3532 add RBX, 4; 3533 add RDX, 4; 3534 sub RCX, 2; 3535 singlepixel: 3536 cmp RCX, 0; 3537 jz end; 3538 mov AX, [RBX]; 3539 and AX, [RDI]; 3540 or AX, [RSI]; 3541 mov [RDX], AX; 3542 end: 3543 ; 3544 } 3545 }else{ 3546 while(length){ 3547 if(*src) 3548 *dest1 = *src; 3549 else 3550 *dest1 = *dest; 3551 src++; 3552 dest++; 3553 dest1++; 3554 length--; 3555 } 3556 } 3557 } 3558 /** 3559 * Copies a 16bit image onto another without blitter. No transparency is used. Dest and mask is placeholder. 3560 */ 3561 public @nogc void copy16bit(ushort* src, ushort* dest, ushort* dest1, size_t length, ushort* mask){ 3562 copy16bit(src,dest1,length); 3563 } 3564 /** 3565 * Two plus one operand blitter for 32 bit values. Uses a separate mask. Copies the result into location specified by dest1. 3566 */ 3567 public @nogc void blitter32bit(uint* src, uint* dest, uint* dest1, size_t length, uint* mask){ 3568 version(X86){ 3569 version(MMX){ 3570 asm @nogc{ 3571 mov ESI, src[EBP]; 3572 mov EDI, dest[EBP]; 3573 mov EBX, mask[EBP]; 3574 mov EDX, dest1[EBP]; 3575 mov ECX, length; 3576 movq MM6, ALPHABLEND_MMX_MASK; 3577 pxor MM7, MM7; 3578 cmp ECX, 2; 3579 jl twopixel; 3580 twopixelloop: 3581 movq MM0, [ESI]; 3582 movq MM1, [EDI]; 3583 movq MM2, [EBX]; 3584 pand MM1, MM2; 3585 por MM1, MM0; 3586 movq [EDX], MM1; 3587 add ESI, 8; 3588 add EDI, 8; 3589 add EBX, 8; 3590 add EDX, 8; 3591 sub ECX, 2; 3592 jge fourpixelloop; 3593 onepixel: 3594 jecxz end; 3595 movd MM0, [ESI]; 3596 movd MM1, [EDI]; 3597 movd MM2, [EBX]; 3598 pand MM1, MM2; 3599 por MM1, MM0; 3600 movd [EDX], MM1; 3601 end: 3602 emms; 3603 } 3604 }else{ 3605 asm @nogc{ 3606 mov ESI, src[EBP]; 3607 mov EDI, dest[EBP]; 3608 mov EBX, mask[EBP]; 3609 mov EDX, dest1[EBP]; 3610 mov ECX, length; 3611 movups XMM6, ALPHABLEND_SSE2_MASK; 3612 pxor XMM7, XMM7; 3613 cmp ECX, 4; 3614 jl twopixel; 3615 fourpixelloop: 3616 movups XMM0, [ESI]; 3617 movups XMM1, [EDI]; 3618 movups XMM2, [EBX]; 3619 pand XMM1, XMM2; 3620 por XMM1, XMM0; 3621 movups [EDX], XMM1; 3622 add ESI,16; 3623 add EDI,16; 3624 add EBX,16; 3625 add EDX,16; 3626 sub ECX, 4; 3627 cmp ECX, 4; 3628 jge fourpixelloop; 3629 twopixel: 3630 cmp ECX, 2; 3631 jl onepixel; 3632 movq XMM0, [ESI]; 3633 movq XMM1, [EDI]; 3634 movq XMM2, [EBX]; 3635 pand XMM1, XMM2; 3636 por XMM1, XMM0; 3637 movq [EDX], XMM1; 3638 add ESI, 8; 3639 add EDI, 8; 3640 add EBX, 8; 3641 add EDX, 8; 3642 sub ECX, 2; 3643 onepixel: 3644 jecxz end; 3645 movd XMM0, [ESI]; 3646 movd XMM1, [EDI]; 3647 movd XMM2, [EBX]; 3648 pand XMM1, XMM2; 3649 por XMM1, XMM0; 3650 movd [EDX], XMM1; 3651 end: 3652 ; 3653 } 3654 } 3655 }else version(X86_64){ 3656 asm @nogc{ 3657 mov RSI, src[RBP]; 3658 mov RDI, dest[RBP]; 3659 mov RBX, mask[RBP]; 3660 mov RDX, dest1[RBP]; 3661 mov RCX, length; 3662 movups XMM6, ALPHABLEND_SSE2_MASK; 3663 pxor XMM7, XMM7; 3664 cmp ECX, 4; 3665 jl twopixel; 3666 fourpixelloop: 3667 movups XMM0, [RSI]; 3668 movups XMM1, [RDI]; 3669 movups XMM2, [RBX]; 3670 pand XMM1, XMM2; 3671 por XMM1, XMM0; 3672 movups [RDX], XMM1; 3673 add RSI,16; 3674 add RDI,16; 3675 add RBX,16; 3676 add RDX,16; 3677 sub RCX, 4; 3678 cmp RCX, 4; 3679 jge fourpixelloop; 3680 twopixel: 3681 cmp RCX, 2; 3682 jl onepixel; 3683 movq XMM0, [RSI]; 3684 movq XMM1, [RDI]; 3685 movq XMM2, [RBX]; 3686 pand XMM1, XMM2; 3687 por XMM1, XMM0; 3688 movq [RDX], XMM1; 3689 add RSI, 8; 3690 add RDI, 8; 3691 add RBX, 8; 3692 add RDX, 8; 3693 sub RCX, 2; 3694 onepixel: 3695 cmp RCX, 1; 3696 jl end; 3697 movd XMM0, [RSI]; 3698 movd XMM1, [RDI]; 3699 movd XMM2, [RBX]; 3700 pand XMM1, XMM2; 3701 por XMM1, XMM0; 3702 movd [RDX], XMM1; 3703 end: 3704 ; 3705 } 3706 }else{ 3707 while(length){ 3708 dest1.base = (dest.base & mask.base) | src.base; 3709 mask++; 3710 src++; 3711 dest++; 3712 dest1++; 3713 length--; 3714 } 3715 } 3716 } 3717 /** 3718 * Implements a four plus one operand alpha-blending algorithm for 32bit bitmaps. For masking, use Pixel32Bit.AlphaMask from CPUblit.colorspaces. 3719 * Output is copied into a memory location specified by dest1. 3720 */ 3721 public @nogc void alphaBlend32bit(uint* src, uint* dest, uint* dest1, size_t length, uint* mask){ 3722 version(X86){ 3723 version(MMX){ 3724 int target8 = length/8, target4 = length%2; 3725 asm @nogc { 3726 //setting up the pointer registers and the counter register 3727 //mov EBX, alpha[EBP]; 3728 mov ESI, src[EBP]; 3729 mov EDI, dest[EBP]; 3730 mov EBX, mask[EBP]; 3731 mov EDX, dest1[EBP]; 3732 mov ECX, target8; 3733 cmp ECX, 0; 3734 jz fourpixelblend; //skip 16 byte operations if not needed 3735 //iteration cycle entry point 3736 sixteenpixelblend: 3737 //create alpha mask on the fly 3738 movq MM3, [ESI]; 3739 /*movq MM1, MM3; 3740 pand MM1, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3741 movq MM0, MM1; 3742 pslld MM0, 8; 3743 por MM1, MM0; //mask is ready for RA 3744 pslld MM1, 16; 3745 por MM0, MM1; //mask is ready for BGRA*/ 3746 movq MM0, [EBX]; 3747 movq MM1, MM0; 3748 punpcklbw MM0, MM2; 3749 punpckhbw MM1, MM2; 3750 movq MM6, ALPHABLEND_MMX_CONST256; 3751 movq MM7, MM6; 3752 movq MM4, ALPHABLEND_MMX_CONST1; 3753 movq MM5, MM4; 3754 3755 paddusw MM4, MM0; //1 + alpha01 3756 paddusw MM5, MM1; //1 + alpha23 3757 psubusw MM6, MM0; //256 - alpha01 3758 psubusw MM7, MM1; //256 - alpha23 3759 3760 //moving the values to their destinations 3761 movq MM0, MM3; //src01 3762 movq MM1, MM0; //src23 3763 punpcklbw MM0, MM2; 3764 punpckhbw MM1, MM2; 3765 pmullw MM4, MM0; //src01 * (1 + alpha01) 3766 pmullw MM5, MM1; //src23 * (1 + alpha23) 3767 movq MM0, [EDI]; //dest01 3768 movq MM1, MM0; //dest23 3769 punpcklbw MM0, MM2; 3770 punpckhbw MM1, MM2; 3771 pmullw MM6, MM0; //dest01 * (256 - alpha) 3772 pmullw MM7, MM1; //dest23 * (256 - alpha) 3773 3774 paddusw MM4, MM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 3775 paddusw MM5, MM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 3776 psrlw MM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 3777 psrlw MM5, 8; 3778 //moving the result to its place; 3779 //pxor MM2, MM2; 3780 packuswb MM4, MM5; 3781 3782 movq [EDX], MM4; 3783 //add EBX, 16; 3784 add ESI, 8; 3785 add EDI, 8; 3786 add EBX, 8; 3787 add EDX, 8; 3788 dec ECX; 3789 cmp ECX, 0; 3790 jnz sixteenpixelblend; 3791 fourpixelblend: 3792 mov ECX, target4; 3793 cmp ECX, 0; 3794 jz endofalgorithm; 3795 fourpixelblendloop: 3796 3797 //movd XMM6, [EBX];//alpha 3798 3799 3800 movd MM0, [EDI]; 3801 movd MM1, [ESI]; 3802 punpcklbw MM0, MM2;//dest 3803 punpcklbw MM1, MM2;//src 3804 movups MM6, MM1; 3805 pand MM6, ALPHABLEND_MMX_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3806 movups MM7, MM6; 3807 pslld MM6, 8; 3808 por MM7, MM6; //mask is ready for RA 3809 pslld MM7, 16; 3810 por MM6, MM7; //mask is ready for GRA 3811 punpcklbw MM7, MM2; 3812 movaps MM4, ALPHABLEND_MMX_CONST256; 3813 movaps MM5, ALPHABLEND_MMX_CONST1; 3814 3815 paddusw MM5, MM6;//1+alpha 3816 psubusw MM4, MM6;//256-alpha 3817 3818 pmullw MM0, MM4;//dest*(256-alpha) 3819 pmullw MM1, MM5;//src*(1+alpha) 3820 paddusw MM0, MM1;//(src*(1+alpha))+(dest*(256-alpha)) 3821 psrlw MM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 3822 3823 packuswb MM0, MM2; 3824 3825 movd [EDX], MM0; 3826 3827 endofalgorithm: 3828 emms; 3829 } 3830 }else{ 3831 int target16 = length/4, target4 = length%4; 3832 asm @nogc { 3833 //setting up the pointer registers and the counter register 3834 //mov EBX, alpha[EBP]; 3835 mov ESI, src[EBP]; 3836 mov EDI, dest[EBP]; 3837 mov EBX, mask[EBP]; 3838 mov EDX, dest1[EBP]; 3839 mov ECX, target16; 3840 cmp ECX, 0; 3841 jz fourpixelblend; //skip 16 byte operations if not needed 3842 //iteration cycle entry point 3843 sixteenpixelblend: 3844 //create alpha mask on the fly 3845 movups XMM3, [ESI]; 3846 movups XMM1, [EBX]; 3847 //pand XMM1, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3848 //movups XMM0, XMM1; 3849 //pslld XMM0, 8; 3850 //por XMM1, XMM0; //mask is ready for RA 3851 //pslld XMM1, 16; 3852 //por XMM0, XMM1; //mask is ready for BGRA/**/ 3853 movups XMM0, XMM1; 3854 3855 punpcklbw XMM0, XMM2; 3856 punpckhbw XMM1, XMM2; 3857 movups XMM6, ALPHABLEND_SSE2_CONST256; 3858 movups XMM7, XMM6; 3859 movups XMM4, ALPHABLEND_SSE2_CONST1; 3860 movups XMM5, XMM4; 3861 3862 paddusw XMM4, XMM0; //1 + alpha01 3863 paddusw XMM5, XMM1; //1 + alpha23 3864 psubusw XMM6, XMM0; //256 - alpha01 3865 psubusw XMM7, XMM1; //256 - alpha23 3866 3867 //moving the values to their destinations 3868 3869 movups XMM0, XMM3; //src01 3870 movups XMM1, XMM0; //src23 3871 punpcklbw XMM0, XMM2; 3872 punpckhbw XMM1, XMM2; 3873 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 3874 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 3875 movups XMM0, [EDI]; //dest01 3876 movups XMM1, XMM0; //dest23 3877 punpcklbw XMM0, XMM2; 3878 punpckhbw XMM1, XMM2; 3879 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 3880 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 3881 3882 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 3883 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 3884 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 3885 psrlw XMM5, 8; 3886 //moving the result to its place; 3887 //pxor MM2, MM2; 3888 packuswb XMM4, XMM5; 3889 3890 movups [EDX], XMM4; 3891 //add EBX, 16; 3892 add ESI, 16; 3893 add EDI, 16; 3894 add EBX, 16; 3895 add EDX, 16; 3896 dec ECX; 3897 cmp ECX, 0; 3898 jnz sixteenpixelblend; 3899 3900 fourpixelblend: 3901 3902 mov ECX, target4; 3903 cmp ECX, 0; 3904 jz endofalgorithm; 3905 3906 fourpixelblendloop: 3907 3908 //movd XMM6, [EBX];//alpha 3909 3910 3911 movd XMM0, [EDI]; 3912 movd XMM1, [ESI]; 3913 punpcklbw XMM0, XMM2;//dest 3914 punpcklbw XMM1, XMM2;//src 3915 movd XMM6, [EBX]; 3916 /*pand XMM6, ALPHABLEND_SSE2_MASK; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3917 movups XMM7, XMM6; 3918 pslld XMM6, 8; 3919 por XMM7, XMM6; //mask is ready for RA 3920 pslld XMM7, 16; 3921 por XMM6, XMM7; //mask is ready for BGRA*/ 3922 3923 punpcklbw XMM6, XMM2; 3924 3925 movaps XMM4, ALPHABLEND_SSE2_CONST256; 3926 movaps XMM5, ALPHABLEND_SSE2_CONST1; 3927 3928 paddusw XMM5, XMM6;//1+alpha 3929 psubusw XMM4, XMM6;//256-alpha 3930 3931 pmullw XMM0, XMM4;//dest*(256-alpha) 3932 pmullw XMM1, XMM5;//src*(1+alpha) 3933 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 3934 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 3935 3936 packuswb XMM0, XMM2; 3937 3938 movd [EDX], XMM0; 3939 3940 add ESI, 4; 3941 add EDI, 4;/**/ 3942 add EBX, 4; 3943 add EDX, 4; 3944 dec ECX; 3945 cmp ECX, 0; 3946 jnz fourpixelblendloop; 3947 3948 endofalgorithm: 3949 ; 3950 } 3951 } 3952 }else version(X86_64){ 3953 size_t target16 = length/4, target4 = length%4; 3954 asm @nogc { 3955 //setting up the pointer registers and the counter register 3956 //mov EBX, alpha[EBP]; 3957 mov RSI, src[RBP]; 3958 mov RDI, dest[RBP]; 3959 mov RBX, mask[RBP]; 3960 mov RDX, dest1[RBP]; 3961 mov RCX, target16; 3962 cmp RCX, 0; 3963 movups XMM8, ALPHABLEND_SSE2_CONST256; 3964 movups XMM9, ALPHABLEND_SSE2_CONST1; 3965 movups XMM10, ALPHABLEND_SSE2_MASK; 3966 jz fourpixelblend; //skip 16 byte operations if not needed 3967 //iteration cycle entry point 3968 sixteenpixelblend: 3969 //create alpha mask on the fly 3970 movups XMM3, [RSI]; 3971 /*movups XMM1, XMM3; 3972 pand XMM1, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 3973 movups XMM0, XMM1; 3974 pslld XMM0, 8; 3975 por XMM1, XMM0; //mask is ready for RA 3976 pslld XMM1, 16; 3977 por XMM0, XMM1; //mask is ready for BGRA/**/ 3978 movups XMM0, [RBX]; 3979 movups XMM1, XMM0; 3980 3981 punpcklbw XMM0, XMM2; 3982 punpckhbw XMM1, XMM2; 3983 movups XMM6, XMM8; 3984 movups XMM7, XMM8; 3985 movups XMM4, XMM9; 3986 movups XMM5, XMM9; 3987 3988 paddusw XMM4, XMM0; //1 + alpha01 3989 paddusw XMM5, XMM1; //1 + alpha23 3990 psubusw XMM6, XMM0; //256 - alpha01 3991 psubusw XMM7, XMM1; //256 - alpha23 3992 3993 //moving the values to their destinations 3994 3995 movups XMM0, XMM3; //src01 3996 movups XMM1, XMM0; //src23 3997 punpcklbw XMM0, XMM2; 3998 punpckhbw XMM1, XMM2; 3999 pmullw XMM4, XMM0; //src01 * (1 + alpha01) 4000 pmullw XMM5, XMM1; //src23 * (1 + alpha23) 4001 movups XMM0, [EDI]; //dest01 4002 movups XMM1, XMM0; //dest23 4003 punpcklbw XMM0, XMM2; 4004 punpckhbw XMM1, XMM2; 4005 pmullw XMM6, XMM0; //dest01 * (256 - alpha) 4006 pmullw XMM7, XMM1; //dest23 * (256 - alpha) 4007 4008 paddusw XMM4, XMM6; //(src01 * (1 + alpha01)) + (dest01 * (256 - alpha01)) 4009 paddusw XMM5, XMM7; //(src * (1 + alpha)) + (dest * (256 - alpha)) 4010 psrlw XMM4, 8; //(src * (1 + alpha)) + (dest * (256 - alpha)) / 256 4011 psrlw XMM5, 8; 4012 //moving the result to its place; 4013 //pxor MM2, MM2; 4014 packuswb XMM4, XMM5; 4015 4016 movups [RDX], XMM4; 4017 //add EBX, 16; 4018 add RSI, 16; 4019 add RDI, 16; 4020 add RBX, 16; 4021 add RDX, 16; 4022 dec RCX; 4023 cmp RCX, 0; 4024 jnz sixteenpixelblend; 4025 4026 fourpixelblend: 4027 4028 mov RCX, target4; 4029 cmp RCX, 0; 4030 jz endofalgorithm; 4031 4032 fourpixelblendloop: 4033 4034 //movd XMM6, [EBX];//alpha 4035 4036 4037 movd XMM0, [RDI]; 4038 movd XMM1, [RSI]; 4039 punpcklbw XMM0, XMM2;//dest 4040 punpcklbw XMM1, XMM2;//src 4041 movups XMM6, [RBX]; 4042 /*pand XMM6, XMM10; //pixel & 0x000000FF,0x000000FF,0x000000FF,0x000000FF 4043 movups XMM7, XMM6; 4044 pslld XMM6, 8; 4045 por XMM7, XMM6; //mask is ready for RA 4046 pslld XMM7, 16; 4047 por XMM6, XMM7; //mask is ready for BGRA*/ 4048 4049 punpcklbw XMM6, XMM2; 4050 4051 movaps XMM4, XMM8; 4052 movaps XMM5, XMM9; 4053 4054 paddusw XMM5, XMM6;//1+alpha 4055 psubusw XMM4, XMM6;//256-alpha 4056 4057 pmullw XMM0, XMM4;//dest*(256-alpha) 4058 pmullw XMM1, XMM5;//src*(1+alpha) 4059 paddusw XMM0, XMM1;//(src*(1+alpha))+(dest*(256-alpha)) 4060 psrlw XMM0, 8;//(src*(1+alpha))+(dest*(256-alpha))/256 4061 4062 packuswb XMM0, XMM2; 4063 4064 movd [RDI], XMM0; 4065 4066 add RSI, 4; 4067 add RDI, 4;/**/ 4068 add RBX, 4; 4069 add RDX, 4; 4070 dec RCX; 4071 cmp RCX, 0; 4072 jnz fourpixelblendloop; 4073 4074 endofalgorithm: 4075 ; 4076 } 4077 }else{ 4078 for(int i ; i < length ; i++){ 4079 switch(mask.AlphaMask.value){ 4080 case 0: 4081 break; 4082 case 255: 4083 dest = src; 4084 break; 4085 default: 4086 int src1 = 1 + mask.AlphaMask.value; 4087 int src256 = 256 - mask.AlphaMask.value; 4088 dest1.ColorSpaceARGB.red = cast(ubyte)((src.ColorSpaceARGB.red * src1 + dest.ColorSpaceARGB.red * src256)>>8); 4089 dest1.ColorSpaceARGB.green = cast(ubyte)((src.ColorSpaceARGB.green * src1 + dest.ColorSpaceARGB.green * src256)>>8); 4090 dest1.ColorSpaceARGB.blue = cast(ubyte)((src.ColorSpaceARGB.blue * src1 + dest.ColorSpaceARGB.blue * src256)>>8); 4091 break; 4092 } 4093 src++; 4094 dest++; 4095 dest1++; 4096 mask++; 4097 } 4098 } 4099 } 4100 /** 4101 * Copies a 32bit image onto another without blitter. No transparency is used. Dest and mask is placeholder. 4102 */ 4103 public @nogc void copy32bit(uint* src, uint* dest, uint* dest1, size_t length, uint* mask){ 4104 copy32bit(src,dest1,length); 4105 } 4106 /** 4107 * 3 + 1 operand XOR blitter. 4108 */ 4109 public @nogc void xorBlitter(T)(T* src, T* dest, T* dest1, size_t length){ 4110 static if(T == "ubyte"){ 4111 version(X86){ 4112 version(MMX){ 4113 asm @nogc{ 4114 mov ESI, src[EBP]; 4115 mov EDI, dest[EBP]; 4116 mov EDX, dest1[EBP]; 4117 mov ECX, length; 4118 cmp ECX, 8; 4119 jl fourpixel; 4120 eightpixelloop: 4121 movq XMM0, [ESI]; 4122 movq XMM1, [EDI]; 4123 pxor XMM0, XMM1; 4124 movq [EDX], XMM0; 4125 add ESI, 8; 4126 add EDI, 8; 4127 add EDX, 8; 4128 sub ECX, 8; 4129 cmp ECX, 8; 4130 jge eightpixelloop; 4131 fourpixel: 4132 cmp ECX, 4; 4133 jl singlepixelloop; 4134 movd XMM0, [ESI]; 4135 movd XMM1, [EDI]; 4136 pxor XMM0, XMM1; 4137 movd [EDX], XMM0; 4138 add ESI, 4; 4139 add EDI, 4; 4140 add EDX, 4; 4141 sub ECX, 4; 4142 cmp ECX, 0; 4143 jle end; 4144 singlepixelloop: 4145 mov AL, [ESI]; 4146 xor AL, [EDI]; 4147 mov [EDX], AL; 4148 inc ESI; 4149 inc EDI; 4150 loop singlepixelloop; 4151 end: 4152 emms; 4153 } 4154 }else{ 4155 asm @nogc{ 4156 mov ESI, src[EBP]; 4157 mov EDI, dest[EBP]; 4158 mov EDX, dest1[EBP]; 4159 mov ECX, length; 4160 cmp ECX, 16; 4161 jl eightpixel; 4162 sixteenpixelloop: 4163 movups XMM0, [ESI]; 4164 movups XMM1, [EDI]; 4165 pxor XMM0, XMM1; 4166 movups [EDX], XMM0; 4167 add ESI, 16; 4168 add EDI, 16; 4169 add EDX, 16; 4170 sub ECX, 16; 4171 cmp ECX, 16; 4172 jge sixteenpixelloop; 4173 eightpixel: 4174 cmp ECX, 8; 4175 jl fourpixel; 4176 movq XMM0, [ESI]; 4177 movq XMM1, [EDI]; 4178 pxor XMM0, XMM1; 4179 movq [EDX], XMM0; 4180 add ESI, 8; 4181 add EDI, 8; 4182 add EDX, 8; 4183 sub ECX, 8; 4184 fourpixel: 4185 cmp ECX, 4; 4186 jl singlepixelloop; 4187 movd XMM0, [ESI]; 4188 movd XMM1, [EDI]; 4189 pxor XMM0, XMM1; 4190 movd [EDX], XMM0; 4191 add ESI, 4; 4192 add EDI, 4; 4193 add EDX, 4; 4194 sub ECX, 4; 4195 cmp ECX, 0; 4196 jle end; 4197 singlepixelloop: 4198 mov AL, [ESI]; 4199 xor AL, [EDI]; 4200 mov [EDX], AL; 4201 inc ESI; 4202 inc EDI; 4203 loop singlepixelloop; 4204 end: 4205 ; 4206 } 4207 } 4208 }else version(X86_64){ 4209 asm @nogc{ 4210 mov RSI, src[RBP]; 4211 mov RDI, dest[RBP]; 4212 mov RDX, dest1[RBP]; 4213 mov RCX, length; 4214 cmp RCX, 16; 4215 jl eightpixel; 4216 sixteenpixelloop: 4217 movups XMM0, [RSI]; 4218 movups XMM1, [RDI]; 4219 pxor XMM0, XMM1; 4220 movups [RDX], XMM0; 4221 add RSI, 16; 4222 add RDI, 16; 4223 add RDX, 16; 4224 sub RCX, 16; 4225 cmp RCX, 16; 4226 jge sixteenpixelloop; 4227 eightpixel: 4228 cmp RCX, 8; 4229 jl fourpixel; 4230 movq XMM0, [RSI]; 4231 movq XMM1, [RDI]; 4232 pxor XMM0, XMM1; 4233 movq [RDX], XMM0; 4234 add RSI, 8; 4235 add RDI, 8; 4236 add RDX, 8; 4237 sub RCX, 8; 4238 fourpixel: 4239 cmp RCX, 4; 4240 jl singlepixelloop; 4241 movd XMM0, [RSI]; 4242 movd XMM1, [RDI]; 4243 pxor XMM0, XMM1; 4244 movd [RDX], XMM0; 4245 add RSI, 4; 4246 add RDI, 4; 4247 add RDX, 4; 4248 sub RCX, 4; 4249 cmp RCX, 0; 4250 jle end; 4251 singlepixelloop: 4252 mov AL, [RSI]; 4253 xor AL, [RDI]; 4254 mov [RDX], AL; 4255 inc RSI; 4256 inc RDI; 4257 loop singlepixelloop; 4258 end: 4259 ; 4260 } 4261 }else{ 4262 while(lenght){ 4263 *dest1 = *src ^ *dest; 4264 src++; 4265 dest++; 4266 dest1++; 4267 length--; 4268 } 4269 } 4270 }else static if(T == "ushort"){ 4271 version(X86){ 4272 version(MMX){ 4273 asm @nogc{ 4274 mov ESI, src[EBP]; 4275 mov EDI, dest[EBP]; 4276 mov EDX, dest1[EBP]; 4277 mov ECX, length; 4278 cmp ECX, 4; 4279 jl twopixel; 4280 fourpixelloop: 4281 movq XMM0, [ESI]; 4282 movq XMM1, [EDI]; 4283 pxor XMM0, XMM1; 4284 movq [EDX], XMM0; 4285 add ESI, 8; 4286 add EDI, 8; 4287 add EDX, 8; 4288 sub ECX, 4; 4289 cmp ECX, 4; 4290 jge fourpixelloop; 4291 twopixel: 4292 cmp ECX, 4; 4293 jl onepixel; 4294 movd XMM0, [ESI]; 4295 movd XMM1, [EDI]; 4296 pxor XMM0, XMM1; 4297 movd [EDX], XMM0; 4298 add ESI, 4; 4299 add EDI, 4; 4300 add EDX, 4; 4301 sub ECX, 2; 4302 cmp ECX, 0; 4303 jle end; 4304 onepixel: 4305 mov AX, [ESI]; 4306 xor AX, [EDI]; 4307 mov [EDX], AX; 4308 end: 4309 emms; 4310 } 4311 }else{ 4312 asm @nogc{ 4313 mov ESI, src[EBP]; 4314 mov EDI, dest[EBP]; 4315 mov EDX, dest1[EBP]; 4316 mov ECX, length; 4317 cmp ECX, 8; 4318 jl fourpixel; 4319 eightpixelloop: 4320 movups XMM0, [ESI]; 4321 movups XMM1, [EDI]; 4322 pxor XMM0, XMM1; 4323 movups [EDX], XMM0; 4324 add ESI,16; 4325 add EDI,16; 4326 add EDX,16; 4327 sub ECX, 8; 4328 cmp ECX, 8; 4329 jge eightpixelloop; 4330 fourpixel: 4331 cmp ECX, 4; 4332 jl twopixel; 4333 movq XMM0, [ESI]; 4334 movq XMM1, [EDI]; 4335 pxor XMM0, XMM1; 4336 movq [EDX], XMM0; 4337 add ESI, 8; 4338 add EDI, 8; 4339 add EDX, 8; 4340 sub ECX, 4; 4341 twopixel: 4342 cmp ECX, 2; 4343 jl onepixel; 4344 movd XMM0, [ESI]; 4345 movd XMM1, [EDI]; 4346 pxor XMM0, XMM1; 4347 movd [EDX], XMM0; 4348 add ESI, 4; 4349 add EDI, 4; 4350 add EDX, 4; 4351 sub ECX, 2; 4352 cmp ECX, 0; 4353 jle end; 4354 onepixel: 4355 mov AX, [ESI]; 4356 xor AX, [EDI]; 4357 mov [EDX], AX; 4358 end: 4359 ; 4360 } 4361 } 4362 }else version(X86_64){ 4363 asm @nogc{ 4364 mov RSI, src[RBP]; 4365 mov RDI, dest[RBP]; 4366 mov RDX, dest1[RBP]; 4367 mov RCX, length; 4368 cmp RCX, 8; 4369 jl fourpixel; 4370 eightpixelloop: 4371 movups XMM0, [RSI]; 4372 movups XMM1, [RDI]; 4373 pxor XMM0, XMM1; 4374 movups [RDX], XMM0; 4375 add RSI,16; 4376 add RDI,16; 4377 add RDX,16; 4378 sub RCX, 8; 4379 cmp RCX, 8; 4380 jge eightpixelloop; 4381 fourpixel: 4382 cmp RCX, 4; 4383 jl twopixel; 4384 movq XMM0, [RSI]; 4385 movq XMM1, [RDI]; 4386 pxor XMM0, XMM1; 4387 movq [RDX], XMM0; 4388 add RSI, 8; 4389 add RDI, 8; 4390 add RDX, 8; 4391 sub RCX, 4; 4392 twopixel: 4393 cmp RCX, 2; 4394 jl singlepixelloop; 4395 movd XMM0, [RSI]; 4396 movd XMM1, [RDI]; 4397 pxor XMM0, XMM1; 4398 movd [RDX], XMM0; 4399 add RSI, 4; 4400 add RDI, 4; 4401 add RDX, 4; 4402 sub RCX, 2; 4403 cmp RCX, 0; 4404 jle end; 4405 onepixel: 4406 mov AX, [RSI]; 4407 xor AX, [RDI]; 4408 mov [RDX], AX; 4409 end: 4410 ; 4411 } 4412 }else{ 4413 while(lenght){ 4414 *dest1 = *src ^ *dest; 4415 src++; 4416 dest++; 4417 dest1++; 4418 length--; 4419 } 4420 } 4421 } 4422 static if(T == "uint"){ 4423 version(X86){ 4424 version(MMX){ 4425 asm @nogc{ 4426 mov ESI, src[EBP]; 4427 mov EDI, dest[EBP]; 4428 mov EDX, dest1[EBP]; 4429 mov ECX, length; 4430 cmp ECX, 2; 4431 jl onepixel; 4432 twopixelloop: 4433 movq XMM0, [ESI]; 4434 movq XMM1, [EDI]; 4435 pxor XMM0, XMM1; 4436 movq [EDX], XMM0; 4437 add ESI, 8; 4438 add EDI, 8; 4439 add EDX, 8; 4440 sub ECX, 2; 4441 cmp ECX, 2; 4442 jge twopixelloop; 4443 onepixel: 4444 cmp ECX, 1; 4445 jl end; 4446 movd XMM0, [ESI]; 4447 movd XMM1, [EDI]; 4448 pxor XMM0, XMM1; 4449 movd [EDX], XMM0; 4450 end: 4451 emms; 4452 } 4453 }else{ 4454 asm @nogc{ 4455 mov ESI, src[EBP]; 4456 mov EDI, dest[EBP]; 4457 mov EDX, dest1[EBP]; 4458 mov ECX, length; 4459 cmp ECX, 4; 4460 jl twopixel; 4461 fourpixelloop: 4462 movups XMM0, [ESI]; 4463 movups XMM1, [EDI]; 4464 pxor XMM0, XMM1; 4465 movups [EDX], XMM0; 4466 add ESI,16; 4467 add EDI,16; 4468 add EDX,16; 4469 sub ECX, 4; 4470 cmp ECX, 4; 4471 jge fourpixelloop; 4472 twopixel: 4473 cmp ECX, 2; 4474 jl onepixel; 4475 movq XMM0, [ESI]; 4476 movq XMM1, [EDI]; 4477 pxor XMM0, XMM1; 4478 movq [EDX], XMM0; 4479 add ESI, 8; 4480 add EDI, 8; 4481 add EDX, 8; 4482 sub ECX, 2; 4483 onepixel: 4484 cmp ECX, 1; 4485 jl end; 4486 movd XMM0, [ESI]; 4487 movd XMM1, [EDI]; 4488 pxor XMM0, XMM1; 4489 movd [EDX], XMM0; 4490 4491 end: 4492 ; 4493 } 4494 } 4495 }else version(X86_64){ 4496 asm @nogc{ 4497 mov RSI, src[RBP]; 4498 mov RDI, dest[RBP]; 4499 mov RDX, dest1[RBP]; 4500 mov RCX, length; 4501 cmp RCX, 4; 4502 jl twopixel; 4503 fourpixelloop: 4504 movups XMM0, [RSI]; 4505 movups XMM1, [RDI]; 4506 pxor XMM0, XMM1; 4507 movups [RDX], XMM0; 4508 add RSI,16; 4509 add RDI,16; 4510 add RDX,16; 4511 sub RCX, 4; 4512 cmp RCX, 4; 4513 jge fourpixelloop; 4514 twopixel: 4515 cmp RCX, 2; 4516 jl onepixel; 4517 movq XMM0, [RSI]; 4518 movq XMM1, [RDI]; 4519 pxor XMM0, XMM1; 4520 movq [RDX], XMM0; 4521 add RSI, 2; 4522 add RDI, 2; 4523 add RDX, 2; 4524 sub RCX, 2; 4525 onepixel: 4526 cmp RCX, 1; 4527 jl end; 4528 movd XMM0, [RSI]; 4529 movd XMM1, [RDI]; 4530 pxor XMM0, XMM1; 4531 movd [RDX], XMM0; 4532 end: 4533 ; 4534 } 4535 }else{ 4536 while(lenght){ 4537 *dest1 = *src ^ *dest; 4538 src++; 4539 dest++; 4540 dest1++; 4541 length--; 4542 } 4543 } 4544 } 4545 } 4546 /** 4547 * 2 + 1 operand XOR blitter. 4548 */ 4549 public @nogc void xorBlitter(T)(T* src, T* dest, size_t length){ 4550 static if(T == "ubyte"){ 4551 version(X86){ 4552 version(MMX){ 4553 asm @nogc{ 4554 mov ESI, src[EBP]; 4555 mov EDI, dest[EBP]; 4556 mov ECX, length; 4557 cmp ECX, 8; 4558 jl fourpixel; 4559 eightpixelloop: 4560 movq XMM0, [ESI]; 4561 movq XMM1, [EDI]; 4562 pxor XMM0, XMM1; 4563 movq [EDI], XMM0; 4564 add ESI, 8; 4565 add EDI, 8; 4566 sub ECX, 8; 4567 cmp ECX, 8; 4568 jge eightpixelloop; 4569 fourpixel: 4570 cmp ECX, 4; 4571 jl singlepixelloop; 4572 movd XMM0, [ESI]; 4573 movd XMM1, [EDI]; 4574 pxor XMM0, XMM1; 4575 movd [EDI], XMM0; 4576 add ESI, 4; 4577 add EDI, 4; 4578 sub ECX, 4; 4579 cmp ECX, 0; 4580 jle end; 4581 singlepixelloop: 4582 mov AL, [ESI]; 4583 xor AL, [EDI]; 4584 mov [EDI], AL; 4585 inc ESI; 4586 inc EDI; 4587 loop singlepixelloop; 4588 end: 4589 emms; 4590 } 4591 }else{ 4592 asm @nogc{ 4593 mov ESI, src[EBP]; 4594 mov EDI, dest[EBP]; 4595 mov ECX, length; 4596 cmp ECX, 16; 4597 jl eightpixel; 4598 sixteenpixelloop: 4599 movups XMM0, [ESI]; 4600 movups XMM1, [EDI]; 4601 pxor XMM0, XMM1; 4602 movups [EDI], XMM0; 4603 add ESI, 16; 4604 add EDI, 16; 4605 sub ECX, 16; 4606 cmp ECX, 16; 4607 jge sixteenpixelloop; 4608 eightpixel: 4609 cmp ECX, 8; 4610 jl fourpixel; 4611 movq XMM0, [ESI]; 4612 movq XMM1, [EDI]; 4613 pxor XMM0, XMM1; 4614 movq [EDI], XMM0; 4615 add ESI, 8; 4616 add EDI, 8; 4617 sub ECX, 8; 4618 fourpixel: 4619 cmp ECX, 4; 4620 jl singlepixelloop; 4621 movd XMM0, [ESI]; 4622 movd XMM1, [EDI]; 4623 pxor XMM0, XMM1; 4624 movd [EDI], XMM0; 4625 add ESI, 4; 4626 add EDI, 4; 4627 sub ECX, 4; 4628 cmp ECX, 0; 4629 jle end; 4630 singlepixelloop: 4631 mov AL, [ESI]; 4632 xor AL, [EDI]; 4633 mov [EDI], AL; 4634 inc ESI; 4635 inc EDI; 4636 loop singlepixelloop; 4637 end: 4638 ; 4639 } 4640 } 4641 }else version(X86_64){ 4642 asm @nogc{ 4643 mov RSI, src[RBP]; 4644 mov RDI, dest[RBP]; 4645 mov RCX, length; 4646 cmp RCX, 16; 4647 jl eightpixel; 4648 sixteenpixelloop: 4649 movups XMM0, [RSI]; 4650 movups XMM1, [RDI]; 4651 pxor XMM0, XMM1; 4652 movups [RDI], XMM0; 4653 add RSI, 16; 4654 add RDI, 16; 4655 sub RCX, 16; 4656 cmp RCX, 16; 4657 jge sixteenpixelloop; 4658 eightpixel: 4659 cmp RCX, 8; 4660 jl fourpixel; 4661 movq XMM0, [RSI]; 4662 movq XMM1, [RDI]; 4663 pxor XMM0, XMM1; 4664 movq [RDI], XMM0; 4665 add RSI, 8; 4666 add RDI, 8; 4667 sub RCX, 8; 4668 fourpixel: 4669 cmp RCX, 4; 4670 jl singlepixelloop; 4671 movd XMM0, [RSI]; 4672 movd XMM1, [RDI]; 4673 pxor XMM0, XMM1; 4674 movd [RDI], XMM0; 4675 add RSI, 4; 4676 add RDI, 4; 4677 sub RCX, 4; 4678 cmp RCX, 0; 4679 jle end; 4680 singlepixelloop: 4681 mov AL, [RSI]; 4682 xor AL, [RDI]; 4683 mov [RDI], AL; 4684 inc RSI; 4685 inc RDI; 4686 loop singlepixelloop; 4687 end: 4688 ; 4689 } 4690 }else{ 4691 while(lenght){ 4692 *dest1 = *src ^ *dest; 4693 src++; 4694 dest++; 4695 dest1++; 4696 length--; 4697 } 4698 } 4699 }else static if(T == "ushort"){ 4700 version(X86){ 4701 version(MMX){ 4702 asm @nogc{ 4703 mov ESI, src[EBP]; 4704 mov EDI, dest[EBP]; 4705 mov ECX, length; 4706 cmp ECX, 4; 4707 jl twopixel; 4708 fourpixelloop: 4709 movq XMM0, [ESI]; 4710 movq XMM1, [EDI]; 4711 pxor XMM0, XMM1; 4712 movq [EDI], XMM0; 4713 add ESI, 8; 4714 add EDI, 8; 4715 sub ECX, 4; 4716 cmp ECX, 4; 4717 jge fourpixelloop; 4718 twopixel: 4719 cmp ECX, 4; 4720 jl onepixel; 4721 movd XMM0, [ESI]; 4722 movd XMM1, [EDI]; 4723 pxor XMM0, XMM1; 4724 movd [EDX], XMM0; 4725 add ESI, 4; 4726 add EDI, 4; 4727 sub ECX, 2; 4728 cmp ECX, 0; 4729 jle end; 4730 onepixel: 4731 mov AX, [ESI]; 4732 xor AX, [EDI]; 4733 mov [EDI], AX; 4734 end: 4735 emms; 4736 } 4737 }else{ 4738 asm @nogc{ 4739 mov ESI, src[EBP]; 4740 mov EDI, dest[EBP]; 4741 mov EDX, dest1[EBP]; 4742 mov ECX, length; 4743 cmp ECX, 8; 4744 jl fourpixel; 4745 eightpixelloop: 4746 movups XMM0, [ESI]; 4747 movups XMM1, [EDI]; 4748 pxor XMM0, XMM1; 4749 movups [EDX], XMM0; 4750 add ESI, 8; 4751 add EDI, 8; 4752 sub ECX, 8; 4753 cmp ECX, 8; 4754 jge eightpixelloop; 4755 fourpixel: 4756 cmp ECX, 4; 4757 jl twopixel; 4758 movq XMM0, [ESI]; 4759 movq XMM1, [EDI]; 4760 pxor XMM0, XMM1; 4761 movq [EDI], XMM0; 4762 add ESI, 4; 4763 add EDI, 4; 4764 sub ECX, 4; 4765 twopixel: 4766 cmp ECX, 2; 4767 jl onepixel; 4768 movd XMM0, [ESI]; 4769 movd XMM1, [EDI]; 4770 pxor XMM0, XMM1; 4771 movd [EDI], XMM0; 4772 add ESI, 2; 4773 add EDI, 2; 4774 sub ECX, 2; 4775 cmp ECX, 0; 4776 jle end; 4777 onepixel: 4778 mov AX, [ESI]; 4779 xor AX, [EDI]; 4780 mov [EDI], AX; 4781 end: 4782 ; 4783 } 4784 } 4785 }else version(X86_64){ 4786 asm @nogc{ 4787 mov RSI, src[RBP]; 4788 mov RDI, dest[RBP]; 4789 mov RCX, length; 4790 cmp RCX, 8; 4791 jl fourpixel; 4792 eightpixelloop: 4793 movups XMM0, [RSI]; 4794 movups XMM1, [RDI]; 4795 pxor XMM0, XMM1; 4796 movups [RDI], XMM0; 4797 add RSI, 8; 4798 add RDI, 8; 4799 sub RCX, 8; 4800 cmp RCX, 8; 4801 jge eightpixelloop; 4802 fourpixel: 4803 cmp RCX, 4; 4804 jl twopixel; 4805 movq XMM0, [RSI]; 4806 movq XMM1, [RDI]; 4807 pxor XMM0, XMM1; 4808 movq [RDI], XMM0; 4809 add RSI, 4; 4810 add RDI, 4; 4811 sub RCX, 4; 4812 twopixel: 4813 cmp RCX, 2; 4814 jl singlepixelloop; 4815 movd XMM0, [RSI]; 4816 movd XMM1, [RDI]; 4817 pxor XMM0, XMM1; 4818 movd [RDI], XMM0; 4819 add RSI, 2; 4820 add RDI, 2; 4821 sub RCX, 2; 4822 cmp RCX, 0; 4823 jle end; 4824 onepixel: 4825 mov AX, [RSI]; 4826 xor AX, [RDI]; 4827 mov [RDI], AX; 4828 end: 4829 ; 4830 } 4831 }else{ 4832 while(lenght){ 4833 *dest1 = *src ^ *dest; 4834 src++; 4835 dest++; 4836 dest1++; 4837 length--; 4838 } 4839 } 4840 } 4841 static if(T == "uint"){ 4842 version(X86){ 4843 version(MMX){ 4844 asm @nogc{ 4845 mov ESI, src[EBP]; 4846 mov EDI, dest[EBP]; 4847 mov ECX, length; 4848 cmp ECX, 2; 4849 jl onepixel; 4850 twopixelloop: 4851 movq XMM0, [ESI]; 4852 movq XMM1, [EDI]; 4853 pxor XMM0, XMM1; 4854 movq [EDI], XMM0; 4855 add ESI, 8; 4856 add EDI, 8; 4857 sub ECX, 2; 4858 cmp ECX, 2; 4859 jge twopixelloop; 4860 onepixel: 4861 cmp ECX, 1; 4862 jl end; 4863 movd XMM0, [ESI]; 4864 movd XMM1, [EDI]; 4865 pxor XMM0, XMM1; 4866 movd [EDI], XMM0; 4867 end: 4868 emms; 4869 } 4870 }else{ 4871 asm @nogc{ 4872 mov ESI, src[EBP]; 4873 mov EDI, dest[EBP]; 4874 mov ECX, length; 4875 cmp ECX, 4; 4876 jl twopixel; 4877 fourpixelloop: 4878 movups XMM0, [ESI]; 4879 movups XMM1, [EDI]; 4880 pxor XMM0, XMM1; 4881 movups [EDI], XMM0; 4882 add ESI,16; 4883 add EDI,16; 4884 sub ECX, 4; 4885 cmp ECX, 4; 4886 jge fourpixelloop; 4887 twopixel: 4888 cmp ECX, 2; 4889 jl onepixel; 4890 movq XMM0, [ESI]; 4891 movq XMM1, [EDI]; 4892 pxor XMM0, XMM1; 4893 movq [EDI], XMM0; 4894 add ESI, 8; 4895 add EDI, 8; 4896 sub ECX, 2; 4897 onepixel: 4898 cmp ECX, 1; 4899 jl end; 4900 movd XMM0, [ESI]; 4901 movd XMM1, [EDI]; 4902 pxor XMM0, XMM1; 4903 movd [EDI], XMM0; 4904 4905 end: 4906 ; 4907 } 4908 } 4909 }else version(X86_64){ 4910 asm @nogc{ 4911 mov RSI, src[RBP]; 4912 mov RDI, dest[RBP]; 4913 mov RCX, length; 4914 cmp RCX, 4; 4915 jl twopixel; 4916 fourpixelloop: 4917 movups XMM0, [RSI]; 4918 movups XMM1, [RDI]; 4919 pxor XMM0, XMM1; 4920 movups [RDI], XMM0; 4921 add RSI,16; 4922 add RDI,16; 4923 sub RCX, 4; 4924 cmp RCX, 4; 4925 jge fourpixelloop; 4926 twopixel: 4927 cmp RCX, 2; 4928 jl onepixel; 4929 movq XMM0, [RSI]; 4930 movq XMM1, [RDI]; 4931 pxor XMM0, XMM1; 4932 movq [RDI], XMM0; 4933 add RSI, 8; 4934 add RDI, 8; 4935 sub RCX, 2; 4936 onepixel: 4937 cmp RCX, 1; 4938 jl end; 4939 movd XMM0, [RSI]; 4940 movd XMM1, [RDI]; 4941 pxor XMM0, XMM1; 4942 movd [RDI], XMM0; 4943 end: 4944 ; 4945 } 4946 }else{ 4947 while(lenght){ 4948 *dest1 = *src ^ *dest; 4949 src++; 4950 dest++; 4951 dest1++; 4952 length--; 4953 } 4954 } 4955 }else static assert("Template parameter '"~ T.stringof ~"' not supported!"); 4956 }