// B8G8R8 to B8G8R8A8 convsersion-code benchmark // copyright(C) 2000 XELF. All rights reserved. // http://www.cyborg.ne.jp/~xelf/ // You can use this source code for any purpose without permission. // Notes that this source code do not supports the processing of fraction pixels. #include #include #pragma comment(lib,"winmm.lib") #include #include #ifdef _DEBUG #define VERIFY // for verifying pointers #endif void CopyPixelImageBGRtoBGRA( BYTE* d, const BYTE* s, int width, int height ) { #ifdef VERIFY const BYTE* se=s+width*height*3; BYTE *de=d+width*height*4; #endif BYTE a=0xff; for(int y=height;--y>=0;) { for(int x=width;--x>=0;) { *d++=a; *d++=*s++; *d++=*s++; *d++=*s++; } } #ifdef VERIFY if (se!=s) { printf("program internal error.\r\n"); } if (de!=d) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAword( BYTE* d, const BYTE* s, int width, int height ) { #ifdef VERIFY const BYTE* se=s+width*height*3; BYTE *de=d+width*height*4; #endif BYTE a=0xff; for(int y=height;--y>=0;) { for(int x=width;--x>=0;) { *d++=a; *d++=*s++; *(WORD*)d=*(WORD*)s; d+=2; s+=2; } } #ifdef VERIFY if (se!=s) { printf("program internal error.\r\n"); } if (de!=d) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAword2( BYTE* d, const BYTE* s, int width, int height ) { #ifdef VERIFY const BYTE* se=s+width*height*3; BYTE *de=d+width*height*4; #endif BYTE a=0xff; for(int y=height;--y>=0;) { for(int x=width;--x>=0;) { *d=a; *(d+1)=*s; *(WORD*)(d+2)=*(WORD*)(s+1); d+=4; s+=3; } } #ifdef VERIFY if (se!=s) { printf("program internal error.\r\n"); } if (de!=d) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRA3( BYTE* d, const BYTE* s, int width, int height ) { #ifdef VERIFY const BYTE* se=s+width*height*3; BYTE *de=d+width*height*4; #endif BYTE a=0xff; int w=width/4; DWORD p0,p1,p2; for(int y=height;--y>=0;) { for(int x=w;--x>=0;) { p0=*(DWORD*)(s ); p1=*(DWORD*)(s+ 4); p2=*(DWORD*)(s+ 8); *(DWORD*)(d )=a|(p0<< 8); *(DWORD*)(d+ 4)=a|(p0>>16)|(p1<<16); *(DWORD*)(d+ 8)=a|(p1>> 8)|(p2<<24); *(DWORD*)(d+12)=a|(p2 ); s+=12; d+=16; } } #ifdef VERIFY if (se!=s) { printf("program internal error.\r\n"); } if (de!=d) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAAsm3dwordor( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov ecx,height; ly: push ecx; mov ecx,w; lx: // h- -- -- -l mov eax,[esi]; // R1 B0 G0 R0 mov edx,eax; // shl eax,8; // B0 G0 R0 00 or eax,0xff; // B0 G0 R0 AA mov [edi],eax; // shr edx,16; // 00 00 R1 00 mov eax,[esi+4]; // G2 R2 B1 G1 mov ebx,eax; // shl eax,16; // B1 G1 00 00 add edx,eax; // B1 G1 R1 00 or edx,0xff; // B1 G1 R1 AA mov [edi+4],edx; // shr ebx,8; // 00 G2 R2 00 mov eax,[esi+8]; // B3 G3 R3 B2 mov edx,eax; // shl eax,24; // B2 00 00 00 add eax,ebx; // B2 G2 R2 00 or eax,0xff; // B2 G2 R2 AA mov [edi+8],eax; // or edx,0xff; // B3 G3 R3 AA add esi,12; mov [edi+12],edx; add edi,16; dec ecx; jnz lx; pop ecx; dec ecx; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAAsm3dwordorPrefetch( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov ecx,height; ly: push ecx; mov ecx,w; lx: // h- -- -- -l mov eax,[esi]; // R1 B0 G0 R0 prefetcht0 [esi+48]; mov edx,eax; // shl eax,8; // B0 G0 R0 00 or eax,0xff; // B0 G0 R0 AA mov [edi],eax; // shr edx,16; // 00 00 R1 00 mov eax,[esi+4]; // G2 R2 B1 G1 mov ebx,eax; // shl eax,16; // B1 G1 00 00 add edx,eax; // B1 G1 R1 00 or edx,0xff; // B1 G1 R1 AA mov [edi+4],edx; // shr ebx,8; // 00 G2 R2 00 mov eax,[esi+8]; // B3 G3 R3 B2 mov edx,eax; // shl eax,24; // B2 00 00 00 add eax,ebx; // B2 G2 R2 00 or eax,0xff; // B2 G2 R2 AA mov [edi+8],eax; // or edx,0xff; // B3 G3 R3 AA add esi,12; mov [edi+12],edx; add edi,16; dec ecx; jnz lx; pop ecx; dec ecx; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAMMXPrefetch( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; static const __int64 _000000ff000000ff=0x000000ff000000ff; static const __int64 _00000000ffffffff=0x00000000ffffffff; static const __int64 _ffffffff00000000=0xffffffff00000000; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov eax,height; movq mm6,_000000ff000000ff; movq mm5,_00000000ffffffff; movq mm4,_ffffffff00000000; ly: mov ecx,w; lx: movq mm0,[esi]; // G2 R2 B1 G1 R1 B0 G0 R0 movd mm1,[esi+8]; // 00 00 00 00 B3 G3 R3 B2 prefetcht0 [esi+48]; movq mm2,mm0; movq mm7,mm0; psllq mm0,8; // R2 B1 G1 R1 B0 G0 R0 00 movq mm3,mm1; psllq mm2,16; // B1 G1 R1 B0 G0 R0 00 00 pand mm0,mm5; // 00 00 00 00 B0 G0 R0 00 psrlq mm7,40; // 00 00 00 00 00 G2 R2 B1 pand mm2,mm4; // B1 G1 R1 B0 00 00 00 00 psllq mm1,24; // 00 B3 G3 R3 B2 00 00 00 paddb mm1,mm7; // 00 B3 G3 R3 B2 G2 R2 B1 paddb mm0,mm2; // B1 G1 R1 B0 B0 G0 R0 00 psllq mm3,32; // R3 G3 R3 AA 00 00 00 00 pand mm1,mm5; // 00 00 00 00 B2 G2 R2 B1 por mm0,mm6; // B1 G1 R1 AA B0 G0 R0 AA por mm3,mm1; // R3 G3 R3 B2 B2 G2 R2 B1 movq [edi],mm0; por mm3,mm6; // R3 G3 R3 AA B2 G2 R2 AA add esi,12; movq [edi+8],mm3; add edi,16; dec ecx; jnz lx; dec eax; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif emms; } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAMMXNT( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; static const __int64 _000000ff000000ff=0x000000ff000000ff; static const __int64 _00000000ffffffff=0x00000000ffffffff; static const __int64 _ffffffff00000000=0xffffffff00000000; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov eax,height; movq mm6,_000000ff000000ff; movq mm5,_00000000ffffffff; movq mm4,_ffffffff00000000; ly: mov ecx,w; lx: movq mm0,[esi]; // G2 R2 B1 G1 R1 B0 G0 R0 movd mm1,[esi+8]; // 00 00 00 00 B3 G3 R3 B2 movq mm2,mm0; movq mm7,mm0; psllq mm0,8; // R2 B1 G1 R1 B0 G0 R0 00 movq mm3,mm1; psllq mm2,16; // B1 G1 R1 B0 G0 R0 00 00 pand mm0,mm5; // 00 00 00 00 B0 G0 R0 00 psrlq mm7,40; // 00 00 00 00 00 G2 R2 B1 pand mm2,mm4; // B1 G1 R1 B0 00 00 00 00 psllq mm1,24; // 00 B3 G3 R3 B2 00 00 00 paddb mm1,mm7; // 00 B3 G3 R3 B2 G2 R2 B1 paddb mm0,mm2; // B1 G1 R1 B0 B0 G0 R0 00 psllq mm3,32; // R3 G3 R3 AA 00 00 00 00 pand mm1,mm5; // 00 00 00 00 B2 G2 R2 B1 por mm0,mm6; // B1 G1 R1 AA B0 G0 R0 AA por mm3,mm1; // R3 G3 R3 B2 B2 G2 R2 B1 movntq [edi],mm0; por mm3,mm6; // R3 G3 R3 AA B2 G2 R2 AA add esi,12; movntq [edi+8],mm3; add edi,16; dec ecx; jnz lx; dec eax; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif emms; } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAMMXPrefetchNT( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; static const __int64 _000000ff000000ff=0x000000ff000000ff; static const __int64 _00000000ffffffff=0x00000000ffffffff; static const __int64 _ffffffff00000000=0xffffffff00000000; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov eax,height; movq mm6,_000000ff000000ff; movq mm5,_00000000ffffffff; movq mm4,_ffffffff00000000; ly: mov ecx,w; lx: movq mm0,[esi]; // G2 R2 B1 G1 R1 B0 G0 R0 movd mm1,[esi+8]; // 00 00 00 00 B3 G3 R3 B2 prefetcht0 [esi+48]; movq mm2,mm0; movq mm7,mm0; psllq mm0,8; // R2 B1 G1 R1 B0 G0 R0 00 movq mm3,mm1; psllq mm2,16; // B1 G1 R1 B0 G0 R0 00 00 pand mm0,mm5; // 00 00 00 00 B0 G0 R0 00 psrlq mm7,40; // 00 00 00 00 00 G2 R2 B1 pand mm2,mm4; // B1 G1 R1 B0 00 00 00 00 psllq mm1,24; // 00 B3 G3 R3 B2 00 00 00 paddb mm1,mm7; // 00 B3 G3 R3 B2 G2 R2 B1 paddb mm0,mm2; // B1 G1 R1 B0 B0 G0 R0 00 psllq mm3,32; // R3 G3 R3 AA 00 00 00 00 pand mm1,mm5; // 00 00 00 00 B2 G2 R2 B1 por mm0,mm6; // B1 G1 R1 AA B0 G0 R0 AA por mm3,mm1; // R3 G3 R3 B2 B2 G2 R2 B1 movntq [edi],mm0; por mm3,mm6; // R3 G3 R3 AA B2 G2 R2 AA add esi,12; movntq [edi+8],mm3; add edi,16; dec ecx; jnz lx; dec eax; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif emms; } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAMMX( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; static const __int64 _000000ff000000ff=0x000000ff000000ff; static const __int64 _00000000ffffffff=0x00000000ffffffff; static const __int64 _ffffffff00000000=0xffffffff00000000; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov eax,height; movq mm6,_000000ff000000ff; movq mm5,_00000000ffffffff; movq mm4,_ffffffff00000000; ly: mov ecx,w; lx: movq mm0,[esi]; // G2 R2 B1 G1 R1 B0 G0 R0 movd mm1,[esi+8]; // 00 00 00 00 B3 G3 R3 B2 movq mm2,mm0; movq mm7,mm0; psllq mm0,8; // R2 B1 G1 R1 B0 G0 R0 00 movq mm3,mm1; psllq mm2,16; // B1 G1 R1 B0 G0 R0 00 00 pand mm0,mm5; // 00 00 00 00 B0 G0 R0 00 psrlq mm7,40; // 00 00 00 00 00 G2 R2 B1 pand mm2,mm4; // B1 G1 R1 B0 00 00 00 00 psllq mm1,24; // 00 B3 G3 R3 B2 00 00 00 paddb mm1,mm7; // 00 B3 G3 R3 B2 G2 R2 B1 paddb mm0,mm2; // B1 G1 R1 B0 B0 G0 R0 00 psllq mm3,32; // R3 G3 R3 AA 00 00 00 00 pand mm1,mm5; // 00 00 00 00 B2 G2 R2 B1 por mm0,mm6; // B1 G1 R1 AA B0 G0 R0 AA por mm3,mm1; // R3 G3 R3 B2 B2 G2 R2 B1 movq [edi],mm0; por mm3,mm6; // R3 G3 R3 AA B2 G2 R2 AA add esi,12; movq [edi+8],mm3; add edi,16; dec ecx; jnz lx; dec eax; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif emms; } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAAsm3( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width/4; __asm { // 3 pixels/lx mov esi,s; mov edi,d; mov ecx,height; ly: push ecx; mov ecx,w; lx: // h- -- -- -l mov eax,[esi]; // R1 B0 G0 R0 mov edx,eax; // shl eax,8; // B0 G0 R0 00 or al,0xff; // B0 G0 R0 AA mov [edi],eax; // shr edx,16; // 00 00 R1 00 mov eax,[esi+4]; // G2 R2 B1 G1 mov ebx,eax; // shl eax,16; // B1 G1 00 00 add edx,eax; // B1 G1 R1 00 or dl,0xff; // B1 G1 R1 AA mov [edi+4],edx; // shr ebx,8; // 00 G2 R2 00 mov eax,[esi+8]; // B3 G3 R3 B2 mov edx,eax; // shl eax,24; // B2 00 00 00 add eax,ebx; // B2 G2 R2 00 or al,0xff; // B2 G2 R2 AA mov [edi+8],eax; // // or dl,0xff; // B3 G3 R3 AA mov dl,al; // B3 G3 R3 AA add esi,12; mov [edi+12],edx; add edi,16; dec ecx; jnz lx; pop ecx; dec ecx; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAAsm1( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width; __asm { mov esi,s; mov edi,d; mov ecx,height; mov ebx,0xff; ly: push ecx; mov ecx,w; lx: // h- -- -- -l mov ax,[esi+1]; // 00 00 B0 G0 shl eax,16; // B0 G0 00 00 mov ah,[esi]; // B0 G0 R0 00 or al,bl; // B0 G0 R0 AA add esi,3; mov [edi],eax; add edi,4; dec ecx; jnz lx; pop ecx; dec ecx; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAAsmbytebyteword( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width; __asm { mov esi,s; mov edi,d; mov ecx,height; mov ebx,0xff; ly: push ecx; mov ecx,w; lx: // h- -- -- -l mov bh,[esi ]; mov ax,[esi+1]; // prefetcht0 [esi+96]; mov [edi ],bl; add esi,3; mov [edi+1],bh; mov [edi+2],ax; add edi,4; dec ecx; jnz lx; pop ecx; dec ecx; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } void CopyPixelImageBGRtoBGRAAsm1dwor( BYTE* d, const BYTE* s, int width, int height ) { // (A=255) #ifdef VERIFY BYTE* de; const BYTE *se; #endif int w=width; __asm { mov esi,s; mov edi,d; mov ecx,height; mov ebx,0xff; ly: push ecx; mov ecx,w; lx: // h- -- -- -l mov ax,[esi+1]; // 00 00 B0 G0 shl eax,16; // B0 G0 00 00 mov ah,[esi]; // B0 G0 R0 00 or eax,ebx; // B0 G0 R0 AA add esi,3; mov [edi],eax; add edi,4; dec ecx; jnz lx; pop ecx; dec ecx; jnz ly; #ifdef VERIFY mov [se],esi; mov [de],edi; #endif } #ifdef VERIFY if (se!=s+width*height*3) { printf("program internal error.\r\n"); } if (de!=d+width*height*4) { printf("program internal error.\r\n"); } #endif } enum { loop=100, }; class Benchmark { DWORD time; BYTE* d; public: Benchmark( LPCTSTR title, BYTE* _d ) { d=_d; printf("%s:",title); time=timeGetTime(); } ~Benchmark() { DWORD span=timeGetTime()-time; for(int i=0;i<16;i++) { printf(" %02X",d[i]); } printf("\r\n %6.2f [ms]\r\n",span/float(loop)); } }; enum { cpu_legacy=0, cpu_mmx=(1<<0), cpu_3dnow=(1<<1), cpu_e3dnow=(1<<2), cpu_sse=(1<<3), cpu_sse2=(1<<4), }; int CPU() { int flags=0,type=0; __asm { xor eax,eax; cpuid; or eax,eax; jz quit; mov eax,1; cpuid; mov flags,edx; quit: } if (flags&(1<<23)) { type|=cpu_mmx; if (flags&(1<<25)) { type|=cpu_sse; if (flags&(1<<26)) { type|=cpu_sse2; } } } return type; } void benchmark_main( BYTE* d, const BYTE* s, int width, int height ) { int cpu=CPU(); if (cpu&cpu_sse) { Benchmark bm("MMX +Prefetch +nt",d); for(int i=0;i