// memory copy code benchmark // source code for Visual C++ 6.0 + Service Pack 4 + Processor Pack // copyright(C) 2001 XELF. All rights reserved. // http://www.cyborg.ne.jp/~xelf/ // You can use this source code for any purpose without permission. // Notes that this source code is not supported the processing of fraction bytes. #include #include #include #include #pragma comment(lib,"winmm.lib") enum { size = 1024*1024*16, }; void mem( LPBYTE d, const BYTE* s, int _size ) { memcpy(d,s,_size); } void memc( LPBYTE d, const BYTE* s, int _size ) { for(int i=_size>>2;--i>=0;) { *(LPDWORD)d=*(const DWORD*)s; d+=4; s+=4; } } void mem2( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,2; lx: mov eax,[esi]; add esi,4; mov [edi],eax; add edi,4; dec ecx; jnz lx; } } void mem3( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,3; lx: mov eax,[esi]; mov eax,[esi+4]; add esi,8; mov [edi],eax; mov [edi+4],eax; add edi,8; dec ecx; jnz lx; } } void mem4( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,3; lx: movq mm0,[esi]; add esi,8; movq [edi],mm0; add edi,8; dec ecx; jnz lx; } } void mem5( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,4; lx: movq mm0,[esi]; movq mm1,[esi+8]; lea esi,[esi+16]; movq [edi],mm0; movq [edi+8],mm1; lea edi,[edi+16]; dec ecx; jnz lx; } } void mem8( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,3; lx: movq mm0,[esi]; lea esi,[esi+8]; movntq [edi],mm0; lea edi,[edi+8]; dec ecx; jnz lx; } } void mem9( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,4; lx: movq mm0,[esi]; movq mm1,[esi+8]; lea esi,[esi+16]; movntq [edi],mm0; movntq [edi+8],mm1; lea edi,[edi+16]; dec ecx; jnz lx; } } void mem10( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,4; lx: movq mm0,[esi]; movq mm1,[esi+8]; lea esi,[esi+16]; movntq [edi],mm0; prefetcht0 [esi+768]; movntq [edi+8],mm1; lea edi,[edi+16]; dec ecx; jnz lx; } } void mem6( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,4; lx: movaps xmm0,[esi]; lea esi,[esi+16]; movaps [edi],xmm0; lea edi,[edi+16]; dec ecx; jnz lx; } } void mem7( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,4; lx: movaps xmm0,[esi]; lea esi,[esi+16]; movntps [edi],xmm0; lea edi,[edi+16]; dec ecx; jnz lx; } } void mem7pre( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,5; lx: movaps xmm0,[esi]; movaps xmm1,[esi+16]; lea esi,[esi+32]; movntps [edi],xmm0; prefetcht0 [esi+1024]; movntps [edi+16],xmm1; lea edi,[edi+32]; dec ecx; jnz lx; } } void memfpu( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,3; lx: fld double ptr [esi]; lea esi,[esi+8]; fstp double ptr [edi]; lea edi,[edi+8]; dec ecx; jnz lx; } } void memrep( LPBYTE d, const BYTE* s, int _size ) { _asm { mov edi,d; mov esi,s; mov ecx,_size; shr ecx,2; rep movsd; } } void begin( LPBYTE mem1, LPBYTE mem2, int size, const char* text ) { memset(mem1,0x55,size); memset(mem2,0xAA,size); printf(text); } void end( LPBYTE mem1, LPBYTE mem2, int size, int time ) { _asm emms; printf("%d.%d [ms]",time/10,time%10); if (memcmp(mem1,mem2,size)) { printf("error!"); } printf("\r\n"); } enum { cpu_legacy=0, cpu_mmx=(1<<0), cpu_3dnow=(1<<1), cpu_e3dnow=(1<<2), cpu_sse=(1<<3), cpu_sse2=(1<<4), }; int CPU() { int flags=0,type=0; __asm { xor eax,eax; cpuid; or eax,eax; jz quit; mov eax,1; cpuid; mov flags,edx; quit: } if (flags&(1<<31)) { type|=cpu_3dnow; if (flags&(1<<30)) { type|=cpu_e3dnow; } } if (flags&(1<<23)) { type|=cpu_mmx; if (flags&(1<<25)) { type|=cpu_sse; if (flags&(1<<26)) { type|=cpu_sse2; } } } return type; } void benchmark( LPBYTE a, LPBYTE b, int size ) { int cpu=CPU(); { begin(a,b,size,"memcpy: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } { begin(a,b,size,"rep movsd: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { memrep(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } { begin(a,b,size,"FPU 8bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { memfpu(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_sse) { begin(a,b,size,"MMX movntq pre 16bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem10(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_sse) { begin(a,b,size,"MMX movntq 16bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem9(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_sse) { begin(a,b,size,"MMX movntq 8bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem8(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_sse) { begin(a,b,size,"SSE movntps pre 32bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem7pre(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_sse) { begin(a,b,size,"SSE movntps 16bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem7(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_sse) { begin(a,b,size,"SSE 16bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem6(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_mmx) { begin(a,b,size,"MMX 16bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem5(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } if (cpu&cpu_mmx) { begin(a,b,size,"MMX 8bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem4(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } { begin(a,b,size,"asm 8bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem3(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } { begin(a,b,size,"asm 4bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { mem2(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } { begin(a,b,size,"C++ 4bytes: "); DWORD t0=timeGetTime(); for(int i=0;i<10;i++) { memc(a,b,size); } DWORD t1=timeGetTime(); end(a,b,size,t1-t0); } } int main() { timeBeginPeriod(1); printf("memory copy code benchmark VER.2001-01-29 by (C)2001 XELF.\r\n"); printf("copy size: %d [bytes]\r\n\r\n",size); LPBYTE a=(LPBYTE)_aligned_malloc(size,16); LPBYTE b=(LPBYTE)_aligned_malloc(size,16); benchmark(a,b,size); _aligned_free(a); _aligned_free(b); printf("completed.\r\n"); timeEndPeriod(1); return 0; }