Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // memcpy_fast() & memset32()
- // - lifted from http://www.stereopsis.com/memcpy.html, though I wrote similar ones back in the day
- // - approx. twice faster than their CRT counterparts on a 2009 netbook with 1.66GHz Intel Atom CPU
- // - combined writes (MOVNTQ) perform favorably for most usage patterns
- void memcpy_fast(void *pDest, const void *pSrc, size_t numBytes)
- {
- // numBytes must be a multiple of 64 -- use memcpy() for general purpose
- VIZ_ASSERT(!(numBytes & 63));
- // an 8-byte boundary gaurantees correctly aligned reads and writes
- VIZ_ASSERT( 0 == (int(pSrc) & 7) );
- VIZ_ASSERT( 0 == (int(pDest) & 7) );
- __asm
- {
- mov edi, pDest
- mov esi, pSrc
- mov ecx, numBytes
- shr ecx, 6
- _loop:
- movq mm1, [esi]
- movq mm2, [esi+8]
- movq mm3, [esi+16]
- movq mm4, [esi+24]
- movq mm5, [esi+32]
- movq mm6, [esi+40]
- movq mm7, [esi+48]
- movq mm0, [esi+56]
- movntq [edi], mm1
- movntq [edi+8], mm2
- movntq [edi+16], mm3
- movntq [edi+24], mm4
- movntq [edi+32], mm5
- movntq [edi+40], mm6
- movntq [edi+48], mm7
- movntq [edi+56], mm0
- add esi, 64
- add edi, 64
- dec ecx
- jnz _loop
- }
- _mm_empty();
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement