Advertisement
Guest User

old skool uncached memcpy x86

a guest
Aug 24th, 2016
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.18 KB | None | 0 0
  1. // memcpy_fast() & memset32()
  2. // - lifted from http://www.stereopsis.com/memcpy.html, though I wrote similar ones back in the day
  3. // - approx. twice faster than their CRT counterparts on a 2009 netbook with 1.66GHz Intel Atom CPU
  4. // - combined writes (MOVNTQ) perform favorably for most usage patterns
  5.  
  6. void memcpy_fast(void *pDest, const void *pSrc, size_t numBytes)
  7. {
  8. // numBytes must be a multiple of 64 -- use memcpy() for general purpose
  9. VIZ_ASSERT(!(numBytes & 63));
  10.  
  11. // an 8-byte boundary gaurantees correctly aligned reads and writes
  12. VIZ_ASSERT( 0 == (int(pSrc) & 7) );
  13. VIZ_ASSERT( 0 == (int(pDest) & 7) );
  14.  
  15. __asm
  16. {
  17. mov edi, pDest
  18. mov esi, pSrc
  19. mov ecx, numBytes
  20. shr ecx, 6
  21. _loop:
  22. movq mm1, [esi]
  23. movq mm2, [esi+8]
  24. movq mm3, [esi+16]
  25. movq mm4, [esi+24]
  26. movq mm5, [esi+32]
  27. movq mm6, [esi+40]
  28. movq mm7, [esi+48]
  29. movq mm0, [esi+56]
  30. movntq [edi], mm1
  31. movntq [edi+8], mm2
  32. movntq [edi+16], mm3
  33. movntq [edi+24], mm4
  34. movntq [edi+32], mm5
  35. movntq [edi+40], mm6
  36. movntq [edi+48], mm7
  37. movntq [edi+56], mm0
  38. add esi, 64
  39. add edi, 64
  40. dec ecx
  41. jnz _loop
  42. }
  43.  
  44. _mm_empty();
  45. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement