
Untitled
By: a guest on
Apr 24th, 2012 | syntax:
None | size: 1.73 KB | hits: 17 | expires: Never
#include <signal.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <limits.h>
#include <stdlib.h>
#define TIMES SHRT_MAX
uint64_t rdtsc(void) {
uint32_t lo,hi; __asm__ __volatile__ ("xorl %%eax,%%eax\ncpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return (uint64_t)hi << 32 | lo;
}
void __attribute__ ((noinline)) *rep_movsb(void *dst, const void *src, size_t cnt)
{
/*
* "rep movs" is optimized in microcode on
* modern Intel CPU's. This method works
* best for easy-copys (src-alligned?), this
* will give best performance if src & dst
* are both equal (mod 64). But will still
* work if they're not. This method is also
* faster than any SSE routine.
*
* See: https://lkml.org/lkml/2011/9/1/229
* for more information regarding the matter.
*/
__asm__("cld; rep movsb" ::
"c"(cnt),
"S"(src),
"D"(dst)
);
return dst;
}
void __attribute__ ((noinline)) eat_it(void* eat) { (void)eat; }
int main(int argc, char **argv)
{
if (argc < 2)
return 0;
int size = atoi(argv[1]);
char *buf1;//[16384];
char *buf2;//[16384];
char *buf3;//[16384];
char *buf4;//[16384];
buf1 = buf2 = buf3 = buf4 = malloc(size);
uint64_t s1,s2;
int i;
for(;;)
{
s1 = rdtsc();
for (i=0;i<TIMES; i++) {
rep_movsb(buf1, buf2, size);
rep_movsb(buf2, buf1, size);
eat_it(buf3);
eat_it(buf4);
}
printf("movsb: %ld (cpu cycles)\n", rdtsc()-s1);
s2 = rdtsc();
for (i=0;i<TIMES; i++) {
memcpy(buf3, buf4, size);
memcpy(buf4, buf3, size);
eat_it(buf3);
eat_it(buf4);
}
printf("memcpy: %ld (cpu cycles)\n", rdtsc()-s2);
}
}