// MemRead.cpp
//
// This program is meant to test memory read performance under various
// kinds of situations.
//
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
#include <windows.h>
#include <mmsystem.h>
// the size of a single fetch over the bus.
#define MEM_FETCH_SIZE 128
// the maximum size of any processor cache. i7 has an 8MB cache.
#define MAX_CACHE_SIZE (8*1024*1024)
// the maximum test buffer size. For this test, I use 512MB.
#define TEST_BUFFER_SIZE (1024*1024*1024)
#define HALF_BUFFER_SIZE (512*1024*1024)
// forward declaration.
void TestLinear( void* buffer );
void TestSparse( void* buffer );
void FlushCache( void* buffer );
// main app.
void main()
{
// begin timing.
timeBeginPeriod( 1 );
// allocate a several megabyte buffer. Note that we intentially allocate
// a little extra to ensure that the cache is not poluted with data.
char* buffer = ( char* )malloc( TEST_BUFFER_SIZE + MAX_CACHE_SIZE );
// begin timing tests.
unsigned int start = 0;
unsigned int linearTotal = 0;
unsigned int sparseTotal = 0;
// flush the cache for a test.
FlushCache( buffer );
// time the current operation.
start = timeGetTime();
// perform a test of non-temporal reads.
TestLinear( buffer );
// calculate the total time needed.
linearTotal = timeGetTime() - start;
// flush the cache for the next test.
FlushCache( buffer );
// time the current operation.
start = timeGetTime();
// perform a test of non-temporal reads.
TestSparse( buffer );
// calculate the total time needed.
sparseTotal = timeGetTime() - start;
// print out our statistics.
printf( "Total linear access time: %d\n", linearTotal );
printf( "Total sparse access time: %d\n", sparseTotal );
// spin for a bit.
_getch();
// end the time period.
timeEndPeriod( 1 );
}
void TestLinear( void* buffer )
{
for ( unsigned int i = 0; i < 512; ++i )
{
__asm
{
mov esi, buffer
mov ecx, HALF_BUFFER_SIZE
loop_start:
; pull in data with a non-temporal read.
movntdqa xmm0, [ esi + 0 ]
movntdqa xmm1, [ esi + MEM_FETCH_SIZE ]
; loop.
add esi, MEM_FETCH_SIZE + MEM_FETCH_SIZE
sub ecx, MEM_FETCH_SIZE + MEM_FETCH_SIZE
jnz loop_start
}
}
}
void TestSparse( void* buffer )
{
for ( unsigned int i = 0; i < 512; ++i )
{
__asm
{
mov esi, buffer
mov ecx, HALF_BUFFER_SIZE
loop_start:
; pull in data with a non-temporal read.
movntdqa xmm0, [ esi + 0 ]
movntdqa xmm1, [ esi + HALF_BUFFER_SIZE ]
; loop.
add esi, MEM_FETCH_SIZE + MEM_FETCH_SIZE
sub ecx, MEM_FETCH_SIZE + MEM_FETCH_SIZE
jnz loop_start
}
}
}
void FlushCache( void* buffer )
{
// initialize the back portion of the buffer in order to ensure that no
// data cached. Note that I intentially use a read and write operation
// to ensure that the CPU cannot optimize the operation with non-temporal
// stores.
char* scratch = ( char* )buffer + TEST_BUFFER_SIZE;
for ( unsigned int i = 0; i < MAX_CACHE_SIZE; i += 4, scratch += 4 )
( ( int* )scratch )[ 0 ] ^= i;
}