Advertisement
lightxx

Shared and Global Memory Atomics

Aug 19th, 2014
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 5.87 KB | None | 0 0
  1. /// <summary>
  2.     /// The program.
  3.     /// </summary>
  4.     public class Program {
  5.         /// <summary>
  6.         /// The size of the image data.
  7.         /// </summary>
  8.         public const int Size = 100 * 1024 * 1024;
  9.  
  10.         /// <summary>
  11.         /// The CalculateHistogramAsKernel method.
  12.         /// </summary>
  13.         /// <param name="thread">
  14.         /// The thread.
  15.         /// </param>
  16.         /// <param name="buffer">
  17.         /// The buffer containing the image data.
  18.         /// </param>
  19.         /// <param name="size">
  20.         /// The size of the image data.
  21.         /// </param>
  22.         /// <param name="histo">
  23.         /// The histogram data.
  24.         /// </param>
  25.         [Cudafy]
  26.         public static void CalculateHistogramAsKernel(GThread thread, byte[] buffer, int size, uint[] histo) {
  27.             // allocate a shared global memory block called "temp" with
  28.             // a capacity of 256 elements
  29.             var temp = thread.AllocateShared<uint>("temp", 256);
  30.            
  31.             // zero out every element (the kernel is called with
  32.             // 256 threads, so it's just one write per thread)
  33.             temp[thread.threadIdx.x] = 0;
  34.            
  35.             // sync the threads
  36.             thread.SyncThreads();
  37.  
  38.             // calculate the starting index and the offset to the next
  39.             // block that each thread will be processing
  40.             var i = thread.threadIdx.x + (thread.blockIdx.x * thread.blockDim.x);
  41.             var stride = thread.blockDim.x * thread.gridDim.x;
  42.  
  43.             while (i < size) {
  44.                 // anatomically increment the accumulation buffer
  45.                 thread.atomicAdd(ref temp[buffer[i]], 1);
  46.                 i += stride;
  47.             }
  48.  
  49.             // sync the data from the above writes to shared memory
  50.             // then add the shared memory values to the values from
  51.             // the other thread blocks using global memory
  52.             // atomic adds. since the kernel is launched with 256
  53.             // threads, updating the global histogram is just one write
  54.             // per thread.
  55.             thread.SyncThreads();
  56.  
  57.             thread.atomicAdd(ref histo[thread.threadIdx.x], temp[thread.threadIdx.x]);
  58.         }
  59.  
  60.         /// <summary>
  61.         /// The main method.
  62.         /// </summary>
  63.         /// <returns>
  64.         /// The <see cref="int"/>.
  65.         /// </returns>
  66.         public static int Main() {
  67.             // set the target to CUDA
  68.             CudafyModes.Target = eGPUType.Cuda;
  69.            
  70.             // set the architecture to compute capability 3.5
  71.             CudafyModes.Architecture = eArchitecture.sm_35;
  72.  
  73.             var cudafyModule = CudafyTranslator.Cudafy();
  74.            
  75.             var gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
  76.  
  77.             // check if atomic operations are supported on the device
  78.             if (gpu is CudaGPU && gpu.GetDeviceProperties().Capability < new Version(1, 2)) {
  79.                 Console.WriteLine("Compute capability 1.2 or higher required for atomics.");
  80.                 return -1;
  81.             }
  82.  
  83.             gpu.LoadModule(cudafyModule);
  84.  
  85.             // define a buffer to hold the random data
  86.             var buffer = new byte[Size];
  87.  
  88.             // fill the buffer with random bytes from the Xorshift RNG
  89.             var rndXorshift = new RndXorshift();
  90.             rndXorshift.NextBytes(buffer);
  91.  
  92.             GPGPUProperties prop;
  93.             try {
  94.                 prop = gpu.GetDeviceProperties();
  95.             } catch (DllNotFoundException) {
  96.                 // thrown if the CUDA library cannot be found
  97.                 prop = gpu.GetDeviceProperties(false);
  98.             }
  99.  
  100.             // starting the timer here so that the costs of all
  101.             // operations on the GPU are included.  If the data were
  102.             // already on the GPU and just the kernel execution itself
  103.             // was timed the runtime would drop from ~~74 ms to 15 ms.  
  104.             gpu.StartTimer();
  105.  
  106.             // allocate memory on the GPU for the image data
  107.             var devBuffer = gpu.CopyToDevice(buffer);
  108.             var deviceHistogram = gpu.Allocate<uint>(256);
  109.            
  110.             // empty the array
  111.             gpu.Set(deviceHistogram);
  112.  
  113.             // kernel launch - 2x the number of processor count seems
  114.             // reasonable          
  115.             var processorCount = prop.MultiProcessorCount;
  116.            
  117.             // in case we're running on the emulator
  118.             if (processorCount == 0)
  119.                 processorCount = 16;
  120.  
  121.             Console.WriteLine("Processors: {0}", processorCount);
  122.  
  123.             // dynamically invoke the CalculateHistogramAsKernel method
  124.             gpu.Launch(processorCount * 2, 256).CalculateHistogramAsKernel(devBuffer, Size, deviceHistogram);
  125.  
  126.             // get the histogram data from the device
  127.             var histogram = new uint[256];
  128.             gpu.CopyFromDevice(deviceHistogram, histogram);
  129.  
  130.             // stop the timer and display the timing results
  131.             var elapsedTime = gpu.StopTimer();
  132.             Console.WriteLine("Time to generate: {0} ms", elapsedTime);
  133.  
  134.             long histogramCount = 0;
  135.             for (int i = 0; i < 256; i++) {
  136.                 histogramCount += histogram[i];
  137.             }
  138.  
  139.             Console.WriteLine("Histogram Sum:  {0}", histogramCount);
  140.  
  141.             // verify that the GPU and CPU histogram counts match
  142.             for (int i = 0; i < Size; i++)
  143.                 histogram[buffer[i]]--;
  144.  
  145.             for (int i = 0; i < 256; i++) {
  146.                 // signal the user that something went wrong on the GPU
  147.                 if (histogram[i] != 0)
  148.                     Console.WriteLine("Failure at {0}!", i);
  149.             }
  150.  
  151.             // free the device's memory
  152.             gpu.FreeAll();
  153.  
  154.             Console.ReadLine();
  155.             return 0;
  156.         }
  157.     }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement