Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /// <summary>
- /// The program.
- /// </summary>
- public class Program {
- /// <summary>
- /// The size of the image data.
- /// </summary>
- public const int Size = 100 * 1024 * 1024;
- /// <summary>
- /// The CalculateHistogramAsKernel method.
- /// </summary>
- /// <param name="thread">
- /// The thread.
- /// </param>
- /// <param name="buffer">
- /// The buffer containing the image data.
- /// </param>
- /// <param name="size">
- /// The size of the image data.
- /// </param>
- /// <param name="histo">
- /// The histogram data.
- /// </param>
- [Cudafy]
- public static void CalculateHistogramAsKernel(GThread thread, byte[] buffer, int size, uint[] histo) {
- // allocate a shared global memory block called "temp" with
- // a capacity of 256 elements
- var temp = thread.AllocateShared<uint>("temp", 256);
- // zero out every element (the kernel is called with
- // 256 threads, so it's just one write per thread)
- temp[thread.threadIdx.x] = 0;
- // sync the threads
- thread.SyncThreads();
- // calculate the starting index and the offset to the next
- // block that each thread will be processing
- var i = thread.threadIdx.x + (thread.blockIdx.x * thread.blockDim.x);
- var stride = thread.blockDim.x * thread.gridDim.x;
- while (i < size) {
- // anatomically increment the accumulation buffer
- thread.atomicAdd(ref temp[buffer[i]], 1);
- i += stride;
- }
- // sync the data from the above writes to shared memory
- // then add the shared memory values to the values from
- // the other thread blocks using global memory
- // atomic adds. since the kernel is launched with 256
- // threads, updating the global histogram is just one write
- // per thread.
- thread.SyncThreads();
- thread.atomicAdd(ref histo[thread.threadIdx.x], temp[thread.threadIdx.x]);
- }
- /// <summary>
- /// The main method.
- /// </summary>
- /// <returns>
- /// The <see cref="int"/>.
- /// </returns>
- public static int Main() {
- // set the target to CUDA
- CudafyModes.Target = eGPUType.Cuda;
- // set the architecture to compute capability 3.5
- CudafyModes.Architecture = eArchitecture.sm_35;
- var cudafyModule = CudafyTranslator.Cudafy();
- var gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
- // check if atomic operations are supported on the device
- if (gpu is CudaGPU && gpu.GetDeviceProperties().Capability < new Version(1, 2)) {
- Console.WriteLine("Compute capability 1.2 or higher required for atomics.");
- return -1;
- }
- gpu.LoadModule(cudafyModule);
- // define a buffer to hold the random data
- var buffer = new byte[Size];
- // fill the buffer with random bytes from the Xorshift RNG
- var rndXorshift = new RndXorshift();
- rndXorshift.NextBytes(buffer);
- GPGPUProperties prop;
- try {
- prop = gpu.GetDeviceProperties();
- } catch (DllNotFoundException) {
- // thrown if the CUDA library cannot be found
- prop = gpu.GetDeviceProperties(false);
- }
- // starting the timer here so that the costs of all
- // operations on the GPU are included. If the data were
- // already on the GPU and just the kernel execution itself
- // was timed the runtime would drop from ~~74 ms to 15 ms.
- gpu.StartTimer();
- // allocate memory on the GPU for the image data
- var devBuffer = gpu.CopyToDevice(buffer);
- var deviceHistogram = gpu.Allocate<uint>(256);
- // empty the array
- gpu.Set(deviceHistogram);
- // kernel launch - 2x the number of processor count seems
- // reasonable
- var processorCount = prop.MultiProcessorCount;
- // in case we're running on the emulator
- if (processorCount == 0)
- processorCount = 16;
- Console.WriteLine("Processors: {0}", processorCount);
- // dynamically invoke the CalculateHistogramAsKernel method
- gpu.Launch(processorCount * 2, 256).CalculateHistogramAsKernel(devBuffer, Size, deviceHistogram);
- // get the histogram data from the device
- var histogram = new uint[256];
- gpu.CopyFromDevice(deviceHistogram, histogram);
- // stop the timer and display the timing results
- var elapsedTime = gpu.StopTimer();
- Console.WriteLine("Time to generate: {0} ms", elapsedTime);
- long histogramCount = 0;
- for (int i = 0; i < 256; i++) {
- histogramCount += histogram[i];
- }
- Console.WriteLine("Histogram Sum: {0}", histogramCount);
- // verify that the GPU and CPU histogram counts match
- for (int i = 0; i < Size; i++)
- histogram[buffer[i]]--;
- for (int i = 0; i < 256; i++) {
- // signal the user that something went wrong on the GPU
- if (histogram[i] != 0)
- Console.WriteLine("Failure at {0}!", i);
- }
- // free the device's memory
- gpu.FreeAll();
- Console.ReadLine();
- return 0;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement