Shared and Global Memory Atomics

/// <summary>
    /// The program.
    /// </summary>
    public class Program {
        /// <summary>
        /// The size of the image data.
        /// </summary>
        public const int Size = 100 * 1024 * 1024;

        /// <summary>
        /// The CalculateHistogramAsKernel method.
        /// </summary>
        /// <param name="thread">
        /// The thread.
        /// </param>
        /// <param name="buffer">
        /// The buffer containing the image data.
        /// </param>
        /// <param name="size">
        /// The size of the image data.
        /// </param>
        /// <param name="histo">
        /// The histogram data.
        /// </param>
        [Cudafy]
        public static void CalculateHistogramAsKernel(GThread thread, byte[] buffer, int size, uint[] histo) {
            // allocate a shared global memory block called "temp" with
            // a capacity of 256 elements
            var temp = thread.AllocateShared<uint>("temp", 256);

            // zero out every element (the kernel is called with
            // 256 threads, so it's just one write per thread)
            temp[thread.threadIdx.x] = 0;

            // sync the threads
            thread.SyncThreads();

            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            var i = thread.threadIdx.x + (thread.blockIdx.x * thread.blockDim.x);
            var stride = thread.blockDim.x * thread.gridDim.x;

            while (i < size) {
                // anatomically increment the accumulation buffer
                thread.atomicAdd(ref temp[buffer[i]], 1);
                i += stride;
            }

            // sync the data from the above writes to shared memory
            // then add the shared memory values to the values from
            // the other thread blocks using global memory
            // atomic adds. since the kernel is launched with 256
            // threads, updating the global histogram is just one write
            // per thread.
            thread.SyncThreads();

            thread.atomicAdd(ref histo[thread.threadIdx.x], temp[thread.threadIdx.x]);
        }

        /// <summary>
        /// The main method.
        /// </summary>
        /// <returns>
        /// The <see cref="int"/>.
        /// </returns>
        public static int Main() {
            // set the target to CUDA
            CudafyModes.Target = eGPUType.Cuda;

            // set the architecture to compute capability 3.5
            CudafyModes.Architecture = eArchitecture.sm_35;

            var cudafyModule = CudafyTranslator.Cudafy();

            var gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);

            // check if atomic operations are supported on the device
            if (gpu is CudaGPU && gpu.GetDeviceProperties().Capability < new Version(1, 2)) {
                Console.WriteLine("Compute capability 1.2 or higher required for atomics.");
                return -1;
            }

            gpu.LoadModule(cudafyModule);

            // define a buffer to hold the random data
            var buffer = new byte[Size];

            // fill the buffer with random bytes from the Xorshift RNG
            var rndXorshift = new RndXorshift();
            rndXorshift.NextBytes(buffer);

            GPGPUProperties prop;
            try {
                prop = gpu.GetDeviceProperties();
            } catch (DllNotFoundException) {
                // thrown if the CUDA library cannot be found
                prop = gpu.GetDeviceProperties(false);
            }

            // starting the timer here so that the costs of all
            // operations on the GPU are included.  If the data were
            // already on the GPU and just the kernel execution itself
            // was timed the runtime would drop from ~~74 ms to 15 ms.
            gpu.StartTimer();

            // allocate memory on the GPU for the image data
            var devBuffer = gpu.CopyToDevice(buffer);
            var deviceHistogram = gpu.Allocate<uint>(256);

            // empty the array
            gpu.Set(deviceHistogram);

            // kernel launch - 2x the number of processor count seems
            // reasonable
            var processorCount = prop.MultiProcessorCount;

            // in case we're running on the emulator
            if (processorCount == 0)
                processorCount = 16;

            Console.WriteLine("Processors: {0}", processorCount);

            // dynamically invoke the CalculateHistogramAsKernel method
            gpu.Launch(processorCount * 2, 256).CalculateHistogramAsKernel(devBuffer, Size, deviceHistogram);

            // get the histogram data from the device
            var histogram = new uint[256];
            gpu.CopyFromDevice(deviceHistogram, histogram);

            // stop the timer and display the timing results
            var elapsedTime = gpu.StopTimer();
            Console.WriteLine("Time to generate: {0} ms", elapsedTime);

            long histogramCount = 0;
            for (int i = 0; i < 256; i++) {
                histogramCount += histogram[i];
            }

            Console.WriteLine("Histogram Sum:  {0}", histogramCount);

            // verify that the GPU and CPU histogram counts match
            for (int i = 0; i < Size; i++)
                histogram[buffer[i]]--;

            for (int i = 0; i < 256; i++) {
                // signal the user that something went wrong on the GPU
                if (histogram[i] != 0)
                    Console.WriteLine("Failure at {0}!", i);
            }

            // free the device's memory
            gpu.FreeAll();

            Console.ReadLine();
            return 0;
        }
    }