Global Memory Atomics

    /// <summary>
    /// The program.
    /// </summary>
    public class Program {
        /// <summary>
        /// The size of the image data.
        /// </summary>
        public const int Size = 100 * 1024 * 1024;

        /// <summary>
        /// The CalculateHistogramAsKernel method.
        /// </summary>
        /// <param name="thread">
        /// The thread.
        /// </param>
        /// <param name="buffer">
        /// The buffer containing the image data.
        /// </param>
        /// <param name="size">
        /// The size of the image data.
        /// </param>
        /// <param name="histo">
        /// The histogram data.
        /// </param>
        [Cudafy]
        public static void CalculateHistogramAsKernel(GThread thread, byte[] buffer, int size, uint[] histo) {
            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            var i = thread.threadIdx.x + (thread.blockIdx.x * thread.blockDim.x);
            var stride = thread.blockDim.x * thread.gridDim.x;

            while (i < size) {
                thread.atomicAdd(ref histo[buffer[i]], 1);
                i += stride;
            }
        }

        /// <summary>
        /// The main method.
        /// </summary>
        /// <returns>
        /// The <see cref="int"/>.
        /// </returns>
        public static int Main() {
            // Set the Target to Cuda
            CudafyModes.Target = eGPUType.Cuda;

            var cudafyModule = CudafyTranslator.Cudafy();

            var gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
            if (gpu is CudaGPU && gpu.GetDeviceProperties().Capability < new Version(1, 2)) {
                Console.WriteLine("Compute capability 1.2 or higher required for atomics.");
                return -1;
            }

            gpu.LoadModule(cudafyModule);

            // define a buffer to hold the random data
            var buffer = new byte[Size];

            // fill the buffer with random bytes from the Xorshift RNG
            var rndXorshift = new RndXorshift();
            rndXorshift.NextBytes(buffer);

            GPGPUProperties prop;
            try {
                prop = gpu.GetDeviceProperties();
            } catch (DllNotFoundException) {
                // thrown if the CUDA library cannot be found
                prop = gpu.GetDeviceProperties(false);
            }

            // capture the start time
            // starting the timer here so that we include the cost of
            // all of the operations on the GPU.  if the data were
            // already on the GPU and we just timed the kernel
            // the timing would drop from 74 ms to 15 ms.  Very fast.
            gpu.StartTimer();

            // allocate memory on the GPU for the file's data
            byte[] devBuffer = gpu.CopyToDevice(buffer);
            uint[] devHisto = gpu.Allocate<uint>(256);
            gpu.Set(devHisto);

            // kernel launch - 2x the number of processor count seems reasonable
            var processorCount = prop.MultiProcessorCount;

            // in case we're running on the emulator
            if (processorCount == 0)
                processorCount = 16;

            Console.WriteLine("Processors: {0}", processorCount);

            // dynamically invoke the CalculateHistogramAsKernel method
            gpu.Launch(processorCount * 2, 256).CalculateHistogramAsKernel(devBuffer, Size, devHisto);

            var histo = new uint[256];
            gpu.CopyFromDevice(devHisto, histo);

            // get stop time, and display the timing results
            var elapsedTime = gpu.StopTimer();
            Console.WriteLine("Time to generate: {0} ms", elapsedTime);

            long histoCount = 0;
            for (int i = 0; i < 256; i++) {
                histoCount += histo[i];
            }

            Console.WriteLine("Histogram Sum:  {0}", histoCount);

            // verify that we have the same counts via CPU
            for (int i = 0; i < Size; i++)
                histo[buffer[i]]--;

            for (int i = 0; i < 256; i++) {
                if (histo[i] != 0)
                    Console.WriteLine("Failure at {0}!", i);
            }

            gpu.FreeAll();

            Console.ReadLine();
            return 0;
        }
    }