Advertisement
Guest User

mp1-part1.cu

a guest
Aug 27th, 2011
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 5.02 KB | None | 0 0
  1. /* This is machine problem 1, part 1, shift cypher
  2.  *
  3.  * The problem is to take in a string of unsigned ints and an int,
  4.  * the shift amount, and add the number to each element of
  5.  * the string, effectively "shifting" each element in the
  6.  * string.
  7.  * SUBMISSION GUIDELINES:
  8.  * You should copy the complete shift_cyper function from your solution
  9.  * into a file called mp1-part1-solution-kernel.cu and submit that file.
  10.  * The function needs to have exactly the same interface (including __global__)
  11.  * as the empty shift_cypher function given below.
  12.  */
  13.  
  14.  
  15. #include <stdlib.h>
  16. #include <stdio.h>
  17. #include <ctime>
  18.  
  19. #include "mp1-util.h"
  20.  
  21.  
  22. // Repeating from the tutorial, just in case you haven't looked at it.
  23.  
  24. // "kernels" or __global__ functions are the entry points to code that executes on the GPU
  25. // The keyword __global__ indicates to the compiler that this function is a GPU entry point.
  26. // __global__ functions must return void, and may only be called or "launched" from code that
  27. // executes on the CPU.
  28.  
  29. void host_shift_cypher(unsigned int *input_array, unsigned int *output_array, unsigned int shift_amount, unsigned int alphabet_max, unsigned int array_length)
  30. {
  31.   for(unsigned int i=0;i<array_length;i++)
  32.   {
  33.     int element = input_array[i];
  34.     int shifted = element + shift_amount;
  35.     if(shifted > alphabet_max)
  36.     {
  37.       shifted = shifted % (alphabet_max + 1);
  38.     }
  39.     output_array[i] = shifted;
  40.   }
  41. }
  42.  
  43.  
  44. // This kernel implements a per element shift
  45. __global__ void shift_cypher(unsigned int *input_array, unsigned int *output_array,
  46.     unsigned int shift_amount, unsigned int alphabet_max, unsigned int array_length)
  47. {
  48.     int gid = blockIdx.x * blockDim.x + threadIdx.x;
  49.     output_array[gid] = (input_array[gid] + shift_amount)%(alphabet_max+1);
  50. }
  51.  
  52.  
  53. int main(void)
  54. {
  55.   // initialize
  56.   srand(time(NULL));
  57.  
  58.   // create arrays of 16M elements
  59.   int num_elements = 1 << 24;
  60.  
  61.  
  62.   unsigned int alphabet_max = 45647;
  63.  
  64.   // compute the size of the arrays in bytes
  65.   int num_bytes = num_elements * sizeof(unsigned int);
  66.  
  67.   // pointers to host & device arrays
  68.   unsigned int *host_input_array = 0;
  69.   unsigned int *host_output_array = 0;
  70.   unsigned int *host_output_checker_array = 0;
  71.   unsigned int *device_input_array = 0;
  72.   unsigned int *device_output_array = 0;
  73.  
  74.   event_pair timer;
  75.  
  76.  
  77.   // malloc host arrays
  78.   host_input_array = (unsigned int*)malloc(num_bytes);
  79.   host_output_array = (unsigned int*)malloc(num_bytes);
  80.   host_output_checker_array = (unsigned int*)malloc(num_bytes);
  81.  
  82.   // cudaMalloc device arrays
  83.   cudaMalloc((void**)&device_input_array, num_bytes);
  84.   cudaMalloc((void**)&device_output_array, num_bytes);
  85.  
  86.   // if either memory allocation failed, report an error message
  87.   if(host_input_array == 0 || host_output_array == 0 || host_output_checker_array == 0 ||
  88.     device_input_array == 0 || device_output_array == 0)
  89.   {
  90.     printf("couldn't allocate memory\n");
  91.     return 1;
  92.   }
  93.  
  94.  
  95.   // generate random input string
  96.   unsigned int shift_amount = rand();
  97.  
  98.   for(int i=0;i< num_elements;i++)
  99.   {
  100.     host_input_array[i] = (unsigned int)rand();
  101.   }
  102.  
  103.   // do copies to and from gpu once to get rid of timing weirdness
  104.   // on first time accesses due to driver
  105.   cudaMemcpy(device_input_array, host_input_array, num_bytes, cudaMemcpyHostToDevice);
  106.   cudaMemcpy(host_output_array, device_output_array, num_bytes, cudaMemcpyDeviceToHost);
  107.  
  108.   start_timer(&timer);
  109.   // copy input to GPU
  110.   cudaMemcpy(device_input_array, host_input_array, num_bytes, cudaMemcpyHostToDevice);
  111.   check_launch("copy to gpu");
  112.   stop_timer(&timer,"copy to gpu");
  113.  
  114.   // choose a number of threads per block
  115.   // we use 512 threads here
  116.   int block_size = 512;
  117.  
  118.   int grid_size = (num_elements + block_size - 1) / block_size;
  119.  
  120.   start_timer(&timer);
  121.   // launch kernel
  122.   shift_cypher<<<grid_size,block_size>>>(device_input_array, device_output_array, shift_amount, alphabet_max, num_elements);
  123.   check_launch("gpu shift cypher");
  124.   stop_timer(&timer,"gpu shift cypher");
  125.  
  126.   start_timer(&timer);
  127.   // download and inspect the result on the host:
  128.   cudaMemcpy(host_output_array, device_output_array, num_bytes, cudaMemcpyDeviceToHost);
  129.   check_launch("copy from gpu");
  130.   stop_timer(&timer,"copy from gpu");
  131.  
  132.   start_timer(&timer);
  133.   // generate reference output
  134.   host_shift_cypher(host_input_array, host_output_checker_array, shift_amount, alphabet_max, num_elements);
  135.   stop_timer(&timer,"host shift cypher");
  136.  
  137.   // check CUDA output versus reference output
  138.   int error = 0;
  139.   for(int i=0;i<num_elements;i++)
  140.   {
  141.     if(host_output_array[i] != host_output_checker_array[i])
  142.     {
  143.       error = 1;
  144.     }
  145.   }
  146.  
  147.   if(error)
  148.   {
  149.     printf("Output of CUDA version and normal version didn't match! \n");
  150.   }
  151.   else
  152.   {
  153.     printf("Worked! CUDA and reference output match. \n");
  154.   }
  155.  
  156.   // deallocate memory
  157.   free(host_input_array);
  158.   free(host_output_array);
  159.   free(host_output_checker_array);
  160.   cudaFree(device_input_array);
  161.   cudaFree(device_output_array);
  162. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement