Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /* This is machine problem 1, part 1, shift cypher
- *
- * The problem is to take in a string of unsigned ints and an int,
- * the shift amount, and add the number to each element of
- * the string, effectively "shifting" each element in the
- * string.
- * SUBMISSION GUIDELINES:
- * You should copy the complete shift_cyper function from your solution
- * into a file called mp1-part1-solution-kernel.cu and submit that file.
- * The function needs to have exactly the same interface (including __global__)
- * as the empty shift_cypher function given below.
- */
- #include <stdlib.h>
- #include <stdio.h>
- #include <ctime>
- #include "mp1-util.h"
- // Repeating from the tutorial, just in case you haven't looked at it.
- // "kernels" or __global__ functions are the entry points to code that executes on the GPU
- // The keyword __global__ indicates to the compiler that this function is a GPU entry point.
- // __global__ functions must return void, and may only be called or "launched" from code that
- // executes on the CPU.
- void host_shift_cypher(unsigned int *input_array, unsigned int *output_array, unsigned int shift_amount, unsigned int alphabet_max, unsigned int array_length)
- {
- for(unsigned int i=0;i<array_length;i++)
- {
- int element = input_array[i];
- int shifted = element + shift_amount;
- if(shifted > alphabet_max)
- {
- shifted = shifted % (alphabet_max + 1);
- }
- output_array[i] = shifted;
- }
- }
- // This kernel implements a per element shift
- __global__ void shift_cypher(unsigned int *input_array, unsigned int *output_array,
- unsigned int shift_amount, unsigned int alphabet_max, unsigned int array_length)
- {
- int gid = blockIdx.x * blockDim.x + threadIdx.x;
- output_array[gid] = (input_array[gid] + shift_amount)%(alphabet_max+1);
- }
- int main(void)
- {
- // initialize
- srand(time(NULL));
- // create arrays of 16M elements
- int num_elements = 1 << 24;
- unsigned int alphabet_max = 45647;
- // compute the size of the arrays in bytes
- int num_bytes = num_elements * sizeof(unsigned int);
- // pointers to host & device arrays
- unsigned int *host_input_array = 0;
- unsigned int *host_output_array = 0;
- unsigned int *host_output_checker_array = 0;
- unsigned int *device_input_array = 0;
- unsigned int *device_output_array = 0;
- event_pair timer;
- // malloc host arrays
- host_input_array = (unsigned int*)malloc(num_bytes);
- host_output_array = (unsigned int*)malloc(num_bytes);
- host_output_checker_array = (unsigned int*)malloc(num_bytes);
- // cudaMalloc device arrays
- cudaMalloc((void**)&device_input_array, num_bytes);
- cudaMalloc((void**)&device_output_array, num_bytes);
- // if either memory allocation failed, report an error message
- if(host_input_array == 0 || host_output_array == 0 || host_output_checker_array == 0 ||
- device_input_array == 0 || device_output_array == 0)
- {
- printf("couldn't allocate memory\n");
- return 1;
- }
- // generate random input string
- unsigned int shift_amount = rand();
- for(int i=0;i< num_elements;i++)
- {
- host_input_array[i] = (unsigned int)rand();
- }
- // do copies to and from gpu once to get rid of timing weirdness
- // on first time accesses due to driver
- cudaMemcpy(device_input_array, host_input_array, num_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(host_output_array, device_output_array, num_bytes, cudaMemcpyDeviceToHost);
- start_timer(&timer);
- // copy input to GPU
- cudaMemcpy(device_input_array, host_input_array, num_bytes, cudaMemcpyHostToDevice);
- check_launch("copy to gpu");
- stop_timer(&timer,"copy to gpu");
- // choose a number of threads per block
- // we use 512 threads here
- int block_size = 512;
- int grid_size = (num_elements + block_size - 1) / block_size;
- start_timer(&timer);
- // launch kernel
- shift_cypher<<<grid_size,block_size>>>(device_input_array, device_output_array, shift_amount, alphabet_max, num_elements);
- check_launch("gpu shift cypher");
- stop_timer(&timer,"gpu shift cypher");
- start_timer(&timer);
- // download and inspect the result on the host:
- cudaMemcpy(host_output_array, device_output_array, num_bytes, cudaMemcpyDeviceToHost);
- check_launch("copy from gpu");
- stop_timer(&timer,"copy from gpu");
- start_timer(&timer);
- // generate reference output
- host_shift_cypher(host_input_array, host_output_checker_array, shift_amount, alphabet_max, num_elements);
- stop_timer(&timer,"host shift cypher");
- // check CUDA output versus reference output
- int error = 0;
- for(int i=0;i<num_elements;i++)
- {
- if(host_output_array[i] != host_output_checker_array[i])
- {
- error = 1;
- }
- }
- if(error)
- {
- printf("Output of CUDA version and normal version didn't match! \n");
- }
- else
- {
- printf("Worked! CUDA and reference output match. \n");
- }
- // deallocate memory
- free(host_input_array);
- free(host_output_array);
- free(host_output_checker_array);
- cudaFree(device_input_array);
- cudaFree(device_output_array);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement