Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cfloat>
- #include <chrono>
- #include <cuda_profiler_api.h>
- #include <iostream>
- using namespace std;
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////
- #define SAFE_CALL( CallInstruction ) { \
- cudaError_t cuerr = CallInstruction; \
- if(cuerr != cudaSuccess) { \
- printf("CUDA error: %s at call \"" #CallInstruction "\"\n", cudaGetErrorString(cuerr)); \
- throw "error in CUDA API function, aborting..."; \
- } \
- }
- #define SAFE_KERNEL_CALL( KernelCallInstruction ){ \
- KernelCallInstruction; \
- cudaError_t cuerr = cudaGetLastError(); \
- if(cuerr != cudaSuccess) { \
- printf("CUDA error in kernel launch: %s at kernel \"" #KernelCallInstruction "\"\n", cudaGetErrorString(cuerr)); \
- throw "error in CUDA kernel launch, aborting..."; \
- } \
- cuerr = cudaDeviceSynchronize(); \
- if(cuerr != cudaSuccess) { \
- printf("CUDA error in kernel execution: %s at kernel \"" #KernelCallInstruction "\"\n", cudaGetErrorString(cuerr)); \
- throw "error in CUDA kernel execution, aborting..."; \
- } \
- }
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////
- void __global__ gather(int *ptrs, int *connections, int *outgoing_ids, int vertices_count, int *data, int *result)
- {
- const long long src_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
- if (src_id < vertices_count)
- {
- const int first_edge_ptr = ptrs[src_id];
- const int connections_count = connections[src_id];
- //connections_count = ptrs[src_id + 1] - ptrs[src_id];
- for(register int cur_edge = threadIdx.x % 32; cur_edge < connections_count; cur_edge += 32)
- {
- // first_edge_ptr + cur_edge - индекс текущего ребра в массивах
- int dst_id = outgoing_ids[first_edge_ptr + cur_edge];
- int val = data[dst_id];
- result[first_edge_ptr + cur_edge] = val;
- // данную программу можно легко переделать во многие графовые алгоритмы, например:
- /* BFS
- int src_level = data[src_id];
- int dst_level = data[dst_id];
- if((src_level == current_level) && (dst_level == UNVISITED_VERTEX))
- {
- data[dst_id] = current_level + 1;
- }
- */
- /* SSSP
- float weight = outgoing_weights[first_edge_ptr + cur_edge];
- float src_weight = data[src_id];
- float dst_weight = data[dst_id];
- if(dst_weight > src_weight + weight)
- {
- data[dst_id] = src_weight + weight;
- }
- */
- }
- }
- }
- ///////////////////////////////////////////////////////////////////////////////////////////////////////////
- int main()
- {
- int vertices_count = 1024*1024;
- int *ptrs = new int[vertices_count];
- int *data = new int[vertices_count];
- int *connections = new int[vertices_count];
- int pos = 0;
- for(int i = 0; i < vertices_count; i++) // TODO (bonus) граф с несколькими "большими" вершинами
- {
- ptrs[i] = pos;
- connections[i] = 16 + rand()%32;
- pos += connections[i];
- data[i] = rand();
- }
- int edges_count = pos;
- int *outgoing_ids = new int[edges_count];
- int *result = new int[edges_count];
- for(int i = 0; i < edges_count; i++)
- {
- outgoing_ids[i] = rand()%vertices_count;
- }
- int *dev_ptrs; int *dev_connections; int *dev_outgoing_ids; int *dev_data; int *dev_result;
- cudaMalloc((void**)&dev_ptrs, vertices_count*sizeof(int));
- cudaMalloc((void**)&dev_connections, vertices_count*sizeof(int));
- cudaMalloc((void**)&dev_data, vertices_count*sizeof(int));
- cudaMalloc((void**)&dev_outgoing_ids, edges_count*sizeof(int));
- cudaMalloc((void**)&dev_result, edges_count*sizeof(int));
- SAFE_CALL(cudaMemcpy(dev_ptrs, ptrs, vertices_count * sizeof(int), cudaMemcpyHostToDevice));
- SAFE_CALL(cudaMemcpy(dev_connections, connections, vertices_count * sizeof(int), cudaMemcpyHostToDevice));
- SAFE_CALL(cudaMemcpy(dev_data, data, vertices_count * sizeof(int), cudaMemcpyHostToDevice));
- SAFE_CALL(cudaMemcpy(dev_outgoing_ids, outgoing_ids, edges_count * sizeof(int), cudaMemcpyHostToDevice));
- dim3 compute_threads(1024);
- dim3 compute_blocks(32*((vertices_count - 1) / compute_threads.x + 1));
- for(int i = 0; i < 5; i++)
- {
- auto start = std::chrono::steady_clock::now();
- SAFE_KERNEL_CALL((gather <<< compute_blocks, compute_threads >>> (dev_ptrs, dev_connections, dev_outgoing_ids, vertices_count, dev_data, dev_result)));
- auto end = std::chrono::steady_clock::now(); // TODO почему работает данный замер веремени?
- std::chrono::duration<double> elapsed_seconds = end-start;
- cout << "time: " << (elapsed_seconds.count())*1000.0 << " ms" << endl;
- cout << "bandwidth: " << 3.0*sizeof(int)*edges_count/((elapsed_seconds.count())*1e9) << " GB/s" << endl << endl;
- }
- int *copy_device_result = new int[edges_count];
- SAFE_CALL(cudaMemcpy(copy_device_result, dev_result, edges_count * sizeof(int), cudaMemcpyDeviceToHost));
- for(int src_id = 0; src_id < vertices_count; src_id++)
- {
- const int first_edge_ptr = ptrs[src_id];
- const int connections_count = connections[src_id];
- for(register int cur_edge = 0; cur_edge < connections_count; cur_edge++)
- {
- int dst_id = outgoing_ids[first_edge_ptr + cur_edge];
- int val = data[dst_id];
- result[first_edge_ptr + cur_edge] = val;
- }
- }
- // TODO check
- int errors_count = 0;
- for(int i = 0; i < edges_count; i++)
- {
- if(result[i] != copy_device_result[i])
- {
- errors_count++;
- }
- }
- cout << errors_count << endl;
- // TODO какие 3 недостатка у текущей версии ядра?
- // TODO отпрофилировать текущую версию, сделать выводы о её производитлеьности
- // TODO сделать оптимизированную версию ядра
- // TODO (bonus) реализовать базовую версию BFS алгоритма (выделить структуры данных и реализовать сам алгоритм)
- cudaFree(dev_data);
- cudaFree(dev_ptrs);
- cudaFree(dev_connections);
- cudaFree(dev_result);
- cudaFree(dev_outgoing_ids);
- delete[]result;
- delete[]data;
- delete[]ptrs;
- delete[]outgoing_ids;
- delete[]connections;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement