contest

double t_start = omp_get_wtime();
        // запускаем копирования данных
        GraphCSR gpu_graph[2];
        t1 = omp_get_wtime();
        #pragma omp parallel num_threads(2)
        {
                int tid = omp_get_thread_num();
                cudaSetDevice(tid);
                user_copy_graph_to_device(csr_graph, gpu_graph[tid]);
        }
        t2 = omp_get_wtime();
        cout << "Device->host copy time: " << t2 - t1 << " sec" << endl;

        // запускаем алгоритм
        cudaDeviceSynchronize();
        t1 = omp_get_wtime();
        int last_source = 0;
        cout << "will do " << iterations << " iterations" << endl;
        #pragma omp parallel for shared(last_source)
        for(int i = 0; i < iterations; i++)
        {

                int tid = omp_get_thread_num();
                int current_gpu = tid%2;
                cudaSetDevice(current_gpu);
                int source_vertex = rand() % graph.vertices_count;
                int *local_result = new int[graph.vertices_count];
            user_algorithm(gpu_graph[current_gpu], local_result, source_vertex);
                if (i == (iterations - 1)){
                        memcpy(user_result, local_result, n * sizeof(int)*graph.vertices_count);
                        last_source = source_vertex;
                }
                delete []local_result;

        }
        cudaDeviceSynchronize();
        t2 = omp_get_wtime();
        double t_end = omp_get_wtime();
        cout << "BFS wall time: " << t2 - t1 << " sec" << endl;

        free_memory(&gpu_graph);