gpu-cpu-example.cpp

#include <iostream>
#include <iomanip>
#include <vector>
#include <chrono>
#include <thread>
#include <deque>
#include <atomic>
#include <mutex>
#include <boost/compute/core.hpp>
#include <boost/compute/types/struct.hpp>
#include <boost/compute/algorithm/transform.hpp>
#include <boost/compute/container/vector.hpp>


namespace compute = boost::compute;

// printing of the device's info:
std::ostream &operator<<(std::ostream &os, const compute::device & device)
{
    return os << device.name() << " (platform: " << device.platform().name() << ")";
}

// struct for maze storing:
struct Field
{
    unsigned nVis[21 * 31];
};

// set up the field:
#define pos2int(r, c) ((r) * 31 + (c))
#define WALL (UINT_MAX)
#define EMPTY (0)
Field setField(const std::vector<std::vector<bool>> & f)
{
    Field a;
    for (int r = 0; r < 21; r++)
        for (int c = 0; c < 31; c++)
            a.nVis[pos2int(r,c)] = (f[r][c] ? WALL : EMPTY);
    return a;
}

// accumulate the answer for calculated maze (sum of number of visits for each cell)
long long accumulateTheAnswer(const Field &f)
{
    long long res = 0;
    for (int i = 0; i < 21 * 31; i++)
        if (f.nVis[i] != WALL)
            res += f.nVis[i];
    return res;
}

// implement bug's moving on CPU:
__attribute__((noinline))
void bugMovingCPU(Field &a)
{
    int pos = 1 * 31 + 1, dir = +31;
    while (pos != 618)
    {
        ++a.nVis[pos];
        unsigned val = a.nVis[pos + dir];
        { unsigned next = a.nVis[pos + 31]; if (next < val) dir = +31, val = next; }
        { unsigned next = a.nVis[pos +  1]; if (next < val) dir =  +1, val = next; }
        { unsigned next = a.nVis[pos - 31]; if (next < val) dir = -31, val = next; }
        { unsigned next = a.nVis[pos -  1]; if (next < val) dir =  -1, val = next; }
        pos += dir;
    }
}

// register our structure on GPU:
BOOST_COMPUTE_ADAPT_STRUCT(Field, Field, (nVis))

// function to emulate bug's moving on GPU:
BOOST_COMPUTE_FUNCTION(Field, bugMovingGPU, (Field a),
{
    int pos = 1 * 31 + 1, dir = +31;
    while (pos != 618)
    {
        ++a.nVis[pos];
        unsigned val = a.nVis[pos + dir];
        { unsigned next = a.nVis[pos + 31]; if (next < val) dir = +31, val = next; }
        { unsigned next = a.nVis[pos +  1]; if (next < val) dir =  +1, val = next; }
        { unsigned next = a.nVis[pos - 31]; if (next < val) dir = -31, val = next; }
        { unsigned next = a.nVis[pos -  1]; if (next < val) dir =  -1, val = next; }
        pos += dir;
    }
    return a;
});

// current time in nanoseconds:
long long currTimeInNanos()
{
    return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}

// function for printing numbers like 1234567890 as 1.234.567.890
template<typename T> std::string printWithDots(T x)
{
    std::string s = std::to_string(x);
    std::string t;
    while (s.size())
    {
        for (int cnt = 3; cnt-- && s.size(); )
        {
            t += s.back();
            s.pop_back();
        }
        if (s.size())
            t += '.';
    }
    std::reverse(t.begin(), t.end());
    return t;
}

int main(int argc, const char *argv[])
{
    if (argc != 3)
    {
        std::cout << "Usage: ./main <num copies> <input file>" << std::endl;
        return 0;
    }
    const int nFields = std::atoi(argv[1]);
    std::cout << "   nFields: " << nFields << std::endl;
    const char *inputFileName = argv[2];
    std::cout << "input file: '" << inputFileName << "'" << std::endl;
    // reading of the maze from the input file:
    std::ifstream fin(inputFileName);
    if (!fin)
    {
        std::cout << "Can't open the file '" << inputFileName << "'" << std::endl;
        return 0;
    }
    std::vector<std::vector<bool>> f(21, std::vector<bool>(31, 1));
    for (int i = 0; i < 21; i++)
        for (int j = 0; j < 31; j++)
        {
            char ch; fin >> ch;
            f[i][j] = (ch == '#');
        }

    // cycle over all of the devices:
    std::cout << "\nList of available devices:\n";
    for (int i = 0; auto device : compute::system::devices())
        std::cout << std::setw(4) << i++ << ":\t" << device << std::endl;

    // start CPU jobs by creating vector of threads, each of them will do some work independently
    const int nCPUThreads = std::thread::hardware_concurrency();
    std::atomic<bool> gpuFinished{false};
    std::vector<std::thread> threads;
    std::vector<std::deque<Field>> cpuResults(nCPUThreads);
    for (int id = 0; id < nCPUThreads; id++)
        threads.emplace_back([threadId = id, &gpuFinished, &f, &cpuResults]()
        {
            while (!gpuFinished)
            {
                // requesting new field:
                Field currField = setField(f);
                // calculating new field:
                bugMovingCPU(currField);
                // adding this field in a vector of results:
                cpuResults[threadId].push_back(currField);
            }
        });

    std::cout << "\nNumber of CPU threads: " << nCPUThreads << std::endl;

    // get the default device
    compute::device device = compute::system::default_device();
    std::cout << "\nDefault device will be used:\n\t" << device << std::endl;
    compute::context context(device);
    compute::command_queue queue(context, device);

    // start the measuring of runtime:
    long long start = currTimeInNanos();

    // create vector of `nFields` copies of input field:
    std::vector<Field> host_vector(nFields, setField(f));

    // create a vector on the device
    compute::vector<Field> device_vector(host_vector.size(), context);

    // transfer data from the host to the device
    compute::copy(host_vector.begin(), host_vector.end(), device_vector.begin(), queue);

    // calculate the answer for each field on GPU:
    compute::transform(
        device_vector.begin(), device_vector.end(), // input range
        device_vector.begin(), // begin of output range
        bugMovingGPU, queue // the lambda function which will be applied to each maze, and device's queue
    );

    // copy calculated mazes back to the host
    compute::copy(device_vector.begin(), device_vector.end(), host_vector.begin(), queue);

    // GPU has been finished here:
    gpuFinished = true;

    // waiting while threads are not finished:
    for (auto &t : threads)
        if (t.joinable())
            t.join();

    // finish the measuring of runtime:
    long long finish = currTimeInNanos();
    double runtime = (finish - start) * 1e-9;

    // calculating number of completed ops on GPU:
    long long totalGPU = 0;
    for (const auto &it : host_vector)
        totalGPU += accumulateTheAnswer(it);

    // calculating number of completed ops on CPU:
    long long totalCPU = 0;
    for (const auto &results : cpuResults)
        for (const auto &it : results)
            totalCPU += accumulateTheAnswer(it);

    // the speed of GPU and CPU:
    long long speedGPU = totalGPU / runtime;
    long long speedCPU = totalCPU / runtime;

    // print the statistics:
    std::cout << "\nCompleted during " << runtime << " seconds:\n";
    std::cout << "  GPU ops: " << std::setw(18) << printWithDots(totalGPU) << std::endl;
    std::cout << "  CPU ops: " << std::setw(18) << printWithDots(totalCPU) << std::endl;
    std::cout << "  Sum ops: " << std::setw(18) << printWithDots(totalCPU+totalGPU) << std::endl;
    std::cout << "\nProductivity per 1 second:\n";
    std::cout << "GPU Speed: " << std::setw(18) << printWithDots(speedGPU) << "/s" << std::endl;
    std::cout << "CPU Speed: " << std::setw(18) << printWithDots(speedCPU) << "/s" << std::endl;
    std::cout << "Sum Speed: " << std::setw(18) << printWithDots(speedGPU+speedCPU) << "/s" << std::endl;
    return 0;
}