Untitled

#include <boost/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/condition.hpp>
#include <boost/bind.hpp>
#include <boost/timer.hpp>
#include <iostream>
#include <string>
#include <cassert>
#include <vector>

#include <sys/time.h>

boost::mutex print_mutex;
void print(const std::string &s) {
    boost::mutex::scoped_lock lock(print_mutex);
    std::cout << s << std::endl;
}
void print(const char *s) {
    boost::mutex::scoped_lock lock(print_mutex);
    std::cout << s << std::endl;
}

suseconds_t time_diff(timeval a, timeval b)
{
    suseconds_t r;
    r = (a.tv_sec - b.tv_sec) * 1000000 + (a.tv_usec - b.tv_usec);
    return r;
}

template <typename T>
T do_sum(T* data, size_t count) {
    timeval tv_start, tv_end;
    gettimeofday(&tv_start, 0);
    T sum = 0.0;
    T c = 0.0;
    T y, t;
    for (size_t i = 0; i < count; i++) {
        y = data[i] - c;
        t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }
    gettimeofday(&tv_end, 0);
    std::cout << "sum operation completed in " << time_diff(tv_end, tv_start)/1000000.0 << " for " << count << std::endl;
    return sum;
}

template <typename T>
class device_matrix {
    public:
        enum method_invocation {
            NO_METHOD = 0,
            SUM_METHOD = 1,
            SQUARE_METHOD = 2,
            NOOP_METHOD = 3
        };

    private:
        bool m_stop;
        method_invocation m_method; // indicate which method to run
        boost::condition m_cond_workorder_ready; // condition is that work has been ordered, via m_method
        boost::condition m_cond_workorder_finished; // condition that the last work order has been completed, and results can be utilized further
        boost::mutex m_mutex;
        int m_elements;
        T* m_data;
        T m_result;
        boost::thread m_thread;

    public:
        typedef boost::mutex::scoped_lock scoped_lock;

        device_matrix(T* data, unsigned int elements) :
            m_method(NO_METHOD),
            m_data(data),
            m_elements(elements),
            m_result(-1),
            m_stop(false),
            m_thread( boost::bind(&device_matrix::thread_loop, this) )
        {
        }

        ~device_matrix() {
            {
                scoped_lock lock(m_mutex);
                // and a dummy method to get the thread to do nothing
                m_method = NOOP_METHOD;
                // we fake a work order to wake the thread up
                m_cond_workorder_ready.notify_all();
                m_stop = true;
            } // drop the mutex, and then wait for the device thread to die off

            m_thread.join(); // main host waits for device thread to stop here.
        }

        void sum() {
            scoped_lock lock(m_mutex);
            while (m_method != NO_METHOD) {
                m_cond_workorder_finished.wait(lock);
            }
            m_method = SUM_METHOD;
            m_cond_workorder_ready.notify_all();
        }

        void square() {
            scoped_lock lock(m_mutex);
            while (m_method != NO_METHOD) {
                m_cond_workorder_finished.wait(lock);
            }
            m_method = SQUARE_METHOD;
            m_cond_workorder_ready.notify_all();
        }

        T result() {
            scoped_lock lock(m_mutex);
            while (m_method != NO_METHOD) {
                print("waiting for result");
                m_cond_workorder_finished.wait(lock);
            }
            return m_result;
        }

    private:

        void do_square() {
            m_result = 2.0;
        }

        void initialize_cuda() {
            scoped_lock lock(m_mutex);
            // this is where we can initialize cuda
        }

        void thread_loop() {
            initialize_cuda();

            while (1) {
                scoped_lock lock(m_mutex);
                while (m_method == NO_METHOD) m_cond_workorder_ready.wait(lock);
                switch (m_method) {
                    case SUM_METHOD:
                        m_result = do_sum(m_data, m_elements);
                        break;
                    case SQUARE_METHOD:
                        do_square();
                        break;
                    default:
                    case NOOP_METHOD:
                    case NO_METHOD:
                        break;
                }
                m_method = NO_METHOD;
                m_cond_workorder_finished.notify_all(); // alert waiting threads, that we are done
                if (m_stop) {
                    return; // ends thread
                }
            }
        }
};

float random_float(float add = 0.0) {
    return ((float)rand()/(float)RAND_MAX) + add;
}

int main() {
    // these are powers of two, for easy splits for now
    unsigned int elements = 2 << 27;
    unsigned int number_of_splits = 4;
    typedef float element_type;
    element_type *data = new element_type[elements];

    std::vector<device_matrix<element_type>*> matrices;

    boost::timer timer;
    for (int i = 0; i < elements; i++) {
        data[i] = random_float();
    }
    double time_to_init = timer.elapsed();

    // populate and calculate reference result
    // uses kahan summation algorithm.
    timer.restart();
    element_type sum = do_sum(data, elements);
    double time_to_sum_single = timer.elapsed();

    for (int i = 0; i < number_of_splits; i ++) {
        unsigned int data_size = elements/number_of_splits;
        matrices.push_back(new device_matrix<element_type>(data+(i*data_size), data_size));
    }

    std::cout << "starting computation" << std::endl;

    timer.restart();
    // start up the matrices
    for (int i = 0; i < number_of_splits; i ++) {
        matrices[i]->sum();
    }

    // add up results
    element_type result = 0.0;
    for (int i = 0; i < number_of_splits; i ++) {
        result += matrices[i]->result();
    }

    double time_to_sum_device = timer.elapsed();

    std::cout << "device_matrix count       " << matrices.size() << std::endl;
    std::cout << "elements                  " << elements << std::endl;
    std::cout << "UINT_MAX                  " << UINT_MAX << std::endl;
    std::cout << "data size total           " << elements*sizeof(float)/1024/1024 << " mb" << std::endl;
    std::cout << "size per device_matrix    " << elements*sizeof(float)/1024/1024/number_of_splits << " mb" << std::endl;

    std::cout.precision(5);
    std::cout.setf(std::ios_base::fixed);
    std::cout << "reference                 " << sum << std::endl;
    std::cout << "result                    " << result << std::endl;
    std::cout.precision(3);
    std::cout << "time taken (init)         " << time_to_init << " secs" << std::endl;
    std::cout << "time taken (single)       " << time_to_sum_single << " secs" << std::endl;
    std::cout << "time taken (threaded)     " << time_to_sum_device << " secs" << std::endl;

    // dealloc shit
    for (int i = 0; i < matrices.size(); i++) {
        delete matrices[i];
    }
    delete[] data;

    return 0;
}