Untitled

#include <boost/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/condition.hpp>
#include <boost/thread/locks.hpp>
#include <boost/bind.hpp>
#include <boost/timer.hpp>
#include <iostream>
#include <string>
#include <cassert>


boost::mutex print_mutex;
void print(std::string s) {
    boost::mutex::scoped_lock lock(print_mutex);
    std::cout << s << std::endl << std::flush;
}

template <typename T>
class device_matrix {
    public:
        enum method_invocation {
            NO_METHOD = 0,
            SUM_METHOD = 1,
            SQUARE_METHOD = 2,
            NOOP_METHOD = 3
        };

    private:
        bool m_stop;
        method_invocation m_method; // indicate which method to run
        boost::thread m_thread;
        boost::condition m_cond_workorder_ready; // condition is that work has been ordered, via m_method
        boost::condition m_cond_workorder_finished; // condition that the last work order has been completed, and results can be utilized further
        boost::mutex m_mutex;
        int m_elements;
        T* m_data;
        T m_result;

    public:
        typedef boost::mutex::scoped_lock scoped_lock;

        device_matrix(T* data, unsigned int elements) :
            m_method(NO_METHOD), m_data(data),
            m_elements(elements), m_result(-1), m_stop(false) {
            m_thread = boost::thread(&device_matrix::thread_loop, this);
        }

        ~device_matrix() {
            {
                scoped_lock lock(m_mutex);
                // and a dummy method to get the thread to do nothing
                m_method = NOOP_METHOD;
                // we fake a work order to wake the thread up
                m_cond_workorder_ready.notify_one();
                m_stop = true;
            } // drop the mutex, and then wait for the device thread to die off

            m_thread.join(); // main host waits for device thread to stop here.
        }

        void sum() {
            scoped_lock lock(m_mutex);
            while (m_method != NO_METHOD) {
                m_cond_workorder_finished.wait(lock);
            }
            m_method = SUM_METHOD;
            m_cond_workorder_ready.notify_one();
        }

        void square() {
            scoped_lock lock(m_mutex);
            while (m_method != NO_METHOD) {
                m_cond_workorder_finished.wait(lock);
            }
            m_method = SQUARE_METHOD;
            m_cond_workorder_ready.notify_one();
        }

        T result() {
            scoped_lock lock(m_mutex);
            while (m_method != NO_METHOD) {
                m_cond_workorder_finished.wait(lock);
            }
            return m_result;
        }

    private:

        void do_sum() {
            float sum = 0.0;
            float c = 0.0;
            float y, t;
            for (int i = 0; i < m_elements; i++) {
                y = m_data[i] - c;
                t = sum + y;
                c = (t - sum) - y;
                sum = t;
            }
            m_result = sum;
        }

        void do_square() {
            m_result = 2.0;
        }

        void initialize_cuda() {
            scoped_lock lock(m_mutex);
            // this is where we can initialize cuda
        }

        void thread_loop() {
            initialize_cuda();

            while (1) {
                scoped_lock lock(m_mutex);
                while (m_method == NO_METHOD) m_cond_workorder_ready.wait(lock);
                switch (m_method) {
                    case SUM_METHOD:
                        do_sum();
                        break;
                    case SQUARE_METHOD:
                        do_square();
                        break;
                    default:
                    case NOOP_METHOD:
                    case NO_METHOD:
                        break;
                }
                m_method = NO_METHOD;
                m_cond_workorder_finished.notify_all(); // alert waiting threads, that we are done

                if (m_stop) {
                    return; // ends thread
                }
            }
        }
};

float random_float(float add = 0.0) {
    return ((float)rand()/(float)RAND_MAX) + add;
}

int main() {
    // these are powers of two, for easy splits for now
    unsigned int elements = 2 << 27;
    unsigned int number_of_splits = 8;
    float *data = new float[elements];

    std::vector<device_matrix<float>*> matrices;

    boost::timer timer;
    for (int i = 0; i < elements; i++) {
        data[i] = random_float();
    }
    double time_to_init = timer.elapsed();

    // populate and calculate reference result
    // uses kahan summation algorithm.
    float sum = 0.0;
    float c = 0.0;
    float y, t;
    timer.restart();
    for (int i = 0; i < elements; i++) {
        y = data[i] - c;
        t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }

    double time_to_sum_single = timer.elapsed();

    for (int i = 0; i < number_of_splits; i ++) {
        unsigned int data_size = elements/number_of_splits;
        matrices.push_back(new device_matrix<float>(data+(i*data_size), data_size));
    }

    std::cout << "starting computation" << std::endl;

    timer.restart();
    // start up the matrices
    for (int i = 0; i < number_of_splits; i ++) {
        matrices[i]->sum();
    }

    // add up results
    float result = 0.0;
    for (int i = 0; i < number_of_splits; i ++) {
        result += matrices[i]->result();
    }

    double time_to_sum_device = timer.elapsed();

    std::cout << "device_matrix count       " << matrices.size() << std::endl;
    std::cout << "elements                  " << elements << std::endl;
    std::cout << "UINT_MAX                  " << UINT_MAX << std::endl;
    std::cout << "data size total           " << elements*sizeof(float)/1024/1024 << " mb" << std::endl;
    std::cout << "size per device_matrix    " << elements*sizeof(float)/1024/1024/number_of_splits << " mb" << std::endl;

    std::cout.precision(5);
    std::cout.setf(std::ios_base::fixed);
    std::cout << "reference                 " << sum << std::endl;
    std::cout << "result                    " << result << std::endl;
    std::cout.precision(3);
    std::cout << "time taken (init)         " << time_to_init << " secs" << std::endl;
    std::cout << "time taken (single)       " << time_to_sum_single << " secs" << std::endl;
    std::cout << "time taken (threaded)     " << time_to_sum_device << " secs" << std::endl;

    // dealloc shit
    for (int i = 0; i < matrices.size(); i++) {
        delete matrices[i];
    }
    delete[] data;

    return 0;
}