Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <boost/thread.hpp>
- #include <boost/thread/mutex.hpp>
- #include <boost/thread/condition.hpp>
- #include <boost/thread/locks.hpp>
- #include <boost/bind.hpp>
- #include <boost/timer.hpp>
- #include <iostream>
- #include <string>
- #include <cassert>
- boost::mutex print_mutex;
- void print(std::string s) {
- boost::mutex::scoped_lock lock(print_mutex);
- std::cout << s << std::endl << std::flush;
- }
- template <typename T>
- class device_matrix {
- public:
- enum method_invocation {
- NO_METHOD = 0,
- SUM_METHOD = 1,
- SQUARE_METHOD = 2,
- NOOP_METHOD = 3
- };
- private:
- bool m_stop;
- method_invocation m_method; // indicate which method to run
- boost::thread m_thread;
- boost::condition m_cond_workorder_ready; // condition is that work has been ordered, via m_method
- boost::condition m_cond_workorder_finished; // condition that the last work order has been completed, and results can be utilized further
- boost::mutex m_mutex;
- int m_elements;
- T* m_data;
- T m_result;
- public:
- typedef boost::mutex::scoped_lock scoped_lock;
- device_matrix(T* data, unsigned int elements) :
- m_method(NO_METHOD), m_data(data),
- m_elements(elements), m_result(-1), m_stop(false) {
- m_thread = boost::thread(&device_matrix::thread_loop, this);
- }
- ~device_matrix() {
- {
- scoped_lock lock(m_mutex);
- // and a dummy method to get the thread to do nothing
- m_method = NOOP_METHOD;
- // we fake a work order to wake the thread up
- m_cond_workorder_ready.notify_one();
- m_stop = true;
- } // drop the mutex, and then wait for the device thread to die off
- m_thread.join(); // main host waits for device thread to stop here.
- }
- void sum() {
- scoped_lock lock(m_mutex);
- while (m_method != NO_METHOD) {
- m_cond_workorder_finished.wait(lock);
- }
- m_method = SUM_METHOD;
- m_cond_workorder_ready.notify_one();
- }
- void square() {
- scoped_lock lock(m_mutex);
- while (m_method != NO_METHOD) {
- m_cond_workorder_finished.wait(lock);
- }
- m_method = SQUARE_METHOD;
- m_cond_workorder_ready.notify_one();
- }
- T result() {
- scoped_lock lock(m_mutex);
- while (m_method != NO_METHOD) {
- m_cond_workorder_finished.wait(lock);
- }
- return m_result;
- }
- private:
- void do_sum() {
- float sum = 0.0;
- float c = 0.0;
- float y, t;
- for (int i = 0; i < m_elements; i++) {
- y = m_data[i] - c;
- t = sum + y;
- c = (t - sum) - y;
- sum = t;
- }
- m_result = sum;
- }
- void do_square() {
- m_result = 2.0;
- }
- void initialize_cuda() {
- scoped_lock lock(m_mutex);
- // this is where we can initialize cuda
- }
- void thread_loop() {
- initialize_cuda();
- while (1) {
- scoped_lock lock(m_mutex);
- while (m_method == NO_METHOD) m_cond_workorder_ready.wait(lock);
- switch (m_method) {
- case SUM_METHOD:
- do_sum();
- break;
- case SQUARE_METHOD:
- do_square();
- break;
- default:
- case NOOP_METHOD:
- case NO_METHOD:
- break;
- }
- m_method = NO_METHOD;
- m_cond_workorder_finished.notify_all(); // alert waiting threads, that we are done
- if (m_stop) {
- return; // ends thread
- }
- }
- }
- };
- float random_float(float add = 0.0) {
- return ((float)rand()/(float)RAND_MAX) + add;
- }
- int main() {
- // these are powers of two, for easy splits for now
- unsigned int elements = 2 << 27;
- unsigned int number_of_splits = 8;
- float *data = new float[elements];
- std::vector<device_matrix<float>*> matrices;
- boost::timer timer;
- for (int i = 0; i < elements; i++) {
- data[i] = random_float();
- }
- double time_to_init = timer.elapsed();
- // populate and calculate reference result
- // uses kahan summation algorithm.
- float sum = 0.0;
- float c = 0.0;
- float y, t;
- timer.restart();
- for (int i = 0; i < elements; i++) {
- y = data[i] - c;
- t = sum + y;
- c = (t - sum) - y;
- sum = t;
- }
- double time_to_sum_single = timer.elapsed();
- for (int i = 0; i < number_of_splits; i ++) {
- unsigned int data_size = elements/number_of_splits;
- matrices.push_back(new device_matrix<float>(data+(i*data_size), data_size));
- }
- std::cout << "starting computation" << std::endl;
- timer.restart();
- // start up the matrices
- for (int i = 0; i < number_of_splits; i ++) {
- matrices[i]->sum();
- }
- // add up results
- float result = 0.0;
- for (int i = 0; i < number_of_splits; i ++) {
- result += matrices[i]->result();
- }
- double time_to_sum_device = timer.elapsed();
- std::cout << "device_matrix count " << matrices.size() << std::endl;
- std::cout << "elements " << elements << std::endl;
- std::cout << "UINT_MAX " << UINT_MAX << std::endl;
- std::cout << "data size total " << elements*sizeof(float)/1024/1024 << " mb" << std::endl;
- std::cout << "size per device_matrix " << elements*sizeof(float)/1024/1024/number_of_splits << " mb" << std::endl;
- std::cout.precision(5);
- std::cout.setf(std::ios_base::fixed);
- std::cout << "reference " << sum << std::endl;
- std::cout << "result " << result << std::endl;
- std::cout.precision(3);
- std::cout << "time taken (init) " << time_to_init << " secs" << std::endl;
- std::cout << "time taken (single) " << time_to_sum_single << " secs" << std::endl;
- std::cout << "time taken (threaded) " << time_to_sum_device << " secs" << std::endl;
- // dealloc shit
- for (int i = 0; i < matrices.size(); i++) {
- delete matrices[i];
- }
- delete[] data;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement