Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <boost/thread.hpp>
- #include <boost/thread/mutex.hpp>
- #include <boost/thread/condition.hpp>
- #include <boost/bind.hpp>
- #include <boost/timer.hpp>
- #include <iostream>
- #include <string>
- #include <cassert>
- #include <vector>
- #include <sys/time.h>
- boost::mutex print_mutex;
- void print(const std::string &s) {
- boost::mutex::scoped_lock lock(print_mutex);
- std::cout << s << std::endl;
- }
- void print(const char *s) {
- boost::mutex::scoped_lock lock(print_mutex);
- std::cout << s << std::endl;
- }
- suseconds_t time_diff(timeval a, timeval b)
- {
- suseconds_t r;
- r = (a.tv_sec - b.tv_sec) * 1000000 + (a.tv_usec - b.tv_usec);
- return r;
- }
- template <typename T>
- T do_sum(T* data, size_t count) {
- timeval tv_start, tv_end;
- gettimeofday(&tv_start, 0);
- T sum = 0.0;
- T c = 0.0;
- T y, t;
- for (size_t i = 0; i < count; i++) {
- y = data[i] - c;
- t = sum + y;
- c = (t - sum) - y;
- sum = t;
- }
- gettimeofday(&tv_end, 0);
- std::cout << "sum operation completed in " << time_diff(tv_end, tv_start)/1000000.0 << " for " << count << std::endl;
- return sum;
- }
- template <typename T>
- class device_matrix {
- public:
- enum method_invocation {
- NO_METHOD = 0,
- SUM_METHOD = 1,
- SQUARE_METHOD = 2,
- NOOP_METHOD = 3
- };
- private:
- bool m_stop;
- method_invocation m_method; // indicate which method to run
- boost::condition m_cond_workorder_ready; // condition is that work has been ordered, via m_method
- boost::condition m_cond_workorder_finished; // condition that the last work order has been completed, and results can be utilized further
- boost::mutex m_mutex;
- int m_elements;
- T* m_data;
- T m_result;
- boost::thread m_thread;
- public:
- typedef boost::mutex::scoped_lock scoped_lock;
- device_matrix(T* data, unsigned int elements) :
- m_method(NO_METHOD),
- m_data(data),
- m_elements(elements),
- m_result(-1),
- m_stop(false),
- m_thread( boost::bind(&device_matrix::thread_loop, this) )
- {
- }
- ~device_matrix() {
- {
- scoped_lock lock(m_mutex);
- // and a dummy method to get the thread to do nothing
- m_method = NOOP_METHOD;
- // we fake a work order to wake the thread up
- m_cond_workorder_ready.notify_all();
- m_stop = true;
- } // drop the mutex, and then wait for the device thread to die off
- m_thread.join(); // main host waits for device thread to stop here.
- }
- void sum() {
- scoped_lock lock(m_mutex);
- while (m_method != NO_METHOD) {
- m_cond_workorder_finished.wait(lock);
- }
- m_method = SUM_METHOD;
- m_cond_workorder_ready.notify_all();
- }
- void square() {
- scoped_lock lock(m_mutex);
- while (m_method != NO_METHOD) {
- m_cond_workorder_finished.wait(lock);
- }
- m_method = SQUARE_METHOD;
- m_cond_workorder_ready.notify_all();
- }
- T result() {
- scoped_lock lock(m_mutex);
- while (m_method != NO_METHOD) {
- print("waiting for result");
- m_cond_workorder_finished.wait(lock);
- }
- return m_result;
- }
- private:
- void do_square() {
- m_result = 2.0;
- }
- void initialize_cuda() {
- scoped_lock lock(m_mutex);
- // this is where we can initialize cuda
- }
- void thread_loop() {
- initialize_cuda();
- while (1) {
- scoped_lock lock(m_mutex);
- while (m_method == NO_METHOD) m_cond_workorder_ready.wait(lock);
- switch (m_method) {
- case SUM_METHOD:
- m_result = do_sum(m_data, m_elements);
- break;
- case SQUARE_METHOD:
- do_square();
- break;
- default:
- case NOOP_METHOD:
- case NO_METHOD:
- break;
- }
- m_method = NO_METHOD;
- m_cond_workorder_finished.notify_all(); // alert waiting threads, that we are done
- if (m_stop) {
- return; // ends thread
- }
- }
- }
- };
- float random_float(float add = 0.0) {
- return ((float)rand()/(float)RAND_MAX) + add;
- }
- int main() {
- // these are powers of two, for easy splits for now
- unsigned int elements = 2 << 27;
- unsigned int number_of_splits = 4;
- typedef float element_type;
- element_type *data = new element_type[elements];
- std::vector<device_matrix<element_type>*> matrices;
- boost::timer timer;
- for (int i = 0; i < elements; i++) {
- data[i] = random_float();
- }
- double time_to_init = timer.elapsed();
- // populate and calculate reference result
- // uses kahan summation algorithm.
- timer.restart();
- element_type sum = do_sum(data, elements);
- double time_to_sum_single = timer.elapsed();
- for (int i = 0; i < number_of_splits; i ++) {
- unsigned int data_size = elements/number_of_splits;
- matrices.push_back(new device_matrix<element_type>(data+(i*data_size), data_size));
- }
- std::cout << "starting computation" << std::endl;
- timer.restart();
- // start up the matrices
- for (int i = 0; i < number_of_splits; i ++) {
- matrices[i]->sum();
- }
- // add up results
- element_type result = 0.0;
- for (int i = 0; i < number_of_splits; i ++) {
- result += matrices[i]->result();
- }
- double time_to_sum_device = timer.elapsed();
- std::cout << "device_matrix count " << matrices.size() << std::endl;
- std::cout << "elements " << elements << std::endl;
- std::cout << "UINT_MAX " << UINT_MAX << std::endl;
- std::cout << "data size total " << elements*sizeof(float)/1024/1024 << " mb" << std::endl;
- std::cout << "size per device_matrix " << elements*sizeof(float)/1024/1024/number_of_splits << " mb" << std::endl;
- std::cout.precision(5);
- std::cout.setf(std::ios_base::fixed);
- std::cout << "reference " << sum << std::endl;
- std::cout << "result " << result << std::endl;
- std::cout.precision(3);
- std::cout << "time taken (init) " << time_to_init << " secs" << std::endl;
- std::cout << "time taken (single) " << time_to_sum_single << " secs" << std::endl;
- std::cout << "time taken (threaded) " << time_to_sum_device << " secs" << std::endl;
- // dealloc shit
- for (int i = 0; i < matrices.size(); i++) {
- delete matrices[i];
- }
- delete[] data;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement