Advertisement
Guest User

Untitled

a guest
Feb 17th, 2011
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.91 KB | None | 0 0
  1. #include <boost/thread.hpp>
  2. #include <boost/thread/mutex.hpp>
  3. #include <boost/thread/condition.hpp>
  4. #include <boost/bind.hpp>
  5. #include <boost/timer.hpp>
  6. #include <iostream>
  7. #include <string>
  8. #include <cassert>
  9. #include <vector>
  10.  
  11. #include <sys/time.h>
  12.  
  13. boost::mutex print_mutex;
  14. void print(const std::string &s) {
  15.     boost::mutex::scoped_lock lock(print_mutex);
  16.     std::cout << s << std::endl;
  17. }
  18. void print(const char *s) {
  19.     boost::mutex::scoped_lock lock(print_mutex);
  20.     std::cout << s << std::endl;
  21. }
  22.  
  23. suseconds_t time_diff(timeval a, timeval b)
  24. {
  25.     suseconds_t r;
  26.     r = (a.tv_sec - b.tv_sec) * 1000000 + (a.tv_usec - b.tv_usec);
  27.     return r;
  28. }
  29.  
  30. template <typename T>
  31. T do_sum(T* data, size_t count) {
  32.     timeval tv_start, tv_end;
  33.     gettimeofday(&tv_start, 0);
  34.     T sum = 0.0;
  35.     T c = 0.0;
  36.     T y, t;
  37.     for (size_t i = 0; i < count; i++) {
  38.         y = data[i] - c;
  39.         t = sum + y;
  40.         c = (t - sum) - y;
  41.         sum = t;
  42.     }
  43.     gettimeofday(&tv_end, 0);
  44.     std::cout << "sum operation completed in " << time_diff(tv_end, tv_start)/1000000.0 << " for " << count << std::endl;
  45.     return sum;
  46. }
  47.  
  48. template <typename T>
  49. class device_matrix {
  50.     public:
  51.         enum method_invocation {
  52.             NO_METHOD = 0,
  53.             SUM_METHOD = 1,
  54.             SQUARE_METHOD = 2,
  55.             NOOP_METHOD = 3
  56.         };
  57.  
  58.     private:
  59.         bool m_stop;
  60.         method_invocation m_method; // indicate which method to run
  61.         boost::condition m_cond_workorder_ready; // condition is that work has been ordered, via m_method
  62.         boost::condition m_cond_workorder_finished; // condition that the last work order has been completed, and results can be utilized further
  63.         boost::mutex m_mutex;
  64.         int m_elements;
  65.         T* m_data;
  66.         T m_result;
  67.         boost::thread m_thread;
  68.  
  69.     public:
  70.         typedef boost::mutex::scoped_lock scoped_lock;
  71.  
  72.         device_matrix(T* data, unsigned int elements) :
  73.             m_method(NO_METHOD),
  74.             m_data(data),
  75.             m_elements(elements),
  76.             m_result(-1),
  77.             m_stop(false),
  78.             m_thread( boost::bind(&device_matrix::thread_loop, this) )
  79.         {
  80.         }
  81.  
  82.         ~device_matrix() {
  83.             {
  84.                 scoped_lock lock(m_mutex);
  85.                 // and a dummy method to get the thread to do nothing
  86.                 m_method = NOOP_METHOD;
  87.                 // we fake a work order to wake the thread up
  88.                 m_cond_workorder_ready.notify_all();
  89.                 m_stop = true;
  90.             } // drop the mutex, and then wait for the device thread to die off
  91.  
  92.             m_thread.join(); // main host waits for device thread to stop here.
  93.         }
  94.  
  95.         void sum() {
  96.             scoped_lock lock(m_mutex);
  97.             while (m_method != NO_METHOD) {
  98.                 m_cond_workorder_finished.wait(lock);
  99.             }
  100.             m_method = SUM_METHOD;
  101.             m_cond_workorder_ready.notify_all();
  102.         }
  103.  
  104.         void square() {
  105.             scoped_lock lock(m_mutex);
  106.             while (m_method != NO_METHOD) {
  107.                 m_cond_workorder_finished.wait(lock);
  108.             }
  109.             m_method = SQUARE_METHOD;
  110.             m_cond_workorder_ready.notify_all();
  111.         }
  112.  
  113.         T result() {
  114.             scoped_lock lock(m_mutex);
  115.             while (m_method != NO_METHOD) {
  116.                 print("waiting for result");
  117.                 m_cond_workorder_finished.wait(lock);
  118.             }
  119.             return m_result;
  120.         }
  121.  
  122.     private:
  123.  
  124.         void do_square() {
  125.             m_result = 2.0;
  126.         }
  127.  
  128.         void initialize_cuda() {
  129.             scoped_lock lock(m_mutex);
  130.             // this is where we can initialize cuda
  131.         }
  132.  
  133.         void thread_loop() {
  134.             initialize_cuda();
  135.  
  136.             while (1) {
  137.                 scoped_lock lock(m_mutex);
  138.                 while (m_method == NO_METHOD) m_cond_workorder_ready.wait(lock);
  139.                 switch (m_method) {
  140.                     case SUM_METHOD:
  141.                         m_result = do_sum(m_data, m_elements);
  142.                         break;
  143.                     case SQUARE_METHOD:
  144.                         do_square();
  145.                         break;
  146.                     default:
  147.                     case NOOP_METHOD:
  148.                     case NO_METHOD:
  149.                         break;
  150.                 }
  151.                 m_method = NO_METHOD;
  152.                 m_cond_workorder_finished.notify_all(); // alert waiting threads, that we are done
  153.                 if (m_stop) {
  154.                     return; // ends thread
  155.                 }
  156.             }
  157.         }
  158. };
  159.  
  160. float random_float(float add = 0.0) {
  161.     return ((float)rand()/(float)RAND_MAX) + add;
  162. }
  163.  
  164. int main() {
  165.     // these are powers of two, for easy splits for now
  166.     unsigned int elements = 2 << 27;
  167.     unsigned int number_of_splits = 4;
  168.     typedef float element_type;
  169.     element_type *data = new element_type[elements];
  170.  
  171.     std::vector<device_matrix<element_type>*> matrices;
  172.  
  173.     boost::timer timer;
  174.     for (int i = 0; i < elements; i++) {
  175.         data[i] = random_float();
  176.     }
  177.     double time_to_init = timer.elapsed();
  178.  
  179.     // populate and calculate reference result
  180.     // uses kahan summation algorithm.
  181.     timer.restart();
  182.     element_type sum = do_sum(data, elements);
  183.     double time_to_sum_single = timer.elapsed();
  184.  
  185.     for (int i = 0; i < number_of_splits; i ++) {
  186.         unsigned int data_size = elements/number_of_splits;
  187.         matrices.push_back(new device_matrix<element_type>(data+(i*data_size), data_size));
  188.     }
  189.  
  190.     std::cout << "starting computation" << std::endl;
  191.  
  192.     timer.restart();
  193.     // start up the matrices
  194.     for (int i = 0; i < number_of_splits; i ++) {
  195.         matrices[i]->sum();
  196.     }
  197.  
  198.     // add up results
  199.     element_type result = 0.0;
  200.     for (int i = 0; i < number_of_splits; i ++) {
  201.         result += matrices[i]->result();
  202.     }
  203.  
  204.     double time_to_sum_device = timer.elapsed();
  205.  
  206.     std::cout << "device_matrix count       " << matrices.size() << std::endl;
  207.     std::cout << "elements                  " << elements << std::endl;
  208.     std::cout << "UINT_MAX                  " << UINT_MAX << std::endl;
  209.     std::cout << "data size total           " << elements*sizeof(float)/1024/1024 << " mb" << std::endl;
  210.     std::cout << "size per device_matrix    " << elements*sizeof(float)/1024/1024/number_of_splits << " mb" << std::endl;
  211.  
  212.     std::cout.precision(5);
  213.     std::cout.setf(std::ios_base::fixed);
  214.     std::cout << "reference                 " << sum << std::endl;
  215.     std::cout << "result                    " << result << std::endl;
  216.     std::cout.precision(3);
  217.     std::cout << "time taken (init)         " << time_to_init << " secs" << std::endl;
  218.     std::cout << "time taken (single)       " << time_to_sum_single << " secs" << std::endl;
  219.     std::cout << "time taken (threaded)     " << time_to_sum_device << " secs" << std::endl;
  220.  
  221.     // dealloc shit
  222.     for (int i = 0; i < matrices.size(); i++) {
  223.         delete matrices[i];
  224.     }
  225.     delete[] data;
  226.  
  227.     return 0;
  228. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement