Advertisement
Guest User

Untitled

a guest
Dec 29th, 2017
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.37 KB | None | 0 0
  1. // Either remove "stdafx.h" on non-Windows platforms, or create an empty file
  2. // with that name.
  3. #include "stdafx.h"
  4.  
  5. #ifdef WIN32
  6. #include <windows.h>
  7. #else
  8. #include <pthread.h>
  9. #include <numa.h>
  10. #endif
  11.  
  12. #include <stdint.h>
  13. #include <stdio.h>
  14. #include <stdlib.h>
  15. #include <time.h>
  16.  
  17. #include <atomic>
  18. #include <chrono>
  19. #include <condition_variable>
  20. #include <iomanip>
  21. #include <iostream>
  22. #include <memory>
  23. #include <set>
  24. #include <string>
  25. #include <thread>
  26. #include <vector>
  27.  
  28. using namespace std;
  29.  
  30. // SMT on x86, tell the CPU-thread to relax a bit while busy waiting
  31. #ifdef WIN32
  32. #define cpu_relax() __asm { pause }
  33. #else
  34. #if defined(__i386__) || defined(__x86_64)
  35. #define cpu_relax() asm volatile("pause" ::: "memory")
  36. #else
  37. #define cpu_relax() asm volatile ("" ::: "memory")
  38. #endif
  39. #endif
  40.  
  41. typedef uint32_t num_t;
  42. typedef uint32_t cpu_id;
  43. typedef enum tse
  44.     {
  45.     THR_READY,
  46.     THR_STEADY,
  47.     THR_GO
  48.     } thr_state_e;
  49.  
  50. // Global state to implement a rendevouz for the two communicating threads
  51. mutex                   mtx;
  52. condition_variable      cv;
  53. thr_state_e             thr_state;
  54.  
  55. static void rendevouz(void)
  56.     {
  57.     unique_lock<mutex> lock(mtx);
  58.  
  59.     if (thr_state == THR_READY)
  60.         {
  61.         thr_state = THR_STEADY;
  62.         cv.wait(lock, [] { return thr_state == THR_GO; });
  63.         }
  64.     else
  65.         {
  66.         thr_state = THR_GO;
  67.         cv.notify_all();
  68.         }
  69.     }
  70.  
  71. [[noreturn]] static void panic(const char * desc)
  72.     {
  73.     perror(desc);
  74.     exit(EXIT_FAILURE);
  75.     }
  76.  
  77. // Binds a software thread to a specific CPU-thread
  78. static void cpu_bind
  79.     (
  80.     cpu_id c
  81.     )
  82.     {
  83. #ifdef WIN32
  84.     if (SetThreadAffinityMask(GetCurrentThread(), 1UL << c) == 0)
  85.         {
  86.         panic("SetThreadAffinityMask() failed");
  87.         }
  88. #else
  89.     cpu_set_t cpu;
  90.  
  91.     CPU_ZERO(&cpu);
  92.     CPU_SET(c, &cpu);
  93.     if (pthread_setaffinity_np(pthread_self(), sizeof cpu, &cpu) != 0)
  94.         {
  95.         panic("pthread_setaffinity_np() failed");
  96.         }
  97. #endif
  98.     }
  99.  
  100. void worker
  101.     (
  102.     atomic<num_t> &ch,
  103.     atomic<bool> &is_done,
  104.     cpu_id cpu_thread,
  105.     uint32_t *latency
  106.     )
  107.     {
  108.     num_t cnt = (latency == nullptr ? 1 : 0);
  109.     cpu_bind(cpu_thread);
  110.     rendevouz();
  111.  
  112.     auto startClk = chrono::steady_clock::now();
  113.     while (!is_done.load(memory_order_relaxed))
  114.         {
  115.         if (ch.load(memory_order_relaxed) == cnt)
  116.             {
  117.             cnt = ch.fetch_add(1, memory_order_relaxed) + 2;
  118.             }
  119.         else
  120.             {
  121.             cpu_relax();
  122.             }
  123.         }
  124.  
  125.     if (latency != nullptr)
  126.         {
  127.         auto endClk = chrono::steady_clock::now();
  128.         auto duration = chrono::duration_cast<chrono::nanoseconds>(endClk - startClk);
  129.         *latency = duration.count() / cnt;
  130.         }
  131.     };
  132.  
  133. static void measure
  134.     (
  135.     cpu_id cpu_threads,
  136.     const set<cpu_id> &cpu_filter,
  137.     int numa_node = 0
  138.     )
  139.     {
  140.     for (cpu_id ping_id = 0; ping_id < cpu_threads - 1; ping_id++)
  141.         {
  142.         for (cpu_id pong_id = ping_id + 1; pong_id < cpu_threads; pong_id++)
  143.             {
  144.             if (cpu_filter.count(ping_id) + cpu_filter.count(pong_id) != 2)
  145.                 {
  146.                 continue;
  147.                 }
  148.  
  149.             cout << setw(2) << ping_id << " <-> "
  150.                  << setw(2) << left << pong_id << right << flush;
  151.  
  152.             atomic<num_t> *channel;
  153.  
  154.             channel = static_cast<atomic<num_t>*>(numa_alloc_onnode(sizeof *channel, numa_node));
  155.  
  156.             atomic<bool> is_done{ false };
  157.             uint32_t latency;
  158.             thread ping(worker, ref(*channel), ref(is_done), ping_id, &latency);
  159.             thread pong(worker, ref(*channel), ref(is_done), pong_id, nullptr);
  160.  
  161.             this_thread::sleep_for(chrono::seconds(4));
  162.             is_done.store(true);
  163.             ping.join();
  164.             pong.join();
  165.  
  166.             cout << " : " << setw(3) << latency << " ns" << endl;
  167.  
  168.             numa_free(channel, sizeof *channel);
  169.             }
  170.         }
  171.     }
  172.  
  173. static set<cpu_id> parse_args
  174.     (
  175.     const vector<string> &args,
  176.     cpu_id cpu_cnt
  177.     )
  178.     {
  179.     set<cpu_id> filter;
  180.  
  181.     if (args.size() == 0)
  182.         {
  183.         for (int c = 0; c < cpu_cnt; c++)
  184.             {
  185.             filter.insert(c);
  186.             }
  187.         }
  188.     else
  189.         {
  190.         for (auto cpu_thr: args)
  191.             {
  192.             filter.insert(stoi(cpu_thr));
  193.             }
  194.         }
  195.  
  196.     return filter;
  197.     }
  198.  
  199. int main
  200.     (
  201.     int argc,
  202.     char *argv[]
  203.     )
  204.     {
  205.     auto num_numa_nodes = numa_num_task_nodes();
  206.  
  207.     try
  208.         {
  209.         vector<string> args(argv + 1, argv + argc);
  210.         cpu_id cpu_threads = thread::hardware_concurrency();
  211.         auto cpu_filter = parse_args(args, cpu_threads);
  212.  
  213.         cout << "This system got " << cpu_threads << " CPU-threads and "
  214.              << num_numa_nodes << " NUMA nodes." << endl;
  215.         for (auto numa_node = 0; numa_node < num_numa_nodes; numa_node++)
  216.             {
  217.             cout << "Allocate from NUMA node " << numa_node << endl;
  218.             measure(cpu_threads, cpu_filter);
  219.             }
  220.         }
  221.     catch (...)
  222.         {
  223.         cerr << "Usage: " << argv[0] << " [CPUTHR0 CPUTHR1 ...]" << endl;
  224.         }
  225.     }
  226.  
  227. // Local Variables:
  228. // compile-command: "make CXXFLAGS=\"-std=c++11 -O2 -pthread\" LDLIBS=\"-lnuma\" th_ping"
  229. // End:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement