Want more features on Pastebin? Sign Up, it's FREE!
Guest

nodegpu01 gmond.conf

By: a guest on Oct 22nd, 2010  |  syntax: None  |  size: 8.13 KB  |  views: 32  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. nodegpu gmond.conf
  2.  
  3.  
  4. /* This configuration is as close to 2.5.x default behavior as possible
  5.    The values closely match ./gmond/metric.h definitions in 2.5.x */
  6. globals {                    
  7.   daemonize = yes              
  8.   setuid = yes            
  9.   user = ganglia              
  10.   debug_level = 0              
  11.   max_udp_msg_len = 1472        
  12.   mute = no            
  13.   deaf = no            
  14.   host_dmax = 0 /*secs */
  15.   cleanup_threshold = 300 /*secs */
  16.   gexec = no            
  17.   send_metadata_interval = 0    
  18. }
  19.  
  20. /* If a cluster attribute is specified, then all gmond hosts are wrapped inside
  21.  * of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> will
  22.  * NOT be wrapped inside of a <CLUSTER> tag. */
  23. cluster {
  24.   name = "GPU cluster"
  25.   owner = "Propietat de FBM"
  26.   latlong = "unspecified"
  27.   url = "unspecified"
  28. }
  29.  
  30. /* The host section describes attributes of the host, like the location */
  31. host {
  32.   location = "suport.informatica@example.org"
  33. }
  34.  
  35. /* Feel free to specify as many udp_send_channels as you like.  Gmond
  36.    used to only support having a single channel */
  37. udp_send_channel {
  38.   mcast_join = 239.2.11.72
  39.   port = 8649
  40.   ttl = 1
  41. }
  42.  
  43. /* You can specify as many udp_recv_channels as you like as well. */
  44. udp_recv_channel {
  45.   mcast_join = 239.2.11.72
  46.   port = 8649
  47.   bind = 239.2.11.72
  48. }
  49.  
  50. /* You can specify as many tcp_accept_channels as you like to share
  51.    an xml description of the state of the cluster */
  52. tcp_accept_channel {
  53.   port = 8649
  54. }
  55.  
  56. /* Each metrics module that is referenced by gmond must be specified and
  57.    loaded. If the module has been statically linked with gmond, it does not
  58.    require a load path. However all dynamically loadable modules must include
  59.    a load path. */
  60. modules {
  61.   module {
  62.     name = "core_metrics"
  63.   }
  64.   module {
  65.     name = "cpu_module"
  66.     path = "/usr/lib/ganglia/modcpu.so"
  67.   }
  68.   module {
  69.     name = "disk_module"
  70.     path = "/usr/lib/ganglia/moddisk.so"
  71.   }
  72.   module {
  73.     name = "load_module"
  74.     path = "/usr/lib/ganglia/modload.so"
  75.   }
  76.   module {
  77.     name = "mem_module"
  78.     path = "/usr/lib/ganglia/modmem.so"
  79.   }
  80.   module {
  81.     name = "net_module"
  82.     path = "/usr/lib/ganglia/modnet.so"
  83.   }
  84.   module {
  85.     name = "proc_module"
  86.     path = "/usr/lib/ganglia/modproc.so"
  87.   }
  88.   module {
  89.     name = "sys_module"
  90.     path = "/usr/lib/ganglia/modsys.so"
  91.   }
  92. }
  93.  
  94. include ('/etc/ganglia/conf.d/*.conf')
  95.  
  96.  
  97. /* The old internal 2.5.x metric array has been replaced by the following
  98.    collection_group directives.  What follows is the default behavior for
  99.    collecting and sending metrics that is as close to 2.5.x behavior as
  100.    possible. */
  101.  
  102. /* This collection group will cause a heartbeat (or beacon) to be sent every
  103.    20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses
  104.    the age of the running gmond. */
  105. collection_group {
  106.   collect_once = yes
  107.   time_threshold = 20
  108.   metric {
  109.     name = "heartbeat"
  110.   }
  111. }
  112.  
  113. /* This collection group will send general info about this host every 1200 secs.
  114.    This information doesn't change between reboots and is only collected once. */
  115. collection_group {
  116.   collect_once = yes
  117.   time_threshold = 1200
  118.   metric {
  119.     name = "cpu_num"
  120.     title = "CPU Count"
  121.   }
  122.   metric {
  123.     name = "cpu_speed"
  124.     title = "CPU Speed"
  125.   }
  126.   metric {
  127.     name = "mem_total"
  128.     title = "Memory Total"
  129.   }
  130.   /* Should this be here? Swap can be added/removed between reboots. */
  131.   metric {
  132.     name = "swap_total"
  133.     title = "Swap Space Total"
  134.   }
  135.   metric {
  136.     name = "boottime"
  137.     title = "Last Boot Time"
  138.   }
  139.   metric {
  140.     name = "machine_type"
  141.     title = "Machine Type"
  142.   }
  143.   metric {
  144.     name = "os_name"
  145.     title = "Operating System"
  146.   }
  147.   metric {
  148.     name = "os_release"
  149.     title = "Operating System Release"
  150.   }
  151.   metric {
  152.     name = "location"
  153.     title = "Location"
  154.   }
  155. }
  156.  
  157. /* This collection group will send the status of gexecd for this host every 300 secs */
  158. /* Unlike 2.5.x the default behavior is to report gexecd OFF.  */
  159. collection_group {
  160.   collect_once = yes
  161.   time_threshold = 300
  162.   metric {
  163.     name = "gexec"
  164.     title = "Gexec Status"
  165.   }
  166. }
  167.  
  168. /* This collection group will collect the CPU status info every 20 secs.
  169.    The time threshold is set to 90 seconds.  In honesty, this time_threshold could be
  170.    set significantly higher to reduce unneccessary network chatter. */
  171. collection_group {
  172.   collect_every = 20
  173.   time_threshold = 90
  174.   /* CPU status */
  175.   metric {
  176.     name = "cpu_user"  
  177.     value_threshold = "1.0"
  178.     title = "CPU User"
  179.   }
  180.   metric {
  181.     name = "cpu_system"  
  182.     value_threshold = "1.0"
  183.     title = "CPU System"
  184.   }
  185.   metric {
  186.     name = "cpu_idle"  
  187.     value_threshold = "5.0"
  188.     title = "CPU Idle"
  189.   }
  190.   metric {
  191.     name = "cpu_nice"  
  192.     value_threshold = "1.0"
  193.     title = "CPU Nice"
  194.   }
  195.   metric {
  196.     name = "cpu_aidle"
  197.     value_threshold = "5.0"
  198.     title = "CPU aidle"
  199.   }
  200.   metric {
  201.     name = "cpu_wio"
  202.     value_threshold = "1.0"
  203.     title = "CPU wio"
  204.   }
  205.   /* The next two metrics are optional if you want more detail...
  206.      ... since they are accounted for in cpu_system.  
  207.   metric {
  208.     name = "cpu_intr"
  209.     value_threshold = "1.0"
  210.     title = "CPU intr"
  211.   }
  212.   metric {
  213.     name = "cpu_sintr"
  214.     value_threshold = "1.0"
  215.     title = "CPU sintr"
  216.   }
  217.   */
  218. }
  219.  
  220. collection_group {
  221.   collect_every = 20
  222.   time_threshold = 90
  223.   /* Load Averages */
  224.   metric {
  225.     name = "load_one"
  226.     value_threshold = "1.0"
  227.     title = "One Minute Load Average"
  228.   }
  229.   metric {
  230.     name = "load_five"
  231.     value_threshold = "1.0"
  232.     title = "Five Minute Load Average"
  233.   }
  234.   metric {
  235.     name = "load_fifteen"
  236.     value_threshold = "1.0"
  237.     title = "Fifteen Minute Load Average"
  238.   }
  239. }
  240.  
  241. /* This group collects the number of running and total processes */
  242. collection_group {
  243.   collect_every = 80
  244.   time_threshold = 950
  245.   metric {
  246.     name = "proc_run"
  247.     value_threshold = "1.0"
  248.     title = "Total Running Processes"
  249.   }
  250.   metric {
  251.     name = "proc_total"
  252.     value_threshold = "1.0"
  253.     title = "Total Processes"
  254.   }
  255. }
  256.  
  257. /* This collection group grabs the volatile memory metrics every 40 secs and
  258.    sends them at least every 180 secs.  This time_threshold can be increased
  259.    significantly to reduce unneeded network traffic. */
  260. collection_group {
  261.   collect_every = 40
  262.   time_threshold = 180
  263.   metric {
  264.     name = "mem_free"
  265.     value_threshold = "1024.0"
  266.     title = "Free Memory"
  267.   }
  268.   metric {
  269.     name = "mem_shared"
  270.     value_threshold = "1024.0"
  271.     title = "Shared Memory"
  272.   }
  273.   metric {
  274.     name = "mem_buffers"
  275.     value_threshold = "1024.0"
  276.     title = "Memory Buffers"
  277.   }
  278.   metric {
  279.     name = "mem_cached"
  280.     value_threshold = "1024.0"
  281.     title = "Cached Memory"
  282.   }
  283.   metric {
  284.     name = "swap_free"
  285.     value_threshold = "1024.0"
  286.     title = "Free Swap Space"
  287.   }
  288. }
  289.  
  290. collection_group {
  291.   collect_every = 40
  292.   time_threshold = 300
  293.   metric {
  294.     name = "bytes_out"
  295.     value_threshold = 4096
  296.     title = "Bytes Sent"
  297.   }
  298.   metric {
  299.     name = "bytes_in"
  300.     value_threshold = 4096
  301.     title = "Bytes Received"
  302.   }
  303.   metric {
  304.     name = "pkts_in"
  305.     value_threshold = 256
  306.     title = "Packets Received"
  307.   }
  308.   metric {
  309.     name = "pkts_out"
  310.     value_threshold = 256
  311.     title = "Packets Sent"
  312.   }
  313. }
  314.  
  315. /* Different than 2.5.x default since the old config made no sense */
  316. collection_group {
  317.   collect_every = 1800
  318.   time_threshold = 3600
  319.   metric {
  320.     name = "disk_total"
  321.     value_threshold = 1.0
  322.     title = "Total Disk Space"
  323.   }
  324. }
  325.  
  326. collection_group {
  327.   collect_every = 40
  328.   time_threshold = 180
  329.   metric {
  330.     name = "disk_free"
  331.     value_threshold = 1.0
  332.     title = "Disk Space Available"
  333.   }
  334.   metric {
  335.     name = "part_max_used"
  336.     value_threshold = 1.0
  337.     title = "Maximum Disk Space Used"
  338.   }
  339. }
clone this paste RAW Paste Data