Advertisement
Guest User

nodegpu01 gmond.conf

a guest
Oct 22nd, 2010
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.13 KB | None | 0 0
  1. nodegpu gmond.conf
  2.  
  3.  
  4. /* This configuration is as close to 2.5.x default behavior as possible
  5. The values closely match ./gmond/metric.h definitions in 2.5.x */
  6. globals {
  7. daemonize = yes
  8. setuid = yes
  9. user = ganglia
  10. debug_level = 0
  11. max_udp_msg_len = 1472
  12. mute = no
  13. deaf = no
  14. host_dmax = 0 /*secs */
  15. cleanup_threshold = 300 /*secs */
  16. gexec = no
  17. send_metadata_interval = 0
  18. }
  19.  
  20. /* If a cluster attribute is specified, then all gmond hosts are wrapped inside
  21. * of a <CLUSTER> tag. If you do not specify a cluster tag, then all <HOSTS> will
  22. * NOT be wrapped inside of a <CLUSTER> tag. */
  23. cluster {
  24. name = "GPU cluster"
  25. owner = "Propietat de FBM"
  26. latlong = "unspecified"
  27. url = "unspecified"
  28. }
  29.  
  30. /* The host section describes attributes of the host, like the location */
  31. host {
  32. location = "suport.informatica@example.org"
  33. }
  34.  
  35. /* Feel free to specify as many udp_send_channels as you like. Gmond
  36. used to only support having a single channel */
  37. udp_send_channel {
  38. mcast_join = 239.2.11.72
  39. port = 8649
  40. ttl = 1
  41. }
  42.  
  43. /* You can specify as many udp_recv_channels as you like as well. */
  44. udp_recv_channel {
  45. mcast_join = 239.2.11.72
  46. port = 8649
  47. bind = 239.2.11.72
  48. }
  49.  
  50. /* You can specify as many tcp_accept_channels as you like to share
  51. an xml description of the state of the cluster */
  52. tcp_accept_channel {
  53. port = 8649
  54. }
  55.  
  56. /* Each metrics module that is referenced by gmond must be specified and
  57. loaded. If the module has been statically linked with gmond, it does not
  58. require a load path. However all dynamically loadable modules must include
  59. a load path. */
  60. modules {
  61. module {
  62. name = "core_metrics"
  63. }
  64. module {
  65. name = "cpu_module"
  66. path = "/usr/lib/ganglia/modcpu.so"
  67. }
  68. module {
  69. name = "disk_module"
  70. path = "/usr/lib/ganglia/moddisk.so"
  71. }
  72. module {
  73. name = "load_module"
  74. path = "/usr/lib/ganglia/modload.so"
  75. }
  76. module {
  77. name = "mem_module"
  78. path = "/usr/lib/ganglia/modmem.so"
  79. }
  80. module {
  81. name = "net_module"
  82. path = "/usr/lib/ganglia/modnet.so"
  83. }
  84. module {
  85. name = "proc_module"
  86. path = "/usr/lib/ganglia/modproc.so"
  87. }
  88. module {
  89. name = "sys_module"
  90. path = "/usr/lib/ganglia/modsys.so"
  91. }
  92. }
  93.  
  94. include ('/etc/ganglia/conf.d/*.conf')
  95.  
  96.  
  97. /* The old internal 2.5.x metric array has been replaced by the following
  98. collection_group directives. What follows is the default behavior for
  99. collecting and sending metrics that is as close to 2.5.x behavior as
  100. possible. */
  101.  
  102. /* This collection group will cause a heartbeat (or beacon) to be sent every
  103. 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
  104. the age of the running gmond. */
  105. collection_group {
  106. collect_once = yes
  107. time_threshold = 20
  108. metric {
  109. name = "heartbeat"
  110. }
  111. }
  112.  
  113. /* This collection group will send general info about this host every 1200 secs.
  114. This information doesn't change between reboots and is only collected once. */
  115. collection_group {
  116. collect_once = yes
  117. time_threshold = 1200
  118. metric {
  119. name = "cpu_num"
  120. title = "CPU Count"
  121. }
  122. metric {
  123. name = "cpu_speed"
  124. title = "CPU Speed"
  125. }
  126. metric {
  127. name = "mem_total"
  128. title = "Memory Total"
  129. }
  130. /* Should this be here? Swap can be added/removed between reboots. */
  131. metric {
  132. name = "swap_total"
  133. title = "Swap Space Total"
  134. }
  135. metric {
  136. name = "boottime"
  137. title = "Last Boot Time"
  138. }
  139. metric {
  140. name = "machine_type"
  141. title = "Machine Type"
  142. }
  143. metric {
  144. name = "os_name"
  145. title = "Operating System"
  146. }
  147. metric {
  148. name = "os_release"
  149. title = "Operating System Release"
  150. }
  151. metric {
  152. name = "location"
  153. title = "Location"
  154. }
  155. }
  156.  
  157. /* This collection group will send the status of gexecd for this host every 300 secs */
  158. /* Unlike 2.5.x the default behavior is to report gexecd OFF. */
  159. collection_group {
  160. collect_once = yes
  161. time_threshold = 300
  162. metric {
  163. name = "gexec"
  164. title = "Gexec Status"
  165. }
  166. }
  167.  
  168. /* This collection group will collect the CPU status info every 20 secs.
  169. The time threshold is set to 90 seconds. In honesty, this time_threshold could be
  170. set significantly higher to reduce unneccessary network chatter. */
  171. collection_group {
  172. collect_every = 20
  173. time_threshold = 90
  174. /* CPU status */
  175. metric {
  176. name = "cpu_user"
  177. value_threshold = "1.0"
  178. title = "CPU User"
  179. }
  180. metric {
  181. name = "cpu_system"
  182. value_threshold = "1.0"
  183. title = "CPU System"
  184. }
  185. metric {
  186. name = "cpu_idle"
  187. value_threshold = "5.0"
  188. title = "CPU Idle"
  189. }
  190. metric {
  191. name = "cpu_nice"
  192. value_threshold = "1.0"
  193. title = "CPU Nice"
  194. }
  195. metric {
  196. name = "cpu_aidle"
  197. value_threshold = "5.0"
  198. title = "CPU aidle"
  199. }
  200. metric {
  201. name = "cpu_wio"
  202. value_threshold = "1.0"
  203. title = "CPU wio"
  204. }
  205. /* The next two metrics are optional if you want more detail...
  206. ... since they are accounted for in cpu_system.
  207. metric {
  208. name = "cpu_intr"
  209. value_threshold = "1.0"
  210. title = "CPU intr"
  211. }
  212. metric {
  213. name = "cpu_sintr"
  214. value_threshold = "1.0"
  215. title = "CPU sintr"
  216. }
  217. */
  218. }
  219.  
  220. collection_group {
  221. collect_every = 20
  222. time_threshold = 90
  223. /* Load Averages */
  224. metric {
  225. name = "load_one"
  226. value_threshold = "1.0"
  227. title = "One Minute Load Average"
  228. }
  229. metric {
  230. name = "load_five"
  231. value_threshold = "1.0"
  232. title = "Five Minute Load Average"
  233. }
  234. metric {
  235. name = "load_fifteen"
  236. value_threshold = "1.0"
  237. title = "Fifteen Minute Load Average"
  238. }
  239. }
  240.  
  241. /* This group collects the number of running and total processes */
  242. collection_group {
  243. collect_every = 80
  244. time_threshold = 950
  245. metric {
  246. name = "proc_run"
  247. value_threshold = "1.0"
  248. title = "Total Running Processes"
  249. }
  250. metric {
  251. name = "proc_total"
  252. value_threshold = "1.0"
  253. title = "Total Processes"
  254. }
  255. }
  256.  
  257. /* This collection group grabs the volatile memory metrics every 40 secs and
  258. sends them at least every 180 secs. This time_threshold can be increased
  259. significantly to reduce unneeded network traffic. */
  260. collection_group {
  261. collect_every = 40
  262. time_threshold = 180
  263. metric {
  264. name = "mem_free"
  265. value_threshold = "1024.0"
  266. title = "Free Memory"
  267. }
  268. metric {
  269. name = "mem_shared"
  270. value_threshold = "1024.0"
  271. title = "Shared Memory"
  272. }
  273. metric {
  274. name = "mem_buffers"
  275. value_threshold = "1024.0"
  276. title = "Memory Buffers"
  277. }
  278. metric {
  279. name = "mem_cached"
  280. value_threshold = "1024.0"
  281. title = "Cached Memory"
  282. }
  283. metric {
  284. name = "swap_free"
  285. value_threshold = "1024.0"
  286. title = "Free Swap Space"
  287. }
  288. }
  289.  
  290. collection_group {
  291. collect_every = 40
  292. time_threshold = 300
  293. metric {
  294. name = "bytes_out"
  295. value_threshold = 4096
  296. title = "Bytes Sent"
  297. }
  298. metric {
  299. name = "bytes_in"
  300. value_threshold = 4096
  301. title = "Bytes Received"
  302. }
  303. metric {
  304. name = "pkts_in"
  305. value_threshold = 256
  306. title = "Packets Received"
  307. }
  308. metric {
  309. name = "pkts_out"
  310. value_threshold = 256
  311. title = "Packets Sent"
  312. }
  313. }
  314.  
  315. /* Different than 2.5.x default since the old config made no sense */
  316. collection_group {
  317. collect_every = 1800
  318. time_threshold = 3600
  319. metric {
  320. name = "disk_total"
  321. value_threshold = 1.0
  322. title = "Total Disk Space"
  323. }
  324. }
  325.  
  326. collection_group {
  327. collect_every = 40
  328. time_threshold = 180
  329. metric {
  330. name = "disk_free"
  331. value_threshold = 1.0
  332. title = "Disk Space Available"
  333. }
  334. metric {
  335. name = "part_max_used"
  336. value_threshold = 1.0
  337. title = "Maximum Disk Space Used"
  338. }
  339. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement