Advertisement
Guest User

dmtcp_coordinator (debug mode)

a guest
Nov 17th, 2014
195
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.16 KB | None | 0 0
  1. [root@slurm-master dmtcp-trunk]# dmtcp_coordinator
  2. dmtcp_coordinator (DMTCP) 2.3.1
  3. License LGPLv3+: GNU LGPL version 3 or later
  4. <http://gnu.org/licenses/lgpl.html>.
  5. This program comes with ABSOLUTELY NO WARRANTY.
  6. This is free software, and you are welcome to redistribute it
  7. under certain conditions; see COPYING file for details.
  8. (Use flag "-q" to hide this message.)
  9.  
  10. [6364] TRACE at dmtcp_coordinator.cpp:1795 in main; REASON='New DMTCP coordinator starting.'
  11. dmtcp::UniquePid::ThisProcess() = 6db90f3d5a9dd200-6364-546a1de3
  12. dmtcp_coordinator starting...
  13. Host: slurm-master (192.168.122.11)
  14. Port: 7779
  15. Checkpoint Interval: disabled (checkpoint manually instead)
  16. Exit on last client: 0
  17. Type '?' for help.
  18.  
  19.  
  20.  
  21.  
  22.  
  23.  
  24.  
  25.  
  26.  
  27.  
  28.  
  29.  
  30.  
  31.  
  32.  
  33.  
  34.  
  35.  
  36.  
  37.  
  38.  
  39.  
  40.  
  41.  
  42.  
  43.  
  44. [6364] TRACE at dmtcp_coordinator.cpp:923 in onConnect; REASON='accepting new connection'
  45. remote.sockfd() = 5
  46. (strerror((*__errno_location ()))) = Success
  47. [6364] TRACE at dmtcp_coordinator.cpp:932 in onConnect; REASON='Reading from incoming connection...'
  48. [6364] TRACE at dmtcp_coordinator.cpp:1225 in validateNewWorkerProcess; REASON='First process connected. Creating new computation group'
  49. compId = 6db90f3d5a9dd200-40000-546a1de8
  50. [6364] NOTE at dmtcp_coordinator.cpp:1040 in onConnect; REASON='worker connected'
  51. hello_remote.from = 6db90f3d5a9dd200-6365-546a1de8
  52. [6364] TRACE at dmtcp_coordinator.cpp:1045 in onConnect; REASON='END'
  53. clients.size() = 1
  54. [6364] NOTE at dmtcp_coordinator.cpp:825 in onData; REASON='Updating process Information after exec()'
  55. progname = srun
  56. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  57. client->identity() = 6db90f3d5a9dd200-6365-546a1de8
  58. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  59. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  60. msg.state = WorkerState::RUNNING
  61. oldState = WorkerState::RUNNING
  62. newState = WorkerState::RUNNING
  63. [6364] TRACE at dmtcp_coordinator.cpp:923 in onConnect; REASON='accepting new connection'
  64. remote.sockfd() = 6
  65. (strerror((*__errno_location ()))) = Success
  66. [6364] TRACE at dmtcp_coordinator.cpp:932 in onConnect; REASON='Reading from incoming connection...'
  67. [6364] TRACE at dmtcp_coordinator.cpp:1228 in validateNewWorkerProcess; REASON='New process connected'
  68. hello_remote.from = 6db90f3d5a9dd200-40000-546a1de8
  69. client->prefixDir() =
  70. client->virtualPid() = 41000
  71. [6364] NOTE at dmtcp_coordinator.cpp:1040 in onConnect; REASON='worker connected'
  72. hello_remote.from = 6db90f3d5a9dd200-40000-546a1de8
  73. [6364] TRACE at dmtcp_coordinator.cpp:1045 in onConnect; REASON='END'
  74. clients.size() = 2
  75. [6364] NOTE at dmtcp_coordinator.cpp:816 in onData; REASON='Updating process Information after fork()'
  76. client->hostname() = slurm-master
  77. client->progname() = srun_(forked)
  78. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  79. client->identity() = 6db90f3d5a9dd200-40000-546a1de8
  80.  
  81. (start checkpoint)
  82.  
  83. c
  84. [6364] TRACE at dmtcp_coordinator.cpp:516 in handleUserCommand; REASON='checkpointing...'
  85. [6364] NOTE at dmtcp_coordinator.cpp:1271 in startCheckpoint; REASON='starting checkpoint, suspending all nodes'
  86. s.numPeers = 2
  87. [6364] NOTE at dmtcp_coordinator.cpp:1273 in startCheckpoint; REASON='Incremented Generation'
  88. compId.generation() = 1
  89. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  90. type = DMT_DO_SUSPEND
  91. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  92. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  93. msg.state = WorkerState::SUSPENDED
  94. oldState = WorkerState::RUNNING
  95. newState = WorkerState::RUNNING
  96. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  97. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  98. msg.state = WorkerState::SUSPENDED
  99. oldState = WorkerState::RUNNING
  100. newState = WorkerState::SUSPENDED
  101. [6364] NOTE at dmtcp_coordinator.cpp:615 in updateMinimumState; REASON='locking all nodes'
  102. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  103. type = DMT_DO_FD_LEADER_ELECTION
  104. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  105. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  106. msg.state = WorkerState::FD_LEADER_ELECTION
  107. oldState = WorkerState::SUSPENDED
  108. newState = WorkerState::SUSPENDED
  109. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  110. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  111. msg.state = WorkerState::FD_LEADER_ELECTION
  112. oldState = WorkerState::SUSPENDED
  113. newState = WorkerState::FD_LEADER_ELECTION
  114. [6364] NOTE at dmtcp_coordinator.cpp:621 in updateMinimumState; REASON='draining all nodes'
  115. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  116. type = DMT_DO_DRAIN
  117. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  118. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  119. msg.state = WorkerState::DRAINED
  120. oldState = WorkerState::FD_LEADER_ELECTION
  121. newState = WorkerState::FD_LEADER_ELECTION
  122.  
  123.  
  124.  
  125.  
  126.  
  127. (it hangs here for a while)
  128.  
  129.  
  130.  
  131.  
  132. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  133. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  134. msg.state = WorkerState::DRAINED
  135. oldState = WorkerState::FD_LEADER_ELECTION
  136. newState = WorkerState::DRAINED
  137. [6364] NOTE at dmtcp_coordinator.cpp:627 in updateMinimumState; REASON='checkpointing all nodes'
  138. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  139. type = DMT_DO_CHECKPOINT
  140. [6364] TRACE at dmtcp_coordinator.cpp:765 in onData; REASON='recording restart info'
  141. ckptFilename = /home/slurm/ckpt_srun_6db90f3d5a9dd200-41000-546a1de8.dmtcp
  142. hostname = slurm-master
  143. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  144. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  145. msg.state = WorkerState::CHECKPOINTED
  146. oldState = WorkerState::DRAINED
  147. newState = WorkerState::DRAINED
  148. [6364] TRACE at dmtcp_coordinator.cpp:765 in onData; REASON='recording restart info'
  149. ckptFilename = /home/slurm/ckpt_srun_6db90f3d5a9dd200-40000-546a1de8.dmtcp
  150. hostname = slurm-master
  151. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  152. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  153. msg.state = WorkerState::CHECKPOINTED
  154. oldState = WorkerState::DRAINED
  155. newState = WorkerState::CHECKPOINTED
  156. [6364] TRACE at dmtcp_coordinator.cpp:1370 in writeRestartScript; REASON='writing restart script'
  157. uniqueFilename = ./dmtcp_restart_script_6db90f3d5a9dd200-40000-546a1de8.sh
  158. [6364] TRACE at dmtcp_coordinator.cpp:1419 in writeRestartScript; REASON='Single HOST'
  159. [6364] TRACE at dmtcp_coordinator.cpp:1522 in writeRestartScript; REASON='linking "dmtcp_restart_script.sh" filename to uniqueFilename'
  160. filename = dmtcp_restart_script.sh
  161. dirname = .
  162. uniqueFilename = ./dmtcp_restart_script_6db90f3d5a9dd200-40000-546a1de8.sh
  163. [6364] NOTE at dmtcp_coordinator.cpp:641 in updateMinimumState; REASON='building name service database'
  164. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  165. type = DMT_DO_REGISTER_NAME_SERVICE_DATA
  166. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  167. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  168. msg.state = WorkerState::NAME_SERVICE_DATA_REGISTERED
  169. oldState = WorkerState::CHECKPOINTED
  170. newState = WorkerState::CHECKPOINTED
  171. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  172. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  173. msg.state = WorkerState::NAME_SERVICE_DATA_REGISTERED
  174. oldState = WorkerState::CHECKPOINTED
  175. newState = WorkerState::NAME_SERVICE_DATA_REGISTERED
  176. [6364] NOTE at dmtcp_coordinator.cpp:657 in updateMinimumState; REASON='entertaining queries now'
  177. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  178. type = DMT_DO_SEND_QUERIES
  179. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  180. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  181. msg.state = WorkerState::DONE_QUERYING
  182. oldState = WorkerState::NAME_SERVICE_DATA_REGISTERED
  183. newState = WorkerState::NAME_SERVICE_DATA_REGISTERED
  184. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  185. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  186. msg.state = WorkerState::DONE_QUERYING
  187. oldState = WorkerState::NAME_SERVICE_DATA_REGISTERED
  188. newState = WorkerState::DONE_QUERYING
  189. [6364] NOTE at dmtcp_coordinator.cpp:662 in updateMinimumState; REASON='refilling all nodes'
  190. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  191. type = DMT_DO_REFILL
  192. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  193. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  194. msg.state = WorkerState::REFILLED
  195. oldState = WorkerState::DONE_QUERYING
  196. newState = WorkerState::DONE_QUERYING
  197. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  198. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  199. msg.state = WorkerState::REFILLED
  200. oldState = WorkerState::DONE_QUERYING
  201. newState = WorkerState::REFILLED
  202. [6364] NOTE at dmtcp_coordinator.cpp:693 in updateMinimumState; REASON='restarting all nodes'
  203. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  204. type = DMT_DO_RESUME
  205. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  206. msg.from = 6db90f3d5a9dd200-41000-546a1de8
  207. msg.state = WorkerState::RUNNING
  208. oldState = WorkerState::REFILLED
  209. newState = WorkerState::REFILLED
  210. [6364] TRACE at dmtcp_coordinator.cpp:747 in onData; REASON='got DMT_OK message'
  211. msg.from = 6db90f3d5a9dd200-40000-546a1de8
  212. msg.state = WorkerState::RUNNING
  213. oldState = WorkerState::REFILLED
  214. newState = WorkerState::RUNNING
  215. [6364] NOTE at dmtcp_coordinator.cpp:875 in onDisconnect; REASON='client disconnected'
  216. client->identity() = 6db90f3d5a9dd200-41000-546a1de8
  217. [6364] NOTE at dmtcp_coordinator.cpp:875 in onDisconnect; REASON='client disconnected'
  218. client->identity() = 6db90f3d5a9dd200-40000-546a1de8
  219. [6364] TRACE at dmtcp_coordinator.cpp:850 in removeStaleSharedAreaFile; REASON='Removing sharedArea file.'
  220. o.str() = /tmp/dmtcp-root@slurm-master/dmtcpSharedArea.6db90f3d5a9dd200-40000-546a1de8.546a1de87
  221.  
  222.  
  223.  
  224. ^C[6364] NOTE at dmtcp_coordinator.cpp:556 in handleUserCommand; REASON='killing all connected peers and quitting ...'
  225. [6364] TRACE at dmtcp_coordinator.cpp:1312 in broadcastMessage; REASON='sending message'
  226. type = DMT_KILL_PEER
  227. DMTCP coordinator exiting... (per request)
  228. [6364] TRACE at dmtcp_coordinator.cpp:850 in removeStaleSharedAreaFile; REASON='Removing sharedArea file.'
  229. o.str() = /tmp/dmtcp-root@slurm-master/dmtcpSharedArea.6db90f3d5a9dd200-40000-546a1de8.546a1de87
  230. [6364] TRACE at dmtcp_coordinator.cpp:857 in preExitCleanup; REASON='Removing port-file'
  231. thePortFile =
  232. [6364] TRACE at dmtcp_coordinator.cpp:564 in handleUserCommand; REASON='Exiting ...'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement