Guest User

Untitled

a guest
Jan 18th, 2019
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.14 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. import sys, os
  4. import time
  5. import random
  6. import getpass
  7. import subprocess
  8.  
  9. if not os.path.exists("/tmp/waitgpu"):
  10. os.mkdir("/tmp/waitgpu")
  11.  
  12. def is_gpu_locked():
  13. if not os.path.exists("/tmp/waitgpu/lock"):
  14. open("/tmp/waitgpu/lock", "w").close()
  15. return time.time() - os.path.getmtime("/tmp/waitgpu/lock") < 60
  16.  
  17. def lock_gpu():
  18. open("/tmp/waitgpu/lock", "w").close()
  19.  
  20.  
  21. def check_first_task():
  22. fnames = os.listdir("/tmp/waitgpu")
  23. tasks = []
  24. for fname in fnames:
  25. fpath = "/tmp/waitgpu/{}".format(fname)
  26. pairs = open(fpath).read().strip().split("\t")
  27. if len(pairs) != 5:
  28. continue
  29. id, user, create_time, heartbeat_time, command = pairs
  30. if time.time() - float(heartbeat_time) > 30:
  31. # Validate the task
  32. try:
  33. os.remove(fpath)
  34. except:
  35. pass
  36. else:
  37. # Add to list
  38. tasks.append((float(create_time), int(id), command))
  39. tasks.sort()
  40. if not tasks:
  41. return None
  42. else:
  43. return tasks[0][1]
  44.  
  45. def is_gpu_available():
  46. ret = subprocess.check_output("nvidia-smi", shell=True)
  47. return "No running processes found" in str(ret)
  48.  
  49. def print_all_guys():
  50. fnames = os.listdir("/tmp/waitgpu")
  51. tasks = []
  52. for fname in fnames:
  53. fpath = "/tmp/waitgpu/{}".format(fname)
  54. pairs = open(fpath).read().strip().split("\t")
  55. if len(pairs) != 5:
  56. continue
  57. id, user, create_time, heartbeat_time, command = pairs
  58. if time.time() - float(heartbeat_time) < 30:
  59. tasks.append((float(create_time), user, command))
  60. tasks.sort()
  61. c = 1
  62. print("-------")
  63. if is_gpu_available():
  64. print("Status: GPU is not being used now")
  65. else:
  66. print("Status: GPU is occupied now")
  67. print("-------")
  68. print("Wating list:")
  69. print("rank\tuser\tcommand")
  70. for ct, user, command in tasks:
  71. print("{}\t{}\t{}".format(c, user, command))
  72. c += 1
  73. print("-------")
  74.  
  75. if __name__ == '__main__':
  76. command = " ".join(sys.argv[1:])
  77. if command == "list":
  78. print_all_guys()
  79. elif command.strip() == "":
  80. print("Usage:")
  81. print("1. waitgpu python xxx.py <- run a command")
  82. print("2. waitgpu list <- see the waiting list")
  83. else:
  84. print("[waitgpu] waiting:", command)
  85. myid = random.randint(0, 999999)
  86. myhandle = "/tmp/waitgpu/{}.event".format(myid)
  87. myuser = getpass.getuser()
  88. create_time = time.time()
  89. while True:
  90. # Write event file
  91. with open(myhandle, "w") as fhandle:
  92. fhandle.write("\t".join([
  93. str(myid), myuser, str(create_time),
  94. str(time.time()), command
  95. ]))
  96. # Checking the gpus status, and run
  97. if is_gpu_available() and not is_gpu_locked() and check_first_task() == myid:
  98. print("[waitgpu] execute:", command)
  99. os.remove(myhandle)
  100. lock_gpu()
  101. os.system(command)
  102. sys.exit()
  103. break
  104. else:
  105. time.sleep(10)
Add Comment
Please, Sign In to add comment