Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- import sys, os
- import time
- import random
- import getpass
- import subprocess
- if not os.path.exists("/tmp/waitgpu"):
- os.mkdir("/tmp/waitgpu")
- def is_gpu_locked():
- if not os.path.exists("/tmp/waitgpu/lock"):
- open("/tmp/waitgpu/lock", "w").close()
- return time.time() - os.path.getmtime("/tmp/waitgpu/lock") < 60
- def lock_gpu():
- open("/tmp/waitgpu/lock", "w").close()
- def check_first_task():
- fnames = os.listdir("/tmp/waitgpu")
- tasks = []
- for fname in fnames:
- fpath = "/tmp/waitgpu/{}".format(fname)
- pairs = open(fpath).read().strip().split("\t")
- if len(pairs) != 5:
- continue
- id, user, create_time, heartbeat_time, command = pairs
- if time.time() - float(heartbeat_time) > 30:
- # Validate the task
- try:
- os.remove(fpath)
- except:
- pass
- else:
- # Add to list
- tasks.append((float(create_time), int(id), command))
- tasks.sort()
- if not tasks:
- return None
- else:
- return tasks[0][1]
- def is_gpu_available():
- ret = subprocess.check_output("nvidia-smi", shell=True)
- return "No running processes found" in str(ret)
- def print_all_guys():
- fnames = os.listdir("/tmp/waitgpu")
- tasks = []
- for fname in fnames:
- fpath = "/tmp/waitgpu/{}".format(fname)
- pairs = open(fpath).read().strip().split("\t")
- if len(pairs) != 5:
- continue
- id, user, create_time, heartbeat_time, command = pairs
- if time.time() - float(heartbeat_time) < 30:
- tasks.append((float(create_time), user, command))
- tasks.sort()
- c = 1
- print("-------")
- if is_gpu_available():
- print("Status: GPU is not being used now")
- else:
- print("Status: GPU is occupied now")
- print("-------")
- print("Wating list:")
- print("rank\tuser\tcommand")
- for ct, user, command in tasks:
- print("{}\t{}\t{}".format(c, user, command))
- c += 1
- print("-------")
- if __name__ == '__main__':
- command = " ".join(sys.argv[1:])
- if command == "list":
- print_all_guys()
- elif command.strip() == "":
- print("Usage:")
- print("1. waitgpu python xxx.py <- run a command")
- print("2. waitgpu list <- see the waiting list")
- else:
- print("[waitgpu] waiting:", command)
- myid = random.randint(0, 999999)
- myhandle = "/tmp/waitgpu/{}.event".format(myid)
- myuser = getpass.getuser()
- create_time = time.time()
- while True:
- # Write event file
- with open(myhandle, "w") as fhandle:
- fhandle.write("\t".join([
- str(myid), myuser, str(create_time),
- str(time.time()), command
- ]))
- # Checking the gpus status, and run
- if is_gpu_available() and not is_gpu_locked() and check_first_task() == myid:
- print("[waitgpu] execute:", command)
- os.remove(myhandle)
- lock_gpu()
- os.system(command)
- sys.exit()
- break
- else:
- time.sleep(10)
Add Comment
Please, Sign In to add comment