Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """
- hadd output of crab jobs utilising the fact that the output files are here at DESY,
- so we can skip the painful crab getoutput
- """
- import os
- import subprocess
- from collections import namedtuple
- from itertools import chain
- import re
- from retrieve_jobs import TaskDictionaryNameUnordered
- from submit_jobs import MCBackgroundsSampleDictionaryUnordered, SignalMCSampleDictionaryUnordered, DataDictionaryMuonChannelUnordered, DataDictionaryElectronChannelUnordered
- # Location of crab output files
- NAF_DIR = "/pnfs/desy.de/cms/tier2/store/user/raggleto/"
- # Where you want the hadded files to go
- OUTPUT_DIR = "/nfs/dust/cms/user/aggleton/aTGCsAnalysis/Samples_80X_Working_21_7_17_NewBtagSFNewVtagSF"
- FEATURE_NAME = "NewBtagSFNewVtagSF"
- # whether to run hadd jobs on bird cluster instead
- RUN_ON_BIRD = True
- # minimum % of jobs that must have completed successfully to generate a hadded file
- MIN_PERCENT_FINISHED = 100.
- Sample = namedtuple("Sample", ["taskname", "dataset", "outname"])
- class bcolors:
- BLUE = '\033[94m'
- GREEN = '\033[92m'
- YELLOW = '\033[93m'
- RED = '\033[91m'
- ENDC = '\033[0m'
- Status = namedtuple("Status", ["colour", "name"])
- class JobStatus:
- FINISHED = Status(colour=bcolors.GREEN, name="Finished")
- UNFINISHED = Status(colour=bcolors.RED, name="Still running")
- JOBSUBMITTED = Status(colour=bcolors.BLUE, name="Hadd job submitted")
- def create_sample_listing():
- """Go through submission lists and retrival list, and match based on task name to form one larger listing"""
- entries = []
- for task_name, dataset in chain(MCBackgroundsSampleDictionaryUnordered, DataDictionaryMuonChannelUnordered):
- matching_output = [x for x in TaskDictionaryNameUnordered if x[0] == task_name]
- if len(matching_output) > 1:
- raise RuntimeError("More than 1 match for %s" % task_name)
- if len(matching_output) == 0:
- print "No match for task %s" % task_name
- continue
- output_name = matching_output[0][1]
- entries.append(Sample(task_name, dataset, output_name))
- return entries
- def get_job_status(job_dir):
- print "Checking job status of", job_dir
- cmd = "crab status -d crab_projects/%s" % job_dir
- status_text = subprocess.check_output(cmd, shell=True)
- return {
- "finished": float(re.search(r"finished +([0-9.]+)", status_text).groups(1)[0]) if "finished" in status_text else 0.0,
- "task_name": re.search(r"Task name:[ \t]+([0-9a-zA-Z_:-]+)", status_text).group(1)
- }
- if channel not in ["mu", "ele"]:
- raise RuntimeError("channel arg must be mu or ele")
- status_dict = get_job_status(crab_dir)
- print status_dict
- def do_one_task(entry, channel, min_percent_finished=90.):
- """Do checking & hadding for one crab task.
- If output file already exists, skip this task.
- channel : {"mu", "ele"}
- min_percent_finished : float
- Minimum % of jobs that must be finished to perform hadd job
- """
- if channel not in ["mu", "ele"]:
- raise RuntimeError("channel arg must be mu or ele")
- output_file = os.path.join(OUTPUT_DIR, entry.outname + "_%s.root" % channel)
- if os.path.isfile(output_file):
- print bcolors.YELLOW + "! Output file already exists - skipping this task" + bcolors.ENDC
- return JobStatus.FINISHED
- crab_dir = "crab_%s_%s_%s" % (entry.taskname, channel, FEATURE_NAME)
- status_dict = get_job_status(crab_dir)
- print status_dict
- if status_dict['finished'] < min_percent_finished:
- print bcolors.RED + "crab jobs not finished - skipping" + bcolors.ENDC
- return JobStatus.UNFINISHED
- sample_dir = entry.dataset.split("/")[1]
- date_str = status_dict['task_name'].split(":")[0]
- input_str = os.path.join(NAF_DIR, sample_dir, crab_dir, date_str, "0000", "tree_%s_*.root" % channel)
- # actually do the hadding
- if RUN_ON_BIRD:
- qsub_command = """qsub -N %s -v OUTPUTF="%s",INPUTF="%s" qsub_hadd.sh""" % (entry.taskname, output_file, input_str)
- # print qsub_command # Uncomment this line when testing to view the qsub command
- subprocess.check_call(qsub_command, shell=True)
- print bcolors.GREEN + "hadd job submitted" + bcolors.ENDC
- return JobStatus.JOBSUBMITTED
- else:
- hadd_cmd = "hadd %s %s" % (output_file, input_str)
- print hadd_cmd
- subprocess.check_output(hadd_cmd, shell=True) # need shell=True for wildcard expansion?
- return JobStatus.FINISHED
- if __name__ == "__main__":
- samples = create_sample_listing()
- for x in samples:
- print x
- if not os.path.isdir(OUTPUT_DIR):
- os.makedirs(OUTPUT_DIR)
- results = []
- for ind, entry in enumerate(samples[7:9], 1):
- print bcolors.BLUE + ">>> [%d/%d]" % (ind, len(samples)), entry.taskname + bcolors.ENDC
- result = do_one_task(entry, "mu", MIN_PERCENT_FINISHED)
- results.append((entry.taskname, result))
- # Print nice summary
- print ""
- print "="*80
- print "SUMMARY:"
- print "-"*80
- for (name, res) in results:
- print res.colour + name + " (" + res.name + ")" + bcolors.ENDC
- print "-"*80
- print JobStatus.JOBSUBMITTED.colour + str(sum([res == JobStatus.JOBSUBMITTED for (name, res) in results])) + " jobs submitted" + bcolors.ENDC
- print JobStatus.FINISHED.colour + str(sum([res == JobStatus.FINISHED for (name, res) in results])) + " / " + str(len(results)) + " jobs finished" + bcolors.ENDC
- print JobStatus.UNFINISHED.colour + str(sum([res == JobStatus.UNFINISHED for (name, res) in results])) + " / " + str(len(results)) + " jobs still running" + bcolors.ENDC
- print "="*80
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement