Untitled

#!/usr/bin/env python


"""
hadd output of crab jobs utilising the fact that the output files are here at DESY,
so we can skip the painful crab getoutput
"""


import os
import subprocess
from collections import namedtuple
from itertools import chain
import re

from retrieve_jobs import TaskDictionaryNameUnordered
from submit_jobs import MCBackgroundsSampleDictionaryUnordered, SignalMCSampleDictionaryUnordered, DataDictionaryMuonChannelUnordered, DataDictionaryElectronChannelUnordered


# Location of crab output files
NAF_DIR = "/pnfs/desy.de/cms/tier2/store/user/raggleto/"

# Where you want the hadded files to go
OUTPUT_DIR = "/nfs/dust/cms/user/aggleton/aTGCsAnalysis/Samples_80X_Working_21_7_17_NewBtagSFNewVtagSF"

FEATURE_NAME = "NewBtagSFNewVtagSF"

# whether to run hadd jobs on bird cluster instead
RUN_ON_BIRD = True

# minimum % of jobs that must have completed successfully to generate a hadded file
MIN_PERCENT_FINISHED = 100.

Sample = namedtuple("Sample", ["taskname", "dataset", "outname"])


class bcolors:
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'


Status = namedtuple("Status", ["colour", "name"])
class JobStatus:
    FINISHED = Status(colour=bcolors.GREEN, name="Finished")
    UNFINISHED = Status(colour=bcolors.RED, name="Still running")
    JOBSUBMITTED = Status(colour=bcolors.BLUE, name="Hadd job submitted")


def create_sample_listing():
    """Go through submission lists and retrival list, and match based on task name to form one larger listing"""
    entries = []
    for task_name, dataset in chain(MCBackgroundsSampleDictionaryUnordered, DataDictionaryMuonChannelUnordered):
        matching_output = [x for x in TaskDictionaryNameUnordered if x[0] == task_name]
        if len(matching_output) > 1:
            raise RuntimeError("More than 1 match for %s" % task_name)
        if len(matching_output) == 0:
            print "No match for task %s" % task_name
            continue
        output_name = matching_output[0][1]
        entries.append(Sample(task_name, dataset, output_name))

    return entries


def get_job_status(job_dir):
    print "Checking job status of", job_dir
    cmd = "crab status -d crab_projects/%s" % job_dir
    status_text = subprocess.check_output(cmd, shell=True)
    return {
        "finished": float(re.search(r"finished +([0-9.]+)", status_text).groups(1)[0]) if "finished" in status_text else 0.0,
        "task_name": re.search(r"Task name:[ \t]+([0-9a-zA-Z_:-]+)", status_text).group(1)
    }

    if channel not in ["mu", "ele"]:
        raise RuntimeError("channel arg must be mu or ele")

    status_dict = get_job_status(crab_dir)
    print status_dict


def do_one_task(entry, channel, min_percent_finished=90.):
    """Do checking & hadding for one crab task.

    If output file already exists, skip this task.

    channel : {"mu", "ele"}
    min_percent_finished : float
        Minimum % of jobs that must be finished to perform hadd job
    """
    if channel not in ["mu", "ele"]:
        raise RuntimeError("channel arg must be mu or ele")

    output_file = os.path.join(OUTPUT_DIR, entry.outname + "_%s.root" % channel)

    if os.path.isfile(output_file):
        print bcolors.YELLOW + "! Output file already exists - skipping this task" + bcolors.ENDC
        return JobStatus.FINISHED

    crab_dir = "crab_%s_%s_%s" % (entry.taskname, channel, FEATURE_NAME)

    status_dict = get_job_status(crab_dir)
    print status_dict
    if status_dict['finished'] < min_percent_finished:
        print bcolors.RED + "crab jobs not finished - skipping" + bcolors.ENDC
        return JobStatus.UNFINISHED

    sample_dir = entry.dataset.split("/")[1]
    date_str = status_dict['task_name'].split(":")[0]
    input_str = os.path.join(NAF_DIR, sample_dir, crab_dir, date_str, "0000", "tree_%s_*.root" % channel)

    # actually do the hadding
    if RUN_ON_BIRD:
        qsub_command = """qsub -N %s -v OUTPUTF="%s",INPUTF="%s" qsub_hadd.sh""" % (entry.taskname, output_file, input_str)
        # print qsub_command # Uncomment this line when testing to view the qsub command
        subprocess.check_call(qsub_command, shell=True)
        print bcolors.GREEN + "hadd job submitted" + bcolors.ENDC
        return JobStatus.JOBSUBMITTED
    else:
        hadd_cmd = "hadd %s %s" % (output_file, input_str)
        print hadd_cmd
        subprocess.check_output(hadd_cmd, shell=True)  # need shell=True for wildcard expansion?
        return JobStatus.FINISHED


if __name__ == "__main__":
    samples = create_sample_listing()
    for x in samples:
        print x

    if not os.path.isdir(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    results = []
    for ind, entry in enumerate(samples[7:9], 1):
        print bcolors.BLUE + ">>> [%d/%d]" % (ind, len(samples)), entry.taskname + bcolors.ENDC
        result = do_one_task(entry, "mu", MIN_PERCENT_FINISHED)
        results.append((entry.taskname, result))

    # Print nice summary
    print ""
    print "="*80
    print "SUMMARY:"
    print "-"*80
    for (name, res) in results:
        print res.colour + name + "  (" + res.name + ")" + bcolors.ENDC
    print "-"*80
    print JobStatus.JOBSUBMITTED.colour + str(sum([res == JobStatus.JOBSUBMITTED for (name, res) in results])) + " jobs submitted" + bcolors.ENDC
    print JobStatus.FINISHED.colour + str(sum([res == JobStatus.FINISHED for (name, res) in results])) + " / " + str(len(results)) + " jobs finished"  + bcolors.ENDC
    print JobStatus.UNFINISHED.colour + str(sum([res == JobStatus.UNFINISHED for (name, res) in results])) + " / " + str(len(results)) + " jobs still running"  + bcolors.ENDC
    print "="*80