Untitled

#!/usr/bin/env python
import sys
import os
from datetime import datetime
from time import time
import re
t = time() # current time
t=int(t*1000) # milliseconds for math later
f=os.popen('hadoop job -list') # initial job list from jobtracker
task_list = [] # empty list to store jobs for ordering if needed
arg = ""
if len(sys.argv) == 2:
 arg =  sys.argv[1]
############################
# format from hadoop job -list expected to look like this...
# job_201106020849_0019 1       1307126449025   cloudera        NORMAL  NA
############
for jobline in f:
 if 'JobId' in jobline:     #get rid of header
 continue
 if 'currently' in jobline: #get rid of header
 continue
 words = str.split(jobline) #split on whitespace to mangle my way
words[0] is job_name
 run_time = int(t) - int(words[2]) # pull out the start time
 run_time = run_time / 1000 # milliseconds transform
 newjobline = re.sub(r'\d{13}',str(run_time), jobline) # replace with
elapsed time
 newjobline = newjobline.rstrip("\n") #strip extra newline
 mtaskcommand = 'hadoop job -list-attempt-ids ' + words[0] + ' map
running' # command string to get running maps
 mtasks = os.popen(str(mtaskcommand)) # get running maps
 nummaps = len(mtasks.readlines())
 rtaskcommand = 'hadoop job -list-attempt-ids ' + words[0] + ' reduce running'
 rtasks = os.popen(str(rtaskcommand)) # get running reducers
 numreducers = len(rtasks.readlines())
 taskline = " MAPPERS:" + str(nummaps) + "  REDUCERS:" + str(numreducers)
 jobinfo = newjobline + "\n" + taskline
 totaltasks = int(nummaps) + int(numreducers)
 task_list.append((totaltasks,jobinfo)) # build list of tasks, tuples
with numtasks
 if arg != "-task": # regular order
 print jobinfo # default order by start time task_id

if arg == "-task":
 task_list.sort(reverse=True)
 for i2 in task_list:
 print i2[1]