Advertisement
Guest User

Untitled

a guest
Jul 19th, 2016
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.80 KB | None | 0 0
  1. import argparse
  2. import datetime
  3. import sys
  4. from collections import defaultdict
  5. import re
  6. import requests
  7. from genologics.lims import Lims
  8.  
  9. baseuri = ''
  10. username = ''
  11. password = ''
  12. apiurl = ''
  13. l = Lims(baseuri=baseuri, username=username, password=password)
  14.  
  15. def get_lims_samples(sample_name, lims):
  16. samples = lims.get_samples(name=sample_name)
  17. # FIXME: Remove the hack when we're sure our sample id don't have colon
  18. if len(samples) == 0:
  19. sample_name_sub = re.sub("_(\d{2})$", ":\g<1>", sample_name)
  20. samples = lims.get_samples(name=sample_name_sub)
  21. if len(samples) == 0:
  22. sample_name_sub = re.sub("__(\w)_(\d{2})", " _\g<1>:\g<2>", sample_name)
  23. samples = lims.get_samples(name=sample_name_sub)
  24. return samples
  25.  
  26.  
  27. def get_lims_sample(sample_name):
  28. samples = get_lims_samples(sample_name, l)
  29. if len(samples) != 1:
  30. return None
  31. return samples[0]
  32.  
  33.  
  34. def get_expected_yield_for_sample(sample_name):
  35. """
  36. Query the LIMS and return the number of bases expected for a sample
  37. :param sample_name: the sample name
  38. :return: number of bases
  39. """
  40. sample = get_lims_sample(sample_name)
  41. if sample:
  42. nb_gb = sample.udf.get('Yield for Quoted Coverage (Gb)')
  43. else:
  44. nb_gb = 95
  45. return int(nb_gb) * 1000000000
  46.  
  47.  
  48. def new_sample_passing_threshold(sample_ids_to_info):
  49. sample_passing_threshold = []
  50. for sample_id in sample_ids_to_info:
  51. threshold = get_expected_yield_for_sample(sample_id)
  52. if sample_ids_to_info[sample_id].get('yield_q30', 0) > threshold and \
  53. sample_ids_to_info[sample_id].get('ready', 'no') == 'no':
  54. sample_ids_to_info[sample_id]['ready'] = 'yes'
  55. sample_passing_threshold.append(sample_id)
  56. return sample_passing_threshold
  57.  
  58. def depaginate(url, query, extra):
  59. r = requests.get(url+query+extra)
  60. json = r.json()
  61. samples = json.get('data')
  62. while 'next' in json['_links']:
  63. query = json['_links']['next']['href']
  64. r = requests.get(url+query+extra)
  65. json = r.json()
  66. samples.extend(json.get('data'))
  67. return samples
  68.  
  69. INTERVAL_WEEK = 'week'
  70. INTERVAL_MONTH = 'month'
  71.  
  72. def aggregate_run_per_week(run_elements):
  73. data_per_week = defaultdict(list)
  74. weeks = set()
  75. for run_element in run_elements:
  76. date = datetime.datetime.strptime(run_element.get('run_id')[:6],'%y%m%d')
  77. year, week_of_the_year, day_of_week = date.isocalendar()
  78. start = date - datetime.timedelta(days=date.weekday())
  79. end = start + datetime.timedelta(days=6)
  80. week = '%s week %02d (%s - %s)' % (year, week_of_the_year, start.strftime('%d/%m'), end.strftime('%d/%m'))
  81. data_per_week[week].append(run_element)
  82. weeks.add(week)
  83. return sorted(weeks), data_per_week
  84.  
  85. def aggregate_run_per_month(run_elements):
  86. data_per_month = defaultdict(list)
  87. months = set()
  88. for run_element in run_elements:
  89. date = datetime.datetime.strptime(run_element.get('run_id')[:6],'%y%m%d')
  90. month = '%s month %02s (%s)' % (date.year, date.month, date.strftime("%B"))
  91. data_per_month[month].append(run_element)
  92. months.add(month)
  93. return sorted(months), data_per_month
  94.  
  95. def aggregate(run_elements, sample_ids_to_info=None):
  96. aggregated_data = {}
  97. clean_bases_r1 = sum([int(e.get('clean_bases_r1', '0')) for e in run_elements if e.get('useable')=='yes'])
  98. clean_bases_r2 = sum([int(e.get('clean_bases_r2', '0')) for e in run_elements if e.get('useable')=='yes'])
  99. clean_q30_bases_r1 = sum(int(e.get('clean_q30_bases_r1', '0')) for e in run_elements if e.get('useable')=='yes')
  100. clean_q30_bases_r2 = sum(int(e.get('clean_q30_bases_r2', '0')) for e in run_elements if e.get('useable')=='yes')
  101. aggregated_data['Yield'] = '%.2f'%((clean_bases_r1 + clean_bases_r2) / 1000000000)
  102. aggregated_data['Yield Q30'] = '%.2f'%((clean_q30_bases_r1 + clean_q30_bases_r2) / 1000000000)
  103. run_ids = set([e.get('run_id') for e in run_elements if e.get('useable')=='yes' and e.get('reviewed')=='pass'])
  104. aggregated_data['Runs successfull'] = '%d'%(len(run_ids))
  105. run_ids = set([e.get('run_id') for e in run_elements if e.get('useable')=='yes' and e.get('reviewed')=='fail'])
  106. aggregated_data['Runs failed'] = '%d'%(len(run_ids))
  107. sample_ids = set([e.get('sample_id') for e in run_elements if e.get('useable')=='yes'])
  108. sample_ids.discard('Undetermined')
  109. for sample_id in sample_ids:
  110. new_yield = sum([e.get('clean_q30_bases_r2', '0') for e in run_elements if e.get('useable')=='yes'])
  111. if 'yield_q30' in sample_ids_to_info[sample_id]:
  112. sample_ids_to_info[sample_id]['yield_q30'] += new_yield
  113. else:
  114. sample_ids_to_info[sample_id]['yield_q30'] = new_yield
  115. sample_passing_threshold = new_sample_passing_threshold(sample_ids_to_info)
  116.  
  117. if sample_ids_to_info:
  118. new_samples = sample_ids.difference(set(sample_ids_to_info))
  119. sample_ids_to_info = defaultdict(dict)
  120. else:
  121. new_samples = set(sample_ids)
  122. aggregated_data['Sample sequenced'] = '%d'%(len(sample_ids))
  123. aggregated_data['New Samples'] = '%d'%(len(new_samples))
  124. aggregated_data['Samples passing threshold'] = '%d'%(len(sample_passing_threshold))
  125.  
  126. return aggregated_data, sample_ids_to_info
  127.  
  128.  
  129. def aggregate_per_time(run_elements, interval_type=INTERVAL_WEEK):
  130. if interval_type == INTERVAL_WEEK:
  131. list_intervals, data_per_interval = aggregate_run_per_week(run_elements)
  132. elif interval_type == INTERVAL_MONTH:
  133. list_intervals, data_per_interval = aggregate_run_per_month(run_elements)
  134. headers = [interval_type, 'Yield', 'Yield Q30', 'Runs successfull', 'Runs failed', 'Sample sequenced', 'New Samples',
  135. 'Samples passing threshold']
  136. sample_ids_to_info = defaultdict(dict)
  137. aggregate_per_interval = {}
  138. for interval in sorted(list_intervals, reverse=False)[:6]:
  139. out=[interval]
  140. aggregated_data, sample_ids_to_info = aggregate(data_per_interval.get(interval), sample_ids_to_info)
  141. aggregate_per_interval[interval]=aggregated_data
  142. print('\t'.join(headers))
  143. for interval in sorted(aggregate_per_interval, reverse=True):
  144. out=[interval]
  145. aggregated_data = aggregate_per_interval.get(interval)
  146. for header in headers[1:]:
  147. out.append(aggregated_data.get(header))
  148. print('\t'.join(out))
  149.  
  150.  
  151. def get_run_qc(interval_type):
  152. query = """run_elements?max_results=100"""
  153. extra = """"""
  154. run_elements = depaginate(apiurl, query, extra)
  155. aggregate_per_time(run_elements, interval_type)
  156.  
  157. def main():
  158. p = argparse.ArgumentParser()
  159. p.add_argument('--interval', type=str, default=INTERVAL_WEEK,
  160. help="Set the interval in which the metrics will be calculated (" + INTERVAL_WEEK +' or '+ INTERVAL_MONTH + ")")
  161. args = p.parse_args()
  162. get_run_qc(args.interval)
  163.  
  164. if __name__=="__main__":
  165. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement