Guest User

Untitled

a guest
Feb 21st, 2018
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.52 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding=utf-8 -*-
  3. import os
  4. import glob
  5. import commands
  6. import argparse
  7.  
  8. from collections import OrderedDict
  9.  
  10.  
  11. __version__ = '1.0'
  12. __author__ = 'suqingdong'
  13.  
  14.  
  15. def human_readable_size(size):
  16.  
  17. if size > 1024 * 1024 * 1024:
  18. size = str(round(size / 1024. / 1024. / 1024., 1)) + 'G'
  19. elif size > 1024 * 1024:
  20. size = str(round(size / 1024. / 1024., 1)) + 'M'
  21. elif size > 1024:
  22. size = str(round(size / 1024., 1)) + 'K'
  23. elif 0 < size < 1024:
  24. size = str(size)
  25. else:
  26. print 'size must greater then zero'
  27.  
  28. return size
  29.  
  30.  
  31. def get_size(file, human_readable=True):
  32.  
  33. if not os.path.exists(file):
  34. size = '\033[31;5mError\033[0m'
  35. elif os.path.getsize(file) == 0:
  36. size = '\033[31;5m0\033[0m'
  37. else:
  38. size = human_readable_size(os.path.getsize(file)) if human_readable else os.path.getsize(file)
  39.  
  40. return size
  41.  
  42.  
  43. def print_glob_size(path):
  44.  
  45. path_list = [path] if type(path) != list else path
  46.  
  47. for path in path_list:
  48. for f in glob.glob(path):
  49. f = f.lstrip('./')
  50. size = get_size(f)
  51. print '| {:6s}\t{}'.format(size, f)
  52.  
  53.  
  54. def check_raw_clean(name, data_path='.'):
  55.  
  56. print '\033[33m|{:=^100s} \033[0m'.format(' Checking %s ' % name)
  57. sample_nums = commands.getoutput('ls {}/{} | wc -l'.format(data_path, name))
  58. print '\033[323m|-- {}样本数: {}\033[0m'.format(name, sample_nums)
  59.  
  60. # fq_nums = commands.getoutput('ls %s/%s/*/*.fq.gz | wc -l' % (data_path, name))
  61. fq_nums = len(glob.glob('%s/%s/*/*.fq.gz' % (data_path, name)))
  62.  
  63. cmd = 'wc -l %s/%s/*/MD5.txt | grep -vE "(total)|(总用量)" | awk \'{print $1}\'' % (data_path, name)
  64. # print cmd
  65. md5_nums = sum(int(n) for n in commands.getoutput(cmd).split('\n'))
  66.  
  67. print '|-- fq.gz文件数:', fq_nums
  68. print '|-- MD5总行数:', md5_nums
  69.  
  70. if fq_nums != md5_nums:
  71. print '\033[31m|-- fq.gz文件数和MD5总行数不相等,请检查!!!\033[0m'
  72. elif fq_nums == 0:
  73. print '\033[31m|-- fq.gz文件不存在,请检查!!!\033[0m'
  74.  
  75. print_glob_size('{}/{}/*/*.fq.gz'.format(data_path, name))
  76.  
  77. for md5 in glob.glob('{}/{}/*/MD5.txt'.format(data_path, name)):
  78. md5 = md5.lstrip('./')
  79. md5_num = commands.getoutput('wc -l %s | awk \'{print $1}\'' % md5)
  80. md5_num = '\033[31;5m{}\033[0m'.format(md5_num) if md5_num == '0' else md5_num
  81. print '| {:6s}\t{}'.format(md5_num, md5)
  82.  
  83.  
  84. def check_bam(name='Mapping', data_path='Mapping'):
  85.  
  86. print '\033[33m|{:=^100s} \033[0m'.format(' Checking %s ' % name)
  87. sample_nums = commands.getoutput('ls -l {} | grep "^d" | wc -l'.format(data_path))
  88. print '\033[323m|-- Mapping样本数: {}\033[0m'.format(sample_nums)
  89.  
  90. print_glob_size('{}/*/*.bam*'.format(data_path))
  91.  
  92.  
  93. def check_variation(name='Variation', data_path='Variation'):
  94.  
  95. print '\033[33m|{:=^100s} \033[0m'.format(' Checking %s ' % name)
  96. sample_nums = commands.getoutput('ls -l {} | grep "^d" | wc -l'.format(data_path))
  97. print '\033[323m|-- Variation样本数: {}\033[0m'.format(sample_nums)
  98.  
  99. for each in ('SNP', 'InDel', 'SV', 'CNV'):
  100. if glob.glob('{}/*/{}'.format(data_path, each)):
  101. print '|-- checking %s ...' % each
  102. print_glob_size([
  103. '{}/*/{}/*.*'.format(data_path, each),
  104. '{}/*/{}/Circos/*'.format(data_path, each),
  105. ])
  106.  
  107.  
  108. def check_advance(name='Advance', data_path='Advance'):
  109.  
  110. print '\033[33m|{:=^100s} \033[0m'.format(' Checking %s ' % name)
  111.  
  112. analysis_items = sorted(str(d) for d in os.listdir(data_path) if 'Total' not in d)
  113.  
  114. print '|-- analysis items:\n| ' + '\n| '.join(analysis_items)
  115.  
  116. # Other analysis check needs to add here
  117. analysis_map = OrderedDict()
  118. analysis_map['FilterDB'] = [
  119. data_path + '/*FilterDB*/VCF/*',
  120. data_path + '/*FilterDB*/Filter/*/*'
  121. ]
  122. for each in ('ACMG', 'FilterCNV_SV', 'ModelF'):
  123. analysis_map[each] = data_path + '/*{}*/*/*'.format(each)
  124. analysis_map['Denovo'] = [
  125. data_path + '/*Denovo*/SNP_INDEL/Denovo*/*/*/*',
  126. data_path + '/*Denovo*/SNP_INDEL/IntersectResult/*xls',
  127. data_path + '/*Denovo*/SNP_INDEL/IntersectResult/*/*',
  128. data_path + '/*Denovo*/CNV_SV/*/*/*'
  129. ]
  130. for each in ('Noncoding', 'Network', 'PPI', 'Share'):
  131. analysis_map[each] = data_path + '/*{}*/*'.format(each)
  132. analysis_map['Pathway'] = [
  133. data_path + '/*Pathway*/*.*',
  134. data_path + '/*Pathway*/KEGG_maps/*.*',
  135. data_path + '/*Pathway*/KEGG_maps/png/*'
  136. ]
  137. analysis_map['Customiezd'] = data_path + '/*Customiezd*/*.*'
  138.  
  139. for analysis_detail in analysis_items:
  140. for analysis in analysis_map:
  141. if analysis in analysis_detail:
  142. print '\033[33m|-- checking %s ...\033[0m' % analysis_detail
  143. print_glob_size(analysis_map[analysis])
  144.  
  145.  
  146. def check_46(check_all=False):
  147.  
  148. print '\033[36m|=== Check result for disease pipeline 4.6 ...\033[0m'
  149.  
  150. # RawData and CleanData
  151. for each in ('RawData', 'CleanData'):
  152. if each in os.listdir('ReleaseResult/Data'):
  153. check_raw_clean(each, 'ReleaseResult/Data')
  154.  
  155. # BamData
  156. if 'BamData' in os.listdir('ReleaseResult/Data'):
  157. check_bam('BamData', 'ReleaseResult/Data/BamData')
  158.  
  159. # SampleVariation
  160. if 'SampleVariation' in os.listdir('ReleaseResult/PrimaryAnalysis'):
  161. check_variation('SampleVariation', 'ReleaseResult/PrimaryAnalysis/SampleVariation')
  162.  
  163. # FilterAnalysis
  164. if 'FilterAnalysis' in os.listdir('ReleaseResult/PrimaryAnalysis'):
  165. print '\033[33m|{:=^100s} \033[0m'.format(' Checking FilterAnalysis ')
  166. print_glob_size('ReleaseResult/PrimaryAnalysis/FilterAnalysis/*/*/*')
  167.  
  168. # FinalResult
  169. if 'FinalResult' in os.listdir('ReleaseResult'):
  170.  
  171. check_advance('FinalResult', 'ReleaseResult/FinalResult')
  172.  
  173. print '\033[33m|-- checking Total.candidate_gene.xls ...\033[0m'
  174. print_glob_size('ReleaseResult/FinalResult/*xls')
  175.  
  176. # 4.6的Advance目录只用于信息自查,不释放
  177. if check_all:
  178. print '|' + '-' * 100
  179. print '|\033[33m{:-^100s}\033[0m'.format(' The below directories will not be released, just used for self checking! ')
  180. if 'Advance' in os.listdir('.'):
  181. check_advance()
  182.  
  183.  
  184. def check_45():
  185.  
  186. print '\033[36m|=== Check result for disease pipeline 4.5 ...\033[0m'
  187.  
  188. directories = os.listdir('.')
  189.  
  190. # RawData and QC
  191. for each in ('RawData', 'QC'):
  192. if each in directories:
  193. check_raw_clean(each)
  194.  
  195. # Mapping
  196. if 'Mapping' in directories:
  197. check_bam()
  198.  
  199. # Variation
  200. if 'Variation' in directories:
  201. check_variation()
  202.  
  203. # Advance
  204. if 'Advance' in directories:
  205. check_advance()
  206.  
  207.  
  208. def main():
  209.  
  210. result_path = os.path.abspath(args.get('dir'))
  211. check_all = args.get('all')
  212.  
  213. os.chdir(result_path)
  214. print '\033[32m|=== Release Directory: %s\033[0m' % os.getcwd()
  215.  
  216. if 'ReleaseResult' in os.listdir('.'):
  217. check_46(check_all)
  218. else:
  219. check_45()
  220.  
  221.  
  222. if __name__ == "__main__":
  223.  
  224. parser = argparse.ArgumentParser(
  225. prog='Check',
  226. description='Check Result directory for disease pipeline',
  227. epilog='Contact: {0} <{0}@novogene.com>'.format(__author__),
  228. formatter_class=argparse.RawTextHelpFormatter,
  229. version=__version__
  230. )
  231.  
  232. parser.add_argument('-d', '--dir', help='The data release path to check [default="%(default)s"]', default='.')
  233. parser.add_argument('-a', '--all', help='Whether check Advance directory or not for 4.6 pipeline', action='store_true')
  234.  
  235. args = vars(parser.parse_args())
  236.  
  237. main()
Add Comment
Please, Sign In to add comment