Advertisement
Guest User

Untitled

a guest
Apr 30th, 2017
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.42 KB | None | 0 0
  1. import time
  2. import argparse
  3. from collections import defaultdict
  4. from parsers import parse
  5. from tqdm import tqdm
  6. import numpy as np
  7. import matplotlib as mpl
  8. mpl.use('Agg')
  9. import matplotlib.pyplot as plt
  10.  
  11.  
  12. def main():
  13. _time_start = time.time()
  14.  
  15. print('[*] Parsing args...')
  16. parser = argparse.ArgumentParser(description='Genome coverage', epilog='Example flags: <..> -i out/align.sam -r data/Ecoli.fasta -k 100')
  17. parser.add_argument('-i', '--input', default='alignment.sam', help='Alignment filename')
  18. parser.add_argument('-r', '--reference', default='reference.fasta', help='Reference filename')
  19. parser.add_argument('-o', '--output', default='graph_genome_coverage', help='Output filename without extension')
  20. parser.add_argument('-k', '--bucket', type=int, default=1000, help='Bucket size [default: %(default)s]')
  21. parser.add_argument('-n', '--dry-run', action='store_true', help='Dry run')
  22.  
  23. args = parser.parse_args()
  24. alignment_filename = args.input
  25. reference_filename = args.reference
  26. output_filename = args.output
  27. k = args.bucket
  28. dry_run = args.dry_run
  29.  
  30. alignment = parse(alignment_filename)
  31. reference = parse(reference_filename)
  32.  
  33. print('[*] Calculating genome length...')
  34. # genome_length = len(next(reference).seq)
  35. genome_length = 15441380
  36. print('[+] Reference length: <{}>'.format(genome_length))
  37.  
  38. print('[*] Working with k = <{}>...'.format(k))
  39. _time_start_calc = time.time()
  40. # n = genome_length // k + 1
  41. data = defaultdict(lambda: defaultdict(int))
  42. for item in tqdm(alignment, desc='Aligned reads', total=31014224):
  43. left = int(item.pos)
  44. if left > 0: # Skip unaligned read (pos=0)
  45. tlen = int(item.tlen)
  46. if 0 < tlen < 1000:
  47. right = left + tlen - 1
  48. a = left // k
  49. b = right // k
  50. for i in range(a, b + 1):
  51. data[item.rname][i] += 1
  52. print('[+] Calculated in {:.3f} seconds'.format(time.time() - _time_start_calc))
  53.  
  54. print('[*] Refining data (<{}> chromosomes)...'.format(len(data)))
  55. data = {chromo: np.array([meta[i] for i in range(1, max(meta) + 1)]) for chromo, meta in data.items()}
  56. for chromo in data:
  57. print(' > len({}) = {}'.format(chromo, len(data[chromo])))
  58.  
  59. print('[*] Dumping chromosomes into <{}_{{chromo}}.txt>...'.format(output_filename))
  60. # for chromo in data:
  61. # chromo_filename = '{}_{}.txt'.format(output_filename, chromo)
  62. # print(' > <{}>...'.format(chromo_filename))
  63. # with open(chromo_filename, 'w') as f:
  64. # for x in data[chromo]:
  65. # f.write('{}\n'.format(x))
  66. print(' > This was a joke :D')
  67.  
  68. print('[*] Plotting chromosomes...')
  69. for chromo in data:
  70. n = len(data[chromo])
  71. chromo_filename = '{}_{}.png'.format(output_filename, chromo)
  72. # print(' > <{}>...'.format(chromo_filename))
  73.  
  74. plt.clf()
  75. plt.plot([i * k for i in range(n)], data[chromo]) # '-', marker='o', markerfacecolor='red')
  76. plt.title('{} coverage'.format(chromo))
  77. plt.grid(True)
  78. plt.xlim(xmin=0)
  79. plt.ylim(0, np.mean(data[chromo]) + 5 * np.std(data[chromo]))
  80. plt.tight_layout()
  81.  
  82. print(' > Saving to <{}.png>...'.format(chromo_filename))
  83. plt.savefig(chromo_filename, dpi=300)
  84.  
  85. print('[+] Done in {:.3f} seconds.'.format(time.time() - _time_start))
  86. # plt.show()
  87.  
  88. if __name__ == '__main__':
  89. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement