Advertisement
Guest User

Untitled

a guest
Apr 30th, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.17 KB | None | 0 0
  1. import time
  2. import argparse
  3. from collections import defaultdict
  4. from parsers import parse
  5. from tqdm import tqdm
  6. import numpy as np
  7. import matplotlib as mpl
  8. mpl.use('Agg')
  9. import matplotlib.pyplot as plt
  10.  
  11.  
  12. def main():
  13. _time_start = time.time()
  14.  
  15. print('[*] Parsing args...')
  16. parser = argparse.ArgumentParser(description='Genome coverage', epilog='Example flags: <..> -i out/align.sam -r data/ref.fasta -o graph_isize -b 50')
  17. parser.add_argument('-i', '--input', default='alignment.sam', help='Alignment filename')
  18. parser.add_argument('-r', '--reference', default='reference.fasta', help='Reference filename')
  19. parser.add_argument('-o', '--output', default='graph_insertion_length', help='Output filename without extension')
  20. parser.add_argument('-b', '--bins', default=100, type=int, help='Histogram bins [default: %(default)s]')
  21. parser.add_argument('-n', '--dry-run', action='store_true', help='Dry run')
  22.  
  23. args = parser.parse_args()
  24. alignment_filename = args.input
  25. reference_filename = args.reference
  26. output_filename = args.output
  27. bins = args.bins
  28. dry_run = args.dry_run
  29.  
  30. alignment = parse(alignment_filename)
  31. # reference = parse(reference_filename)
  32.  
  33. print('[*] Calculating genome length...')
  34. # genome_length = len(next(reference).seq)
  35. genome_length = 15441380
  36. print('[+] Reference length: <{}>'.format(genome_length))
  37.  
  38. print('[*] Working...')
  39. _time_start_calc = time.time()
  40. meta = defaultdict(lambda: defaultdict(int))
  41. # for item in tqdm(alignment, desc='Aligned reads'):
  42. for item in tqdm(alignment, desc='Aligned reads', total=31014224):
  43. chromo = item.rname
  44. tlen = int(item.tlen)
  45. if 0 < tlen < 1000:
  46. meta[chromo][tlen] += 1
  47. print('[+] Calculated in {:.3f} seconds'.format(time.time() - _time_start_calc))
  48.  
  49. print('[*] Refining meta (<{}> chromosomes)...'.format(len(meta)))
  50. meta = {chromo: [(i, data[i]) for i in range(1, max(data.keys()) + 1)] for chromo, data in meta.items()}
  51.  
  52. for chromo, data in meta.items():
  53. chromo_filename = '{}_{}'.format(output_filename, chromo)
  54.  
  55. print('[*] Statistics for <{}>:'.format(chromo))
  56. sizes = [k for k, v in data]
  57. weights = [v for k, v in data]
  58. avg = np.average(sizes, weights=weights)
  59. std = np.sqrt(np.average((sizes - avg)**2, weights=weights))
  60. low = avg - 2 * std
  61. high = avg + 2 * std
  62. print(' > Mean: {:.1f}'.format(avg))
  63. print(' > Std: {:.1f}'.format(std))
  64. print(' > 95%CI: {:.1f} -- {:.1f}'.format(low, high))
  65.  
  66. print('[*] Plotting <{}>...'.format(chromo))
  67. plt.clf()
  68. plt.hist(sizes, weights=weights, bins=bins)
  69. # plt.plot(sizes, weights)
  70. plt.title('{} insertion size distribution\n$\\mu={:.1f},$ $\\sigma^2={:.1f},$ 95%CI: ${:.1f} \\ldots {:.1f}$'.format(chromo, avg, std**2, low, high))
  71. plt.xlim(xmin=0, xmax=max(sizes))
  72. plt.grid(True)
  73. plt.tight_layout()
  74.  
  75. print('[*] Saving to <{}.png>...'.format(chromo_filename))
  76. plt.savefig('{}.png'.format(chromo_filename), dpi=300)
  77.  
  78. print('[+] Done in {:.3f} seconds.'.format(time.time() - _time_start))
  79.  
  80. if __name__ == '__main__':
  81. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement