Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import argparse
- from collections import defaultdict
- from parsers import parse
- from tqdm import tqdm
- import numpy as np
- import matplotlib as mpl
- mpl.use('Agg')
- import matplotlib.pyplot as plt
- def main():
- _time_start = time.time()
- print('[*] Parsing args...')
- parser = argparse.ArgumentParser(description='Genome coverage', epilog='Example flags: <..> -i out/align.sam -r data/ref.fasta -o graph_isize -b 50')
- parser.add_argument('-i', '--input', default='alignment.sam', help='Alignment filename')
- parser.add_argument('-r', '--reference', default='reference.fasta', help='Reference filename')
- parser.add_argument('-o', '--output', default='graph_insertion_length', help='Output filename without extension')
- parser.add_argument('-b', '--bins', default=100, type=int, help='Histogram bins [default: %(default)s]')
- parser.add_argument('-n', '--dry-run', action='store_true', help='Dry run')
- args = parser.parse_args()
- alignment_filename = args.input
- reference_filename = args.reference
- output_filename = args.output
- bins = args.bins
- dry_run = args.dry_run
- alignment = parse(alignment_filename)
- # reference = parse(reference_filename)
- print('[*] Calculating genome length...')
- # genome_length = len(next(reference).seq)
- genome_length = 15441380
- print('[+] Reference length: <{}>'.format(genome_length))
- print('[*] Working...')
- _time_start_calc = time.time()
- meta = defaultdict(lambda: defaultdict(int))
- # for item in tqdm(alignment, desc='Aligned reads'):
- for item in tqdm(alignment, desc='Aligned reads', total=31014224):
- chromo = item.rname
- tlen = int(item.tlen)
- if 0 < tlen < 1000:
- meta[chromo][tlen] += 1
- print('[+] Calculated in {:.3f} seconds'.format(time.time() - _time_start_calc))
- print('[*] Refining meta (<{}> chromosomes)...'.format(len(meta)))
- meta = {chromo: [(i, data[i]) for i in range(1, max(data.keys()) + 1)] for chromo, data in meta.items()}
- for chromo, data in meta.items():
- chromo_filename = '{}_{}'.format(output_filename, chromo)
- print('[*] Statistics for <{}>:'.format(chromo))
- sizes = [k for k, v in data]
- weights = [v for k, v in data]
- avg = np.average(sizes, weights=weights)
- std = np.sqrt(np.average((sizes - avg)**2, weights=weights))
- low = avg - 2 * std
- high = avg + 2 * std
- print(' > Mean: {:.1f}'.format(avg))
- print(' > Std: {:.1f}'.format(std))
- print(' > 95%CI: {:.1f} -- {:.1f}'.format(low, high))
- print('[*] Plotting <{}>...'.format(chromo))
- plt.clf()
- plt.hist(sizes, weights=weights, bins=bins)
- # plt.plot(sizes, weights)
- plt.title('{} insertion size distribution\n$\\mu={:.1f},$ $\\sigma^2={:.1f},$ 95%CI: ${:.1f} \\ldots {:.1f}$'.format(chromo, avg, std**2, low, high))
- plt.xlim(xmin=0, xmax=max(sizes))
- plt.grid(True)
- plt.tight_layout()
- print('[*] Saving to <{}.png>...'.format(chromo_filename))
- plt.savefig('{}.png'.format(chromo_filename), dpi=300)
- print('[+] Done in {:.3f} seconds.'.format(time.time() - _time_start))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement