Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import time
- import argparse
- from collections import defaultdict
- from parsers import parse
- from tqdm import tqdm
- import numpy as np
- import matplotlib as mpl
- mpl.use('Agg')
- import matplotlib.pyplot as plt
- def main():
- _time_start = time.time()
- print('[*] Parsing args...')
- parser = argparse.ArgumentParser(description='Genome coverage', epilog='Example flags: <..> -i out/align.sam -r data/Ecoli.fasta -k 100')
- parser.add_argument('-i', '--input', default='alignment.sam', help='Alignment filename')
- parser.add_argument('-r', '--reference', default='reference.fasta', help='Reference filename')
- parser.add_argument('-o', '--output', default='graph_genome_coverage', help='Output filename without extension')
- parser.add_argument('-k', '--bucket', type=int, default=1000, help='Bucket size [default: %(default)s]')
- parser.add_argument('-n', '--dry-run', action='store_true', help='Dry run')
- args = parser.parse_args()
- alignment_filename = args.input
- reference_filename = args.reference
- output_filename = args.output
- k = args.bucket
- dry_run = args.dry_run
- alignment = parse(alignment_filename)
- reference = parse(reference_filename)
- print('[*] Calculating genome length...')
- # genome_length = len(next(reference).seq)
- genome_length = 15441380
- print('[+] Reference length: <{}>'.format(genome_length))
- print('[*] Working with k = <{}>...'.format(k))
- _time_start_calc = time.time()
- # n = genome_length // k + 1
- data = defaultdict(lambda: defaultdict(int))
- for item in tqdm(alignment, desc='Aligned reads', total=31014224):
- left = int(item.pos)
- if left > 0: # Skip unaligned read (pos=0)
- tlen = int(item.tlen)
- if 0 < tlen < 1000:
- right = left + tlen - 1
- a = left // k
- b = right // k
- for i in range(a, b + 1):
- data[item.rname][i] += 1
- print('[+] Calculated in {:.3f} seconds'.format(time.time() - _time_start_calc))
- print('[*] Refining data (<{}> chromosomes)...'.format(len(data)))
- data = {chromo: np.array([meta[i] for i in range(1, max(meta) + 1)]) for chromo, meta in data.items()}
- for chromo in data:
- print(' > len({}) = {}'.format(chromo, len(data[chromo])))
- print('[*] Dumping chromosomes into <{}_{{chromo}}.txt>...'.format(output_filename))
- # for chromo in data:
- # chromo_filename = '{}_{}.txt'.format(output_filename, chromo)
- # print(' > <{}>...'.format(chromo_filename))
- # with open(chromo_filename, 'w') as f:
- # for x in data[chromo]:
- # f.write('{}\n'.format(x))
- print(' > This was a joke :D')
- print('[*] Plotting chromosomes...')
- for chromo in data:
- n = len(data[chromo])
- chromo_filename = '{}_{}.png'.format(output_filename, chromo)
- # print(' > <{}>...'.format(chromo_filename))
- plt.clf()
- plt.plot([i * k for i in range(n)], data[chromo]) # '-', marker='o', markerfacecolor='red')
- plt.title('{} coverage'.format(chromo))
- plt.grid(True)
- plt.xlim(xmin=0)
- plt.ylim(0, np.mean(data[chromo]) + 5 * np.std(data[chromo]))
- plt.tight_layout()
- print(' > Saving to <{}.png>...'.format(chromo_filename))
- plt.savefig(chromo_filename, dpi=300)
- print('[+] Done in {:.3f} seconds.'.format(time.time() - _time_start))
- # plt.show()
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement