Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from collections import defaultdict
- from tqdm import tqdm
- from Bio import SeqIO
- alignment_filename = 'out/HTC10499.sam'
- reference_filename = 'data4/P.stipitis/Pichia_stipitis_Sanger_reference.fa'
- # alignment_filename = 'out/head_HTC10499.sam'
- # reference_filename = 'data/Pichia_stipitis_Sanger_reference.fa'
- def main():
- matrix = defaultdict(lambda: defaultdict(int))
- with open(reference_filename) as f:
- reference = next(SeqIO.parse(reference_filename, 'fasta')).seq._data.upper()
- with open(alignment_filename) as f:
- for line in tqdm(f):
- if line.startswith('@'):
- continue
- raw = line.split('\t')
- pos, cigar, read = int(raw[3]), raw[5], raw[9].upper()
- if 'M' in cigar and 'I' not in cigar and 'D' not in cigar:
- k = int(cigar[:-1])
- ref_read = reference[pos - 1: pos + k - 1]
- for c, refc in zip(read, ref_read):
- if c != refc:
- matrix[refc][c] += 1
- LETTERS = sorted(matrix.keys())
- print('[*] Substitution matrix:')
- k = 9
- s = '{}{}'.format(' ' * (k + 4), ' |{}'.format(' ' * (k - 1)).join(LETTERS))
- print(s)
- print(' ' + '-' * len(s[3:]))
- for refc in LETTERS:
- print(' {} | {}'.format(refc, ' |'.join(map(lambda x: '{1: >{0}}'.format(k, x), (matrix[refc][c] for c in LETTERS)))))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement