Guest User

Untitled

a guest
Mar 17th, 2018
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.54 KB | None | 0 0
  1. from Bio.Seq import Seq
  2. from Bio.Seq import reverse_complement
  3.  
  4. dna_bases = ('A', 'C', 'G', 'T')
  5. purines = ('A', 'G')
  6. pyrimidines = ('C', 'T')
  7.  
  8. transitions = ('A>T', 'C>G', 'G>C', 'T>A')
  9. transversions = ('A>C', 'A>G', 'C>A', 'C>T', 'G>A', 'G>T', 'T>C', 'T>G')
  10.  
  11.  
  12. class Mutation(object):
  13. def __init__(self, ref, base, chrom, position, context):
  14. self.ref = ref
  15. self.base = base
  16. self.chrom = chrom
  17. self.position = int(position)
  18. self.context = context
  19.  
  20. @property
  21. def substitution(self):
  22. return Seq(''.join([self.ref, '>', self.base]))
  23.  
  24. @property
  25. def context(self):
  26. return Seq(''.join([self._5prime, self.ref, self._3prime]))
  27.  
  28. @context.setter
  29. def context(self, context):
  30. self._5prime = context[:len(context) - 2]
  31. self._3prime = context[len(context) - 1:]
  32.  
  33. @property
  34. def label(self):
  35. return ''.join([self._5prime.upper(),
  36. '[',
  37. str(self.substitution).upper(),
  38. ']',
  39. self._3prime.upper()])
  40.  
  41. def __eq__(self, other):
  42. return ''.join([str(self.chrom),
  43. str(self.position),
  44. str(self.substitution),
  45. str(self.context)]) == other
  46.  
  47. def __hash__(self):
  48. return hash(''.join([str(self.chrom),
  49. str(self.position),
  50. str(self.substitution),
  51. str(self.context)]))
  52.  
  53. def __lt__(self, other):
  54. return self.position < other.position
  55.  
  56. def __repr__(self):
  57.  
  58. return 'Mutation: {} in {}\nPosition: {}:{:,}\n'.format(
  59. str(self.substitution),
  60. str(self.context),
  61. self.chrom,
  62. self.position)
  63.  
  64.  
  65. def mutpos_depths(infile, chromosome=None, start=0, end=None, fmt='essigmann'):
  66. depths = {}
  67.  
  68. with open(infile, 'r') as handle:
  69. for line in handle:
  70. # Strip newline characters and split on tabs
  71. line = line.strip().split('\t')
  72.  
  73. # Unpack line and cast to proper data type
  74. chrom = str(line[0])
  75. position, depth = int(line[2]) - 1, int(line[3])
  76. if chromosome is not None and chrom != chromosome:
  77. continue
  78. if position < start:
  79. continue
  80. if end is not None and position >= end:
  81. continue
  82.  
  83. if fmt is 'essigmann':
  84. A, C, G, T, N = map(int, line[4:9])
  85. elif fmt is 'loeb':
  86. T, C, G, A = map(int, line[5:9])
  87. N = int(line[11])
  88. else:
  89. raise ValueError('Format must be essigmann or loeb')
  90.  
  91. depths[position] = depth - N
  92. result = []
  93. for position in range(start, end):
  94. if position in depths.keys():
  95. result.append(depths[position])
  96. else:
  97. result.append(0)
  98. return result
  99.  
  100.  
  101. def fasta_to_dict(ref_file):
  102. from Bio import SeqIO
  103. with open(ref_file, 'r') as handle:
  104. return SeqIO.to_dict(SeqIO.parse(handle, 'fasta'))
  105.  
  106.  
  107. def from_mutpos(mutpos_file, ref_file, clonality=(0, 1), min_depth=100, kmer=3,
  108. chromosome=None, start=0, end=None, notation='pyrimidine',
  109. fmt='essigmann', verbose=True):
  110. mutations = []
  111. record_dict = fasta_to_dict(ref_file)
  112.  
  113. # Open the mutpos_file and read data line by line
  114. with open(mutpos_file, 'r') as handle:
  115. for line in handle:
  116. # Strip newline characters and split on tabs
  117. line = line.strip().split('\t')
  118.  
  119. # Unpack line and cast to proper data type
  120. chrom, ref = str(line[0]), str(line[1]).upper()
  121. position, depth = int(line[2]) - 1, int(line[3])
  122.  
  123. if chromosome is not None and chrom != chromosome:
  124. continue
  125. if position < start:
  126. continue
  127. if end is not None and position >= end:
  128. continue
  129.  
  130. if fmt is 'essigmann':
  131. A, C, G, T, N = map(int, line[4:9])
  132. elif fmt is 'loeb':
  133. T, C, G, A = map(int, line[5:9])
  134. N = int(line[11])
  135. else:
  136. raise ValueError('Format must be essigmann or loeb')
  137.  
  138. # If we see no observed base substitutions, skip this loop
  139. if sum([A, C, G, T]) == 0:
  140. continue
  141.  
  142. # Read mapped depth (minus Ns) at this position must be greater
  143. # than min_depth, if not, skip this loop
  144. if min_depth > depth:
  145. continue
  146.  
  147. # Get the kmer context at the given position in the given
  148. # chromosome as looked up in record_dict. If we encounter an
  149. # edge case (return of None) we'll skip this loop
  150. context = str(get_kmer(record_dict, chrom, position, kmer)).upper()
  151.  
  152. if context is None:
  153. continue
  154.  
  155. # If the base is not in our intended labeling set: let's get
  156. # the complement of the reference base, the reverse complement
  157. # of the trinucleotide context, and the mutation counts for the
  158. # complement of the base substitution observed
  159. if (
  160. (notation == 'pyrimidine' and ref in purines) or
  161. (notation == 'purine' and ref in pyrimidines)
  162. ):
  163. ref = reverse_complement(ref)
  164. context = reverse_complement(context)
  165. A, G, C, T = T, C, G, A
  166.  
  167. for base, num_mutations in zip(dna_bases, [A, C, G, T]):
  168. # Clonality is defined as the frequency of any base
  169. # substitution at this one genomic position. For example, if
  170. # there are 5% T, 10% C, and 100% G in a reference position of
  171. # A, we can selet a clonality filter of (0.1, 0.5) to
  172. # eliminate the rare T mutations and clonal G mutations.
  173. base_clonality = num_mutations / (depth - N)
  174. if not min(clonality) < base_clonality < max(clonality):
  175. continue
  176.  
  177. for _ in range(num_mutations):
  178. mutation = Mutation(ref, base, chrom, position, context)
  179. mutation.depth = depth
  180. mutation.observed = num_mutations
  181. mutation.VAF = base_clonality
  182. mutations.append(mutation)
  183.  
  184. if verbose is True:
  185. print('Found {} Mutations'.format(len(mutations)))
  186. return mutations
  187.  
  188.  
  189. def read_mutations(infile, gene_filter=None):
  190. mutations = []
  191. with open(infile) as handle:
  192. for line in handle:
  193. sample, chromosome, position, gene, substitution, \
  194. context, observed, depth, VAF = line.strip().split(',')
  195. position = int(position)
  196.  
  197. if (gene_filter is not None and
  198. gene != gene_filter.name):
  199. continue
  200.  
  201. if (gene_filter is not None and
  202. position < gene_filter.start):
  203. continue
  204.  
  205. if (gene_filter is not None and
  206. position > gene_filter.end):
  207. continue
  208.  
  209. mutation = Mutation(substitution[0],
  210. substitution[2],
  211. chromosome,
  212. position,
  213. context)
  214.  
  215. mutation.VAF = float(VAF)
  216. mutation.sample, mutation.gene = sample, gene
  217. mutation.depth, mutation.observed = int(depth), int(observed)
  218.  
  219. mutations.append(mutation)
  220. return mutations
Add Comment
Please, Sign In to add comment