Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Mini project #3:
- ---------------
- The is focused on finding the minimum length k of k-mers for which it is possible to obtain unique cyclic genome.
- Implementation of a whole method is based on constructing Hamiltionian cycle which recovers the genome.
- Good source of knowledge: https://www.youtube.com/watch?v=0JlUy_l-RTk&t=2s
- """
- from typing import Dict, List, Tuple
- from collections import defaultdict
- def read_file(path: str) -> Dict:
- """ Reading genes from a file into a dictionary
- in which keys represent a genome name,
- and a values - the genomes. """
- genomes = dict()
- dict_hlpr = ""
- with open(path, 'r') as file:
- for line in file:
- if line.startswith(">"):
- dict_hlpr = line[2:-1]
- # print(dict_hlpr)
- else:
- genomes[dict_hlpr] = line[:-1]
- # print(line[:-1])
- return(genomes)
- def composition(genomes: str, k_len: int) -> List[str]:
- """
- Takes a circular genes and divide it for all possible k-mers (fragments) of a size of "k-mer" parameter
- and returns a list of k-mers that "wrap around" the end.
- For example:
- in: composition(“ATACGGTC”, 3)
- out: [“ATA”, “TAC”, “ACG”, “CGG”, “GGT”, “GTC”, “TCA”, “CAT”]
- :param gene: gene for reconstruction
- :param k_len: size of each divided kmer (fragment)
- :return fragments: list of kmers """
- fragments = list()
- length = len(genomes)
- hlpr = ""
- for i in range(length):
- hlpr = genomes[i : i+k_len]
- len2 = len(hlpr)
- if len2 < k_len:
- hlpr = hlpr + genomes[: k_len-len2]
- fragments.append(hlpr)
- return(fragments)
- def suffix_prefix(kmers: List[str]) ->Dict:
- """
- Returns a prefix and a suffix of each k-mer from the list.
- :param kmers: A list of error-free DNA k-mers taken from the strand of circular chromosome
- :return result: result[suffix] = prefix
- """
- kmer_len = len(kmers)
- result = defaultdict(list)
- for kmer in kmers:
- prefix = kmer[:-1]
- suffix = kmer[1:]
- result[suffix].append(prefix)
- print("suffix:", suffix, "prefiks: ", result[suffix])
- print("__________")
- return(result)
- def simple_reconstruction(kmers: List[str]) ->str: #->List[str]:
- """
- Reconstruction of a circular string from a k-mers using de Bruijn graph.
- http://rosalind.info/problems/grep/
- For example:
- circular string assembled from the cycle "AC" -> "CT" -> "TA" -> "AC" is simply (ACT)
- :param kmers: A list of error-free DNA (k+1) mers taken from the strand of circular chromosome
- :return result: one of the circular strings assembled by complete cycles in the Bruijn graph. """
- result = ""
- i = 0
- for kmer in kmers:
- if i == 0:
- result = kmer
- elif i < len(kmers) - len(kmer) + 1:
- result += kmer[-1]
- i += 1
- # print(result)
- return(result)
- def main():
- result = dict()
- genomes = read_file("genomes.txt")
- for key in genomes:
- print("____________________________________")
- print("Name:", key)
- genome = genomes[key]
- print("genome:", genome)
- result[key] = None
- kmers = composition('ATACGGTC', 3)
- # simple_reconstruction(kmers)
- pref_suf0 = suffix_prefix(kmers)
- pref_suf = suffix_prefix(["CAG", "AGT", "GTT", "TTT", "TTG", "TGG", "GGC", "GCG", "CGT", "GTT", "TTC", "TCA", "CAA", "AAT", "ATT", "TTC", "TCA"])
- # example_distinctive = is_ambiguous(pref_suf)
- # distinctive(pref_suf)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement