Advertisement
Guest User

basic_functions

a guest
Jan 18th, 2020
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.68 KB | None | 0 0
  1. """
  2. Mini project #3:
  3. ---------------
  4. The is focused on finding the minimum length k of k-mers for which it is possible to obtain unique cyclic genome.
  5. Implementation of a whole method is based on constructing Hamiltionian cycle which recovers the genome.
  6.  
  7. Good source of knowledge: https://www.youtube.com/watch?v=0JlUy_l-RTk&t=2s
  8. """
  9.  
  10. from typing import Dict, List, Tuple
  11. from collections import defaultdict
  12.  
  13. def read_file(path: str) -> Dict:
  14.     """ Reading genes from a file into a dictionary
  15.    in which keys represent a genome name,
  16.    and a values - the genomes. """
  17.     genomes = dict()
  18.     dict_hlpr = ""
  19.     with open(path, 'r') as file:
  20.         for line in file:
  21.             if line.startswith(">"):
  22.                 dict_hlpr = line[2:-1]
  23.                 # print(dict_hlpr)
  24.             else:
  25.                 genomes[dict_hlpr] = line[:-1]
  26.                 # print(line[:-1])
  27.  
  28.     return(genomes)
  29.  
  30.  
  31. def composition(genomes: str, k_len: int) -> List[str]:
  32.     """
  33.    Takes a circular genes and divide it for all possible k-mers (fragments) of a size of "k-mer" parameter
  34.    and returns a list of k-mers that "wrap around" the end.
  35.    For example:
  36.        in: composition(“ATACGGTC”, 3)
  37.        out: [“ATA”, “TAC”, “ACG”, “CGG”, “GGT”, “GTC”, “TCA”, “CAT”]
  38.    :param gene: gene for reconstruction
  39.    :param k_len: size of each divided kmer (fragment)
  40.    :return fragments: list of kmers """
  41.  
  42.     fragments = list()
  43.     length = len(genomes)
  44.  
  45.     hlpr = ""
  46.     for i in range(length):
  47.         hlpr = genomes[i : i+k_len]
  48.         len2 = len(hlpr)
  49.         if len2 < k_len:
  50.             hlpr = hlpr + genomes[: k_len-len2]
  51.         fragments.append(hlpr)
  52.  
  53.     return(fragments)
  54.  
  55.  
  56. def suffix_prefix(kmers: List[str]) ->Dict:
  57.     """
  58.    Returns a prefix and a suffix of each k-mer from the list.
  59.    :param kmers: A list of error-free DNA k-mers taken from the strand of circular chromosome
  60.    :return result: result[suffix] = prefix
  61.    """
  62.  
  63.     kmer_len = len(kmers)
  64.  
  65.     result = defaultdict(list)
  66.     for kmer in kmers:
  67.         prefix = kmer[:-1]
  68.         suffix = kmer[1:]
  69.         result[suffix].append(prefix)
  70.         print("suffix:", suffix, "prefiks: ", result[suffix])
  71.         print("__________")
  72.  
  73.     return(result)
  74.  
  75. def simple_reconstruction(kmers: List[str]) ->str: #->List[str]:
  76.     """
  77.    Reconstruction of a circular string from a k-mers using de Bruijn graph.
  78.    http://rosalind.info/problems/grep/
  79.    For example:
  80.        circular string assembled from the cycle "AC" -> "CT" -> "TA" -> "AC" is simply (ACT)
  81.    :param kmers: A list of error-free DNA (k+1) mers taken from the strand of circular chromosome
  82.    :return result: one of the circular strings assembled by complete cycles in the Bruijn graph. """
  83.  
  84.     result = ""
  85.     i = 0
  86.     for kmer in kmers:
  87.         if i == 0:
  88.             result = kmer
  89.         elif i < len(kmers) - len(kmer) + 1:
  90.             result += kmer[-1]
  91.         i += 1
  92.  
  93.     # print(result)
  94.     return(result)
  95.  
  96.  
  97. def main():
  98.     result = dict()
  99.  
  100.     genomes = read_file("genomes.txt")
  101.     for key in genomes:
  102.         print("____________________________________")
  103.         print("Name:", key)
  104.         genome = genomes[key]
  105.         print("genome:", genome)
  106.  
  107.         result[key] = None
  108.  
  109.     kmers = composition('ATACGGTC', 3)
  110.     # simple_reconstruction(kmers)
  111.     pref_suf0 = suffix_prefix(kmers)
  112.     pref_suf = suffix_prefix(["CAG", "AGT", "GTT", "TTT", "TTG", "TGG", "GGC", "GCG", "CGT", "GTT", "TTC", "TCA", "CAA", "AAT", "ATT", "TTC", "TCA"])
  113.     # example_distinctive = is_ambiguous(pref_suf)
  114.     # distinctive(pref_suf)
  115.  
  116. if __name__ == "__main__":
  117.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement