basic_functions

"""
Mini project #3:
---------------
The is focused on finding the minimum length k of k-mers for which it is possible to obtain unique cyclic genome.
Implementation of a whole method is based on constructing Hamiltionian cycle which recovers the genome.

Good source of knowledge: https://www.youtube.com/watch?v=0JlUy_l-RTk&t=2s
"""

from typing import Dict, List, Tuple
from collections import defaultdict

def read_file(path: str) -> Dict:
    """ Reading genes from a file into a dictionary
    in which keys represent a genome name,
    and a values - the genomes. """
    genomes = dict()
    dict_hlpr = ""
    with open(path, 'r') as file:
        for line in file:
            if line.startswith(">"):
                dict_hlpr = line[2:-1]
                # print(dict_hlpr)
            else:
                genomes[dict_hlpr] = line[:-1]
                # print(line[:-1])

    return(genomes)


def composition(genomes: str, k_len: int) -> List[str]:
    """
    Takes a circular genes and divide it for all possible k-mers (fragments) of a size of "k-mer" parameter
    and returns a list of k-mers that "wrap around" the end.
    For example:
        in: composition(“ATACGGTC”, 3)
        out: [“ATA”, “TAC”, “ACG”, “CGG”, “GGT”, “GTC”, “TCA”, “CAT”]
    :param gene: gene for reconstruction
    :param k_len: size of each divided kmer (fragment)
    :return fragments: list of kmers """

    fragments = list()
    length = len(genomes)

    hlpr = ""
    for i in range(length):
        hlpr = genomes[i : i+k_len]
        len2 = len(hlpr)
        if len2 < k_len:
            hlpr = hlpr + genomes[: k_len-len2]
        fragments.append(hlpr)

    return(fragments)


def suffix_prefix(kmers: List[str]) ->Dict:
    """
    Returns a prefix and a suffix of each k-mer from the list.
    :param kmers: A list of error-free DNA k-mers taken from the strand of circular chromosome
    :return result: result[suffix] = prefix
    """

    kmer_len = len(kmers)

    result = defaultdict(list)
    for kmer in kmers:
        prefix = kmer[:-1]
        suffix = kmer[1:]
        result[suffix].append(prefix)
        print("suffix:", suffix, "prefiks: ", result[suffix])
        print("__________")

    return(result)

def simple_reconstruction(kmers: List[str]) ->str: #->List[str]:
    """
    Reconstruction of a circular string from a k-mers using de Bruijn graph.
    http://rosalind.info/problems/grep/
    For example:
        circular string assembled from the cycle "AC" -> "CT" -> "TA" -> "AC" is simply (ACT)
    :param kmers: A list of error-free DNA (k+1) mers taken from the strand of circular chromosome
    :return result: one of the circular strings assembled by complete cycles in the Bruijn graph. """

    result = ""
    i = 0
    for kmer in kmers:
        if i == 0:
            result = kmer
        elif i < len(kmers) - len(kmer) + 1:
            result += kmer[-1]
        i += 1

    # print(result)
    return(result)


def main():
    result = dict()

    genomes = read_file("genomes.txt")
    for key in genomes:
        print("____________________________________")
        print("Name:", key)
        genome = genomes[key]
        print("genome:", genome)

        result[key] = None

    kmers = composition('ATACGGTC', 3)
    # simple_reconstruction(kmers)
    pref_suf0 = suffix_prefix(kmers)
    pref_suf = suffix_prefix(["CAG", "AGT", "GTT", "TTT", "TTG", "TGG", "GGC", "GCG", "CGT", "GTT", "TTC", "TCA", "CAA", "AAT", "ATT", "TTC", "TCA"])
    # example_distinctive = is_ambiguous(pref_suf)
    # distinctive(pref_suf)

if __name__ == "__main__":
    main()