Untitled

module Chapter1 where

--import Data.Map
import Control.Monad
import Data.List
import Data.Char

data Base = A | G | C | T  deriving(Eq, Show)
bases = [A,G,C,T]
type DNA = [Base]

complement A = T
complement T = A
complement G = C
complement C = G

charToBase 'A' = A
charToBase 'G' = G
charToBase 'C' = C
charToBase 'T' = T

stringToDna = map charToBase
dnaToString :: DNA -> String
dnaToString = foldl (\x y -> x ++ (show y)) ""

boolAsInt :: Bool -> Int
boolAsInt True = 1
boolAsInt False = 0

-- how many occurrences of pt are in strand?
patternCount :: [Base] -> [Base] -> Int
patternCount strand pt =
    if length strand < length pt
        then 0
    else
        (boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt

kmers :: Int -> [DNA]
kmers 1 = map (\x -> [x]) [A,G,C,T]
kmers n = [b : kmer |  kmer <- kmers (n-1), b <- bases]

kmersIn :: Int -> DNA -> [DNA]
kmersIn n s =
    if (length s) < n
        then []
    else
        (take n s) : (kmersIn n (tail s))

mostFrequentKmers :: Int -> DNA -> [DNA]
mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers
    where
        m = foldl max 0 (map (patternCount strand) nmers)
        nmers = nub $ kmersIn n strand

kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int)
kmerWithIndex length i strand = ((take length strand),i)

kmer :: Int -> Int -> DNA -> DNA
kmer len i strand = fst $ kmerWithIndex len i strand

kmersWithIndices :: Int -> DNA -> [(DNA,Int)]
kmersWithIndices n = kmersWithIndicesTail n 0
    where
        kmersWithIndicesTail n i strand =
            if (length strand) < n
                then []
            else
                (kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand))

reverseComplement = (map complement . reverse)

clumpFinding :: Int -> Int -> Int -> DNA -> [DNA]
clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux
    where
    aux = kmersWithIndices k strand
    kmersInWindow lst = result
        where
        hd = head lst
        inWindow p = (abs ((snd p) - (snd hd)) <= l)
        result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst)
    allFormingWindows acc [] = acc
    allFormingWindows acc xs =
        if t <= (length $ kmersInWindow xs)
            then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs)
        else
            allFormingWindows acc (tail xs)
{-
    WEEK 1 CS
-}

baseToInt A = 0
baseToInt C = 1
baseToInt G = 2
baseToInt T = 3

patternToNumber = helper . reverse
    where
    helper [] = 0
    helper (b : bs) = 4*(helper bs) + (baseToInt b)


{-
    WEEK 2
-}

skew :: DNA -> [Int]
skew = skewTail 0
    where
        skewTail n [] = []
        skewTail n (C : rest) = n : (skewTail (n-1) rest)
        skewTail n (G : rest) = n :  (skewTail (n+1) rest)
        skewTail n (ch : rest) = n : (skewTail n rest)

minimumSkew :: DNA -> [Int]
minimumSkew [] = []
minimumSkew strand = minimalIndices
    where
        helper :: Int -> Int -> [Int] -> [Int]
        helper i m [] = []
        helper i m (x : xs) =
            if (m == x)
                then i : (helper (i+1) m xs)
            else
                helper (i+1) m xs
        m = foldr min (head $ skew strand) (skew strand)
        minimalIndices = helper 0 m (skew strand)

-- these are without replication
comb :: Int -> [a] -> [[a]]
comb 0 as = [[]]
comb n as =
    if length as < n
        then []
    else
       (map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as))

combWithRep :: Int -> [a] -> [[a]]
combWithRep 0 as = [[]]
combWithRep 1 as = [[a] | a <- as]
combWithRep n as = [ a : prev | a <- as, prev <- combWithRep (n-1) as]

dist [] _ = 0
dist _ [] = 0
dist (x : xs) (y : ys) =
    (boolAsInt (x /= y)) + dist xs ys

approximateOccurrences :: DNA -> Int -> DNA -> [Int]
approximateOccurrences pt mismatch = helper 0 []
    where
        n = length pt
        helper i acc strand =
            if (length strand) < (length pt)
                then acc
            else
                let
                    substr = take n strand
                in
                    if  ((dist substr pt) <= mismatch)
                        then helper (i + 1) (acc ++ [i]) (tail strand)
                    else
                        helper (i + 1) acc (tail strand)

countPatternWithMismatches :: DNA -> Int -> DNA -> Int
countPatternWithMismatches pt mismatch = helper 0
    where
        helper i strand =
            if (length strand) < (length pt)
                then i
            else
                if ((dist strand pt) <= mismatch)
                then helper (i + 1) (tail strand)
                else helper i (tail strand)

{-
frequentWithMismatches :: DNA -> Int -> Int -> DNA
frequentWithMismatches strand length mismatches =
    where occurring = kmers length strand
    -- it seems that we need to find all mismatched words and then choose the which occurs the most
-}

-- helpers
f = clumpFinding 9 500 3
clumpsInEcoli = do
    contents <- readFile "E-coli.txt"
    let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents
        in
        writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed)

-- write to repl
-- printList $ map dnaToString $ f $ stringToDna parsed

printList :: [String] -> IO [()]
printList l = sequence $ map putStrLn l

handleFile inName outName f = do
    contents <- readFile inName
    let processed = take ((length contents) - 1) contents
        in writeFile outName $ (dnaToString . f . stringToDna) processed