Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- module Chapter1 where
- --import Data.Map
- import Control.Monad
- import Data.List
- import Data.Char
- data Base = A | G | C | T deriving(Eq, Show)
- bases = [A,G,C,T]
- type DNA = [Base]
- complement A = T
- complement T = A
- complement G = C
- complement C = G
- charToBase 'A' = A
- charToBase 'G' = G
- charToBase 'C' = C
- charToBase 'T' = T
- stringToDna = map charToBase
- dnaToString :: DNA -> String
- dnaToString = foldl (\x y -> x ++ (show y)) ""
- boolAsInt :: Bool -> Int
- boolAsInt True = 1
- boolAsInt False = 0
- -- how many occurrences of pt are in strand?
- patternCount :: [Base] -> [Base] -> Int
- patternCount strand pt =
- if length strand < length pt
- then 0
- else
- (boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt
- kmers :: Int -> [DNA]
- kmers 1 = map (\x -> [x]) [A,G,C,T]
- kmers n = [b : kmer | kmer <- kmers (n-1), b <- bases]
- kmersIn :: Int -> DNA -> [DNA]
- kmersIn n s =
- if (length s) < n
- then []
- else
- (take n s) : (kmersIn n (tail s))
- mostFrequentKmers :: Int -> DNA -> [DNA]
- mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers
- where
- m = foldl max 0 (map (patternCount strand) nmers)
- nmers = nub $ kmersIn n strand
- kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int)
- kmerWithIndex length i strand = ((take length strand),i)
- kmer :: Int -> Int -> DNA -> DNA
- kmer len i strand = fst $ kmerWithIndex len i strand
- kmersWithIndices :: Int -> DNA -> [(DNA,Int)]
- kmersWithIndices n = kmersWithIndicesTail n 0
- where
- kmersWithIndicesTail n i strand =
- if (length strand) < n
- then []
- else
- (kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand))
- reverseComplement = (map complement . reverse)
- clumpFinding :: Int -> Int -> Int -> DNA -> [DNA]
- clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux
- where
- aux = kmersWithIndices k strand
- kmersInWindow lst = result
- where
- hd = head lst
- inWindow p = (abs ((snd p) - (snd hd)) <= l)
- result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst)
- allFormingWindows acc [] = acc
- allFormingWindows acc xs =
- if t <= (length $ kmersInWindow xs)
- then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs)
- else
- allFormingWindows acc (tail xs)
- {-
- WEEK 1 CS
- -}
- baseToInt A = 0
- baseToInt C = 1
- baseToInt G = 2
- baseToInt T = 3
- patternToNumber = helper . reverse
- where
- helper [] = 0
- helper (b : bs) = 4*(helper bs) + (baseToInt b)
- {-
- WEEK 2
- -}
- skew :: DNA -> [Int]
- skew = skewTail 0
- where
- skewTail n [] = []
- skewTail n (C : rest) = n : (skewTail (n-1) rest)
- skewTail n (G : rest) = n : (skewTail (n+1) rest)
- skewTail n (ch : rest) = n : (skewTail n rest)
- minimumSkew :: DNA -> [Int]
- minimumSkew [] = []
- minimumSkew strand = minimalIndices
- where
- helper :: Int -> Int -> [Int] -> [Int]
- helper i m [] = []
- helper i m (x : xs) =
- if (m == x)
- then i : (helper (i+1) m xs)
- else
- helper (i+1) m xs
- m = foldr min (head $ skew strand) (skew strand)
- minimalIndices = helper 0 m (skew strand)
- -- these are without replication
- comb :: Int -> [a] -> [[a]]
- comb 0 as = [[]]
- comb n as =
- if length as < n
- then []
- else
- (map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as))
- combWithRep :: Int -> [a] -> [[a]]
- combWithRep 0 as = [[]]
- combWithRep 1 as = [[a] | a <- as]
- combWithRep n as = [ a : prev | a <- as, prev <- combWithRep (n-1) as]
- dist [] _ = 0
- dist _ [] = 0
- dist (x : xs) (y : ys) =
- (boolAsInt (x /= y)) + dist xs ys
- approximateOccurrences :: DNA -> Int -> DNA -> [Int]
- approximateOccurrences pt mismatch = helper 0 []
- where
- n = length pt
- helper i acc strand =
- if (length strand) < (length pt)
- then acc
- else
- let
- substr = take n strand
- in
- if ((dist substr pt) <= mismatch)
- then helper (i + 1) (acc ++ [i]) (tail strand)
- else
- helper (i + 1) acc (tail strand)
- countPatternWithMismatches :: DNA -> Int -> DNA -> Int
- countPatternWithMismatches pt mismatch = helper 0
- where
- helper i strand =
- if (length strand) < (length pt)
- then i
- else
- if ((dist strand pt) <= mismatch)
- then helper (i + 1) (tail strand)
- else helper i (tail strand)
- {-
- frequentWithMismatches :: DNA -> Int -> Int -> DNA
- frequentWithMismatches strand length mismatches =
- where occurring = kmers length strand
- -- it seems that we need to find all mismatched words and then choose the which occurs the most
- -}
- -- helpers
- f = clumpFinding 9 500 3
- clumpsInEcoli = do
- contents <- readFile "E-coli.txt"
- let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents
- in
- writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed)
- -- write to repl
- -- printList $ map dnaToString $ f $ stringToDna parsed
- printList :: [String] -> IO [()]
- printList l = sequence $ map putStrLn l
- handleFile inName outName f = do
- contents <- readFile inName
- let processed = take ((length contents) - 1) contents
- in writeFile outName $ (dnaToString . f . stringToDna) processed
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement