Advertisement
Guest User

Untitled

a guest
Aug 2nd, 2015
229
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.43 KB | None | 0 0
  1. module Chapter1 where
  2.  
  3. --import Data.Map
  4. import Control.Monad
  5. import Data.List
  6. import Data.Char
  7.  
  8. data Base = A | G | C | T deriving(Eq, Show)
  9. bases = [A,G,C,T]
  10. type DNA = [Base]
  11.  
  12. complement A = T
  13. complement T = A
  14. complement G = C
  15. complement C = G
  16.  
  17. charToBase 'A' = A
  18. charToBase 'G' = G
  19. charToBase 'C' = C
  20. charToBase 'T' = T
  21.  
  22. stringToDna = map charToBase
  23. dnaToString :: DNA -> String
  24. dnaToString = foldl (\x y -> x ++ (show y)) ""
  25.  
  26. boolAsInt :: Bool -> Int
  27. boolAsInt True = 1
  28. boolAsInt False = 0
  29.  
  30. -- how many occurrences of pt are in strand?
  31. patternCount :: [Base] -> [Base] -> Int
  32. patternCount strand pt =
  33. if length strand < length pt
  34. then 0
  35. else
  36. (boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt
  37.  
  38. kmers :: Int -> [DNA]
  39. kmers 1 = map (\x -> [x]) [A,G,C,T]
  40. kmers n = [b : kmer | kmer <- kmers (n-1), b <- bases]
  41.  
  42. kmersIn :: Int -> DNA -> [DNA]
  43. kmersIn n s =
  44. if (length s) < n
  45. then []
  46. else
  47. (take n s) : (kmersIn n (tail s))
  48.  
  49. mostFrequentKmers :: Int -> DNA -> [DNA]
  50. mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers
  51. where
  52. m = foldl max 0 (map (patternCount strand) nmers)
  53. nmers = nub $ kmersIn n strand
  54.  
  55. kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int)
  56. kmerWithIndex length i strand = ((take length strand),i)
  57.  
  58. kmer :: Int -> Int -> DNA -> DNA
  59. kmer len i strand = fst $ kmerWithIndex len i strand
  60.  
  61. kmersWithIndices :: Int -> DNA -> [(DNA,Int)]
  62. kmersWithIndices n = kmersWithIndicesTail n 0
  63. where
  64. kmersWithIndicesTail n i strand =
  65. if (length strand) < n
  66. then []
  67. else
  68. (kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand))
  69.  
  70. reverseComplement = (map complement . reverse)
  71.  
  72. clumpFinding :: Int -> Int -> Int -> DNA -> [DNA]
  73. clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux
  74. where
  75. aux = kmersWithIndices k strand
  76. kmersInWindow lst = result
  77. where
  78. hd = head lst
  79. inWindow p = (abs ((snd p) - (snd hd)) <= l)
  80. result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst)
  81. allFormingWindows acc [] = acc
  82. allFormingWindows acc xs =
  83. if t <= (length $ kmersInWindow xs)
  84. then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs)
  85. else
  86. allFormingWindows acc (tail xs)
  87. {-
  88. WEEK 1 CS
  89. -}
  90.  
  91. baseToInt A = 0
  92. baseToInt C = 1
  93. baseToInt G = 2
  94. baseToInt T = 3
  95.  
  96. patternToNumber = helper . reverse
  97. where
  98. helper [] = 0
  99. helper (b : bs) = 4*(helper bs) + (baseToInt b)
  100.  
  101.  
  102. {-
  103. WEEK 2
  104. -}
  105.  
  106. skew :: DNA -> [Int]
  107. skew = skewTail 0
  108. where
  109. skewTail n [] = []
  110. skewTail n (C : rest) = n : (skewTail (n-1) rest)
  111. skewTail n (G : rest) = n : (skewTail (n+1) rest)
  112. skewTail n (ch : rest) = n : (skewTail n rest)
  113.  
  114. minimumSkew :: DNA -> [Int]
  115. minimumSkew [] = []
  116. minimumSkew strand = minimalIndices
  117. where
  118. helper :: Int -> Int -> [Int] -> [Int]
  119. helper i m [] = []
  120. helper i m (x : xs) =
  121. if (m == x)
  122. then i : (helper (i+1) m xs)
  123. else
  124. helper (i+1) m xs
  125. m = foldr min (head $ skew strand) (skew strand)
  126. minimalIndices = helper 0 m (skew strand)
  127.  
  128. -- these are without replication
  129. comb :: Int -> [a] -> [[a]]
  130. comb 0 as = [[]]
  131. comb n as =
  132. if length as < n
  133. then []
  134. else
  135. (map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as))
  136.  
  137. combWithRep :: Int -> [a] -> [[a]]
  138. combWithRep 0 as = [[]]
  139. combWithRep 1 as = [[a] | a <- as]
  140. combWithRep n as = [ a : prev | a <- as, prev <- combWithRep (n-1) as]
  141.  
  142. dist [] _ = 0
  143. dist _ [] = 0
  144. dist (x : xs) (y : ys) =
  145. (boolAsInt (x /= y)) + dist xs ys
  146.  
  147. approximateOccurrences :: DNA -> Int -> DNA -> [Int]
  148. approximateOccurrences pt mismatch = helper 0 []
  149. where
  150. n = length pt
  151. helper i acc strand =
  152. if (length strand) < (length pt)
  153. then acc
  154. else
  155. let
  156. substr = take n strand
  157. in
  158. if ((dist substr pt) <= mismatch)
  159. then helper (i + 1) (acc ++ [i]) (tail strand)
  160. else
  161. helper (i + 1) acc (tail strand)
  162.  
  163. countPatternWithMismatches :: DNA -> Int -> DNA -> Int
  164. countPatternWithMismatches pt mismatch = helper 0
  165. where
  166. helper i strand =
  167. if (length strand) < (length pt)
  168. then i
  169. else
  170. if ((dist strand pt) <= mismatch)
  171. then helper (i + 1) (tail strand)
  172. else helper i (tail strand)
  173.  
  174. {-
  175. frequentWithMismatches :: DNA -> Int -> Int -> DNA
  176. frequentWithMismatches strand length mismatches =
  177. where occurring = kmers length strand
  178. -- it seems that we need to find all mismatched words and then choose the which occurs the most
  179. -}
  180.  
  181. -- helpers
  182. f = clumpFinding 9 500 3
  183. clumpsInEcoli = do
  184. contents <- readFile "E-coli.txt"
  185. let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents
  186. in
  187. writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed)
  188.  
  189. -- write to repl
  190. -- printList $ map dnaToString $ f $ stringToDna parsed
  191.  
  192. printList :: [String] -> IO [()]
  193. printList l = sequence $ map putStrLn l
  194.  
  195. handleFile inName outName f = do
  196. contents <- readFile inName
  197. let processed = take ((length contents) - 1) contents
  198. in writeFile outName $ (dnaToString . f . stringToDna) processed
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement