Guest User

General markov chain generator

a guest
Aug 9th, 2013
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Haskell 10.74 KB | None | 0 0
  1. {-# LANGUAGE TypeSynonymInstances, FlexibleInstances, MultiParamTypeClasses, FlexibleContexts #-}
  2. module Text.Markov(
  3.     One,
  4.     Two,
  5.     Three,
  6.     Four,
  7.     Five,
  8.     Six,
  9.     Seven,
  10.     OneWord,
  11.     TwoWord,
  12.     ThreeWord,
  13.     FourWord,
  14.     one,
  15.     two,
  16.     three,
  17.     four,
  18.     five,
  19.     six,
  20.     seven,
  21.     oneWord,
  22.     twoWord,
  23.     threeWord,
  24.     fourWord,
  25.     MarkovModel,
  26.     MarkovArrow,
  27.     Freq,
  28.     Order(..),
  29.     analyzeOrder,
  30.     gAnalyzeOrder,
  31.     analyzeFromUrls,
  32.     generateString,
  33.     chainString,
  34.     generateMarkovArrow,
  35.     runMarkovArrow,
  36.     liftMarkovModel
  37. ) where
  38.  
  39. import qualified Data.HashMap.Strict as S
  40. import qualified Data.List as L
  41. import Control.Monad
  42. import Data.Monoid
  43. import Data.Hashable
  44. import Data.Maybe
  45. import Control.Applicative
  46. import Control.Category
  47. import Control.Arrow
  48. import System.Random
  49. import System.IO
  50. import Network.HTTP
  51. import Prelude hiding ((.), id)
  52.  
  53.  
  54. type One = Char
  55. type Two = (Char, Char)
  56. type Three = (Char, Char, Char)
  57. type Four = (Char, Char, Char, Char)
  58. type Five = (Char, Char, Char, Char, Char)
  59. type Six = (Char, Char, Char, Char, Char,Char)
  60. type Seven = (Char,Char,Char,Char,Char,Char,Char)
  61. type Freq = Integer
  62.  
  63. type OneWord = String
  64. type TwoWord = (String,String)
  65. type ThreeWord = (String,String, String)
  66. type FourWord = (String,String, String, String)
  67.  
  68. -- | This is the model of the markov chain  
  69. --  The nth order markov model is a mapping:
  70. --   n x a -> [(b, Freq)]
  71. --
  72. --   It makes one wonder of this could be cast into an arrow. The arrow would
  73. --   than build a more complicated model from other models. It would not be an easy one. Especially because I use frequencies instead of probabilities.  
  74. --  
  75. newtype MarkovModel order out = MM {
  76.     unMM :: S.HashMap order [(out, Freq)]
  77.     } deriving Show
  78.  
  79. -- | It would look like this
  80. newtype MarkovArrow order out = MA {
  81.         unMA :: order -> [(out, Freq)]
  82.     }
  83.  
  84. -- | Run a markov arrow as a mapping from order to [(out, freq)]
  85. runMarkovArrow :: MarkovArrow order out -> order -> [(out, Freq)]
  86. runMarkovArrow m o = unMA m o
  87.  
  88. -- | Generate a string of out from a markov arrow
  89. generateMarkovArrow :: (Order order out, RandomGen g) => MarkovArrow order out ->  g -> order -> [out]
  90. generateMarkovArrow m seed o = let xs = unMA m o
  91.                    (p,g) = freq xs seed
  92.                    in generateMarkovArrow m seed (shiftOrder p o)
  93.  
  94. -- | And I can lift an MarkovModel into the MarkovArrow category
  95. liftMarkovModel :: Order order out => MarkovModel order out -> MarkovArrow order out
  96. liftMarkovModel (MM s) = MA (\order -> case S.lookup order s of
  97.                         Nothing -> []
  98.                         Just ts -> ts )
  99.  
  100.  
  101. -- | The identity function is rather trivial
  102. idMA :: MarkovArrow a a  
  103. idMA = MA $ \i -> [(i, 1)]
  104.  
  105. -- | But composition is a problem
  106. -- I would have to convert the frequency into probabilities and the compose them
  107. compMA :: MarkovArrow b c -> MarkovArrow a b -> MarkovArrow a c
  108. compMA (MA f) (MA g) = MA $ compRaw f g
  109.  
  110. -- | Which of course is entirely possible
  111. compRaw :: (b -> [(c, Freq)]) ->  (a -> [(b, Freq)]) -> a -> [(c, Freq)]
  112. compRaw f g a = let xs = toProb $ g a  
  113.         in fromProb $ foldr step [] xs
  114.     where step (x,p) z = let xs = toProb (f x)
  115.                  in fmap (second (*p)) xs ++ z
  116.  
  117. toProb :: [(a, Freq)] -> [(a, Double)]
  118. toProb xs =  let tot = fromInteger $ sum (snd <$> xs)
  119.          in fmap (second (\x -> fromInteger x / tot)) xs
  120.  
  121. fromProb :: [(a, Double)] -> [(a, Freq)]
  122. fromProb xs = let min = minimum (fmap snd xs)
  123.           reversed = 1 / min
  124.           in fmap (second (round.(*reversed))) xs  
  125.  
  126. -- | The category is quite neat, but the arrow is probably not an arrow
  127. instance Category MarkovArrow where
  128.     id = idMA
  129.     (.) = compMA  
  130.  
  131. -- | Now lets instantiate a functor
  132. instance Functor (MarkovArrow a) where
  133.     fmap f (MA g) = MA $ \i -> fmap (first f) $ g i
  134.  
  135. -- | And a applicative instance, which has a lot in common with the arrow interface.
  136. --   We would want to take the cross product instead of the zipping functor  
  137. --   This looks rather sane, we take both probabilities and multiply them
  138. instance Applicative (MarkovArrow a) where
  139.     pure f = MA $ \i -> [(f, 1)]
  140.     (<*>) (MA f) (MA g) = MA $ \i -> let xs = toProb $ f i
  141.                          ns = toProb $ g i
  142.                      in fromProb $  [ (h a, p * p') | (h,p) <- xs, (a,p') <- ns]  
  143.  
  144. -- | Create the arrow interface, this doesn't look to good unfortunetaly, I bet this thing isn't an arrow  
  145. --  but the arrow is probably sane
  146. instance Arrow MarkovArrow where
  147.     arr f= MA $ \i -> [(f i, 1)]
  148.     first (MA f) = MA $ \(i,s) -> let xs = f i
  149.                       in fmap (first (\x -> (x,s))) xs  
  150.  
  151.  
  152. -- | This is a class which specify the relation between out and order,
  153. class (Hashable order, Eq order) => Order order out where
  154.     takeOrder :: [out] -> [(order, out)]
  155.     shiftOrder :: out -> order -> order
  156.    
  157.  
  158. instance Order One Char where
  159.     takeOrder (x:y:_) = return (x,y)
  160.     takeOrder _ = mzero
  161.     shiftOrder c _ = c
  162.  
  163. instance Order Two Char where
  164.     takeOrder (x:y:z:_) = return ((x,y),z)
  165.     takeOrder _ = mzero
  166.     shiftOrder c (a,b) = (b,c)
  167.  
  168. instance Order Three Char where
  169.     takeOrder (x:y:z:p:_) = return ((x,y,z),p)
  170.     takeOrder _ = mzero
  171.     shiftOrder c (a,b,d) = (b,d,c)
  172.  
  173. instance Order Four Char where
  174.     takeOrder (x:y:z:p:q:_) = return ((x,y,z,p),q)
  175.     takeOrder _ = mzero
  176.     shiftOrder q (a,b,c,d) = (b,c,d,q)
  177.  
  178. instance Order Five Char where
  179.     takeOrder (x:y:z:p:q:r:_) = return ((x,y,z,p,q),r)
  180.     takeOrder _ = mzero
  181.     shiftOrder q (a,b,c,d,e) = (b,c,d,e,q)
  182. instance Order Six Char where
  183.     takeOrder (x:y:z:p:q:r:s:_) = return ((x,y,z,p,q,r),s)
  184.     takeOrder _ = mzero
  185.     shiftOrder q (a,b,c,d,e,f) = (b,c,d,e,f,q)
  186. instance Order Seven Char where
  187.     takeOrder (x:y:z:p:q:r:s:t:_) = return ((x,y,z,p,q,r,s),t)
  188.     takeOrder _ = mzero
  189.     shiftOrder q (a,b,c,d,e,f,g) = (b,c,d,e,f,g,q)
  190.  
  191. instance Order OneWord String where
  192.     takeOrder (x:y:_) = return (x,y)
  193.     takeOrder _ = mzero
  194.     shiftOrder q a = q
  195.  
  196. instance Order TwoWord String where
  197.     takeOrder (x:y:z:_) = return ((x,y),z)
  198.     takeOrder _ = mzero
  199.     shiftOrder q (a,b) = (b,q)
  200.  
  201. instance Order ThreeWord String where
  202.     takeOrder (x:y:z:p:_) = return ((x,y,z),p)
  203.     takeOrder _ = mzero
  204.     shiftOrder q (a,b,c) = (b,c,q)
  205.  
  206. instance Order FourWord String where
  207.     takeOrder (x:y:z:p:q:_) = return ((x,y,z,p),q)
  208.     takeOrder _ = mzero
  209.     shiftOrder q (a,b,c,d) = (b,c,d,q)
  210.  
  211. one :: One
  212. one = undefined
  213. two :: Two
  214. two = undefined
  215. three :: Three
  216. three = undefined
  217. four :: Four
  218. four = undefined
  219. five :: Five
  220. five = undefined  
  221. six :: Six
  222. six = undefined
  223. seven :: Seven
  224. seven = undefined
  225. oneWord :: OneWord
  226. oneWord = undefined
  227. twoWord :: TwoWord
  228. twoWord = undefined
  229. threeWord :: ThreeWord
  230. threeWord = undefined
  231. fourWord :: FourWord
  232. fourWord = undefined
  233.  
  234.  
  235. testntOrder :: Order o Char => o -> IO ()
  236. testntOrder o = do
  237.     xs <- analyzeFromUrls o [
  238.         "http://www.textfiles.com/sex/808-lust.txt",
  239.         "http://www.ewtn.com/library/liturgy/womord.txt",
  240.         "http://www.textfiles.com/sex/808-next.txt",
  241.         "http://www.textfiles.com/sex/a_friend.txt",
  242.         "http://www.textfiles.com/sex/camptrip.txt",
  243.         "http://www.textfiles.com/sex/clothes.pin",
  244.         "http://www.brothermike.com/Outlines/2002text/s120802.txt",
  245.         "http://www.rfc-editor.org/rfc/rfc1127.txt",
  246.         "http://www.rfc-editor.org/rfc/rfc1141.txt"
  247.         ]
  248.     c <- newStdGen
  249.     writeFile "generated" $  take 5000 $  generateString c xs
  250.  
  251. -- | Analyze and build a markov model from specified urls
  252. -- o is the order of the markov model
  253. analyzeFromUrls :: Order order Char => order -> [String] -> IO (MarkovModel order Char)
  254. analyzeFromUrls o xs = do
  255.             reqs <- forM xs $ \s -> simpleHTTP (getRequest s)
  256.             body <- forM reqs $ getResponseBody
  257.             return $ analyzeOrder o (concat body)
  258.  
  259. -- | Analyze a markov model from a given string
  260. analyzeOrder :: Order o Char => o -> String -> MarkovModel o Char
  261. analyzeOrder _ xs = createPairs (collapseSpaces $ filterShit xs) takeOrder
  262.  
  263.     where filterShit (x:xs) = case x of
  264.                     '\n' -> ' ' : filterShit xs
  265.                     '\r' -> filterShit xs
  266.                     '"' -> filterShit xs
  267.                     '\'' -> filterShit xs
  268.                     '-' -> filterShit xs
  269.                     c -> c : filterShit xs  
  270.           filterShit [] = []
  271. -- | Analyze an markov model for more general types
  272. gAnalyzeOrder :: Eq out => Order order out => order -> [out] -> MarkovModel order out
  273. gAnalyzeOrder _ xs = createPairs xs takeOrder
  274.  
  275. testGAnalyze o  = do
  276.         let urls = [    "http://www.textfiles.com/sex/808-lust.txt",
  277.             "http://www.ewtn.com/library/liturgy/womord.txt",
  278.             "http://www.textfiles.com/sex/808-next.txt",
  279.             "http://www.textfiles.com/sex/a_friend.txt",
  280.             "http://www.textfiles.com/sex/camptrip.txt",
  281.             "http://www.textfiles.com/sex/clothes.pin",
  282.             "http://www.brothermike.com/Outlines/2002text/s120802.txt",
  283.             "http://www.rfc-editor.org/rfc/rfc1127.txt",
  284.             "http://www.rfc-editor.org/rfc/rfc1141.txt"  ]
  285.  
  286.         xs <- forM urls $ \s -> simpleHTTP (getRequest s)
  287.         body <- forM xs $ \p -> getResponseBody p  
  288.         let mos = gAnalyzeOrder o (words $ collapseSpaces $ concat body)
  289.         n <- newStdGen
  290.         let bs =  generateString n mos
  291.         writeFile "generated-words" $ unwords (take 50 bs)
  292.  
  293. collapseSpaces :: String -> String
  294. collapseSpaces (' ': ' ': xs) = collapseSpaces (' ':xs)
  295. collapseSpaces (x:xs) = x : collapseSpaces xs
  296. collapseSpaces [] = []
  297.  
  298. -- | Generate a string from a markov model with a given seed
  299. generateString :: (Order order a, RandomGen g) => g -> MarkovModel order a -> [a]
  300. generateString stdg m = let (k,g) = choose stdg (S.keys $ unMM m)
  301.             in chainString k m g
  302.  
  303. -- | Generate a string from a starting seed and a markov model
  304. chainString :: (Order a b, RandomGen g) => a -> MarkovModel a b -> g -> [b]
  305. chainString a m g = case S.lookup a (unMM m) of
  306.                 Nothing -> []
  307.                 Just str -> let (n, g') = freq str g
  308.                         in n : chainString (shiftOrder n a) m g'
  309.  
  310.  
  311. freq :: RandomGen g => [(a, Freq)] -> g -> (a, g)
  312. freq xs g = let (p,g') = randomR (1, tot) g
  313.             tot = sum (fmap snd xs)
  314.       in (pick p xs,g')
  315.  
  316. pick :: Freq -> [(a,Freq)] -> a
  317. pick n ((a,k):xs) | n <= k = a
  318.           | otherwise = pick (n - k) xs
  319.  
  320.    
  321. choose :: RandomGen g => g -> [a] -> (a, g)
  322. choose std xs = let (a, g) = randomR (0, length xs - 1) std in (xs !! a, g)
  323.  
  324.  
  325. -- | This looks supiciously comonadic to me
  326. createPairs :: (Eq a, Hashable a, Eq b) => [b] -> ([b] -> [(a,b)]) -> MarkovModel a b
  327. createPairs [] _ = MM mempty  
  328. createPairs xs f =  f xs `squashProb` (createPairs (tail xs) f)
  329.     where squashProb xs m = MM $ foldr step (unMM m) xs
  330.             where step x z = updateProb (fst x) (snd x) z  
  331.  
  332. updateProb :: (Eq a, Hashable a, Eq b) => a -> b -> S.HashMap a [(b,Freq)] -> S.HashMap a [(b, Freq)]
  333. updateProb a c m = case (S.lookup a m) of
  334.             Nothing -> S.insert a [(c,1)] m
  335.             Just ts -> S.insert a (intoList c ts) m  
  336.         where intoList c (x:xs) | c == fst x = second (+1) x : xs  
  337.                     | otherwise = x : intoList c xs
  338.               intoList c [] = [(c,1)]
Advertisement
Add Comment
Please, Sign In to add comment