Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class CorpusModel m where
- tags :: m -> [Tag]
- tag_name :: m -> Tag -> String
- tag_count_u :: m -> Tag -> Int
- tag_count_b :: m -> (Tag, Tag) -> Int
- tag_count_t :: m -> (Tag, Tag, Tag) -> Int
- word_tag_count :: m -> Word -> Tag -> Int
- viterbi :: CorpusModel m => m -> [Word] -> [Tag]
- viterbi m s = let -- list all possible ends tag pairs, and find the maximum one.
- candidates = [(taccum (len+1) t1 t2 TagStop, (t1, t2)) | t1 <- validtags, t2 <- validtags]
- (_, (t1, t2)) = maximumBy value_compare candidates
- -- utility step function that extract the previous tag
- -- from tag pair (t1,t2) at step n.
- step (n,t1,t2) = fromJust $ snd $ pi_cached !# n !# (t1,t2)
- -- repeat the above step utils the full path of tags are extracted,
- -- note that this path is reversed.
- path = t2 : t1 : map step (zip3 (reverse [3..len]) (tail path) path)
- in reverse path
- where
- validtags = tags m
- len = length s
- min_tag = TagStart
- max_tag = maximum validtags
- tag_range = ((min_tag,min_tag),(max_tag,max_tag))
- sentence = listArray (1,len) s
- -- word probability
- wprob n = let cnt = word_tag_count m (sentence !# n)
- in \t -> (fromIntegral (cnt t) / fromIntegral (tag_count_u m t)) :: Double
- -- one-step transition probability
- tprob t1 t2 t3 = (fromIntegral (tag_count_t m (t1,t2,t3)) / fromIntegral (tag_count_b m (t1,t2))):: Double
- -- accumulated transition probability
- taccum 1 t1 t2 t3 = tprob t1 t2 t3
- taccum n t1 t2 t3 = get_value (pi_cached !# (n-1) !# (t1, t2)) * tprob t1 t2 t3
- pi :: Int -> (Tag,Tag) -> (Double, Maybe Tag)
- pi n (t2,t3) | t3 `elem` [TagStart, TagStop] = (0, Nothing)
- | n == 1 && t2 /= TagStart = (0, Nothing)
- | n > 1 && t2 `elem` [TagStart, TagStop] = (0, Nothing)
- -- (n = 1 ==> t2 = TagStart) && (n > 1 ==> t2 /= TagStart,TagStop)
- | otherwise = let wprobn = wprob n
- in if n <= 2 then
- (taccum n TagStart t2 t3 * wprobn t3, Just TagStart)
- else
- let candidates = [(taccum n t1 t2 t3 * wprobn t3, Just t1) | t1 <- validtags]
- in maximumBy value_compare candidates
- -- cached pi values
- -- by the lazy semantics, each value is computed only when it is necessary and always computed only once.
- pi_cached :: Array Int (Array (Tag,Tag) (Double, Maybe Tag))
- pi_cached = listArray (1,len)
- ( flip map [1..len] (\n ->
- listArray tag_range $ map (pi n) (range tag_range)))
- get_value (v, _) = v
- value_compare a b = compare (get_value a) (get_value b)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement