Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {-# LANGUAGE OverloadedStrings #-}
- module Text.MFKC ( distance ) where
- ----------------------------------------------------------------------------
- -- |
- -- Module : MKFC
- -- Author : (C) Andrew Fatkulin
- -- License : BSD-3-Clause (see the file LICENSE)
- --
- --
- -- Computing the most frequent k characters distance of two strings
- ----------------------------------------------------------------------------
- import Data.List (sortOn)
- import Data.Text (Text)
- import Data.HashMap.Strict((!), HashMap)
- import qualified Data.Text as Tx
- import qualified Data.HashMap.Strict as Hm
- infixr 1 ?
- (?) :: Bool -> a -> a -> a
- (?) True x _ = x
- (?) False _ y = y
- type MkHash = HashMap Char Int
- toHash :: Text -> MkHash
- toHash text = Tx.foldl fillMap emptyMap text
- where
- emptyMap = Hm.empty :: MkHash
- fillMap = \acc ch ->
- let value = (Hm.member ch acc) ? (acc ! ch) + 1 $ 1
- in Hm.insert ch value acc
- takeMostFrequentK :: Int -> MkHash -> MkHash
- takeMostFrequentK k hash = Hm.fromList . (take k) . reverse . (sortOn snd) . Hm.toList $ hash
- calcSimilarity :: (Int, Int, Int) -> Int
- calcSimilarity (sim, v1, v2) = v1 == v2 ? (sim + v2) $ (sim + v1 + v2)
- similarity :: MkHash -> MkHash -> Int
- similarity h1 h2 = Hm.foldlWithKey' calc 0 h2
- where
- calc acc k v = (Hm.member k h1)
- ? calcSimilarity (acc, v, ( h1 ! k ))
- $ acc
- distance :: Int -> Int -> Text -> Text -> Int
- distance k max text text2 = max - (similarity hash1 hash2)
- where
- hash1 = ((takeMostFrequentK k) . toHash) text
- hash2 = ((takeMostFrequentK k) . toHash) text2
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement