{-# LANGUAGE OverloadedStrings #-} module Text.MFKC ( distance ) where ---------------------------------------------------------------------------- -- | -- Module : MKFC -- Author : (C) Andrew Fatkulin -- License : BSD-3-Clause (see the file LICENSE) -- -- -- Computing the most frequent k characters distance of two strings ---------------------------------------------------------------------------- import Data.List (sortOn) import Data.Text (Text) import Data.HashMap.Strict((!), HashMap) import qualified Data.Text as Tx import qualified Data.HashMap.Strict as Hm infixr 1 ? (?) :: Bool -> a -> a -> a (?) True x _ = x (?) False _ y = y type MkHash = HashMap Char Int toHash :: Text -> MkHash toHash text = Tx.foldl fillMap emptyMap text where emptyMap = Hm.empty :: MkHash fillMap = \acc ch -> let value = (Hm.member ch acc) ? (acc ! ch) + 1 $ 1 in Hm.insert ch value acc takeMostFrequentK :: Int -> MkHash -> MkHash takeMostFrequentK k hash = Hm.fromList . (take k) . reverse . (sortOn snd) . Hm.toList $ hash calcSimilarity :: (Int, Int, Int) -> Int calcSimilarity (sim, v1, v2) = v1 == v2 ? (sim + v2) $ (sim + v1 + v2) similarity :: MkHash -> MkHash -> Int similarity h1 h2 = Hm.foldlWithKey' calc 0 h2 where calc acc k v = (Hm.member k h1) ? calcSimilarity (acc, v, ( h1 ! k )) $ acc distance :: Int -> Int -> Text -> Text -> Int distance k max text text2 = max - (similarity hash1 hash2) where hash1 = ((takeMostFrequentK k) . toHash) text hash2 = ((takeMostFrequentK k) . toHash) text2