Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- module TFIDF
- open Types
- open Text
- open NgramBag
- open FSharpPlus
- type CorpusTf = Map<Term, float>
- type DocumentTf = Map<Term, int>
- type DocumentTfIdf = Map<Term, float>
- let documentFrequency (corpus:seq<DocumentTf>) term =
- corpus |> Seq.filter (Map.containsKey term) |> Seq.length
- let inverseDocumentFrequency sampleCount docFrequency =
- System.Math.Log (float sampleCount / (float docFrequency + 1.0))
- let tfdf (corpusTf: CorpusTf) docTerm docFreq =
- let idf = corpusTf.TryFind docTerm |> Option.defaultValue 1.0
- float docFreq * idf
- let tfIdfDocument (corpusTf: CorpusTf) (documentTf:DocumentTf) : DocumentTfIdf =
- Map.map (tfdf corpusTf) documentTf
- let getTermFreqs words : DocumentTf = words |> Seq.countBy id |> Map.ofSeq
- let tfDocument (corpusNgramBag:NgramBag) (document:TokenizedDocument) =
- document
- |> Seq.map ((flip Map.find) corpusNgramBag)
- |> getTermFreqs
- let calcInverseDocFreq sampleCount docFrequency =
- System.Math.Log (float sampleCount / (float docFrequency + 1.0))
- let corpusTf corpus (corpusNgramBag: NgramBag) : CorpusTf =
- let uniqueTerms = Map.toSeq corpusNgramBag |> Seq.map snd
- let corpusFreq = documentFrequency corpus
- let sampleCount = Array.length corpus
- let idf = corpusFreq >> inverseDocumentFrequency sampleCount
- uniqueTerms
- |> Seq.map (fun term -> term, idf term)
- |> Map.ofSeq
- let documentsTfIdf corpus =
- let tokenizedDocunemts = Array.map tokenize corpus
- let corpusNgramBag = Seq.collect id tokenizedDocunemts |> createNgramBag
- let documentsTf = Array.map (tfDocument corpusNgramBag) tokenizedDocunemts
- let corpusTf = corpusTf documentsTf corpusNgramBag
- let documentsTfIdf = Seq.map (tfIdfDocument corpusTf) documentsTf
- documentsTfIdf
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement