Advertisement
Guest User

Untitled

a guest
Mar 15th, 2019
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
F# 1.93 KB | None | 0 0
  1. module TFIDF
  2.     open Types
  3.     open Text
  4.     open NgramBag
  5.     open FSharpPlus
  6.  
  7.     type CorpusTf = Map<Term, float>
  8.     type DocumentTf = Map<Term, int>
  9.     type DocumentTfIdf = Map<Term, float>  
  10.  
  11.     let documentFrequency (corpus:seq<DocumentTf>) term  =
  12.         corpus |> Seq.filter (Map.containsKey term) |> Seq.length
  13.    
  14.     let inverseDocumentFrequency sampleCount docFrequency =
  15.         System.Math.Log (float sampleCount / (float docFrequency + 1.0))
  16.  
  17.     let tfdf (corpusTf: CorpusTf) docTerm docFreq =
  18.         let idf = corpusTf.TryFind docTerm |> Option.defaultValue 1.0
  19.         float docFreq * idf
  20.  
  21.     let tfIdfDocument  (corpusTf: CorpusTf) (documentTf:DocumentTf) : DocumentTfIdf =
  22.         Map.map (tfdf corpusTf) documentTf
  23.  
  24.     let getTermFreqs words : DocumentTf = words |> Seq.countBy id |> Map.ofSeq
  25.  
  26.     let tfDocument (corpusNgramBag:NgramBag) (document:TokenizedDocument) =
  27.         document
  28.         |> Seq.map ((flip Map.find) corpusNgramBag)
  29.         |> getTermFreqs
  30.  
  31.     let calcInverseDocFreq sampleCount docFrequency =
  32.         System.Math.Log (float sampleCount / (float docFrequency + 1.0))
  33.  
  34.     let corpusTf corpus (corpusNgramBag: NgramBag) : CorpusTf =
  35.         let uniqueTerms = Map.toSeq corpusNgramBag |> Seq.map snd
  36.         let corpusFreq = documentFrequency corpus
  37.         let sampleCount = Array.length corpus
  38.         let idf = corpusFreq >> inverseDocumentFrequency sampleCount
  39.         uniqueTerms
  40.         |> Seq.map (fun term -> term, idf term)
  41.         |> Map.ofSeq
  42.    
  43.  
  44.     let documentsTfIdf corpus =
  45.         let tokenizedDocunemts = Array.map tokenize corpus
  46.         let corpusNgramBag = Seq.collect id tokenizedDocunemts |> createNgramBag
  47.         let documentsTf = Array.map (tfDocument corpusNgramBag) tokenizedDocunemts
  48.         let corpusTf = corpusTf documentsTf corpusNgramBag
  49.         let documentsTfIdf = Seq.map (tfIdfDocument corpusTf) documentsTf
  50.         documentsTfIdf
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement