Advertisement
Guest User

Untitled

a guest
Feb 17th, 2014
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
F# 2.63 KB | None | 0 0
  1. let doc1 = "The Eiffel Tower was built for the International Exhibition of Paris of 1889 commemorating the centenary of the French Revolution. Of the 700 proposals submitted in a design competition, Gustave Eiffel’s was unanimously chosen."
  2. let doc2 = "A signature of the Las Vegas skyline, the replica Eiffel Tower at Paris Las Vegas is an exact reproduction of one of Europe’s highest towers."
  3. let doc3 = "Alexandre Gustave Eiffel was born in Dijon, France, in 1832"
  4. let doc4 = "Another undisputed great engineering feat is the Eiffel Tower, a huge wrought-iron skeleton in Paris. Alexandre Gustave Eiffel designed the 984-foot tower for the exposition of 1889. The tower contains about 7,000 short tons of iron and steel."
  5.  
  6. type Document =
  7.     { Text: string; Length: float; Index: int }
  8.  
  9. let docs = [{Text = doc1; Length = 0.51; Index = 1};
  10.             {Text = doc2; Length = 0.87; Index = 2};
  11.             {Text = doc3; Length = 0.58; Index = 3};
  12.             {Text = doc4; Length = 0.70; Index = 4}]
  13.  
  14. type TokenizedDoc =
  15.     { Doc: Document; Tokens: Map<string, int> }
  16.  
  17. let tokenize (doc:Document) =
  18.     let text = doc.Text.ToLower ()
  19.     let text = text.Replace (".",  " ")
  20.     let text = text.Replace (",", " ")
  21.     let terms = text.Split ([|" "|], System.StringSplitOptions.RemoveEmptyEntries) |> List.ofArray
  22.     let m = terms |> List.fold (fun map term ->
  23.         let count =
  24.             match Map.tryFind term map with
  25.             | Some c -> c + 1
  26.             | None -> 1
  27.         Map.add term count map) Map.empty
  28.     {Doc = doc; Tokens = m}
  29.  
  30. let tokenized = docs |> List.map tokenize
  31. let totalCount =
  32.     tokenized
  33.     |> List.map (fun t -> t.Tokens)
  34.     |> List.fold (fun result map ->
  35.         Map.fold (fun state term count ->
  36.             let count =
  37.                 match Map.tryFind term state with
  38.                 | Some c -> c + count
  39.                 | None -> count
  40.             Map.add term count state) result map) Map.empty
  41.  
  42. let tdidf doc term =
  43.     let tf_t_d =
  44.         match doc.Tokens |> Map.tryFind term with
  45.         | Some c -> c
  46.         | None -> 0
  47.         |> float
  48.     let df_t =
  49.         tokenized
  50.         |> List.map (fun t ->
  51.             match t.Tokens |> Map.tryFind term with
  52.             | Some _ -> 1
  53.             | None -> 0)
  54.         |> List.sum |> float
  55.     let idf_t = log10 ((List.length tokenized |> float) / df_t)
  56.     let w_t_d = (1.0 + (log10 tf_t_d)) * idf_t
  57.     sprintf "Doc[%d]: tf=%d; df=%d; idf=%f; w=%f" doc.Doc.Index (tf_t_d |> int) (df_t |> int) idf_t w_t_d
  58.  
  59. "\n" + (tokenized
  60. |> List.map tdidf
  61. |> List.map (fun calc -> calc "tower")
  62. |> String.concat "\n") + "\n" |> printf "%s"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement