Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- let doc1 = "The Eiffel Tower was built for the International Exhibition of Paris of 1889 commemorating the centenary of the French Revolution. Of the 700 proposals submitted in a design competition, Gustave Eiffel’s was unanimously chosen."
- let doc2 = "A signature of the Las Vegas skyline, the replica Eiffel Tower at Paris Las Vegas is an exact reproduction of one of Europe’s highest towers."
- let doc3 = "Alexandre Gustave Eiffel was born in Dijon, France, in 1832"
- let doc4 = "Another undisputed great engineering feat is the Eiffel Tower, a huge wrought-iron skeleton in Paris. Alexandre Gustave Eiffel designed the 984-foot tower for the exposition of 1889. The tower contains about 7,000 short tons of iron and steel."
- type Document =
- { Text: string; Length: float; Index: int }
- let docs = [{Text = doc1; Length = 0.51; Index = 1};
- {Text = doc2; Length = 0.87; Index = 2};
- {Text = doc3; Length = 0.58; Index = 3};
- {Text = doc4; Length = 0.70; Index = 4}]
- type TokenizedDoc =
- { Doc: Document; Tokens: Map<string, int> }
- let tokenize (doc:Document) =
- let text = doc.Text.ToLower ()
- let text = text.Replace (".", " ")
- let text = text.Replace (",", " ")
- let terms = text.Split ([|" "|], System.StringSplitOptions.RemoveEmptyEntries) |> List.ofArray
- let m = terms |> List.fold (fun map term ->
- let count =
- match Map.tryFind term map with
- | Some c -> c + 1
- | None -> 1
- Map.add term count map) Map.empty
- {Doc = doc; Tokens = m}
- let tokenized = docs |> List.map tokenize
- let totalCount =
- tokenized
- |> List.map (fun t -> t.Tokens)
- |> List.fold (fun result map ->
- Map.fold (fun state term count ->
- let count =
- match Map.tryFind term state with
- | Some c -> c + count
- | None -> count
- Map.add term count state) result map) Map.empty
- let tdidf doc term =
- let tf_t_d =
- match doc.Tokens |> Map.tryFind term with
- | Some c -> c
- | None -> 0
- |> float
- let df_t =
- tokenized
- |> List.map (fun t ->
- match t.Tokens |> Map.tryFind term with
- | Some _ -> 1
- | None -> 0)
- |> List.sum |> float
- let idf_t = log10 ((List.length tokenized |> float) / df_t)
- let w_t_d = (1.0 + (log10 tf_t_d)) * idf_t
- sprintf "Doc[%d]: tf=%d; df=%d; idf=%f; w=%f" doc.Doc.Index (tf_t_d |> int) (df_t |> int) idf_t w_t_d
- "\n" + (tokenized
- |> List.map tdidf
- |> List.map (fun calc -> calc "tower")
- |> String.concat "\n") + "\n" |> printf "%s"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement