Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- namespace DecisionTree
- module DecisionTree =
- open System
- type Fish = Yes | No | Maybe
- type Tree =
- | Conclusion of Fish
- | Choice of string * (int * Tree)[]
- type Datum = {attrs: int[]; fish: Fish }
- let log2 x = Math.Log(x, 2.0)
- let calcEntropy (dataset: Datum[]) =
- let numEntries = dataset |> Array.length
- dataset
- |> Array.groupBy (fun x -> x.fish)
- |> Array.sumBy (fun g ->
- let count = g |> snd |> Seq.length
- let p = (float count) / (float numEntries)
- -p * log2 p)
- let remove i arr =
- let aa = Array.splitAt i arr
- Array.append (fst aa) (snd aa |> Array.skip 1)
- let splitDataSet (dataset: Datum[]) axis value =
- dataset
- |> Array.filter (fun x -> x.attrs.[axis] = value)
- |> Array.map (fun x -> {x with attrs = (remove axis x.attrs) })
- let createDataSet =
- let ds = [|
- {attrs = [|1; 1; 0|]; fish = Yes};
- {attrs = [|1; 1; 0|]; fish = Yes};
- {attrs = [|1; 0; 0|]; fish = No};
- {attrs = [|0; 1; 2|]; fish = Maybe};
- {attrs = [|0; 1; 0|]; fish = No}; |]
- let labels = [| "no surfacing"; "flippers"; "num legs" |]
- ds, labels
- let column (ds: Datum[]) i =
- ds |> Array.map (fun row -> row.attrs.[i])
- let majority (ds: Datum[]) =
- ds
- |> Array.map (fun x -> x.fish)
- |> Array.groupBy (fun x -> x)
- |> Array.maxBy (fun g -> snd g |> Array.length)
- |> fst
- let chooseBestFeatureToSplit (ds: Datum[]) =
- let numFeatures = ds.[0].attrs |> Array.length
- let baseEntropy = calcEntropy ds
- let datasetLength = Array.length ds
- if numFeatures <= 0 then
- None
- else
- [0..numFeatures-1]
- |> Seq.map (fun i ->
- let newEntropy =
- column ds i
- |> Array.distinct
- |> Array.map (fun value ->
- let split = splitDataSet ds i value
- let prob = float (Array.length split) / (float datasetLength)
- calcEntropy split * prob)
- |> Array.sum
- newEntropy, i)
- |> Seq.maxBy (fun newEntropy -> baseEntropy - (fst newEntropy))
- |> snd
- |> Some
- let rec buildTree dataset =
- let ds, (labels: string[]) = dataset
- match chooseBestFeatureToSplit ds with
- | None -> Conclusion (majority ds)
- | Some featureIndex ->
- let label = labels.[featureIndex]
- let splitLabels = remove featureIndex labels
- let trees =
- ds
- |> Array.groupBy (fun row -> row.attrs.[featureIndex])
- |> Array.map (fun (value, group) ->
- let newds =
- group
- |> Array.map (fun x -> {x with attrs = (remove featureIndex x.attrs) })
- value, newds)
- |> Array.map (fun (value, newds) -> value, buildTree(newds, splitLabels))
- Choice(label, trees)
- let manualTree =
- Choice
- ("no surfacing",
- [| 0,
- Choice
- ("flippers", [|1, Conclusion Yes; 0, Conclusion No|]);
- 1, Conclusion Maybe
- |])
- let rec classify subject tree =
- match tree with
- | Conclusion (c) -> Some c
- | Choice (label, options) ->
- let subjectState =
- subject
- |> Seq.find(fun (key, value) -> key = label)
- |> snd
- match options |> Array.tryFind (fun (option, tree) -> option = subjectState) with
- | None -> None
- | Some x -> snd x |> classify subject
- let test =
- let ds = createDataSet
- let dataset = fst ds
- let ent = calcEntropy dataset
- let test = [| "no surfacing", 0; "flippers", 1; "num legs", 0 |]
- printfn "entropy: %A" ent
- //printfn "split: %A" (splitDataSet dataset 0 1)
- printfn "best feature: %A" (chooseBestFeatureToSplit dataset)
- printfn "manual classify: %A" (classify test manualTree)
- let tree = buildTree ds
- printfn "tree: %A" tree
- match classify test tree with
- | None -> printfn "auto classify: Unknown"
- | Some x -> printfn "auto classify: %A" x
- 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement