Advertisement
evilbloodydemon

Untitled

Jul 23rd, 2015
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
F# 4.55 KB | None | 0 0
  1. namespace DecisionTree
  2.  
  3. module DecisionTree =
  4.     open System
  5.  
  6.     type Fish = Yes | No | Maybe
  7.  
  8.     type Tree =
  9.         | Conclusion of Fish
  10.         | Choice of string * (int * Tree)[]
  11.  
  12.     type Datum = {attrs: int[]; fish: Fish }
  13.  
  14.     let log2 x = Math.Log(x, 2.0)
  15.  
  16.     let calcEntropy (dataset: Datum[]) =
  17.         let numEntries = dataset |> Array.length
  18.  
  19.         dataset
  20.         |> Array.groupBy (fun x -> x.fish)
  21.         |> Array.sumBy (fun g ->
  22.             let count = g |> snd |> Seq.length
  23.             let p = (float count) / (float numEntries)
  24.             -p * log2 p)
  25.  
  26.     let remove i arr =
  27.         let aa = Array.splitAt i arr
  28.         Array.append (fst aa) (snd aa |> Array.skip 1)
  29.  
  30.     let splitDataSet (dataset: Datum[]) axis value =
  31.         dataset
  32.         |> Array.filter (fun x -> x.attrs.[axis] = value)
  33.         |> Array.map (fun x -> {x with attrs = (remove axis x.attrs) })
  34.  
  35.     let createDataSet =
  36.         let ds = [|
  37.             {attrs = [|1; 1; 0|]; fish = Yes};
  38.             {attrs = [|1; 1; 0|]; fish = Yes};
  39.             {attrs = [|1; 0; 0|]; fish = No};
  40.             {attrs = [|0; 1; 2|]; fish = Maybe};
  41.             {attrs = [|0; 1; 0|]; fish = No}; |]
  42.  
  43.         let labels = [| "no surfacing"; "flippers"; "num legs" |]
  44.  
  45.         ds, labels
  46.  
  47.     let column (ds: Datum[]) i =
  48.         ds |> Array.map (fun row -> row.attrs.[i])
  49.  
  50.     let majority (ds: Datum[]) =
  51.         ds
  52.         |> Array.map (fun x -> x.fish)
  53.         |> Array.groupBy (fun x -> x)
  54.         |> Array.maxBy (fun g -> snd g |> Array.length)
  55.         |> fst
  56.  
  57.     let chooseBestFeatureToSplit (ds: Datum[]) =
  58.         let numFeatures = ds.[0].attrs |> Array.length
  59.         let baseEntropy = calcEntropy ds
  60.         let datasetLength = Array.length ds
  61.  
  62.         if numFeatures <= 0 then
  63.             None
  64.         else
  65.             [0..numFeatures-1]
  66.             |> Seq.map (fun i ->
  67.                 let newEntropy =
  68.                     column ds i
  69.                     |> Array.distinct
  70.                     |> Array.map (fun value ->
  71.                         let split = splitDataSet ds i value
  72.                         let prob = float (Array.length split) / (float datasetLength)
  73.                         calcEntropy split * prob)
  74.                     |> Array.sum
  75.                
  76.                 newEntropy, i)
  77.             |> Seq.maxBy (fun newEntropy -> baseEntropy - (fst newEntropy))
  78.             |> snd
  79.             |> Some
  80.  
  81.     let rec buildTree dataset =
  82.         let ds, (labels: string[]) = dataset
  83.  
  84.         match chooseBestFeatureToSplit ds with
  85.         | None -> Conclusion (majority ds)
  86.         | Some featureIndex ->
  87.             let label = labels.[featureIndex]
  88.             let splitLabels = remove featureIndex labels
  89.  
  90.             let trees =
  91.                 ds
  92.                 |> Array.groupBy (fun row -> row.attrs.[featureIndex])
  93.                 |> Array.map (fun (value, group) ->
  94.                     let newds =
  95.                         group
  96.                         |> Array.map (fun x -> {x with attrs = (remove featureIndex x.attrs) })
  97.                     value, newds)
  98.                 |> Array.map (fun (value, newds) -> value, buildTree(newds, splitLabels))
  99.  
  100.             Choice(label, trees)
  101.  
  102.     let manualTree =
  103.         Choice
  104.             ("no surfacing",
  105.             [|  0,
  106.                 Choice
  107.                     ("flippers", [|1, Conclusion Yes; 0, Conclusion No|]);
  108.                 1, Conclusion Maybe
  109.             |])
  110.  
  111.     let rec classify subject tree =
  112.         match tree with
  113.         | Conclusion (c) -> Some c
  114.         | Choice (label, options) ->
  115.             let subjectState =
  116.                 subject
  117.                 |> Seq.find(fun (key, value) -> key = label)
  118.                 |> snd
  119.  
  120.             match options |> Array.tryFind (fun (option, tree) -> option = subjectState) with
  121.             | None -> None
  122.             | Some x -> snd x |> classify subject
  123.  
  124.     let test =
  125.         let ds = createDataSet
  126.         let dataset = fst ds
  127.         let ent = calcEntropy dataset
  128.         let test = [| "no surfacing", 0; "flippers", 1; "num legs", 0 |]
  129.  
  130.         printfn "entropy: %A" ent
  131.         //printfn "split: %A" (splitDataSet dataset 0 1)
  132.         printfn "best feature: %A" (chooseBestFeatureToSplit dataset)
  133.         printfn "manual classify: %A" (classify test manualTree)
  134.  
  135.         let tree = buildTree ds
  136.  
  137.         printfn "tree: %A" tree
  138.  
  139.         match classify test tree with
  140.         | None -> printfn "auto classify: Unknown"
  141.         | Some x -> printfn "auto classify: %A" x
  142.  
  143.         0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement