Advertisement
nanorocks

first_exam_2017_decision_tree

May 26th, 2018
857
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.31 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3.  
  4. #Дадено е податочно множество од риби кое ги содржи следните видови:
  5.  
  6. #    Code Finnish  Swedish    English        Latin
  7.  
  8.    #  1   Lahna    Braxen     Bream          Abramis brama
  9.  #    2   Siika    Iiden      Whitewish      Leusiscus idus
  10.    #  3   Saerki   Moerten    Roach          Leuciscus rutilus
  11.  #    4   Parkki   Bjoerknan  ?              Abramis bjrkna
  12.   #   5   Norssi   Norssen    Smelt          Osmerus eperlanus
  13.    #  6   Hauki    Jaedda     Pike           Esox lucius
  14.    #  7   Ahven    Abborre    Perch          Perca fluviatilis
  15. #Дадени се следните атрибути:
  16.  
  17.   #  0  Weight      Weight of the fish (in grams)
  18.   #  1  Length1     Length from the nose to the beginning of the tail (in cm)
  19.   #  2  Length2     Length from the nose to the notch of the tail (in cm)
  20.   #  3  Length3     Length from the nose to the end of the tail (in cm)
  21.    # 4  Height%     Maximal height as % of Length3
  22.    # 5  Width%      Maximal width as % of Length3
  23. #Класата е дадена во последната колона.
  24.  
  25. #Да се направи модел за класификација за даденото податочно множество. За тренинг да се земат
  26. #само првите 5 примероци од секоја од класите во множеството. Притоа ова да се направи во програмата,
  27. # а не со рачно копирање! Да се класифицира елементот од податочното множество даден на влез и да се
  28. # испечати предвидувањето. Елементот е даден со индексот од податочното множество.
  29.  
  30.  
  31.  
  32. data = [[242.0, 23.2, 25.4, 30.0, 38.4, 13.4, 1],
  33.         [290.0, 24.0, 26.3, 31.2, 40.0, 13.8, 1],
  34.         [340.0, 23.9, 26.5, 31.1, 39.8, 15.1, 1],
  35.         [363.0, 26.3, 29.0, 33.5, 38.0, 13.3, 1],
  36.         [430.0, 26.5, 29.0, 34.0, 36.6, 15.1, 1],
  37.         [450.0, 26.8, 29.7, 34.7, 39.2, 14.2, 1],
  38.         [500.0, 26.8, 29.7, 34.5, 41.1, 15.3, 1],
  39.         [390.0, 27.6, 30.0, 35.0, 36.2, 13.4, 1],
  40.         [450.0, 27.6, 30.0, 35.1, 39.9, 13.8, 1],
  41.         [500.0, 28.5, 30.7, 36.2, 39.3, 13.7, 1],
  42.         [475.0, 28.4, 31.0, 36.2, 39.4, 14.1, 1],
  43.         [500.0, 28.7, 31.0, 36.2, 39.7, 13.3, 1],
  44.         [500.0, 29.1, 31.5, 36.4, 37.8, 12.0, 1],
  45.         [500.0, 29.5, 32.0, 37.3, 37.3, 13.6, 1],
  46.         [600.0, 29.4, 32.0, 37.2, 40.2, 13.9, 1],
  47.         [600.0, 29.4, 32.0, 37.2, 41.5, 15.0, 1],
  48.         [700.0, 30.4, 33.0, 38.3, 38.8, 13.8, 1],
  49.         [700.0, 30.4, 33.0, 38.5, 38.8, 13.5, 1],
  50.         [610.0, 30.9, 33.5, 38.6, 40.5, 13.3, 1],
  51.         [650.0, 31.0, 33.5, 38.7, 37.4, 14.8, 1],
  52.         [575.0, 31.3, 34.0, 39.5, 38.3, 14.1, 1],
  53.         [685.0, 31.4, 34.0, 39.2, 40.8, 13.7, 1],
  54.         [620.0, 31.5, 34.5, 39.7, 39.1, 13.3, 1],
  55.         [680.0, 31.8, 35.0, 40.6, 38.1, 15.1, 1],
  56.         [700.0, 31.9, 35.0, 40.5, 40.1, 13.8, 1],
  57.         [725.0, 31.8, 35.0, 40.9, 40.0, 14.8, 1],
  58.         [720.0, 32.0, 35.0, 40.6, 40.3, 15.0, 1],
  59.         [714.0, 32.7, 36.0, 41.5, 39.8, 14.1, 1],
  60.         [850.0, 32.8, 36.0, 41.6, 40.6, 14.9, 1],
  61.         [1000.0, 33.5, 37.0, 42.6, 44.5, 15.5, 1],
  62.         [920.0, 35.0, 38.5, 44.1, 40.9, 14.3, 1],
  63.         [955.0, 35.0, 38.5, 44.0, 41.1, 14.3, 1],
  64.         [925.0, 36.2, 39.5, 45.3, 41.4, 14.9, 1],
  65.         [975.0, 37.4, 41.0, 45.9, 40.6, 14.7, 1],
  66.         [950.0, 38.0, 41.0, 46.5, 37.9, 13.7, 1],
  67.         [270.0, 23.6, 26.0, 28.7, 29.2, 14.8, 2],
  68.         [270.0, 24.1, 26.5, 29.3, 27.8, 14.5, 2],
  69.         [306.0, 25.6, 28.0, 30.8, 28.5, 15.2, 2],
  70.         [540.0, 28.5, 31.0, 34.0, 31.6, 19.3, 2],
  71.         [800.0, 33.7, 36.4, 39.6, 29.7, 16.6, 2],
  72.         [1000.0, 37.3, 40.0, 43.5, 28.4, 15.0, 2],
  73.         [40.0, 12.9, 14.1, 16.2, 25.6, 14.0, 3],
  74.         [69.0, 16.5, 18.2, 20.3, 26.1, 13.9, 3],
  75.         [78.0, 17.5, 18.8, 21.2, 26.3, 13.7, 3],
  76.         [87.0, 18.2, 19.8, 22.2, 25.3, 14.3, 3],
  77.         [120.0, 18.6, 20.0, 22.2, 28.0, 16.1, 3],
  78.         [0.0, 19.0, 20.5, 22.8, 28.4, 14.7, 3],
  79.         [110.0, 19.1, 20.8, 23.1, 26.7, 14.7, 3],
  80.         [120.0, 19.4, 21.0, 23.7, 25.8, 13.9, 3],
  81.         [150.0, 20.4, 22.0, 24.7, 23.5, 15.2, 3],
  82.         [145.0, 20.5, 22.0, 24.3, 27.3, 14.6, 3],
  83.         [160.0, 20.5, 22.5, 25.3, 27.8, 15.1, 3],
  84.         [140.0, 21.0, 22.5, 25.0, 26.2, 13.3, 3],
  85.         [160.0, 21.1, 22.5, 25.0, 25.6, 15.2, 3],
  86.         [169.0, 22.0, 24.0, 27.2, 27.7, 14.1, 3],
  87.         [161.0, 22.0, 23.4, 26.7, 25.9, 13.6, 3],
  88.         [200.0, 22.1, 23.5, 26.8, 27.6, 15.4, 3],
  89.         [180.0, 23.6, 25.2, 27.9, 25.4, 14.0, 3],
  90.         [290.0, 24.0, 26.0, 29.2, 30.4, 15.4, 3],
  91.         [272.0, 25.0, 27.0, 30.6, 28.0, 15.6, 3],
  92.         [390.0, 29.5, 31.7, 35.0, 27.1, 15.3, 3],
  93.         [55.0, 13.5, 14.7, 16.5, 41.5, 14.1, 4],
  94.         [60.0, 14.3, 15.5, 17.4, 37.8, 13.3, 4],
  95.         [90.0, 16.3, 17.7, 19.8, 37.4, 13.5, 4],
  96.         [120.0, 17.5, 19.0, 21.3, 39.4, 13.7, 4],
  97.         [150.0, 18.4, 20.0, 22.4, 39.7, 14.7, 4],
  98.         [140.0, 19.0, 20.7, 23.2, 36.8, 14.2, 4],
  99.         [170.0, 19.0, 20.7, 23.2, 40.5, 14.7, 4],
  100.         [145.0, 19.8, 21.5, 24.1, 40.4, 13.1, 4],
  101.         [200.0, 21.2, 23.0, 25.8, 40.1, 14.2, 4],
  102.         [273.0, 23.0, 25.0, 28.0, 39.6, 14.8, 4],
  103.         [300.0, 24.0, 26.0, 29.0, 39.2, 14.6, 4],
  104.         [6.7, 9.3, 9.8, 10.8, 16.1, 9.7, 5],
  105.         [7.5, 10.0, 10.5, 11.6, 17.0, 10.0, 5],
  106.         [7.0, 10.1, 10.6, 11.6, 14.9, 9.9, 5],
  107.         [9.7, 10.4, 11.0, 12.0, 18.3, 11.5, 5],
  108.         [9.8, 10.7, 11.2, 12.4, 16.8, 10.3, 5],
  109.         [8.7, 10.8, 11.3, 12.6, 15.7, 10.2, 5],
  110.         [10.0, 11.3, 11.8, 13.1, 16.9, 9.8, 5],
  111.         [9.9, 11.3, 11.8, 13.1, 16.9, 8.9, 5],
  112.         [9.8, 11.4, 12.0, 13.2, 16.7, 8.7, 5],
  113.         [12.2, 11.5, 12.2, 13.4, 15.6, 10.4, 5],
  114.         [13.4, 11.7, 12.4, 13.5, 18.0, 9.4, 5],
  115.         [12.2, 12.1, 13.0, 13.8, 16.5, 9.1, 5],
  116.         [19.7, 13.2, 14.3, 15.2, 18.9, 13.6, 5],
  117.         [19.9, 13.8, 15.0, 16.2, 18.1, 11.6, 5],
  118.         [200.0, 30.0, 32.3, 34.8, 16.0, 9.7, 6],
  119.         [300.0, 31.7, 34.0, 37.8, 15.1, 11.0, 6],
  120.         [300.0, 32.7, 35.0, 38.8, 15.3, 11.3, 6],
  121.         [300.0, 34.8, 37.3, 39.8, 15.8, 10.1, 6],
  122.         [430.0, 35.5, 38.0, 40.5, 18.0, 11.3, 6],
  123.         [345.0, 36.0, 38.5, 41.0, 15.6, 9.7, 6],
  124.         [456.0, 40.0, 42.5, 45.5, 16.0, 9.5, 6],
  125.         [510.0, 40.0, 42.5, 45.5, 15.0, 9.8, 6],
  126.         [540.0, 40.1, 43.0, 45.8, 17.0, 11.2, 6],
  127.         [500.0, 42.0, 45.0, 48.0, 14.5, 10.2, 6],
  128.         [567.0, 43.2, 46.0, 48.7, 16.0, 10.0, 6],
  129.         [770.0, 44.8, 48.0, 51.2, 15.0, 10.5, 6],
  130.         [950.0, 48.3, 51.7, 55.1, 16.2, 11.2, 6],
  131.         [1250.0, 52.0, 56.0, 59.7, 17.9, 11.7, 6],
  132.         [1600.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
  133.         [1550.0, 56.0, 60.0, 64.0, 15.0, 9.6, 6],
  134.         [1650.0, 59.0, 63.4, 68.0, 15.9, 11.0, 6],
  135.         [5.9, 7.5, 8.4, 8.8, 24.0, 16.0, 7],
  136.         [32.0, 12.5, 13.7, 14.7, 24.0, 13.6, 7],
  137.         [40.0, 13.8, 15.0, 16.0, 23.9, 15.2, 7],
  138.         [51.5, 15.0, 16.2, 17.2, 26.7, 15.3, 7],
  139.         [70.0, 15.7, 17.4, 18.5, 24.8, 15.9, 7],
  140.         [100.0, 16.2, 18.0, 19.2, 27.2, 17.3, 7],
  141.         [78.0, 16.8, 18.7, 19.4, 26.8, 16.1, 7],
  142.         [80.0, 17.2, 19.0, 20.2, 27.9, 15.1, 7],
  143.         [85.0, 17.8, 19.6, 20.8, 24.7, 14.6, 7],
  144.         [85.0, 18.2, 20.0, 21.0, 24.2, 13.2, 7],
  145.         [110.0, 19.0, 21.0, 22.5, 25.3, 15.8, 7],
  146.         [115.0, 19.0, 21.0, 22.5, 26.3, 14.7, 7],
  147.         [125.0, 19.0, 21.0, 22.5, 25.3, 16.3, 7],
  148.         [130.0, 19.3, 21.3, 22.8, 28.0, 15.5, 7],
  149.         [120.0, 20.0, 22.0, 23.5, 26.0, 14.5, 7],
  150.         [120.0, 20.0, 22.0, 23.5, 24.0, 15.0, 7],
  151.         [130.0, 20.0, 22.0, 23.5, 26.0, 15.0, 7],
  152.         [135.0, 20.0, 22.0, 23.5, 25.0, 15.0, 7],
  153.         [110.0, 20.0, 22.0, 23.5, 23.5, 17.0, 7],
  154.         [130.0, 20.5, 22.5, 24.0, 24.4, 15.1, 7],
  155.         [150.0, 20.5, 22.5, 24.0, 28.3, 15.1, 7],
  156.         [145.0, 20.7, 22.7, 24.2, 24.6, 15.0, 7],
  157.         [150.0, 21.0, 23.0, 24.5, 21.3, 14.8, 7],
  158.         [170.0, 21.5, 23.5, 25.0, 25.1, 14.9, 7],
  159.         [225.0, 22.0, 24.0, 25.5, 28.6, 14.6, 7],
  160.         [145.0, 22.0, 24.0, 25.5, 25.0, 15.0, 7],
  161.         [188.0, 22.6, 24.6, 26.2, 25.7, 15.9, 7],
  162.         [180.0, 23.0, 25.0, 26.5, 24.3, 13.9, 7],
  163.         [197.0, 23.5, 25.6, 27.0, 24.3, 15.7, 7],
  164.         [218.0, 25.0, 26.5, 28.0, 25.6, 14.8, 7],
  165.         [300.0, 25.2, 27.3, 28.7, 29.0, 17.9, 7],
  166.         [260.0, 25.4, 27.5, 28.9, 24.8, 15.0, 7],
  167.         [265.0, 25.4, 27.5, 28.9, 24.4, 15.0, 7],
  168.         [250.0, 25.4, 27.5, 28.9, 25.2, 15.8, 7],
  169.         [250.0, 25.9, 28.0, 29.4, 26.6, 14.3, 7],
  170.         [300.0, 26.9, 28.7, 30.1, 25.2, 15.4, 7],
  171.         [320.0, 27.8, 30.0, 31.6, 24.1, 15.1, 7],
  172.         [514.0, 30.5, 32.8, 34.0, 29.5, 17.7, 7],
  173.         [556.0, 32.0, 34.5, 36.5, 28.1, 17.5, 7],
  174.         [840.0, 32.5, 35.0, 37.3, 30.8, 20.9, 7],
  175.         [685.0, 34.0, 36.5, 39.0, 27.9, 17.6, 7],
  176.         [700.0, 34.0, 36.0, 38.3, 27.7, 17.6, 7],
  177.         [700.0, 34.5, 37.0, 39.4, 27.5, 15.9, 7],
  178.         [690.0, 34.6, 37.0, 39.3, 26.9, 16.2, 7],
  179.         [900.0, 36.5, 39.0, 41.4, 26.9, 18.1, 7],
  180.         [650.0, 36.5, 39.0, 41.4, 26.9, 14.5, 7],
  181.         [820.0, 36.6, 39.0, 41.3, 30.1, 17.8, 7],
  182.         [850.0, 36.9, 40.0, 42.3, 28.2, 16.8, 7],
  183.         [900.0, 37.0, 40.0, 42.5, 27.6, 17.0, 7],
  184.         [1015.0, 37.0, 40.0, 42.4, 29.2, 17.6, 7],
  185.         [820.0, 37.1, 40.0, 42.5, 26.2, 15.6, 7],
  186.         [1100.0, 39.0, 42.0, 44.6, 28.7, 15.4, 7],
  187.         [1000.0, 39.8, 43.0, 45.2, 26.4, 16.1, 7],
  188.         [1100.0, 40.1, 43.0, 45.5, 27.5, 16.3, 7],
  189.         [1000.0, 40.2, 43.5, 46.0, 27.4, 17.7, 7],
  190.         [1000.0, 41.1, 44.0, 46.6, 26.8, 16.3, 7]]
  191.  
  192.  
  193.  
  194. class decisionnode:
  195.     def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
  196.         self.col = col
  197.         self.value = value
  198.         self.results = results
  199.         self.tb = tb
  200.         self.fb = fb
  201.  
  202.  
  203. def sporedi_broj(row, column, value):
  204.     return row[column] >= value
  205.  
  206.  
  207. def sporedi_string(row, column, value):
  208.     return row[column] == value
  209.  
  210.  
  211. # Divides a set on a specific column. Can handle numeric
  212. # or nominal values
  213. def divideset(rows, column, value):
  214.     # Make a function that tells us if a row is in
  215.     # the first group (true) or the second group (false)
  216.     split_function = None
  217.     if isinstance(value, int) or isinstance(value, float):  # ako vrednosta so koja sporeduvame e od tip int ili float
  218.         # split_function=lambda row:row[column]>=value # togas vrati funkcija cij argument e row i vrakja vrednost true ili false
  219.         split_function = sporedi_broj
  220.     else:
  221.         # split_function=lambda row:row[column]==value # ako vrednosta so koja sporeduvame e od drug tip (string)
  222.         split_function = sporedi_string
  223.  
  224.     # Divide the rows into two sets and return them
  225.     set_false = []
  226.     set_true = []
  227.     for row in rows:
  228.         if split_function(row, column, value):
  229.             set_true.append(row)
  230.         else:
  231.             set_false.append(row)
  232.     set1 = [row for row in rows if
  233.             split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja true
  234.     set2 = [row for row in rows if
  235.             not split_function(row, column, value)]  # za sekoj row od rows za koj split_function vrakja false
  236.     # return (set1, set2)
  237.     return (set_true, set_false)
  238.  
  239.  
  240.  
  241.  
  242. # Create counts of possible results (the last column of
  243. # each row is the result)
  244. def uniquecounts(rows):
  245.     results = {}
  246.     for row in rows:
  247.         # The result is the last column
  248.         r = row[-1]
  249.         results.setdefault(r, 0)
  250.         results[r] += 1
  251.  
  252.     return results
  253.  
  254.  
  255. # Probability that a randomly placed item will
  256. # be in the wrong category
  257.  
  258. def log2(x):
  259.     from math import log
  260.     l2 = log(x) / log(2)
  261.     return l2
  262.  
  263.  
  264. # Entropy is the sum of p(x)log(p(x)) across all
  265. # the different possible results
  266. def entropy(rows):
  267.     results = uniquecounts(rows)
  268.     # Now calculate the entropy
  269.     ent = 0.0
  270.     for r in results.keys():
  271.         p = float(results[r]) / len(rows)
  272.         ent = ent - p * log2(p)
  273.     return ent
  274.  
  275.  
  276. def buildtree(rows, scoref=entropy):
  277.     if len(rows) == 0: return decisionnode()
  278.     current_score = scoref(rows)
  279.  
  280.     # Set up some variables to track the best criteria
  281.     best_gain = 0.0
  282.     best_column = -1
  283.     best_value = None
  284.     best_subsetf = None
  285.     best_subsett = None
  286.  
  287.     column_count = len(rows[0]) - 1
  288.     for col in range(column_count):
  289.         # Generate the list of different values in
  290.         # this column
  291.         column_values = set()
  292.         for row in rows:
  293.             column_values.add(row[col])
  294.         # Now try dividing the rows up for each value
  295.         # in this column
  296.         for value in column_values:
  297.             (set1, set2) = divideset(rows, col, value)
  298.  
  299.             # Information gain
  300.             p = float(len(set1)) / len(rows)
  301.             gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
  302.             if gain > best_gain and len(set1) > 0 and len(set2) > 0:
  303.                 best_gain = gain
  304.                 best_column = col
  305.                 best_value = value
  306.                 best_subsett = set1
  307.                 best_subsetf = set2
  308.                 # best_criteria = (col, value)
  309.                 # best_sets = (set1, set2)
  310.  
  311.     # Create the subbranches
  312.     if best_gain > 0:
  313.         trueBranch = buildtree(best_subsett, scoref)
  314.         falseBranch = buildtree(best_subsetf, scoref)
  315.         return decisionnode(col=best_column, value=best_value,
  316.                             tb=trueBranch, fb=falseBranch)
  317.     else:
  318.         return decisionnode(results=uniquecounts(rows))
  319.  
  320.  
  321.  
  322. def printtree(tree, indent=''):
  323.     # Is this a leaf node?
  324.     if tree.results != None:
  325.         print(indent + str(sorted(tree.results.items())))
  326.     else:
  327.         # Print the criteria
  328.         print(indent + str(tree.col) + ':' + str(tree.value) + '? ')
  329.         # Print the branches
  330.         print(indent + 'T->')
  331.         printtree(tree.tb, indent + '  ')
  332.         print(indent + 'F->')
  333.         printtree(tree.fb, indent + '  ')
  334.  
  335.  
  336.  
  337.  
  338. def classify(observation, tree):
  339.     if tree.results != None:
  340.         return tree.results
  341.     else:
  342.         vrednost = observation[tree.col]
  343.         branch = None
  344.  
  345.         if isinstance(vrednost, int) or isinstance(vrednost, float):
  346.             if vrednost >= tree.value:
  347.                 branch = tree.tb
  348.             else:
  349.                 branch = tree.fb
  350.         else:
  351.             if vrednost == tree.value:
  352.                 branch = tree.tb
  353.             else:
  354.                 branch = tree.fb
  355.  
  356.         return classify(observation, branch)
  357.  
  358.  
  359.  
  360. def new_data_set_fu(data):
  361.     classes = {}
  362.     for i in data:
  363.         # print(i)
  364.         classes.setdefault(i[-1])
  365.  
  366.     classes_data = list(classes)
  367.  
  368.     flag = 0
  369.     tmp = 0
  370.     data_set = []
  371.     for i in data:
  372.  
  373.         if flag == 5:
  374.             flag = 0
  375.             tmp += 1
  376.             if tmp == classes_data[-1]:
  377.                 break
  378.         elif classes_data[tmp] == i[-1]:
  379.             data_set.append(i)
  380.             # print(i)
  381.             flag += 1
  382.  
  383.     return data_set
  384.  
  385. if __name__ == '__main__':
  386.  
  387.     my_index = 5
  388.  
  389.     data_set = new_data_set_fu(data)
  390.     t = buildtree(data_set)
  391.  
  392.     #printtree(t)
  393.  
  394.     c = classify(data[my_index],t)
  395.  
  396.     print(c)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement