Advertisement
Guest User

Untitled

a guest
Jul 27th, 2017
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.46 KB | None | 0 0
  1. /*
  2. * To change this template, choose Tools | Templates
  3. * and open the template in the editor.
  4. */
  5.  
  6. package id3;
  7.  
  8. import java.io.BufferedReader;
  9. import java.io.File;
  10. import java.io.FileReader;
  11. import java.util.LinkedList;
  12. import java.util.List;
  13.  
  14. /**
  15. *
  16. * @author Student
  17. */
  18. public class Main {
  19.  
  20. /**
  21. * @param args the command line arguments
  22. */
  23. public static List<int[]> dataset;
  24.  
  25. private static double log2(double x) {
  26. if (x == 0) return 0;
  27. return Math.log(x) / Math.log(2);
  28. }
  29.  
  30. public static double frequency(int attr, int value, List<int[]> dataset) {
  31. int total = dataset.size();
  32. int occurences = 0;
  33. for (int i=0; i<total; i++) {
  34. int[] line = dataset.get(i);
  35. if (line[attr] == value) occurences++;
  36. }
  37.  
  38. return (double)occurences / (double)total;
  39. }
  40.  
  41. public static List<int[]> subtable(List<int[]> dataset, int attr, int value) {
  42. List<int[]> result = new LinkedList<int[]>();
  43. for (int i=0; i<dataset.size(); i++) {
  44. int[] line = dataset.get(i);
  45. if (line[attr] == value) {
  46. line[attr] = -1;
  47. result.add(line);
  48. }
  49. }
  50.  
  51. return result;
  52. }
  53.  
  54. public static double avg_entropy(int attr1, int attr2, List<int[]> dataset) {
  55. List values = new LinkedList();
  56. double result = 0;
  57. for (int i=0; i<dataset.size(); i++) {
  58. int[] line = dataset.get(i);
  59. if (!values.contains(line[attr1])) {
  60. values.add(line[attr1]);
  61. result = result + frequency(attr1, line[attr1], dataset) * entropy(attr2, subtable(dataset, attr1, line[attr1]));
  62. }
  63. }
  64.  
  65. return result;
  66. }
  67.  
  68. public static double entropy(int attr, List<int[]> dataset) {
  69. double e = 0;
  70. List values = new LinkedList();
  71.  
  72. for (int i=0; i<dataset.size(); i++) {
  73. int[] line = dataset.get(i);
  74. if (!values.contains(line[attr])) {
  75. values.add(line[attr]);
  76. double f = frequency(attr, line[attr], dataset);
  77. //System.out.println("F("+line[attr]+") = " + f + ", log2 = " + log2(f));
  78. e = e + f * log2(f);
  79. }
  80. }
  81.  
  82. return -e;
  83. }
  84.  
  85. private static class Node {
  86. String label;
  87. List<Node> nodes;
  88. }
  89.  
  90. private static int most_common_value(int attr, List<int[]> dataset) {
  91. double max_f = 0;
  92. int most_common = 0;
  93. for (int i=0; i<dataset.size(); i++) {
  94. int[] line = dataset.get(i);
  95. if ((frequency(attr, line[attr], dataset)) > max_f) {
  96. max_f = frequency(attr, line[attr], dataset);
  97. most_common = line[attr];
  98. }
  99. }
  100.  
  101. return most_common;
  102. }
  103.  
  104. public static Node id3(List<int[]> dataset, int attr) {
  105. if (dataset.isEmpty()) return null;
  106. Node n = new Node();
  107.  
  108. boolean empty = true;
  109. int[] sample = dataset.get(0);
  110. for (int i=0; i<4; i++) if (i!=attr) {
  111. if (sample[i] != -1) {
  112. empty = false;
  113. break;
  114. }
  115. }
  116.  
  117. if (empty) {
  118. int most_common = most_common_value(attr, dataset);
  119.  
  120. n.label = "ATTR[" + attr + "] = " + most_common;
  121. } else {
  122. boolean same = true;
  123. int[] first = dataset.get(0);
  124. for (int i=1; i<dataset.size(); i++) {
  125. int[] line = dataset.get(i);
  126. if (line[attr] != first[attr]) {
  127. same = false;
  128. break;
  129. }
  130. }
  131.  
  132. if (same) {
  133. n.label = "ATTR[" + attr + "] = " + first[attr] + " with probability 1";
  134. } else {
  135. double min_avg_e = 1000000;
  136. int min_attr = 0;
  137. for (int a=0; a<4; a++) {
  138. double avg_e = avg_entropy(a, attr, dataset);
  139. if (avg_e < min_avg_e) {
  140. min_attr = a;
  141. min_avg_e = avg_e;
  142. }
  143. }
  144.  
  145. if (min_avg_e > entropy(attr, dataset) / 2) {
  146.  
  147.  
  148. } else {
  149.  
  150.  
  151. }
  152. }
  153. }
  154. }
  155.  
  156. public static List<int[]> readFile(String filename) {
  157. List<int[]> result = new LinkedList<int[]>();
  158. try {
  159. File myFile = new File (filename);
  160. BufferedReader newFile = new BufferedReader(new FileReader(myFile));
  161. while (true) {
  162. if (!newFile.ready()) break;
  163. String next_line = newFile.readLine();
  164. System.out.println(next_line);
  165. int[] line = new int[4];
  166. line[0] = (next_line.charAt(0) == 'Y') ? 1 : 0;
  167. line[1] = (next_line.charAt(1) == 'Y') ? 1 : 0;
  168. line[2] = (next_line.charAt(2) == 'Y') ? 1 : 0;
  169. line[3] = Integer.parseInt(""+next_line.charAt(3));
  170. result.add(line);
  171. }
  172. } catch (Exception e) {
  173. System.out.println("Exception: " + e.toString());
  174. }
  175.  
  176. return result;
  177. }
  178.  
  179. public static void main(String[] args) {
  180. dataset = readFile("input.txt");
  181. System.out.println(entropy(3, dataset));
  182. }
  183. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement