Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // // The sum of the specific conditional entropies * the probability of
- // each
- // // value for each value in ix_attr.
- public static double conditionalEntropy(List<Instance> instances,
- Integer ix_attr) {
- double sum = 0.0;
- Map<Integer, List<Instance>> instancesByAttrVal = binInstancesByVal(
- instances, ix_attr);
- for (List<Instance> filteredInstances : instancesByAttrVal.values()) {
- // probability of being in this set is equal to the number of items
- // in the set divided by the total number of items
- double probability = filteredInstances.size()
- / (double) instances.size();
- // specific conditional entropy
- sum += entropy(filteredInstances) * probability;
- }
- return sum;
- }
- public static Map<Integer, List<Instance>> binInstancesByVal(
- List<Instance> instances, Integer ix_attr) {
- if (instances == null || ix_attr < 0)
- throw new IllegalArgumentException();
- Map<Integer, List<Instance>> instancesByVal = new HashMap<Integer, List<Instance>>();
- for (Instance i : instances) {
- int val = i.attributes.get(ix_attr);
- instancesByVal.putIfAbsent(val, new ArrayList<Instance>());
- List<Instance> l = instancesByVal.get(val);
- l.add(i);
- }
- return instancesByVal;
- }
- // Based on diversity of label
- public static double entropy(List<Instance> instances) {
- if (instances == null)
- throw new IllegalArgumentException();
- double sum = 0.0;
- Map<Integer, Integer> numInstancesByLabel = countInstancesByLabel(instances);
- for (int num : numInstancesByLabel.values()) {
- double probability = (double) num / instances.size();
- // add Pr(i) * lg_2(Pr(i))
- sum -= probability * Math.log(probability) / Math.log(2);
- }
- return sum;
- }
- public static Map<Integer, Integer> countInstancesByLabel(
- List<Instance> instances) {
- if (instances == null)
- throw new IllegalArgumentException();
- Map<Integer, Integer> numInstancesByLabel = new HashMap<Integer, Integer>();
- for (Instance i : instances) {
- int num = 0;
- if (numInstancesByLabel.containsKey(i.label))
- num = numInstancesByLabel.get(i.label);
- numInstancesByLabel.put(i.label, num + 1);
- }
- return numInstancesByLabel;
- }
- public void rootInfoGain(DataSet train) {
- this.labels = train.labels;
- this.attributes = train.attributes;
- this.attributeValues = train.attributeValues;
- for (int i = 0; i < attributes.size(); i++) {
- System.out
- .printf("%s %.5f\n", attributes.get(i),
- 1.0 - DecisionTreeImpl.conditionalEntropy(
- train.instances, i));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement