Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- import sys
- def load_features(train_f):
- features = {}
- feature_for_index = {}
- index = 0
- for feature in train_f.readline().replace('\n', '').split('\t'):
- features[feature] = []
- feature_for_index[index] = feature
- index += 1
- for line in train_f:
- split = line.replace('\n', '').split('\t')
- for i in range(len(split)):
- features[feature_for_index[i]].append(int(split[i]))
- return features
- def frequency_table(values):
- unique_value_count = {}
- for value in values:
- if value not in unique_value_count:
- unique_value_count[value] = 1
- else:
- unique_value_count[value] += 1
- return unique_value_count
- def filter_feature_for_class(features, clazz, feature):
- filtered_values = []
- for i in range(len(features['class'])):
- if features['class'][i] == clazz:
- filtered_values.append(features[feature][i])
- return filtered_values
- def print_learning(features):
- class_freq_table = frequency_table(features['class'])
- total = sum(class_freq_table.values())
- list = []
- for clazz in sorted(class_freq_table.keys()):
- class_total = class_freq_table[clazz]
- class_p = class_total / total
- print('P(class=' + str(clazz) + ')=%.2f ' % class_p, end='')
- x = class_p
- list.append(x)
- for feature in features.keys():
- if feature == 'class':
- continue
- values_for_feature = filter_feature_for_class(features, clazz, feature)
- feature_freq_table = frequency_table(values_for_feature)
- for feature_value in sorted(feature_freq_table.keys()):
- count = feature_freq_table[feature_value]
- feature_value_p = count / class_total
- print('P(' + feature + '=' + str(feature_value) + '|' + str(clazz) + ')=%.2f ' % feature_value_p, end='')
- x = feature_value_p
- list.append(x)
- print()
- return list
- def accuracy(list, file):
- right = 0
- wrong = 0
- train_f = open(file, 'r')
- train_f.readline()
- list_of_lists = []
- for line in train_f:
- stripped_line = line.strip()
- line_list = [int(x) for x in stripped_line.split()]
- if len(line_list) > 0:
- list_of_lists.append(line_list)
- for lists in list_of_lists:
- classzero = math.log(float(list[0]),2)
- classone = math.log(float(list[len(lists)*2-1]),2)
- for val in range(len(lists)-1):
- classzero += math.log(float(list[val*2 + lists[val]+1]),2)
- classone += math.log(float(list[len(lists)*2+ val*2 + lists[val]]),2)
- if(classzero > classone):
- if(lists[len(lists)-1] == 0):
- right +=1
- else:
- wrong +=1
- elif(classone > classzero):
- if(lists[len(lists)-1] == 1):
- right +=1
- else:
- wrong +=1
- else:
- if(list[0]>=.5):
- if(lists[len(lists)-1] == 0):
- right +=1
- else:
- wrong +=1
- else:
- if(lists[len(lists)-1] == 1):
- right +=1
- else:
- wrong +=1
- return (round(float(right) / float(right + wrong),4),(right + wrong))
- if len(sys.argv) != 3:
- print('You must specify only a training data file and test data file in the program parameters; nothing more or less.')
- else:
- train_file = sys.argv[1]
- test_file = sys.argv[2]
- train_f = open(train_file, 'r')
- features = load_features(train_f)
- list = print_learning(features)
- print()
- acc = accuracy(list,train_file,features)
- print("Accuracy on training set (" + str(acc[1]) + " instances): "+ str(100 * acc[0]) + "%")
- print()
- acc = accuracy(list,test_file,features)
- print("Accuracy on training set (" + str(acc[1]) + " instances): "+ str(100 * acc[0]) + "%")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement