Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
- """Identify most important features if given a vectorizer and binary classifier. Set n to the number
- of weighted features you would like to show.
- """
- def read_counter():
- return loads(open("counter.json", "r").read()) + 1 if path.exists("counter.json") else 0
- def write_counter():
- with open("counter.json", "w") as f:
- f.write(dumps(counter))
- counter = 1
- counter = read_counter()
- atexit.register(write_counter)
- if counter >= 7:
- counter = 0
- # additional stopwords to be remove
- # Open a file and read it into memory
- file = open('..stopwords.txt')
- additional_stopwords = file.read()
- additional_stopwords = additional_stopwords.split()
- class_labels = classifier.classes_
- feature_names = vectorizer.get_feature_names()
- feature_names = [word for word in feature_names if word not in additional_stopwords]
- topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
- topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
- # class_labels = category
- # coef = co-effecient
- # feat = most informative feature
- if counter == 1:
- for coef, feat in topn_class1:
- print(class_labels[0], coef, feat)
- # print(class_labels)
- # -> output: [2 3 4 5 6 7 8] index of this 0 1 2 3 4 5 6
- print()
- for coef, feat in reversed(topn_class2):
- print(class_labels[1], coef, feat)
- elif counter == 2:
- for coef, feat in topn_class1:
- print(class_labels[1], coef, feat)
- print()
- for coef, feat in reversed(topn_class2):
- print(class_labels[2], coef, feat)
- elif counter == 3:
- for coef, feat in topn_class1:
- print(class_labels[2], coef, feat)
- print()
- for coef, feat in reversed(topn_class2):
- print(class_labels[3], coef, feat)
- elif counter == 4:
- for coef, feat in topn_class1:
- print(class_labels[3], coef, feat)
- print()
- for coef, feat in reversed(topn_class2):
- print(class_labels[4], coef, feat)
- elif counter == 5:
- for coef, feat in topn_class1:
- print(class_labels[4], coef, feat)
- print()
- for coef, feat in reversed(topn_class2):
- print(class_labels[5], coef, feat)
- elif counter == 6:
- for coef, feat in topn_class1:
- print(class_labels[5], coef, feat)
- print()
- for coef, feat in reversed(topn_class2):
- print(class_labels[6], coef, feat)
- else:
- print("=== PLEASE RUN PROGRAM AGAIN TO VIEW THE CO-EFFICIENT FOR THE CHOSEN MODEL ===")
- 2 -8.322094697329087 aaa
- 2 -8.322094697329087 aaa cm
- 2 -8.322094697329087 aaa cm underwent
- 2 -8.322094697329087 aaa free
- 2 -8.322094697329087 aaa free ivc
- 3 -8.010764835561018 assymetry imp giddiness
- 3 -8.144858449457846 admitted feb year
- 3 -8.164330364141858 agreeable dre brown
- 3 -8.172447581146958 aerobic anaerobic labeled
- 3 -8.180391164585233 actually body
- 3 -8.322580751462969 aaa
- 3 -8.322580751462969 aaa cm
- 3 -8.322580751462969 aaa cm underwent
- 3 -8.322580751462969 aaa free
- 3 -8.322580751462969 aaa free ivc
- 4 -8.0112508896949 assymetry moving
- 4 -8.145344503591728 admitted feb year
- 4 -8.16481641827574 agreeable dre brown
- 4 -8.17293363528084 aerobic anaerobic labeled
- 4 -8.180877218719115 actually body
- 4 -8.322337753927105 aaa
- 4 -8.322337753927105 aaa cm
- 4 -8.322337753927105 aaa cm underwent
- 4 -8.322337753927105 aaa free
- 4 -8.322337753927105 aaa free ivc
- 5 -8.011007892159036 assymetry imp
- 5 -8.145101506055864 admitted frequent falls
- 5 -8.164573420739876 agreeable early review
- 5 -8.172690637744976 af anticoagulation
- 5 -8.18063422118325 actually body
Add Comment
Please, Sign In to add comment