Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import ast
- from matplotlib.patches import Patch
- from scipy.spatial.distance import euclidean
- from sklearn.feature_extraction.text import TfidfVectorizer
- legend_elements = []
- legend_labels = ['civil law', 'administrative law', 'pharmaceutical law', 'labor law', 'medical law', 'criminal law', 'international law', 'tax law', 'constitutional law', 'other']
- for index,color in enumerate(colors):
- c = colors[index]
- l = legend_labels[index]
- legend_elements.append(Patch(facecolor=c, edgecolor='black',label=l))
- plt.figure(figsize=(20, 20))
- texts = []
- reset_number_of_documents_in_category()
- def calculate_class_based_tfidf(allWordsInClass, allClasses):
- # allWordsInClass: a list of lists, where each sublist contains all words for a particular class
- # allClasses: a list of all class names
- # Convert all words in each class to a single string (document)
- documents = [' '.join(words) for words in allWordsInClass]
- # Initialize the TfidfVectorizer
- vectorizer = TfidfVectorizer()
- # Fit and transform the documents
- tfidf_matrix = vectorizer.fit_transform(documents)
- # Retrieve the words
- feature_names = vectorizer.get_feature_names_out()
- # Create a dictionary to store TF-IDF scores
- tfidf_scores = {class_name: {} for class_name in allClasses}
- # Populate the dictionary with TF-IDF scores
- for class_index, class_name in enumerate(allClasses):
- feature_index = tfidf_matrix[class_index,:].nonzero()[1]
- tfidf_scores_for_class = zip(feature_index, [tfidf_matrix[class_index, x] for x in feature_index])
- for word, score in [(feature_names[i], s) for (i, s) in tfidf_scores_for_class]:
- tfidf_scores[class_name][word] = score
- return tfidf_scores
- allWordsInClass = [[] for _ in tags]
- print("Number of classes:", len(tags))
- print("Length of allWordsInClass:", len(allWordsInClass))
- print("Length of vectorizedSentences:", len(vectorizedSentences))
- print("Length of allWords:", len(allWords))
- threshold = 5.0
- class_distances = {}
- for i,vec in enumerate(vectorizedSentences):
- affairIndex = get_category_index(vec)
- allWordsInClass[affairIndex].extend(allWordsFromSentences[i])
- update_number_of_documents_in_category(affairIndex)
- c = colors[affairIndex]
- winnin_position = som.winner(vec)
- text = ''
- if len(bestWordsFromSentences[i]) > 0:
- text = bestWordsFromSentences[i]
- if text != "":
- texts.append(plt.text((winnin_position[0]+np.random.rand()*.9)%map_dim, (winnin_position[1]+np.random.rand()*.9)%map_dim, text, color=c))
- class_points = [j for j, other_vec in enumerate(vectorizedSentences) if get_category_index(other_vec) == affairIndex]
- class_distances[affairIndex] = class_distances.get(affairIndex, [])
- for j in class_points:
- if i != j:
- other_position = som.winner(vectorizedSentences[j])
- distance = euclidean(winnin_position, other_position)
- if distance < threshold:
- class_distances[affairIndex].append(distance)
- print_number_of_documents_in_category()
- class_quality = {}
- for affairIndex, distances in class_distances.items():
- quality = 1.0 - (np.mean(distances) / threshold)
- class_name = tags[affairIndex]
- class_quality[class_name] = quality
- print(f"Jakość dla klasy {class_name}: {quality}")
- tfidf_scores = calculate_class_based_tfidf(allWordsInClass, tags)
- # Now you can access the TF-IDF score for each word in each class
- # For example, to print the scores for the first class
- print("TF-IDF scores for class '{}':".format(tags[0]))
- for word, score in tfidf_scores[tags[0]].items():
- print(f"{word}: {score}")
- plt.legend(handles=legend_elements, loc='upper left',prop={'size': 15})
- # plt.xticks(range(map_dim))
- # plt.yticks(range(map_dim))
- # plt.grid()
- plt.xlim([0, map_dim])
- plt.ylim([0, map_dim])
- plt.plot()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement