Advertisement
lisachu

tfidf

Jan 14th, 2024
411
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.92 KB | None | 0 0
  1. import ast
  2. from matplotlib.patches import Patch
  3. from scipy.spatial.distance import euclidean
  4. from sklearn.feature_extraction.text import TfidfVectorizer
  5. legend_elements = []
  6. legend_labels = ['civil law', 'administrative law', 'pharmaceutical law', 'labor law', 'medical law', 'criminal law', 'international law', 'tax law', 'constitutional law', 'other']
  7. for index,color in enumerate(colors):
  8.     c = colors[index]
  9.     l = legend_labels[index]
  10.     legend_elements.append(Patch(facecolor=c, edgecolor='black',label=l))
  11. plt.figure(figsize=(20, 20))
  12. texts = []
  13. reset_number_of_documents_in_category()
  14.  
  15. def calculate_class_based_tfidf(allWordsInClass, allClasses):
  16.     # allWordsInClass: a list of lists, where each sublist contains all words for a particular class
  17.     # allClasses: a list of all class names
  18.    
  19.     # Convert all words in each class to a single string (document)
  20.     documents = [' '.join(words) for words in allWordsInClass]
  21.  
  22.     # Initialize the TfidfVectorizer
  23.     vectorizer = TfidfVectorizer()
  24.     # Fit and transform the documents
  25.     tfidf_matrix = vectorizer.fit_transform(documents)
  26.  
  27.     # Retrieve the words
  28.     feature_names = vectorizer.get_feature_names_out()
  29.  
  30.     # Create a dictionary to store TF-IDF scores
  31.     tfidf_scores = {class_name: {} for class_name in allClasses}
  32.  
  33.     # Populate the dictionary with TF-IDF scores
  34.     for class_index, class_name in enumerate(allClasses):
  35.         feature_index = tfidf_matrix[class_index,:].nonzero()[1]
  36.         tfidf_scores_for_class = zip(feature_index, [tfidf_matrix[class_index, x] for x in feature_index])
  37.         for word, score in [(feature_names[i], s) for (i, s) in tfidf_scores_for_class]:
  38.             tfidf_scores[class_name][word] = score
  39.    
  40.     return tfidf_scores
  41.  
  42. allWordsInClass = [[] for _ in tags]
  43.  
  44. print("Number of classes:", len(tags))
  45. print("Length of allWordsInClass:", len(allWordsInClass))
  46.  
  47. print("Length of vectorizedSentences:", len(vectorizedSentences))
  48. print("Length of allWords:", len(allWords))
  49.  
  50. threshold = 5.0
  51. class_distances = {}
  52. for i,vec in enumerate(vectorizedSentences):
  53.     affairIndex = get_category_index(vec)
  54.     allWordsInClass[affairIndex].extend(allWordsFromSentences[i])
  55.     update_number_of_documents_in_category(affairIndex)
  56.     c = colors[affairIndex]
  57.     winnin_position = som.winner(vec)
  58.     text = ''
  59.     if len(bestWordsFromSentences[i]) > 0:
  60.         text = bestWordsFromSentences[i]
  61.     if text != "":
  62.         texts.append(plt.text((winnin_position[0]+np.random.rand()*.9)%map_dim, (winnin_position[1]+np.random.rand()*.9)%map_dim, text, color=c))
  63.  
  64.         class_points = [j for j, other_vec in enumerate(vectorizedSentences) if get_category_index(other_vec) == affairIndex]
  65.         class_distances[affairIndex] = class_distances.get(affairIndex, [])
  66.         for j in class_points:
  67.             if i != j:
  68.                 other_position = som.winner(vectorizedSentences[j])
  69.                 distance = euclidean(winnin_position, other_position)
  70.                 if distance < threshold:
  71.                     class_distances[affairIndex].append(distance)
  72.  
  73. print_number_of_documents_in_category()
  74.  
  75. class_quality = {}
  76.  
  77. for affairIndex, distances in class_distances.items():
  78.     quality = 1.0 - (np.mean(distances) / threshold)
  79.     class_name = tags[affairIndex]
  80.     class_quality[class_name] = quality
  81.     print(f"Jakość dla klasy {class_name}: {quality}")
  82.    
  83. tfidf_scores = calculate_class_based_tfidf(allWordsInClass, tags)
  84.  
  85. # Now you can access the TF-IDF score for each word in each class
  86. # For example, to print the scores for the first class
  87. print("TF-IDF scores for class '{}':".format(tags[0]))
  88. for word, score in tfidf_scores[tags[0]].items():
  89.     print(f"{word}: {score}")
  90.  
  91. plt.legend(handles=legend_elements, loc='upper left',prop={'size': 15})
  92. # plt.xticks(range(map_dim))
  93. # plt.yticks(range(map_dim))
  94. # plt.grid()
  95. plt.xlim([0, map_dim])
  96. plt.ylim([0, map_dim])
  97. plt.plot()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement