tfidf

import ast
from matplotlib.patches import Patch
from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import TfidfVectorizer
legend_elements = []
legend_labels = ['civil law', 'administrative law', 'pharmaceutical law', 'labor law', 'medical law', 'criminal law', 'international law', 'tax law', 'constitutional law', 'other']
for index,color in enumerate(colors):
    c = colors[index]
    l = legend_labels[index]
    legend_elements.append(Patch(facecolor=c, edgecolor='black',label=l))
plt.figure(figsize=(20, 20))
texts = []
reset_number_of_documents_in_category()

def calculate_class_based_tfidf(allWordsInClass, allClasses):
    # allWordsInClass: a list of lists, where each sublist contains all words for a particular class
    # allClasses: a list of all class names

    # Convert all words in each class to a single string (document)
    documents = [' '.join(words) for words in allWordsInClass]

    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()
    # Fit and transform the documents
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Retrieve the words
    feature_names = vectorizer.get_feature_names_out()

    # Create a dictionary to store TF-IDF scores
    tfidf_scores = {class_name: {} for class_name in allClasses}

    # Populate the dictionary with TF-IDF scores
    for class_index, class_name in enumerate(allClasses):
        feature_index = tfidf_matrix[class_index,:].nonzero()[1]
        tfidf_scores_for_class = zip(feature_index, [tfidf_matrix[class_index, x] for x in feature_index])
        for word, score in [(feature_names[i], s) for (i, s) in tfidf_scores_for_class]:
            tfidf_scores[class_name][word] = score

    return tfidf_scores

allWordsInClass = [[] for _ in tags]

print("Number of classes:", len(tags))
print("Length of allWordsInClass:", len(allWordsInClass))

print("Length of vectorizedSentences:", len(vectorizedSentences))
print("Length of allWords:", len(allWords))

threshold = 5.0
class_distances = {}
for i,vec in enumerate(vectorizedSentences):
    affairIndex = get_category_index(vec)
    allWordsInClass[affairIndex].extend(allWordsFromSentences[i])
    update_number_of_documents_in_category(affairIndex)
    c = colors[affairIndex]
    winnin_position = som.winner(vec)
    text = ''
    if len(bestWordsFromSentences[i]) > 0:
        text = bestWordsFromSentences[i]
    if text != "":
        texts.append(plt.text((winnin_position[0]+np.random.rand()*.9)%map_dim, (winnin_position[1]+np.random.rand()*.9)%map_dim, text, color=c))

        class_points = [j for j, other_vec in enumerate(vectorizedSentences) if get_category_index(other_vec) == affairIndex]
        class_distances[affairIndex] = class_distances.get(affairIndex, [])
        for j in class_points:
            if i != j:
                other_position = som.winner(vectorizedSentences[j])
                distance = euclidean(winnin_position, other_position)
                if distance < threshold:
                    class_distances[affairIndex].append(distance)

print_number_of_documents_in_category()

class_quality = {}

for affairIndex, distances in class_distances.items():
    quality = 1.0 - (np.mean(distances) / threshold)
    class_name = tags[affairIndex]
    class_quality[class_name] = quality
    print(f"Jakość dla klasy {class_name}: {quality}")

tfidf_scores = calculate_class_based_tfidf(allWordsInClass, tags)

# Now you can access the TF-IDF score for each word in each class
# For example, to print the scores for the first class
print("TF-IDF scores for class '{}':".format(tags[0]))
for word, score in tfidf_scores[tags[0]].items():
    print(f"{word}: {score}")

plt.legend(handles=legend_elements, loc='upper left',prop={'size': 15})
# plt.xticks(range(map_dim))
# plt.yticks(range(map_dim))
# plt.grid()
plt.xlim([0, map_dim])
plt.ylim([0, map_dim])
plt.plot()