Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_tfidf_idf_df_feature(collection_document_text, dict_dataset):
- doc_vocabulary = []
- list_doc = []
- order_doc_id = []
- for doc_id in dict_dataset:
- print(doc_id)
- doc_df = dict_dataset[doc_id]
- doc_vocabulary.append(np.array(doc_df.index.array).tolist())
- list_doc.append(collection_document_text[doc_id]) #Lista com os documentos. EM cada posição temos o documento numa string
- order_doc_id.append(doc_id) # Ordem que vai ser utilizado pelo TFIDFVectorizer
- doc_vocabulary = [item for sublist in doc_vocabulary for item in sublist]
- doc_vocabulary = list(dict.fromkeys(doc_vocabulary))
- tfidf = TfidfVectorizer(vocabulary=doc_vocabulary,lowercase=True)
- tf = CountVectorizer(vocabulary=doc_vocabulary,lowercase=True)
- tf_matrix = tf.fit_transform(list_doc)
- tf_idf_matrix = tfidf.fit_transform(list_doc)
- feature_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
- df_idf = pd.DataFrame.from_dict(feature_idf, orient = "index")
- df_tfidf = pd.DataFrame(tf_idf_matrix.toarray(), columns=tfidf.get_feature_names())
- df_tf =pd.DataFrame(tf_matrix.toarray(), columns=tfidf.get_feature_names())
- for index,doc_id in enumerate(order_doc_id):
- token_document_list = dict_dataset.get(doc_id).index.tolist()
- df_tfidf_doc = df_tfidf.iloc[index]
- df_tf_doc = df_tf.iloc[index]
- df_idf_doc = df_idf.loc[token_document_list]
- if doc_id == '2':
- print("td",df_tf_doc["services"])
- print("tfidf", df_tfidf_doc.loc["services"])
- print("idf", df_idf_doc.loc["services"])
- df_filtered_tfidf = df_tfidf_doc[token_document_list]
- df_filtered_tf = df_tf_doc[token_document_list]
- dict_dataset.get(doc_id)["tfidf"] = df_filtered_tfidf
- dict_dataset.get(doc_id)["idf"] = df_idf_doc
- dict_dataset.get(doc_id)["tf"] = df_filtered_tf
- return dict_dataset
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement