Advertisement
pri_yeahyeahyeha

Untitled

Dec 6th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.94 KB | None | 0 0
  1. def get_tfidf_idf_df_feature(collection_document_text, dict_dataset):
  2.     doc_vocabulary = []
  3.     list_doc = []
  4.     order_doc_id = []
  5.     for doc_id in dict_dataset:
  6.         print(doc_id)
  7.         doc_df = dict_dataset[doc_id]
  8.         doc_vocabulary.append(np.array(doc_df.index.array).tolist())
  9.         list_doc.append(collection_document_text[doc_id]) #Lista com os documentos. EM cada posição temos o documento  numa string
  10.         order_doc_id.append(doc_id) # Ordem que vai ser utilizado pelo TFIDFVectorizer
  11.     doc_vocabulary = [item for sublist in doc_vocabulary for item in sublist]
  12.     doc_vocabulary = list(dict.fromkeys(doc_vocabulary))
  13.     tfidf = TfidfVectorizer(vocabulary=doc_vocabulary,lowercase=True)
  14.     tf = CountVectorizer(vocabulary=doc_vocabulary,lowercase=True)
  15.     tf_matrix = tf.fit_transform(list_doc)
  16.  
  17.     tf_idf_matrix = tfidf.fit_transform(list_doc)
  18.  
  19.     feature_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
  20.     df_idf = pd.DataFrame.from_dict(feature_idf, orient = "index")
  21.  
  22.     df_tfidf = pd.DataFrame(tf_idf_matrix.toarray(), columns=tfidf.get_feature_names())
  23.  
  24.     df_tf =pd.DataFrame(tf_matrix.toarray(), columns=tfidf.get_feature_names())
  25.  
  26.     for index,doc_id in enumerate(order_doc_id):
  27.         token_document_list = dict_dataset.get(doc_id).index.tolist()
  28.  
  29.         df_tfidf_doc = df_tfidf.iloc[index]
  30.         df_tf_doc = df_tf.iloc[index]
  31.         df_idf_doc = df_idf.loc[token_document_list]
  32.  
  33.         if doc_id == '2':
  34.             print("td",df_tf_doc["services"])
  35.             print("tfidf", df_tfidf_doc.loc["services"])
  36.             print("idf", df_idf_doc.loc["services"])
  37.  
  38.         df_filtered_tfidf = df_tfidf_doc[token_document_list]
  39.         df_filtered_tf = df_tf_doc[token_document_list]
  40.         dict_dataset.get(doc_id)["tfidf"] = df_filtered_tfidf
  41.         dict_dataset.get(doc_id)["idf"] = df_idf_doc
  42.         dict_dataset.get(doc_id)["tf"] = df_filtered_tf
  43.  
  44.     return dict_dataset
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement