Advertisement
Guest User

Untitled

a guest
May 27th, 2019
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.72 KB | None | 0 0
  1. import re
  2. from bs4 import BeautifulSoup
  3. import nltk
  4.  
  5. import os
  6. from string import punctuation
  7. from nltk.corpus import stopwords
  8. from collections import Counter
  9. from nltk.tokenize import word_tokenize, sent_tokenize
  10. from sklearn.feature_extraction.text import CountVectorizer
  11. from sklearn.feature_extraction.text import TfidfVectorizer
  12. import numpy as np; import pandas as pd
  13. from sklearn.metrics.pairwise import cosine_similarity
  14. my_stopwords = set(stopwords.words('english') + list(punctuation))
  15. from nltk.stem import PorterStemmer
  16. # input
  17. path = input("Nhập thông tin đường dẫn ")
  18. list_path = []
  19.  
  20. for root, dirs, files in os.walk(path):
  21. for file in files:
  22. list_path.append(root + "/" + file)
  23.  
  24.  
  25. def get_text(file):
  26. read_file = open(file, "r")
  27. text = read_file.readlines()
  28. text = ' '.join(text)
  29. return text
  30.  
  31.  
  32. # loại bỏ các thẻ của html trong file
  33. def clean_html(text):
  34. soup = BeautifulSoup(text, 'html.parser')
  35. return soup.get_text()
  36.  
  37.  
  38. datalist = []
  39.  
  40. def remove_special_character(text):
  41. string = re.sub('[^\w\s]', '', text)
  42. string = re.sub('\s+', ' ', string)
  43. strign = string.strip()
  44. return string
  45.  
  46.  
  47. my_stopwords = set(stopwords.words('english') + list(punctuation))
  48. i = 0
  49.  
  50. for i in range(len(list_path)):
  51. text = get_text(list_path[i])
  52. text_cleaned = clean_html(text)
  53. sents = sent_tokenize(text_cleaned)
  54. sents_cleaned = [remove_special_character(s) for s in sents]
  55. text_sents_join = ''.join(sents_cleaned)
  56. words = word_tokenize(text_sents_join)
  57. words = [word.lower() for word in words]
  58. words = [word for word in words if word not in my_stopwords]
  59. words=' '.join(words)
  60. datalist.append(words)
  61.  
  62. print("chon cach phuong phap (1,2)")
  63. phuongphap =int(input())
  64. if(phuongphap==1):
  65. result = CountVectorizer()
  66. x=result.fit_transform(datalist).todense()
  67. file = open('WoB.txt', 'w')
  68. # ghi file
  69. file.write(str(x))
  70. # dong file
  71. file.close()
  72.  
  73. file=open('BoW_CosSim.txt','w')
  74. x=cosine_similarity(x)
  75. file.write(str(x))
  76. file.close()
  77. elif(phuongphap==2):
  78. tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
  79.  
  80. tf_idf_matrix = tf.fit_transform(datalist)
  81. featurename_names = tf.get_feature_names()
  82. dense = tf_idf_matrix.todense()
  83. file = open('tfidf.txt', 'w')
  84. # ghi file
  85. file.write('\n'.join(featurename_names) +"\n")
  86. file.write(str(tf_idf_matrix) +"\n")
  87. file.write(str(dense))
  88. # dong file
  89. file.close()
  90.  
  91. file = open('Tfidf_CosSim.txt', 'w')
  92. x = cosine_similarity(dense)
  93. file.write(str(x))
  94. file.close()
  95.  
  96. else:
  97. print("Nhap sai !!!!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement