Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from bs4 import BeautifulSoup
- import nltk
- import os
- from string import punctuation
- from nltk.corpus import stopwords
- from collections import Counter
- from nltk.tokenize import word_tokenize, sent_tokenize
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfVectorizer
- import numpy as np; import pandas as pd
- from sklearn.metrics.pairwise import cosine_similarity
- my_stopwords = set(stopwords.words('english') + list(punctuation))
- from nltk.stem import PorterStemmer
- # input
- path = input("Nhập thông tin đường dẫn ")
- list_path = []
- for root, dirs, files in os.walk(path):
- for file in files:
- list_path.append(root + "/" + file)
- def get_text(file):
- read_file = open(file, "r")
- text = read_file.readlines()
- text = ' '.join(text)
- return text
- # loại bỏ các thẻ của html trong file
- def clean_html(text):
- soup = BeautifulSoup(text, 'html.parser')
- return soup.get_text()
- datalist = []
- def remove_special_character(text):
- string = re.sub('[^\w\s]', '', text)
- string = re.sub('\s+', ' ', string)
- strign = string.strip()
- return string
- my_stopwords = set(stopwords.words('english') + list(punctuation))
- i = 0
- for i in range(len(list_path)):
- text = get_text(list_path[i])
- text_cleaned = clean_html(text)
- sents = sent_tokenize(text_cleaned)
- sents_cleaned = [remove_special_character(s) for s in sents]
- text_sents_join = ''.join(sents_cleaned)
- words = word_tokenize(text_sents_join)
- words = [word.lower() for word in words]
- words = [word for word in words if word not in my_stopwords]
- words=' '.join(words)
- datalist.append(words)
- print("chon cach phuong phap (1,2)")
- phuongphap =int(input())
- if(phuongphap==1):
- result = CountVectorizer()
- x=result.fit_transform(datalist).todense()
- file = open('WoB.txt', 'w')
- # ghi file
- file.write(str(x))
- # dong file
- file.close()
- file=open('BoW_CosSim.txt','w')
- x=cosine_similarity(x)
- file.write(str(x))
- file.close()
- elif(phuongphap==2):
- tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
- tf_idf_matrix = tf.fit_transform(datalist)
- featurename_names = tf.get_feature_names()
- dense = tf_idf_matrix.todense()
- file = open('tfidf.txt', 'w')
- # ghi file
- file.write('\n'.join(featurename_names) +"\n")
- file.write(str(tf_idf_matrix) +"\n")
- file.write(str(dense))
- # dong file
- file.close()
- file = open('Tfidf_CosSim.txt', 'w')
- x = cosine_similarity(dense)
- file.write(str(x))
- file.close()
- else:
- print("Nhap sai !!!!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement