Untitled

import re
from bs4 import BeautifulSoup
import nltk

import os
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np; import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
my_stopwords = set(stopwords.words('english') + list(punctuation))
from nltk.stem import PorterStemmer
# input
path = input("Nhập thông tin đường dẫn ")
list_path = []

for root, dirs, files in os.walk(path):
    for file in files:
        list_path.append(root + "/" + file)


def get_text(file):
    read_file = open(file, "r")
    text = read_file.readlines()
    text = ' '.join(text)
    return text


# loại bỏ các thẻ của html trong file
def clean_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()


datalist = []

def remove_special_character(text):
    string = re.sub('[^\w\s]', '', text)
    string = re.sub('\s+', ' ', string)
    strign = string.strip()
    return string


my_stopwords = set(stopwords.words('english') + list(punctuation))
i = 0

for i in range(len(list_path)):
    text = get_text(list_path[i])
    text_cleaned = clean_html(text)
    sents = sent_tokenize(text_cleaned)
    sents_cleaned = [remove_special_character(s) for s in sents]
    text_sents_join = ''.join(sents_cleaned)
    words = word_tokenize(text_sents_join)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in my_stopwords]
    words=' '.join(words)
    datalist.append(words)

print("chon cach phuong phap (1,2)")
phuongphap =int(input())
if(phuongphap==1):
    result = CountVectorizer()
    x=result.fit_transform(datalist).todense()
    file = open('WoB.txt', 'w')
    # ghi file
    file.write(str(x))
    # dong file
    file.close()

    file=open('BoW_CosSim.txt','w')
    x=cosine_similarity(x)
    file.write(str(x))
    file.close()
elif(phuongphap==2):
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

    tf_idf_matrix = tf.fit_transform(datalist)
    featurename_names = tf.get_feature_names()
    dense = tf_idf_matrix.todense()
    file = open('tfidf.txt', 'w')
    # ghi file
    file.write('\n'.join(featurename_names) +"\n")
    file.write(str(tf_idf_matrix) +"\n")
    file.write(str(dense))
    # dong file
    file.close()

    file = open('Tfidf_CosSim.txt', 'w')
    x = cosine_similarity(dense)
    file.write(str(x))
    file.close()

else:
    print("Nhap sai !!!!")