13

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import jieba
import jieba.analyse
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
jieba.load_userdict('dict.txt') #載入使用者字典
"""
建立停用詞
"""
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords
"""
斷詞
"""
def seg_article(init_article):
    article_seged = jieba.cut(init_article, cut_all = False)
    stopwords = stopwordslist('stoped.txt')  # 這裡加載停用詞的路徑
    outstr = ''
    for word in article_seged:
        if word not in stopwords:
            outstr += word  #outstr = outstr + word
            outstr += " "
    return outstr
"""
文字雲
"""
def cloud(text):
    wc = WordCloud(font_path="msjh.ttc", #設置字體
                   background_color="white", #背景顏色
                   max_words = 50, #文字雲顯示最大詞數
                   width = 1600,
                   height =900
                   )
                         #停用字詞
    wc.generate(text)
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig('WordCloud.png')
"""
降維
"""
def PCAtool(date,weight,word):
    pcatool = PCA(n_components = 2)
    dimensionlist = pcatool.fit_transform(weight)
    dimension_df = pd.DataFrame(dimensionlist, index = date,columns = ["Xdimension","Ydimension"])
    dimension_df.to_csv("dimension.csv", encoding = 'utf_8_sig')
    x = dimensionlist[:,0]
    y = dimensionlist[:,1]
    fig, ax = plt.subplots()
    plt.title("PCA_COMPONENTS")
    plt.xlabel("x component")
    plt.ylabel("y compoment")
    plt.plot(x,y,'o')
    for i in range(len(date)):
        ax.annotate("news-{}".format(i+1),(x[i],y[i]))
    plt.savefig("COMPONENTS.png")
    pcatool.components_
    comp_df = pd.DataFrame(pcatool.components_, columns = word)
    comp_df.to_csv("comp.csv", encoding = 'utf_8_sig')
"""
tfidf
"""
def tfidf(article,date):
    corpus = article
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus)
    word = vectorizer.get_feature_names()
    weight = tfidf.toarray()
    tfidf_df = pd.DataFrame(weight, index = date,columns = word)
    tfidf_df.to_csv("tfidf.csv", encoding = 'utf_8_sig')
    PCAtool(date,weight,word)    #PCAtool(date,weight,words) 將tf-idf矩陣降維

"""
jiebaTFIDF
"""
def jiebaTFIDF(text,n):
    keywordlist = [] # 儲存前幾名的關鍵詞
    tags = jieba.analyse.extract_tags(text, topK=n, withWeight=True)  # topK代表要取的關鍵字次數
    for i in range(len(tags)):
        eachword = tags[i][0]
        keywordlist.append(eachword)
    return keywordlist
"""
將文字內容轉為0,1矩陣，使其能用於作為機器學習之特徵
"""
def one_hot(dataframe,keyword,words,date):
    df_zeros = np.zeros((len(dataframe),), dtype=int)
    tempdic = pd.DataFrame({'temp':df_zeros}, index = date) #建立空dataframe
    for eachkey in keyword:
        tempdic[eachkey] = 0 # 將欄位預先新增
    OneHot_df = tempdic.drop('temp', axis=1)
    for j in range(len(words)):  #j為第幾篇新聞
        for k in range(len(words[j])): # 第j篇新聞第k個字
            thisword = words[j][k]
            if thisword in keyword:
                OneHot_df[thisword][j] = 1
    OneHot_df.to_csv("OneHot.csv",index=1,header=1,encoding='utf_8_sig')
"""
情緒
"""
def sentiment(words,positive,negative,date):
    scorelist = []
    for i in range(len(words)):
        score = 0
        for word in words[i]:
            totalWord = len(words[i])
            if word in positive:
                score += 1
            if word in negative:
                score += -1
        sentimentWeight = score/totalWord
        scorelist.append(sentimentWeight)
    scoreSeries = pd.Series(scorelist, index = date)
    sentiment_df = pd.DataFrame(scoreSeries, index = date, columns = ["sentiment"])
    sentiment_df.to_csv("sentiment.csv",encoding='utf_8_sig')
"""
匯入檔案
"""
def readfile(filename):
    df = pd.read_csv(filename,encoding='utf-8',dtype=str)  #讀檔
    positive = [line.strip() for line in open("Positive.valid.txt", 'r', encoding='utf-8').readlines()]
    negative= [line.strip() for line in open("Negative.valid.txt", 'r', encoding='utf-8').readlines()]
    textstr = ''    #創建總分詞字串，儲存斷詞好的字串
    textlist = []    #創建總分詞典，儲存斷詞好的詞 list
    onelist = []
    date = []
    for i in range(len(df)):
        pattern = '[a-zA-Z0-9〔／〕（）.？:@，！/、\n]+'
        orginaltext = re.sub(pattern,'', df['mine_idea'][i])
        date.append(df["Date"][i])
        segtext = seg_article(orginaltext)    #將每一篇新聞斷詞後定義為segtext
        textstr += segtext    #將每一篇斷詞好的新聞加總成一個總字串
        textlist.append(segtext)    #將每一篇新聞存入串列中
        onelist.append(segtext.split())    #將字串中的字以空格分隔為字串存入串列
    cloud(textstr)    #cloud(text) 文字雲
    tfidf(textlist,date)    #tfidf(article,date) tf-idf計算
    keywordlist = jiebaTFIDF(textstr,40)    #jiebaTFIDF(textstr,n) 依jiebaTFIDF排序 n = 取幾個
    one_hot(df,keywordlist,onelist,date)    #one_hot(dataframe,keyword,words,date)
    sentiment(onelist,positive,negative,date)    #sentiment(words,positive,negative,date)
readfile("2379.csv")