Advertisement
FINLAB

13

Oct 23rd, 2019
550
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.57 KB | None | 0 0
  1. from sklearn.decomposition import PCA
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. import pandas as pd
  4. import jieba
  5. import jieba.analyse
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. from wordcloud import WordCloud
  9. import re
  10. jieba.load_userdict('dict.txt') #載入使用者字典
  11. """
  12. 建立停用詞
  13. """
  14. def stopwordslist(filepath):
  15.     stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
  16.     return stopwords                          
  17. """
  18. 斷詞
  19. """  
  20. def seg_article(init_article):  
  21.     article_seged = jieba.cut(init_article, cut_all = False)  
  22.     stopwords = stopwordslist('stoped.txt')  # 這裡加載停用詞的路徑
  23.     outstr = ''  
  24.     for word in article_seged:  
  25.         if word not in stopwords:  
  26.             outstr += word  #outstr = outstr + word
  27.             outstr += " "  
  28.     return outstr
  29. """  
  30. 文字雲
  31. """
  32. def cloud(text):
  33.     wc = WordCloud(font_path="msjh.ttc", #設置字體
  34.                    background_color="white", #背景顏色
  35.                    max_words = 50, #文字雲顯示最大詞數
  36.                    width = 1600,
  37.                    height =900
  38.                    )        
  39.                          #停用字詞
  40.     wc.generate(text)
  41.     plt.imshow(wc)
  42.     plt.axis("off")
  43.     plt.savefig('WordCloud.png')
  44. """
  45. 降維
  46. """
  47. def PCAtool(date,weight,word):
  48.     pcatool = PCA(n_components = 2)
  49.     dimensionlist = pcatool.fit_transform(weight)
  50.     dimension_df = pd.DataFrame(dimensionlist, index = date,columns = ["Xdimension","Ydimension"])
  51.     dimension_df.to_csv("dimension.csv", encoding = 'utf_8_sig')
  52.     x = dimensionlist[:,0]
  53.     y = dimensionlist[:,1]
  54.     fig, ax = plt.subplots()
  55.     plt.title("PCA_COMPONENTS")
  56.     plt.xlabel("x component")
  57.     plt.ylabel("y compoment")
  58.     plt.plot(x,y,'o')
  59.     for i in range(len(date)):
  60.         ax.annotate("news-{}".format(i+1),(x[i],y[i]))
  61.     plt.savefig("COMPONENTS.png")
  62.     pcatool.components_
  63.     comp_df = pd.DataFrame(pcatool.components_, columns = word)
  64.     comp_df.to_csv("comp.csv", encoding = 'utf_8_sig')
  65. """
  66. tfidf
  67. """
  68. def tfidf(article,date):
  69.     corpus = article
  70.     vectorizer = TfidfVectorizer()
  71.     tfidf = vectorizer.fit_transform(corpus)
  72.     word = vectorizer.get_feature_names()
  73.     weight = tfidf.toarray()
  74.     tfidf_df = pd.DataFrame(weight, index = date,columns = word)
  75.     tfidf_df.to_csv("tfidf.csv", encoding = 'utf_8_sig')
  76.     PCAtool(date,weight,word)    #PCAtool(date,weight,words) 將tf-idf矩陣降維
  77.    
  78. """
  79. jiebaTFIDF
  80. """
  81. def jiebaTFIDF(text,n):
  82.     keywordlist = [] # 儲存前幾名的關鍵詞
  83.     tags = jieba.analyse.extract_tags(text, topK=n, withWeight=True)  # topK代表要取的關鍵字次數  
  84.     for i in range(len(tags)):
  85.         eachword = tags[i][0]
  86.         keywordlist.append(eachword)
  87.     return keywordlist
  88. """
  89. 將文字內容轉為0,1矩陣,使其能用於作為機器學習之特徵
  90. """
  91. def one_hot(dataframe,keyword,words,date):
  92.     df_zeros = np.zeros((len(dataframe),), dtype=int)
  93.     tempdic = pd.DataFrame({'temp':df_zeros}, index = date) #建立空dataframe
  94.     for eachkey in keyword:
  95.         tempdic[eachkey] = 0 # 將欄位預先新增    
  96.     OneHot_df = tempdic.drop('temp', axis=1)
  97.     for j in range(len(words)):  #j為第幾篇新聞
  98.         for k in range(len(words[j])): # 第j篇新聞第k個字
  99.             thisword = words[j][k]
  100.             if thisword in keyword:
  101.                 OneHot_df[thisword][j] = 1  
  102.     OneHot_df.to_csv("OneHot.csv",index=1,header=1,encoding='utf_8_sig')
  103. """
  104. 情緒
  105. """
  106. def sentiment(words,positive,negative,date):
  107.     scorelist = []
  108.     for i in range(len(words)):
  109.         score = 0
  110.         for word in words[i]:
  111.             totalWord = len(words[i])
  112.             if word in positive:
  113.                 score += 1
  114.             if word in negative:
  115.                 score += -1      
  116.         sentimentWeight = score/totalWord
  117.         scorelist.append(sentimentWeight)
  118.     scoreSeries = pd.Series(scorelist, index = date)
  119.     sentiment_df = pd.DataFrame(scoreSeries, index = date, columns = ["sentiment"])
  120.     sentiment_df.to_csv("sentiment.csv",encoding='utf_8_sig')
  121. """
  122. 匯入檔案
  123. """
  124. def readfile(filename):
  125.     df = pd.read_csv(filename,encoding='utf-8',dtype=str)  #讀檔
  126.     positive = [line.strip() for line in open("Positive.valid.txt", 'r', encoding='utf-8').readlines()]
  127.     negative= [line.strip() for line in open("Negative.valid.txt", 'r', encoding='utf-8').readlines()]
  128.     textstr = ''    #創建總分詞字串,儲存斷詞好的字串
  129.     textlist = []    #創建總分詞典,儲存斷詞好的詞 list
  130.     onelist = []
  131.     date = []
  132.     for i in range(len(df)):
  133.         pattern = '[a-zA-Z0-9〔/〕().?:@,!/、\n]+'
  134.         orginaltext = re.sub(pattern,'', df['mine_idea'][i])
  135.         date.append(df["Date"][i])
  136.         segtext = seg_article(orginaltext)    #將每一篇新聞斷詞後定義為segtext
  137.         textstr += segtext    #將每一篇斷詞好的新聞加總成一個總字串
  138.         textlist.append(segtext)    #將每一篇新聞存入串列中
  139.         onelist.append(segtext.split())    #將字串中的字以空格分隔為字串存入串列
  140.     cloud(textstr)    #cloud(text) 文字雲
  141.     tfidf(textlist,date)    #tfidf(article,date) tf-idf計算
  142.     keywordlist = jiebaTFIDF(textstr,40)    #jiebaTFIDF(textstr,n) 依jiebaTFIDF排序 n = 取幾個
  143.     one_hot(df,keywordlist,onelist,date)    #one_hot(dataframe,keyword,words,date)
  144.     sentiment(onelist,positive,negative,date)    #sentiment(words,positive,negative,date)
  145. readfile("2379.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement