Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.decomposition import PCA
- from sklearn.feature_extraction.text import TfidfVectorizer
- import pandas as pd
- import jieba
- import jieba.analyse
- import numpy as np
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- import re
- jieba.load_userdict('dict.txt') #載入使用者字典
- """
- 建立停用詞
- """
- def stopwordslist(filepath):
- stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
- return stopwords
- """
- 斷詞
- """
- def seg_article(init_article):
- article_seged = jieba.cut(init_article, cut_all = False)
- stopwords = stopwordslist('stoped.txt') # 這裡加載停用詞的路徑
- outstr = ''
- for word in article_seged:
- if word not in stopwords:
- outstr += word #outstr = outstr + word
- outstr += " "
- return outstr
- """
- 文字雲
- """
- def cloud(text):
- wc = WordCloud(font_path="msjh.ttc", #設置字體
- background_color="white", #背景顏色
- max_words = 50, #文字雲顯示最大詞數
- width = 1600,
- height =900
- )
- #停用字詞
- wc.generate(text)
- plt.imshow(wc)
- plt.axis("off")
- plt.savefig('WordCloud.png')
- """
- 降維
- """
- def PCAtool(date,weight,word):
- pcatool = PCA(n_components = 2)
- dimensionlist = pcatool.fit_transform(weight)
- dimension_df = pd.DataFrame(dimensionlist, index = date,columns = ["Xdimension","Ydimension"])
- dimension_df.to_csv("dimension.csv", encoding = 'utf_8_sig')
- x = dimensionlist[:,0]
- y = dimensionlist[:,1]
- fig, ax = plt.subplots()
- plt.title("PCA_COMPONENTS")
- plt.xlabel("x component")
- plt.ylabel("y compoment")
- plt.plot(x,y,'o')
- for i in range(len(date)):
- ax.annotate("news-{}".format(i+1),(x[i],y[i]))
- plt.savefig("COMPONENTS.png")
- pcatool.components_
- comp_df = pd.DataFrame(pcatool.components_, columns = word)
- comp_df.to_csv("comp.csv", encoding = 'utf_8_sig')
- """
- tfidf
- """
- def tfidf(article,date):
- corpus = article
- vectorizer = TfidfVectorizer()
- tfidf = vectorizer.fit_transform(corpus)
- word = vectorizer.get_feature_names()
- weight = tfidf.toarray()
- tfidf_df = pd.DataFrame(weight, index = date,columns = word)
- tfidf_df.to_csv("tfidf.csv", encoding = 'utf_8_sig')
- PCAtool(date,weight,word) #PCAtool(date,weight,words) 將tf-idf矩陣降維
- """
- jiebaTFIDF
- """
- def jiebaTFIDF(text,n):
- keywordlist = [] # 儲存前幾名的關鍵詞
- tags = jieba.analyse.extract_tags(text, topK=n, withWeight=True) # topK代表要取的關鍵字次數
- for i in range(len(tags)):
- eachword = tags[i][0]
- keywordlist.append(eachword)
- return keywordlist
- """
- 將文字內容轉為0,1矩陣,使其能用於作為機器學習之特徵
- """
- def one_hot(dataframe,keyword,words,date):
- df_zeros = np.zeros((len(dataframe),), dtype=int)
- tempdic = pd.DataFrame({'temp':df_zeros}, index = date) #建立空dataframe
- for eachkey in keyword:
- tempdic[eachkey] = 0 # 將欄位預先新增
- OneHot_df = tempdic.drop('temp', axis=1)
- for j in range(len(words)): #j為第幾篇新聞
- for k in range(len(words[j])): # 第j篇新聞第k個字
- thisword = words[j][k]
- if thisword in keyword:
- OneHot_df[thisword][j] = 1
- OneHot_df.to_csv("OneHot.csv",index=1,header=1,encoding='utf_8_sig')
- """
- 情緒
- """
- def sentiment(words,positive,negative,date):
- scorelist = []
- for i in range(len(words)):
- score = 0
- for word in words[i]:
- totalWord = len(words[i])
- if word in positive:
- score += 1
- if word in negative:
- score += -1
- sentimentWeight = score/totalWord
- scorelist.append(sentimentWeight)
- scoreSeries = pd.Series(scorelist, index = date)
- sentiment_df = pd.DataFrame(scoreSeries, index = date, columns = ["sentiment"])
- sentiment_df.to_csv("sentiment.csv",encoding='utf_8_sig')
- """
- 匯入檔案
- """
- def readfile(filename):
- df = pd.read_csv(filename,encoding='utf-8',dtype=str) #讀檔
- positive = [line.strip() for line in open("Positive.valid.txt", 'r', encoding='utf-8').readlines()]
- negative= [line.strip() for line in open("Negative.valid.txt", 'r', encoding='utf-8').readlines()]
- textstr = '' #創建總分詞字串,儲存斷詞好的字串
- textlist = [] #創建總分詞典,儲存斷詞好的詞 list
- onelist = []
- date = []
- for i in range(len(df)):
- pattern = '[a-zA-Z0-9〔/〕().?:@,!/、\n]+'
- orginaltext = re.sub(pattern,'', df['mine_idea'][i])
- date.append(df["Date"][i])
- segtext = seg_article(orginaltext) #將每一篇新聞斷詞後定義為segtext
- textstr += segtext #將每一篇斷詞好的新聞加總成一個總字串
- textlist.append(segtext) #將每一篇新聞存入串列中
- onelist.append(segtext.split()) #將字串中的字以空格分隔為字串存入串列
- cloud(textstr) #cloud(text) 文字雲
- tfidf(textlist,date) #tfidf(article,date) tf-idf計算
- keywordlist = jiebaTFIDF(textstr,40) #jiebaTFIDF(textstr,n) 依jiebaTFIDF排序 n = 取幾個
- one_hot(df,keywordlist,onelist,date) #one_hot(dataframe,keyword,words,date)
- sentiment(onelist,positive,negative,date) #sentiment(words,positive,negative,date)
- readfile("2379.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement