Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- import csv
- import pprint
- import MeCab
- import collections
- import sys
- from sys import argv
- m = MeCab.Tagger('-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd') #形態素解析
- mode = 'none'
- wcs = {}
- ec = ['Twitter', 'Imgur'] #特定文字列
- target = ['名詞'] #統計対象品詞
- if len(sys.argv) < 2:
- sys.exit()
- input_file = sys.argv[1]
- with open(input_file, encoding='utf8') as file:
- lines = file.readlines()
- lines_strip = [line.strip() for line in lines]
- for line in lines_strip:
- if line == '':
- continue
- if '--------------------------------------------' in line:
- mode = 'thread'
- continue
- if '---' in line:
- mode = 'title'
- continue
- if mode == 'thread':
- mode = 'none'
- continue
- if mode == 'title':
- wc = line.split()[3][:9]
- mode = 'content'
- continue
- if mode == 'content':
- if wc not in wcs.keys():
- wcs[wc] = dict()
- wcs[wc]['text'] = ''
- wcs[wc]['count'] = 0
- wcs[wc]['ec'] = dict()
- wcs[wc]['text'] += line
- wcs[wc]['count'] += 1
- mode = 'title'
- continue
- for key in wcs:
- text = wcs[key]['text']
- node = m.parseToNode(text)
- words=[]
- while node:
- hinshi = node.feature.split(",")[0]
- if hinshi in target:
- origin = node.feature.split(",")[6]
- words.append(origin)
- node = node.next
- wcs[key]['result'] = collections.Counter(words)
- if wcs[key]['count'] > 0:
- for k in wcs[key]['result']:
- if k in ec:
- wcs[key]['ec'][k] = '{:.2f}'.format(wcs[key]['result'][k]/wcs[key]['count'])
- # print(key, wcs[key]['count'], c.most_common(20))
- print(key, wcs[key]['count'], wcs[key]['ec'])
Advertisement
Add Comment
Please, Sign In to add comment