Guest User

Untitled

a guest
Jul 24th, 2020
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.20 KB | None | 0 0
  1. #!/usr/bin/python3
  2. import csv
  3. import pprint
  4. import MeCab
  5. import collections
  6. import sys
  7. from sys import argv
  8.  
  9. m = MeCab.Tagger('-Ochasen  -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')    #形態素解析
  10.  
  11. mode = 'none'
  12. wcs = {}
  13. ec = ['Twitter', 'Imgur'] #特定文字列
  14. target = ['名詞']    #統計対象品詞                  
  15.  
  16. if len(sys.argv) < 2:
  17.         sys.exit()
  18. input_file = sys.argv[1]
  19.  
  20. with open(input_file, encoding='utf8') as file:
  21.         lines = file.readlines()
  22. lines_strip = [line.strip() for line in lines]
  23.  
  24. for line in lines_strip:
  25.         if line == '':
  26.                 continue
  27.         if '--------------------------------------------' in line:
  28.                 mode = 'thread'
  29.                 continue
  30.         if '---' in line:
  31.                 mode = 'title'
  32.                 continue
  33.  
  34.         if mode == 'thread':
  35.                 mode = 'none'
  36.                 continue
  37.         if mode == 'title':
  38.                 wc = line.split()[3][:9]
  39.                 mode = 'content'
  40.                 continue
  41.         if mode == 'content':
  42.                 if wc not in wcs.keys():
  43.                         wcs[wc] = dict()
  44.                         wcs[wc]['text'] = ''
  45.                         wcs[wc]['count'] = 0
  46.                         wcs[wc]['ec'] = dict()
  47.                 wcs[wc]['text'] += line
  48.                 wcs[wc]['count'] += 1
  49.                 mode = 'title'
  50.                 continue
  51.  
  52. for key in wcs:
  53.         text = wcs[key]['text']
  54.         node = m.parseToNode(text)
  55.         words=[]
  56.         while node:
  57.                 hinshi = node.feature.split(",")[0]
  58.                 if hinshi in target:
  59.                         origin = node.feature.split(",")[6]
  60.                         words.append(origin)
  61.                 node = node.next
  62.                
  63.         wcs[key]['result'] = collections.Counter(words)
  64.  
  65.         if wcs[key]['count'] > 0:
  66.                 for k in wcs[key]['result']:
  67.                         if k in ec:
  68.                                 wcs[key]['ec'][k] = '{:.2f}'.format(wcs[key]['result'][k]/wcs[key]['count'])
  69. #        print(key, wcs[key]['count'],  c.most_common(20))
  70.         print(key, wcs[key]['count'], wcs[key]['ec'])
  71.  
  72.  
Advertisement
Add Comment
Please, Sign In to add comment