Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # mystem.py
- # coding: utf8
- from os import walk
- from pymystem3 import Mystem
- import json
- import io
- from multiprocessing import Process
- import time
- path = 'db/materials/'
- files = []
- mystem = Mystem()
- for (dirpath, dirnames, filenames) in walk(path):
- files.extend(filenames)
- break
- def parse_lemmas(_file):
- with io.open('db/materials/' + _file, 'r+', encoding='utf8') as data_file:
- data = json.load(data_file)
- lemmas = mystem.analyze(data['header'])
- lemmas.extend(mystem.analyze(data['body']))
- words = []
- for lemma in lemmas:
- if ('analysis' in lemma):
- an = lemma['analysis']
- if ((len(an) > 0) and ('lex' in an[0]) and ('gr' in an[0])):
- if (not an[0]['gr'].startswith('CONJ')) and (not an[0]['gr'].startswith('PR')) and (not an[0]['gr'].startswith('PART')):
- words.append(an[0]['lex'])
- data['lemmas'] = words
- # del data['body']
- upd_data = json.dumps(data, sort_keys = False, indent = 4, ensure_ascii=False)
- data_file.seek(0)
- data_file.write(upd_data)
- data_file.truncate()
- def parsing_proc(files):
- for _file in files:
- parse_lemmas(_file)
- time_start = time.time()
- p1 = Process(target=parsing_proc, args=(files[0:500],))
- p2 = Process(target=parsing_proc, args=(files[500:1000],))
- p1.start()
- p2.start()
- p1.join()
- p2.join()
- print(time.time() - time_start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement