Advertisement
Guest User

Mystem.py

a guest
May 25th, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.33 KB | None | 0 0
  1. # mystem.py
  2. # coding: utf8
  3.  
  4. from os import walk
  5. from pymystem3 import Mystem
  6. import json
  7. import io
  8. from multiprocessing import Process
  9. import time
  10.  
  11. path = 'db/materials/'
  12. files = []
  13. mystem = Mystem()
  14.  
  15. for (dirpath, dirnames, filenames) in walk(path):
  16. files.extend(filenames)
  17. break
  18.  
  19. def parse_lemmas(_file):
  20. with io.open('db/materials/' + _file, 'r+', encoding='utf8') as data_file:
  21. data = json.load(data_file)
  22. lemmas = mystem.analyze(data['header'])
  23. lemmas.extend(mystem.analyze(data['body']))
  24.  
  25. words = []
  26. for lemma in lemmas:
  27. if ('analysis' in lemma):
  28. an = lemma['analysis']
  29. if ((len(an) > 0) and ('lex' in an[0]) and ('gr' in an[0])):
  30. if (not an[0]['gr'].startswith('CONJ')) and (not an[0]['gr'].startswith('PR')) and (not an[0]['gr'].startswith('PART')):
  31. words.append(an[0]['lex'])
  32.  
  33. data['lemmas'] = words
  34. # del data['body']
  35. upd_data = json.dumps(data, sort_keys = False, indent = 4, ensure_ascii=False)
  36. data_file.seek(0)
  37. data_file.write(upd_data)
  38. data_file.truncate()
  39.  
  40. def parsing_proc(files):
  41. for _file in files:
  42. parse_lemmas(_file)
  43.  
  44. time_start = time.time()
  45.  
  46. p1 = Process(target=parsing_proc, args=(files[0:500],))
  47. p2 = Process(target=parsing_proc, args=(files[500:1000],))
  48. p1.start()
  49. p2.start()
  50. p1.join()
  51. p2.join()
  52.  
  53. print(time.time() - time_start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement