Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import json
- from bs4 import BeautifulSoup as Soup
- import re
- from pymystem3 import Mystem
- m = Mystem(entire_input=False)
- patt = re.compile('установил:(.*)решил:', re.DOTALL)
- with open('orders.json', 'r', encoding='utf8') as f:
- results = {}
- n = 0
- for line in f:
- elememt = json.loads(line)
- for field in elememt['fields']:
- text = field['value']
- if text and '<HTML>' in text:
- # data.append(text)
- soup = Soup(text)
- raw = soup.body.get_text().lower()
- res = patt.findall(raw)
- if res:
- descr = res[0]
- results[n] = descr
- print(n)
- break
- n += 1
- json_el = json.dumps(results, ensure_ascii=False)
- open('cases_parsed_lemmatized.json', 'w', encoding='utf-8').write(json_el)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement