Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import asyncio
- import time
- import bs4
- import requests
- from pymongo import MongoClient
- # vocabulary.com parser
- class VocElement:
- def __init__(self, word, tip, usage, definition):
- self.word = word
- self.tip = tip
- self.usage = usage
- self.definition = definition
- def get_voc_json(self):
- return {
- 'word': self.word,
- 'tip': self.tip,
- 'usage': self.usage,
- 'definition': self.definition
- }
- def get_links_to_parse():
- def cut_id_out(link):
- return link.split('/')[-1]
- q = 'https://www.vocabulary.com/lists/'
- req = requests.get(q)
- if req.status_code >= 400:
- return {
- 'error': 'connection not established'
- }
- scrap = bs4.BeautifulSoup(req.content, 'html.parser')
- return [cut_id_out(a['href']) for a in scrap.select('.readMore')]
- async def get_definition(link):
- req = requests.get(link)
- if req.status_code >= 400:
- return {
- 'error': 'connection not established'
- }
- scrap = bs4.BeautifulSoup(req.content, 'html.parser')
- return scrap.select('.short')[0].text
- async def get_single_word(link):
- req = requests.get(link)
- if req.status_code >= 400:
- return {
- 'error': 'connection not established'
- }
- scrap = bs4.BeautifulSoup(req.content, 'html.parser')
- print('Currently parsing {link}'.format(link=link))
- words = scrap.select('.learnable')
- vocs = []
- for idx, word in enumerate(words):
- v_word = word.select('.word')[0].text
- print('\t[{id}] Word: {word}'.format(id=idx, word=v_word)) # easy to follow progress in console DEV ONLY
- v_tip = word.select('.definition')[0].text
- v_usage = word.select('.example')[0].text
- v_def = await get_definition('https://www.vocabulary.com/dictionary/{word}'.format(word=v_word))
- vocs.append(VocElement(v_word, v_tip, v_usage, v_def).get_voc_json())
- return vocs
- async def init_dictionary():
- links = ['https://www.vocabulary.com/lists/'+x for x in get_links_to_parse()[:1]]
- out = []
- for f in asyncio.as_completed([get_single_word(link) for link in links]):
- result = await f
- out.extend(result)
- return out
- def create_dictionary():
- return asyncio.get_event_loop().run_until_complete(init_dictionary())
- # that's mine, ignore pls
- def main():
- init_time = time.time()
- data = create_dictionary()
- client = MongoClient()
- db = client.vocuiz
- print('Inserting to database')
- result = db.dict.insert_many(data)
- print('Data status: {status}'.format(status=result))
- print('Duration: {time}'.format(time=time.time()-init_time))
- print('D O N E')
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement