Advertisement
emesten

Untitled

May 5th, 2018
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.80 KB | None | 0 0
  1. import asyncio
  2. import time
  3.  
  4. import bs4
  5. import requests
  6. from pymongo import MongoClient
  7.  
  8.  
  9. # vocabulary.com parser
  10. class VocElement:
  11.     def __init__(self, word, tip, usage, definition):
  12.         self.word = word
  13.         self.tip = tip
  14.         self.usage = usage
  15.         self.definition = definition
  16.  
  17.     def get_voc_json(self):
  18.         return {
  19.             'word': self.word,
  20.             'tip': self.tip,
  21.             'usage': self.usage,
  22.             'definition': self.definition
  23.         }
  24.  
  25.  
  26. def get_links_to_parse():
  27.     def cut_id_out(link):
  28.         return link.split('/')[-1]
  29.     q = 'https://www.vocabulary.com/lists/'
  30.     req = requests.get(q)
  31.     if req.status_code >= 400:
  32.         return {
  33.             'error': 'connection not established'
  34.         }
  35.     scrap = bs4.BeautifulSoup(req.content, 'html.parser')
  36.     return [cut_id_out(a['href']) for a in scrap.select('.readMore')]
  37.  
  38.  
  39. async def get_definition(link):
  40.     req = requests.get(link)
  41.     if req.status_code >= 400:
  42.         return {
  43.             'error': 'connection not established'
  44.         }
  45.     scrap = bs4.BeautifulSoup(req.content, 'html.parser')
  46.     return scrap.select('.short')[0].text
  47.  
  48.  
  49. async def get_single_word(link):
  50.     req = requests.get(link)
  51.     if req.status_code >= 400:
  52.         return {
  53.             'error': 'connection not established'
  54.         }
  55.     scrap = bs4.BeautifulSoup(req.content, 'html.parser')
  56.     print('Currently parsing {link}'.format(link=link))
  57.  
  58.     words = scrap.select('.learnable')
  59.     vocs = []
  60.     for idx, word in enumerate(words):
  61.         v_word = word.select('.word')[0].text
  62.         print('\t[{id}] Word: {word}'.format(id=idx, word=v_word))  # easy to follow progress in console DEV ONLY
  63.         v_tip = word.select('.definition')[0].text
  64.         v_usage = word.select('.example')[0].text
  65.         v_def = await get_definition('https://www.vocabulary.com/dictionary/{word}'.format(word=v_word))
  66.         vocs.append(VocElement(v_word, v_tip, v_usage, v_def).get_voc_json())
  67.     return vocs
  68.  
  69.  
  70. async def init_dictionary():
  71.     links = ['https://www.vocabulary.com/lists/'+x for x in get_links_to_parse()[:1]]
  72.     out = []
  73.     for f in asyncio.as_completed([get_single_word(link) for link in links]):
  74.         result = await f
  75.         out.extend(result)
  76.     return out
  77.  
  78.  
  79. def create_dictionary():
  80.     return asyncio.get_event_loop().run_until_complete(init_dictionary())
  81.  
  82.  
  83. # that's mine, ignore pls
  84. def main():
  85.     init_time = time.time()
  86.     data = create_dictionary()
  87.     client = MongoClient()
  88.     db = client.vocuiz
  89.     print('Inserting to database')
  90.     result = db.dict.insert_many(data)
  91.     print('Data status: {status}'.format(status=result))
  92.     print('Duration: {time}'.format(time=time.time()-init_time))
  93.     print('D O N E')
  94.  
  95.  
  96. if __name__ == '__main__':
  97.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement