ikov34

Untitled

May 7th, 2021
710
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import requests
  2. import json
  3. from bs4 import BeautifulSoup
  4.  
  5. level_dict = {}
  6.  
  7. general_dict = {}
  8.  
  9.  
  10. for i in range(1, 2000):
  11.     print(f'Processing page {i}')
  12.     URL = f'https://besednipiknik.com/stopnja-{i}.html'
  13.     page = requests.get(URL)
  14.     soup = BeautifulSoup(page.content, 'html.parser')
  15.     words = soup.find(class_='words')
  16.     if words is None:
  17.         break
  18.     buffer = ''
  19.     keyword = ''
  20.     for el in words.contents:
  21.         if el.name == 'span':
  22.             buffer += el.text
  23.         elif el.name == 'br':
  24.             if i not in level_dict:
  25.                 level_dict[i] = {'words': [], 'keyword': ''}
  26.             level_dict[i]['words'].append(buffer)
  27.             buffer = ''
  28.     keyword = level_dict[i]['keyword'] = level_dict[i]['words'][-1]
  29.  
  30.     if keyword not in general_dict:
  31.         general_dict[keyword] = set(level_dict[i]['words'])
  32.     else:
  33.         general_dict[keyword] = general_dict[keyword].union(set(level_dict[i]['words']))
  34.  
  35. # Change sets into lists
  36. for keyword, words in general_dict.items():
  37.     general_dict[keyword] = sorted(list(general_dict[keyword]))
  38.  
  39. with open("leveli.json", "w", encoding='utf8') as write_file:
  40.     json.dump(general_dict, write_file, indent=4, ensure_ascii=False)
  41.  
  42. print('Done!')
  43.  
RAW Paste Data