Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2019
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.87 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import sys
  4. import time
  5. import json
  6.  
  7. def get_entries(maxNumber):
  8.     entryNum = 1
  9.     last = None
  10.     data = set()
  11.     entryList = []
  12.     with open("entries.json","w+") as f:
  13.         pass
  14.     while entryNum < maxNumber:
  15.         address = "https://www.uludagsozluk.com/e/" + str(entryNum) + "/"
  16.         r = requests.get(address)
  17.         r.encoding = "utf-8"
  18.         source = BeautifulSoup(r.content,"lxml")
  19.         if entryNum % 10 == 0:
  20.             sys.stdout.write("\r")
  21.             sys.stdout.write("% 2d" % entryNum)
  22.             sys.stdout.flush()
  23.         if source.find("div", attrs={"class":"entry-p"}) == None:
  24.             entryNum += 1
  25.             continue
  26.         text = " ".join(source.find("div", attrs={"class":"entry-p"}).text.split())
  27.         upvote = 0
  28.         downvote = 0
  29.         title = ""
  30.         cnt = 0
  31.         preNumbers = source.find_all("a", attrs={"class":"butoy"})
  32.         for num in preNumbers:
  33.             current = " ".join(num.text.split())
  34.             if cnt % 3 == 0:
  35.                 if(current.isnumeric()):
  36.                     upvote = int(current)
  37.             if cnt % 3 == 1:
  38.                 current = current.replace('-', '')
  39.                 if(current.isnumeric()):
  40.                     downvote = -int(current)
  41.             cnt = cnt + 1
  42.         title = source.find("h1", attrs={"class":"tekentry-baslik"}).text.replace('\n', '')
  43.         view = int(source.find("span", attrs={"class":"hidden-phone"}).text.replace('\n', ''))
  44.         entryNum += 1
  45.         cur = (text, upvote, downvote, title, view)
  46.         entryList.append(cur)
  47.         if len(entryList) % 10000 == 0:
  48.             with open("entries"+ ".json", "a+") as f:
  49.                 temp = json.dumps(entryList)
  50.                 f.write(temp)
  51.                 entryList = []
  52.  
  53. maxNumber = sys.argv[1]
  54. get_entries(maxNumber)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement