Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import requests
- import sys
- import time
- import json
- def get_entries(maxNumber):
- entryNum = 1
- last = None
- data = set()
- entryList = []
- with open("entries.json","w+") as f:
- pass
- while entryNum < maxNumber:
- address = "https://www.uludagsozluk.com/e/" + str(entryNum) + "/"
- r = requests.get(address)
- r.encoding = "utf-8"
- source = BeautifulSoup(r.content,"lxml")
- if entryNum % 10 == 0:
- sys.stdout.write("\r")
- sys.stdout.write("% 2d" % entryNum)
- sys.stdout.flush()
- if source.find("div", attrs={"class":"entry-p"}) == None:
- entryNum += 1
- continue
- text = " ".join(source.find("div", attrs={"class":"entry-p"}).text.split())
- upvote = 0
- downvote = 0
- title = ""
- cnt = 0
- preNumbers = source.find_all("a", attrs={"class":"butoy"})
- for num in preNumbers:
- current = " ".join(num.text.split())
- if cnt % 3 == 0:
- if(current.isnumeric()):
- upvote = int(current)
- if cnt % 3 == 1:
- current = current.replace('-', '')
- if(current.isnumeric()):
- downvote = -int(current)
- cnt = cnt + 1
- title = source.find("h1", attrs={"class":"tekentry-baslik"}).text.replace('\n', '')
- view = int(source.find("span", attrs={"class":"hidden-phone"}).text.replace('\n', ''))
- entryNum += 1
- cur = (text, upvote, downvote, title, view)
- entryList.append(cur)
- if len(entryList) % 10000 == 0:
- with open("entries"+ ".json", "a+") as f:
- temp = json.dumps(entryList)
- f.write(temp)
- entryList = []
- maxNumber = sys.argv[1]
- get_entries(maxNumber)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement