Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from Post import Post
- import urllib.request as urllib
- from bs4 import BeautifulSoup
- def getHTML(url):
- try:
- f = urllib.urlopen(url)
- return f.read()
- f.close()
- except urllib.error.HTTPError as e:
- print("Ocurrió un error")
- print(e.code)
- return ""
- except urllib.error.URLError as e:
- print("Ocurrió un error")
- print(e.reason)
- return ""
- link1 = "https://foros.derecho.com/foro/20-Derecho-Civil-General"
- soup = BeautifulSoup(getHTML(link1), 'html.parser')
- ol = soup.find(id="threads")
- li_list = ol.find_all("li", {"class": "threadbit"})
- titles = []
- links = []
- authors = []
- pubdates = []
- responses = []
- views = []
- for li in li_list:
- titulo = ""
- linkd = ""
- fecha = ""
- autor = ""
- visitas = ""
- respuestas = ""
- h3 = li.find("h3", {"class": "threadtitle"})
- a = h3.find('a')
- titulo = a.string
- h3 = li.find("h3", {"class": "threadtitle"})
- a = h3.find('a')
- linkd = "https://foros.derecho.com/"+a['href']
- div_threadmeat = li.find("div", {"class": "threadmeta"})
- label = div_threadmeat.find("span", {"class": "label"})
- a = label.find('a')
- meta = a['title']
- authorYdate = re.match("Iniciado por (.+), el (.+)", meta)
- autor = authorYdate.group(1)
- fecha = authorYdate.group(2)
- ul_stats = li.find("ul", {"class": "threadstats td alt"})
- i = 0
- for li2 in ul_stats.find_all('li'):
- if(i == 0):
- respuestas = li2.find('a').string
- i += 1
- elif(i == 1):
- visitas = re.match("Visitas: (.+)", li2.string).group(1)
- i += 1
- else:
- i = 0
- post = Post(titulo, linkd, fecha, autor, respuestas, visitas)
- print(post)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement