Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- from bs4 import BeautifulSoup
- print "Meddl Loide! Servus und herzlich willkommen beim automatischen Lachschon-Crawler\n"
- response = urllib2.urlopen('http://www.lachschon.de/forum/thread/show/50195/?page=1')
- soup = BeautifulSoup(response, 'html.parser')
- soup.find_all("div", class_="pageselection top")
- pagecountmax = soup.find_all("a", class_="link")[-1].getText()
- print "Etzala "+pagecountmax+" Seiten voll"
- savefile = open("lachschon.html", "w")
- savefile.write('<meta charset="UTF-8" /><style>li{border: solid 2px black;list-style-type: none;}</style>')
- for i in range(1, int(pagecountmax)):
- response = urllib2.urlopen('http://www.lachschon.de/forum/thread/show/50195/?page='+str(i))
- soup = BeautifulSoup(response, 'html.parser')
- content = soup.find_all(id="post-list")
- savefile.write(str(unicode.join(u'\n',map(unicode,content))))
- print("Speichere Seite "+str(i)+"...")
- i = i+1;
- savefile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement