Advertisement
Guest User

LachschonCrawler v2

a guest
Jan 28th, 2016
432
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.10 KB | None | 0 0
  1. #!/usr/bin/env/python
  2.  
  3. import urllib2
  4. from bs4 import BeautifulSoup
  5.  
  6. print "Meddl Loide! Servus und herzlich willkommen beim automatischen Lachschon-Crawler\n"
  7.  
  8. response = urllib2.urlopen('http://www.lachschon.de/forum/thread/show/50195/?page=1')
  9. soup = BeautifulSoup(response, 'html.parser')
  10. soup.find_all("div", class_="pageselection top")
  11. pagecountmax = soup.find_all("a", class_="link")[-1].getText()
  12. print "Etzala "+pagecountmax+" Seiten voll"
  13.  
  14. for i in range(1, int(pagecountmax)):
  15. response = urllib2.urlopen('http://www.lachschon.de/forum/thread/show/50195/?page='+str(i))
  16. soup = BeautifulSoup(response, 'html.parser')
  17. content = soup.find_all(id="post-list")
  18.  
  19. print("Speichere Seite "+str(i)+"...")
  20. savefile = open(str(i)+".html", "w")
  21. savefile.write('<meta charset="UTF-8" /><style>li{border: solid 2px black;list-style-type: none;}</style>')
  22. savefile.write(str(unicode.join(u'\n',map(unicode,content))))
  23. savefile.write('<br> <h1><a href="'+str(i-1)+'.html">Zurueck</a> <a href="'+str(i+1)+'.html">Vorwaerts</a></h1>')
  24. savefile.close()
  25. i = i+1;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement