Advertisement
Guest User

joke scraper

a guest
Aug 2nd, 2018
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.89 KB | None | 0 0
  1. import requests
  2. import csv
  3. from bs4 import BeautifulSoup
  4. import os
  5.  
  6.  
  7. # Create a file to write our data to, add a headers row
  8. f = csv.writer(open('kid_jokes.csv', 'w'))
  9. #f.writerow(['Name', 'Address', 'Town', 'State'])
  10. f.writerow(['Q', 'A'])
  11.  
  12. # Create empty array for your multiple pages to get stored to
  13. pages = []
  14.  
  15. #Run through every page we want to visit - great if you have a numerical list of ordered pages!
  16. for i in range(7, 91):
  17.  
  18.  
  19. #our base url, which we are adding numbers to the end of
  20. url = 'http://www.ahajokes.com/kani' + str(i).zfill(2) + '.html' # + str(i)
  21. pages.append(url)
  22.  
  23. #for each url we have stored
  24. for item in pages:
  25. page = requests.get(item)
  26. if page:
  27. soup = BeautifulSoup(page.text, 'html.parser')
  28.  
  29. ############HTML############
  30. #prints out the full html of the page.
  31. #print (soup)
  32.  
  33. ######PARSING FOR INTERNAL DATA######## - the specifics of this will change for your project!
  34. text_soup = soup.text
  35. split_text_soup = text_soup.split('\n')
  36. #print (split_text_soup)
  37.  
  38. for pea in split_text_soup:
  39. if pea.startswith("Q:"):
  40. print (pea)
  41. f.writerow([pea, ""])
  42. if pea.startswith("A:"):
  43. print (pea)
  44. f.writerow(["",pea])
  45.  
  46. #########IMAGES###########
  47. #downloads every image on the page
  48. for link in soup.find_all('img'):
  49. image = link.get("src")
  50.  
  51. #prints link it found for the image
  52. print(image)
  53.  
  54. #checks if the image is a complete link or not
  55. if image.startswith("http"):
  56. image_url = image
  57.  
  58. #if it is not, please add the base URL of the domain!
  59. else:
  60. baseURL = 'http://www.ahajokes.com/'
  61. image_url = baseURL + image
  62.  
  63. #gets the image offline
  64. r2 = requests.get(image_url)
  65.  
  66. #gets the image name
  67. image_name = os.path.split(image_url)[1]
  68.  
  69. #saves the image to the local folder
  70. with open(image_name, "wb") as im:
  71. im.write(r2.content)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement