Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import csv
- from bs4 import BeautifulSoup
- import os
- # Create a file to write our data to, add a headers row
- f = csv.writer(open('kid_jokes.csv', 'w'))
- #f.writerow(['Name', 'Address', 'Town', 'State'])
- f.writerow(['Q', 'A'])
- # Create empty array for your multiple pages to get stored to
- pages = []
- #Run through every page we want to visit - great if you have a numerical list of ordered pages!
- for i in range(7, 91):
- #our base url, which we are adding numbers to the end of
- url = 'http://www.ahajokes.com/kani' + str(i).zfill(2) + '.html' # + str(i)
- pages.append(url)
- #for each url we have stored
- for item in pages:
- page = requests.get(item)
- if page:
- soup = BeautifulSoup(page.text, 'html.parser')
- ############HTML############
- #prints out the full html of the page.
- #print (soup)
- ######PARSING FOR INTERNAL DATA######## - the specifics of this will change for your project!
- text_soup = soup.text
- split_text_soup = text_soup.split('\n')
- #print (split_text_soup)
- for pea in split_text_soup:
- if pea.startswith("Q:"):
- print (pea)
- f.writerow([pea, ""])
- if pea.startswith("A:"):
- print (pea)
- f.writerow(["",pea])
- #########IMAGES###########
- #downloads every image on the page
- for link in soup.find_all('img'):
- image = link.get("src")
- #prints link it found for the image
- print(image)
- #checks if the image is a complete link or not
- if image.startswith("http"):
- image_url = image
- #if it is not, please add the base URL of the domain!
- else:
- baseURL = 'http://www.ahajokes.com/'
- image_url = baseURL + image
- #gets the image offline
- r2 = requests.get(image_url)
- #gets the image name
- image_name = os.path.split(image_url)[1]
- #saves the image to the local folder
- with open(image_name, "wb") as im:
- im.write(r2.content)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement