Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup
- import urllib3
- htmlToOpen = open('test.html')
- #This opens the html file that you want to work in. To do multiple html files, use a for loop and change the value into variables.
- #https://stackoverflow.com/questions/1120707/using-python-to-execute-a-command-on-every-file-in-a-folder
- soup = BeautifulSoup(htmlToOpen, 'html.parser')
- #this creates an HTML object
- #i'm using html parser.
- imageWebPath = ""
- #This just declares an empty string variable that I will replace using the image web path
- newImagePath = "image.jpg"
- #this is the new image. To do multiple images, use a for loop and append a number to the file name using a counter of some sort.
- imagesInHtmlFile = soup.findAll('img')
- #this finds all of the images in the html file.
- #https://stackoverflow.com/questions/43982002/extract-src-attribute-from-img-tag-using-beautifulsoup/47166671
- for image in imagesInHtmlFile:
- imageWebPath = image['src']
- image['src'] = image['src'].replace(imageWebPath,newImagePath)
- #This for loop finds all the image src attribute vallues and replaces it with a new path.
- #You will need to add to this for loop for multiple images and use a counter.
- htmlFile = open('whatever.html',"w+",encoding='utf-8')
- #this opens a new whatever.html file. You will need to use variables instead of values to change the naming convention.
- #Also, w+ creates a file if none exists.
- htmlFile.write(str(soup))
- #this writes the data to the whatever.html file.
- htmlFile.close()
- #I just closed the whatever.html file
- htmlToOpen.close()
- #Closed the test.html file
- ##################################################################################
- http = urllib3.PoolManager()
- #https://urllib3.readthedocs.io/en/latest/ I need to create a PoolManager Object
- imageFile = open('test.jpg',"wb+")
- #this opens the new image file that will be stored on the host server. The variable name is the same as in line 20.
- #wb+ allows me to create a new image file if image.jpg doesn't work. The b in wb+ allows me to write a byte.
- get = http.request('GET', imageWebPath, preload_content=False)
- #I created a get object so that I can download the image. This method takes three arguments
- #the first argument is a CRUD protocol (not sure if that's the correct saying, but its get, post, etc..)
- #The second argument is the image path.
- #Ignore the third path.
- dataDownload = get.data
- #I download the data from the website. This is the simpliest way that I can think of.
- imageFile.write(dataDownload)
- #this writes the data to the image.jpg file.
- imageFile.close()
Add Comment
Please, Sign In to add comment