Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium import webdriver
- from bs4 import BeautifulSoup
- import pandas as pd
- import re
- def cleanhtml(raw_html):
- cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
- cleantext = re.sub(cleanr, '', raw_html)
- return cleantext
- def main():
- driver = webdriver.Chrome(executable_path=r'E:\Program Files (x86)\Chromedriver\chromedriver.exe')
- driver.get('https://www.bbc.co.uk/news')
- content = driver.page_source
- soup = BeautifulSoup(content, "html.parser")
- news1=soup.find('h3', attrs={'class':'gs-c-promo-heading__title gel-paragon-bold nw-o-link-split__text'})
- news2=soup.find_all('h3', attrs={'class':'gs-c-promo-heading__title gel-pica-bold nw-o-link-split__text'})
- desc=soup.find_all('p', attrs={'class':'gs-c-promo-summary gel-long-primer gs-u-mt nw-c-promo-summary'})
- times=soup.find_all('span', attrs={'class':'qa-status-date-output'})
- image1 = soup.find('img', attrs={'class':'qa-srcset-image'})
- image1 = image1['src']
- images=[]
- image2 = soup.find_all('img', attrs={'class':'lazyloaded'})
- for image in image2:
- images.append(image['src'])
- print((news1.contents)[0])
- print(((desc)[1]).contents[0])
- print(((times)[1]).contents[0])
- print(image1)
- print("////////////////////////////////////////////////////////")
- print(((news2)[0]).contents[0])
- print(((desc)[2]).contents[0])
- print(((times)[2]).contents[0])
- print(images[0])
- print("////////////////////////////////////////////////////////")
- print(((news2)[1]).contents[0])
- print(((desc)[3]).contents[0])
- print(((times)[3]).contents[0])
- print(images[1])
- print("////////////////////////////////////////////////////////")
- print(((news2)[2]).contents[0])
- print(((desc)[4]).contents[0])
- print(((times)[4]).contents[0])
- print(images[2])
- print("////////////////////////////////////////////////////////")
- print(((news2)[3]).contents[0])
- print(((desc)[5]).contents[0])
- print(((times)[5]).contents[0])
- print(images[3])
- driver.quit()
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement