Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Fri Apr 26 11:47:29 2019
- @author: fredr
- """
- #import re
- from urllib.request import urlopen
- from bs4 import BeautifulSoup, Comment
- url = 'http://time.com/5561872/game-of-thrones-show/'
- #connecting to
- conn = urlopen(url)
- html = conn.read()
- soup = BeautifulSoup(html, features="lxml")
- keywords = soup.head.find("meta", {"name":"keywords"})['content']
- headline = soup.title.text
- unicorn = soup.find(string=lambda text: isinstance(text, Comment))
- paragraphs = soup.find_all("p")
- del paragraphs[-1]
- htmlstring = '''
- <!DOCTYPE html>
- <html>
- <head>
- </head>
- <body>
- </body>
- </html>
- '''
- with open("got.html", "w") as f:
- f.write(str(htmlstring))
- with open("got.html") as inf:
- txt = inf.read()
- soup2 = BeautifulSoup(txt, features="lxml")
- new_h1 = soup2.new_tag('h1')
- new_h1.string= headline
- new_keywordsBase = soup2.new_tag('p')
- new_keywordsBase.string = "Keywords: "
- new_span = soup2.new_tag('span')
- new_span.string = keywords
- new_p1 = soup2.new_tag('p')
- new_p1.string= paragraphs[0].text
- new_p2 = soup2.new_tag('p')
- new_p2.string= paragraphs[1].text
- new_p3 = soup2.new_tag('p')
- new_p3.string= paragraphs[2].text
- soup2.html.head.insert(0, new_h1)
- soup2.html.body.insert(0, new_keywordsBase)
- new_keywordsBase.append(new_span)
- soup2.html.head.h1.append(soup2.new_tag('style', type='text/css'))
- soup2.html.head.h1.style.append('h1 {text-decoration: underline;}')
- soup2.html.body.span.append(soup2.new_tag('style', type='text/css'))
- soup2.html.body.span.style.append('span {color:red;}')
- li1 = soup2.new_tag('li')
- li2 = soup2.new_tag('li')
- li3 = soup2.new_tag('li')
- ul = soup2.new_tag('ul')
- soup2.html.body.insert(1, ul)
- soup2.html.body.ul.append(soup2.new_tag('style', type='text/css'))
- soup2.html.body.ul.style.append('li {list-style-type:square;}')
- soup2.html.body.ul.insert(0, li1)
- soup2.html.body.ul.insert(1, li2)
- soup2.html.body.ul.insert(2, li3)
- li1.append(new_p1)
- li2.append(new_p2)
- li3.append(new_p3)
- soup2.head.insert(-1,unicorn)
- test = soup2.prettify
- #print(test)
- with open('got_clean.html', 'w') as outf:
- outf.write(str(soup2))
- outf.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement