Advertisement
Guest User

Untitled

a guest
Apr 26th, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.17 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Fri Apr 26 11:47:29 2019
  4.  
  5. @author: fredr
  6. """
  7.  
  8. #import re
  9. from urllib.request import urlopen
  10. from bs4 import BeautifulSoup, Comment
  11. url = 'http://time.com/5561872/game-of-thrones-show/'
  12.  
  13.  
  14. #connecting to
  15. conn = urlopen(url)
  16. html = conn.read()
  17. soup = BeautifulSoup(html, features="lxml")
  18.  
  19. keywords = soup.head.find("meta", {"name":"keywords"})['content']
  20. headline = soup.title.text
  21. unicorn = soup.find(string=lambda text: isinstance(text, Comment))
  22. paragraphs = soup.find_all("p")
  23. del paragraphs[-1]
  24.  
  25. htmlstring = '''
  26. <!DOCTYPE html>
  27. <html>
  28. <head>
  29. </head>
  30. <body>
  31. </body>
  32. </html>
  33. '''
  34.  
  35. with open("got.html", "w") as f:
  36. f.write(str(htmlstring))
  37.  
  38. with open("got.html") as inf:
  39. txt = inf.read()
  40. soup2 = BeautifulSoup(txt, features="lxml")
  41.  
  42. new_h1 = soup2.new_tag('h1')
  43. new_h1.string= headline
  44.  
  45. new_keywordsBase = soup2.new_tag('p')
  46. new_keywordsBase.string = "Keywords: "
  47.  
  48. new_span = soup2.new_tag('span')
  49. new_span.string = keywords
  50.  
  51.  
  52. new_p1 = soup2.new_tag('p')
  53. new_p1.string= paragraphs[0].text
  54.  
  55. new_p2 = soup2.new_tag('p')
  56. new_p2.string= paragraphs[1].text
  57.  
  58. new_p3 = soup2.new_tag('p')
  59. new_p3.string= paragraphs[2].text
  60.  
  61. soup2.html.head.insert(0, new_h1)
  62.  
  63. soup2.html.body.insert(0, new_keywordsBase)
  64. new_keywordsBase.append(new_span)
  65.  
  66. soup2.html.head.h1.append(soup2.new_tag('style', type='text/css'))
  67. soup2.html.head.h1.style.append('h1 {text-decoration: underline;}')
  68.  
  69. soup2.html.body.span.append(soup2.new_tag('style', type='text/css'))
  70. soup2.html.body.span.style.append('span {color:red;}')
  71.  
  72. li1 = soup2.new_tag('li')
  73. li2 = soup2.new_tag('li')
  74. li3 = soup2.new_tag('li')
  75. ul = soup2.new_tag('ul')
  76. soup2.html.body.insert(1, ul)
  77. soup2.html.body.ul.append(soup2.new_tag('style', type='text/css'))
  78. soup2.html.body.ul.style.append('li {list-style-type:square;}')
  79. soup2.html.body.ul.insert(0, li1)
  80. soup2.html.body.ul.insert(1, li2)
  81. soup2.html.body.ul.insert(2, li3)
  82. li1.append(new_p1)
  83. li2.append(new_p2)
  84. li3.append(new_p3)
  85.  
  86. soup2.head.insert(-1,unicorn)
  87.  
  88. test = soup2.prettify
  89. #print(test)
  90.  
  91. with open('got_clean.html', 'w') as outf:
  92. outf.write(str(soup2))
  93. outf.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement