Advertisement
TringaliLuca

Extract p and img from HTML page

Dec 27th, 2018
201
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.68 KB | None | 0 0
  1. #!/usr/bin/python3
  2. import re
  3. import os
  4. import sys
  5.  
  6. filename = sys.argv[1]
  7. text_file = open(filename, "r")
  8. text = text_file.read()
  9. text_file.close()
  10.  
  11. regex = r'<p.*?>(.*?)</p>'
  12. blocks = []
  13. for m in re.finditer(regex, text, flags=re.IGNORECASE|re.DOTALL):
  14.     blocks.append(m.group(1))
  15. for block in blocks:
  16.     clean = re.sub(r'<.*?>', "", block, flags=re.DOTALL)
  17.     print(clean)
  18.    
  19. regex = r'<img (.*?)>'
  20. blocks = [m.group(1) for m in re.finditer(regex, text, flags=re.IGNORECASE|re.DOTALL)]
  21. for block in blocks:
  22.     clean = re.sub(r""".*src=["']""", "", block, flags=re.IGNORECASE|re.DOTALL)
  23.     clean = re.sub(r"""["'].*""", "", clean, flags=re.DOTALL)
  24.     print(clean)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement