Guest User

Untitled

a guest
May 20th, 2018
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.83 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import os
  3. from sys import argv
  4.  
  5. DECOMPOSE_TAGS = (
  6. 'div.menu-panel',
  7. 'head',
  8. 'li',
  9. 'p.month-name',
  10. 'table.days-of-month',
  11. 'div.comment',
  12. 'p.source',
  13. 'div.notes',
  14. 'span',
  15. 'script'
  16. )
  17.  
  18. def extract_text(input_file, output_file):
  19. soup = BeautifulSoup(open(input_file),"html.parser")
  20.  
  21. for selector in DECOMPOSE_TAGS:
  22. for tag in soup.select(selector):
  23. tag.decompose()
  24.  
  25. with open(output_file, 'a') as f:
  26. print(soup.get_text(), file=f)
  27. f.close()
  28.  
  29. if __name__ == '__main__:
  30. input_directory = argv[1]
  31. output_file = argv[2]
  32. for file in os.listdir(input_directory):
  33. if file.endswith('.html'):
  34. file_path = os.path.join(intput_directory, file)
  35. extract_text(input_file, output_file)
Add Comment
Please, Sign In to add comment