Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import os
- from sys import argv
- DECOMPOSE_TAGS = (
- 'div.menu-panel',
- 'head',
- 'li',
- 'p.month-name',
- 'table.days-of-month',
- 'div.comment',
- 'p.source',
- 'div.notes',
- 'span',
- 'script'
- )
- def extract_text(input_file, output_file):
- soup = BeautifulSoup(open(input_file),"html.parser")
- for selector in DECOMPOSE_TAGS:
- for tag in soup.select(selector):
- tag.decompose()
- with open(output_file, 'a') as f:
- print(soup.get_text(), file=f)
- f.close()
- if __name__ == '__main__:
- input_directory = argv[1]
- output_file = argv[2]
- for file in os.listdir(input_directory):
- if file.endswith('.html'):
- file_path = os.path.join(intput_directory, file)
- extract_text(input_file, output_file)
Add Comment
Please, Sign In to add comment