Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import os
- import codecs
- import csv
- import re
- from collections import namedtuple
- from bs4 import BeautifulSoup
- # Will walk that directory and grab all .html file
- in_dir = 'html/'
- # csv files will be written into this directory
- out_dir = 'csv/'
- #######################
- # ~ HERE BE DRAGONS ~ #
- #######################
- reMatchTitle = re.compile(r'.*(Week Ending [0-9]{1,2}(?:st|nd|rd|th)\s[a-zA-Z]{3}\s[0-9]{4}).*').match
- CsvData = namedtuple('CsvData', 'filename headers data')
- def parse_html(htmlfile):
- with codecs.open(htmlfile, 'r', 'iso-8859-1') as hfile:
- html = hfile.read()
- soup = BeautifulSoup(html, 'html.parser')
- title = soup.find('title').text
- tm = reMatchTitle(title)
- if tm:
- # assuming the filename is 12345.COUNTRY.html
- # we want 12345_COUNTRY_{title}.csv
- _, filename = os.path.split(htmlfile)
- csv_fn = filename.replace('html', '{}').replace('.','_')
- csv_fn = csv_fn.format(tm.group(1).replace(' ', '_')) + '.csv'
- else:
- print(f"WARNING: regex couldn't match string: {title}")
- exit()
- table = soup.find('table', class_='chart')
- headers = [header.text for header in table.find_all('th')]
- rows = []
- for row in table.find_all('tr'):
- # Ignore embedded tables tr entries
- if len(row.find_parents("table")) == 2:
- continue
- rows.append([col.text.strip() for col in row.find_all('td', recursive=False)])
- return CsvData(csv_fn, headers, rows)
- def write_csv(csv_data):
- fn = os.path.abspath(os.path.join(out_dir, csv_data.filename))
- with open(fn, 'w') as f:
- writer = csv.writer(f)
- writer.writerow(csv_data.headers)
- writer.writerows(row for row in csv_data.data if row)
- if __name__ == '__main__':
- for root, dirs, files in os.walk(os.path.abspath(in_dir)):
- print(root)
- for file in files:
- if os.path.splitext(file)[1] == '.html':
- print(f"Processing {file}...", end='')
- write_csv(parse_html(os.path.join(root, file)))
- print("\t\tDONE")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement