Untitled

#!/usr/bin/env python
import os
import codecs
import csv
import re
from collections import namedtuple
from bs4 import BeautifulSoup


# Will walk that directory and grab all .html file
in_dir = 'html/'
# csv files will be written into this directory
out_dir = 'csv/'

#######################
# ~ HERE BE DRAGONS ~ #
#######################
reMatchTitle = re.compile(r'.*(Week Ending [0-9]{1,2}(?:st|nd|rd|th)\s[a-zA-Z]{3}\s[0-9]{4}).*').match
CsvData = namedtuple('CsvData', 'filename headers data')


def parse_html(htmlfile):
    with codecs.open(htmlfile, 'r', 'iso-8859-1') as hfile:
        html = hfile.read()

    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('title').text
    tm = reMatchTitle(title)
    if tm:
        # assuming the filename is 12345.COUNTRY.html
        # we want 12345_COUNTRY_{title}.csv
        _, filename = os.path.split(htmlfile)
        csv_fn = filename.replace('html', '{}').replace('.','_')
        csv_fn = csv_fn.format(tm.group(1).replace(' ', '_')) + '.csv'
    else:
        print(f"WARNING: regex couldn't match string: {title}")
        exit()

    table = soup.find('table', class_='chart')
    headers = [header.text for header in table.find_all('th')]

    rows = []
    for row in table.find_all('tr'):
        # Ignore embedded tables tr entries
        if len(row.find_parents("table")) == 2:
            continue
        rows.append([col.text.strip() for col in row.find_all('td', recursive=False)])

    return CsvData(csv_fn, headers, rows)


def write_csv(csv_data):
    fn = os.path.abspath(os.path.join(out_dir, csv_data.filename))
    with open(fn, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(csv_data.headers)
        writer.writerows(row for row in csv_data.data if row)


if __name__ == '__main__':
    for root, dirs, files in os.walk(os.path.abspath(in_dir)):
        print(root)
        for file in files:
            if os.path.splitext(file)[1] == '.html':
                print(f"Processing {file}...", end='')
                write_csv(parse_html(os.path.join(root, file)))
                print("\t\tDONE")