Advertisement
Guest User

Untitled

a guest
Aug 19th, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.07 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import os
  3. import codecs
  4. import csv
  5. import re
  6. from collections import namedtuple
  7. from bs4 import BeautifulSoup
  8.  
  9.  
  10. # Will walk that directory and grab all .html file
  11. in_dir = 'html/'
  12. # csv files will be written into this directory
  13. out_dir = 'csv/'
  14.  
  15. #######################
  16. # ~ HERE BE DRAGONS ~ #
  17. #######################
  18. reMatchTitle = re.compile(r'.*(Week Ending [0-9]{1,2}(?:st|nd|rd|th)\s[a-zA-Z]{3}\s[0-9]{4}).*').match
  19. CsvData = namedtuple('CsvData', 'filename headers data')
  20.  
  21.  
  22. def parse_html(htmlfile):
  23. with codecs.open(htmlfile, 'r', 'iso-8859-1') as hfile:
  24. html = hfile.read()
  25.  
  26. soup = BeautifulSoup(html, 'html.parser')
  27. title = soup.find('title').text
  28. tm = reMatchTitle(title)
  29. if tm:
  30. # assuming the filename is 12345.COUNTRY.html
  31. # we want 12345_COUNTRY_{title}.csv
  32. _, filename = os.path.split(htmlfile)
  33. csv_fn = filename.replace('html', '{}').replace('.','_')
  34. csv_fn = csv_fn.format(tm.group(1).replace(' ', '_')) + '.csv'
  35. else:
  36. print(f"WARNING: regex couldn't match string: {title}")
  37. exit()
  38.  
  39. table = soup.find('table', class_='chart')
  40. headers = [header.text for header in table.find_all('th')]
  41.  
  42. rows = []
  43. for row in table.find_all('tr'):
  44. # Ignore embedded tables tr entries
  45. if len(row.find_parents("table")) == 2:
  46. continue
  47. rows.append([col.text.strip() for col in row.find_all('td', recursive=False)])
  48.  
  49. return CsvData(csv_fn, headers, rows)
  50.  
  51.  
  52. def write_csv(csv_data):
  53. fn = os.path.abspath(os.path.join(out_dir, csv_data.filename))
  54. with open(fn, 'w') as f:
  55. writer = csv.writer(f)
  56. writer.writerow(csv_data.headers)
  57. writer.writerows(row for row in csv_data.data if row)
  58.  
  59.  
  60. if __name__ == '__main__':
  61. for root, dirs, files in os.walk(os.path.abspath(in_dir)):
  62. print(root)
  63. for file in files:
  64. if os.path.splitext(file)[1] == '.html':
  65. print(f"Processing {file}...", end='')
  66. write_csv(parse_html(os.path.join(root, file)))
  67. print("\t\tDONE")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement