from BeautifulSoup import BeautifulSoup as BS
from datetime import datetime
from fnmatch import fnmatch
import numpy as np
import os
def get_files(path):
'''
Get a list of all files in a given directory.
Returns a list of filename strings.
'''
files = os.listdir(path)
return files
def file_subset(files, match_string):
'''
This function allows you to get a subset of the files in a directory.
Pass it a list of filenames and a string to match (e.g. '20090910*.txt')
'''
f_subset = [f for f in files if fnmatch(f, match_string)]
return f_subset
def get_soup(filename):
'''
This function uses BeautifulSoup to parse the html of a given text file
and returns a dictionary:
'''
html = open(filename).read()
soup = BS(html)
tables = soup.findAll('table')
title = soup.find('h4').string
datestring = soup.find('caption').string
num_tables = len(tables)
table_no = filename.split('_')[1]
return {'soup': tables,
'num_tables':num_tables,
'title':title,
'datestring':datestring,
'table_no':table_no
}
def table2array(td_soup):
'''
This function takes a BeautifulSoup object consisting of <td> tags and
attempts to clean it up.
The function returns 1) a tuple containing a numpy array that basically vector-
izes the html table contents and 2) a list that does the same.
'''
raw_data = [item.string for item in td_soup]
clean_data = []
for item in raw_data:
#if statement below is an expedient way to cover all scenarios
if item in [None, u' ', ' '] or item.string in [None,u' ', ' ']:
clean_data.append('NA')
else:
clean_data.append(item.strip())
clean_data_array = np.array(clean_data)
return (clean_data_array, raw_data)
def reformat_array(data, table_no):
'''
This function is designed to take a tuple fro table2array. It takes a
"clean data" numpy array and a "raw data" python list as a tuple, and
the table number with which the data is associated.
It returns a target array that mimics (approximately) the shape of the
source html table.
'''
clean_data_array, raw_data = data
data_mask = clean_data_array != 'NA'
data_index = np.where(data_mask)[0] #where() returns a tuple so need [0]
raw_data_reversed = raw_data[:]
raw_data_reversed.reverse()
number_columns = raw_data_reversed.index(None)
total_columns = number_columns + 1
if table_no in ['1', '9']:
first_col = data_index[1]
first_num = first_col + 1
temp_array = clean_data_array[first_num:]
NA_mask = temp_array == 'NA'
NA_index = np.where(NA_mask)[0]
target = np.delete(temp_array, NA_index)
cols = number_columns
elif table_no in ['6','7','8','14','15','16','17','18','19']:
first_col = data_index[0]
target = clean_data_array[first_col:]
cols = total_columns
else:
first_col = data_index[1]
target = clean_data_array[first_col:]
cols = total_columns
t_shape = (len(target)/cols, float(cols))
try:
target.shape = t_shape
return target
except Exception, e:
return ('Table %s' % table_no, e)
def inspect_target(filename):
path = '/Volumes/Drobo/Data/DTCC/Raw/'
page = get_soup(path+filename)
soup = page['soup']
num_tables = page['num_tables']
title = page['title']
datestring = page['datestring']
table_no = page['table_no']
for table in soup:
td_soup = table.findAll('td')
data = table2array(td_soup)
target = reformat_array(data, table_no)
print target
return data
def main():
path = '/Volumes/Drobo/Data/DTCC/Raw/'
all_files = get_files(path)
subset_files = file_subset(all_files, '20090910*.txt')
errors = []
titles = []
target_dict = {}
for f in subset_files:
page = get_soup(path+f) #remember get_soup() returns a dictionary!
soup = page['soup']
num_tables = page['num_tables']
title = page['title']
datestring = page['datestring']
table_no = page['table_no']
targets = []
for table in soup:
td_soup = table.findAll('td')
data = table2array(td_soup)
target = reformat_array(data, table_no)
if isinstance(target, tuple):
errors.append(target)
else:
targets.append(target)
target_dict[table_no] = targets #most tables are list of 1!
titles.append(title)
target_dict['Titles'] = titles
if not errors:
return target_dict
else:
return errors
if (__name__ == '__main__'):
targets = main()
'''
To Do:
- delete TOTAL rows and columns
- insert dates into table arrays
- insert headers for csv once array has been modified
- write to csv files
'''