Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import matplotlib.style as sty
- import os
- import re
- import pandas as pd
- import numpy as np
- import matplotlib
- from sys import exit
- import csv
- #an easy way to get all the imports installed is by just installing Anaconda (especially true for windows users)
- class BabyNameAnalyzer:
- """Anaylze SSA baby name HTML files -- Use with a web scraper for maximum effectiveness"""
- def __init__(self):
- sty.use('ggplot')
- self.namedict = {}
- self.boyrankings = {}
- self.girlrankings = {}
- self.extractNames()
- self.organizeNames()
- def extractNames(self):
- """Extract the names from the Python Google class html files
- (in this example run the program from that directory)
- """
- for root, dirs, files in os.walk(os.path.dirname(os.path.realpath(__file__))):
- patt = re.compile(r'<tr align="right">\s*<td>(\d{1,4})<\/td>\s*<td>([A-Z][a-z]+?)<\/td>\s*<td>([A-Z][a-z]+?)<\/td>')
- for file in files:
- year = file[4:-5]
- self.namedict[year] = []
- if file[-4:] == "html":
- myfile = open(os.path.join(root, file))
- htmlText = myfile.read()
- found = re.findall(patt,htmlText)
- myfile.close()
- [self.namedict[year].append(find) for find in found]
- def nameDumpCSV(self):
- """Organize the names into CSV for both girls and boys"""
- with open("boybabynames.csv","wb") as csvfile:
- fieldnames = ['name', 'year', 'rank']
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
- writer.writeheader()
- for key,values in self.boyrankings.items():
- for value in values:
- writer.writerow({'name': key, 'year': value[0], 'rank': value[1]})
- csvfile.close()
- with open("girlbabynames.csv","wb") as csvfile:
- fieldnames = ['name', 'year', 'rank']
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
- writer.writeheader()
- for key,values in self.girlrankings.items():
- for value in values:
- writer.writerow({'name': key, 'year': value[0], 'rank': value[1]})
- csvfile.close()
- def trendName(self):
- """Given a gender and name create a matplotlib line graph mapping the rankings over the years"""
- namedf = pd.read_csv('babynames.csv', index_col=1)
- namedf.plot()
- plt.show()
- def main():
- bna = BabyNameAnalyzer()
- #bna.nameDumpCSV()
- bna.trendName()
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement