Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import matplotlib.style as sty
- import os
- import re
- import pandas as pd
- import numpy as np
- import matplotlib
- from sys import exit
- #an easy way to get all the imports installed is by just installing Anaconda (especially true for windows users)
- class BabyNameAnalyzer:
- """Anaylze SSA baby name HTML files -- Use with a web scraper for maximum effectiveness get it here http://pastebin.com/9HsU767u"""
- def __init__(self):
- sty.use('ggplot')
- self.namedict = {}
- self.boyrankings = {}
- self.girlrankings = {}
- self.extractNames()
- self.organizeNames()
- def extractNames(self):
- """Extract the names from the Python Google class html files
- (in this example run the program from that directory)
- """
- for root, dirs, files in os.walk(os.path.dirname(os.path.realpath(__file__))):
- patt = re.compile(r'<tr align="right">\s*<td>(\d{1,4})<\/td>\s*<td>([A-Z][a-z]+?)<\/td>\s*<td>([A-Z][a-z]+?)<\/td>')
- for file in files:
- year = file[4:-5]
- self.namedict[year] = []
- if file[-4:] == "html":
- myfile = open(os.path.join(root, file))
- htmlText = myfile.read()
- found = re.findall(patt,htmlText)
- myfile.close()
- [self.namedict[year].append(find) for find in found]
- def organizeNames(self):
- """Organize the names into Python dictionary Objects for both girls and boys"""
- for key in self.namedict.keys():
- for nametuple in self.namedict[key]:
- if nametuple[1] in self.boyrankings and nametuple[2] in self.girlrankings:
- self.boyrankings[nametuple[1]].append((int(key), int(nametuple[0])))
- self.girlrankings[nametuple[2]].append((int(key), int(nametuple[0])))
- elif nametuple[1] in self.boyrankings and nametuple[2] not in self.girlrankings:
- self.boyrankings[nametuple[1]].append((int(key), int(nametuple[0])))
- self.girlrankings[nametuple[2]] = [(int(key), int(nametuple[0])),]
- elif nametuple[1] not in self.boyrankings and nametuple[2] in self.girlrankings:
- self.boyrankings[nametuple[1]] = [(int(key), int(nametuple[0])),]
- self.girlrankings[nametuple[2]].append((int(key), int(nametuple[0])))
- else:
- self.boyrankings[nametuple[1]] = [(int(key), int(nametuple[0])),]
- self.girlrankings[nametuple[2]] = [(int(key), int(nametuple[0])),]
- def plotName(self, gender, name):
- """Given a gender and name create a matplotlib line graph mapping the rankings over the years"""
- if gender == "boy":
- data = self.boyrankings[name]
- data.sort(key=lambda x: x[0])
- df = pd.DataFrame(data, columns=['Year', 'Rank'])
- dfplot = df.plot(figsize=(20,6), grid=True, kind='line', x='Year', y='Rank', title="Rankings for {}".format(name), xticks=[x for x in range(1880,2016) if x % 4 ==0])
- plt.gca().invert_yaxis()
- figure = dfplot.get_figure()
- figure.savefig("{}.png".format(name))
- elif gender == "girl":
- data = self.girlrankings[name]
- data.sort(key=lambda x: x[0])
- df = pd.DataFrame(data, columns=['Year', 'Rank'])
- dfplot = df.plot(figsize=(20,6), grid=True, kind='line', x='Year', y='Rank', title="Rankings for {}".format(name), xticks=[x for x in range(1880,2016) if x % 4 ==0])
- plt.gca().invert_yaxis()
- figure = dfplot.get_figure()
- figure.savefig("{}.png".format(name))
- else:
- pass
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement