Advertisement
dc5553

Analyzing Baby names with Pandas

Jan 8th, 2017
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.30 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. import matplotlib.style as sty
  3. import os
  4. import re
  5. import pandas as pd
  6. import numpy as np
  7. import matplotlib
  8. from sys import exit
  9. #an easy way to get all the imports installed is by just installing Anaconda (especially true for windows users)
  10.  
  11.  
  12. class BabyNameAnalyzer:
  13.     """Anaylze SSA baby name HTML files -- Use with a web scraper for maximum effectiveness get it here http://pastebin.com/9HsU767u"""
  14.    
  15.     def __init__(self):
  16.         sty.use('ggplot')
  17.         self.namedict = {}
  18.         self.boyrankings = {}
  19.         self.girlrankings = {}
  20.         self.extractNames()
  21.         self.organizeNames()
  22.  
  23.     def extractNames(self):
  24.         """Extract the names from the Python Google class html files
  25.        
  26.         (in this example run the program from that directory)
  27.         """
  28.         for root, dirs, files in os.walk(os.path.dirname(os.path.realpath(__file__))):
  29.             patt = re.compile(r'<tr align="right">\s*<td>(\d{1,4})<\/td>\s*<td>([A-Z][a-z]+?)<\/td>\s*<td>([A-Z][a-z]+?)<\/td>')
  30.             for file in files:             
  31.                 year = file[4:-5]
  32.                 self.namedict[year] = []
  33.                 if file[-4:] == "html":
  34.                     myfile = open(os.path.join(root, file))
  35.                     htmlText = myfile.read()
  36.                     found = re.findall(patt,htmlText)
  37.                     myfile.close()
  38.                     [self.namedict[year].append(find) for find in found]
  39.                        
  40.     def organizeNames(self):
  41.         """Organize the names into Python dictionary Objects for both girls and boys"""
  42.         for key in self.namedict.keys():
  43.             for nametuple in self.namedict[key]:
  44.                 if nametuple[1] in self.boyrankings and nametuple[2] in self.girlrankings:
  45.                     self.boyrankings[nametuple[1]].append((int(key), int(nametuple[0])))
  46.                     self.girlrankings[nametuple[2]].append((int(key), int(nametuple[0])))
  47.                 elif nametuple[1] in self.boyrankings and nametuple[2] not in self.girlrankings:
  48.                     self.boyrankings[nametuple[1]].append((int(key), int(nametuple[0])))
  49.                     self.girlrankings[nametuple[2]] = [(int(key), int(nametuple[0])),]
  50.                 elif nametuple[1] not in self.boyrankings and nametuple[2] in self.girlrankings:
  51.                     self.boyrankings[nametuple[1]] = [(int(key), int(nametuple[0])),]
  52.                     self.girlrankings[nametuple[2]].append((int(key), int(nametuple[0])))
  53.                 else:
  54.                     self.boyrankings[nametuple[1]] = [(int(key), int(nametuple[0])),]
  55.                     self.girlrankings[nametuple[2]] = [(int(key), int(nametuple[0])),]
  56.  
  57.    
  58.     def plotName(self, gender, name):
  59.         """Given a gender and name create a matplotlib line graph mapping the rankings over the years"""
  60.         if gender == "boy":
  61.             data = self.boyrankings[name]          
  62.             data.sort(key=lambda x: x[0])          
  63.             df = pd.DataFrame(data, columns=['Year', 'Rank'])
  64.             dfplot = df.plot(figsize=(20,6), grid=True, kind='line', x='Year', y='Rank', title="Rankings for {}".format(name), xticks=[x for x in range(1880,2016) if x % 4 ==0])
  65.             plt.gca().invert_yaxis()
  66.             figure = dfplot.get_figure()
  67.             figure.savefig("{}.png".format(name))
  68.         elif gender == "girl":
  69.             data = self.girlrankings[name]
  70.             data.sort(key=lambda x: x[0])
  71.             df = pd.DataFrame(data, columns=['Year', 'Rank'])
  72.             dfplot = df.plot(figsize=(20,6), grid=True, kind='line', x='Year', y='Rank', title="Rankings for {}".format(name), xticks=[x for x in range(1880,2016) if x % 4 ==0])
  73.             plt.gca().invert_yaxis()
  74.             figure = dfplot.get_figure()
  75.             figure.savefig("{}.png".format(name))
  76.         else:
  77.             pass
  78.                
  79. if __name__ == '__main__':
  80.   main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement