Advertisement
dc5553

Starting Trend Analysis of Baby names

Jan 15th, 2017
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.33 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. import matplotlib.style as sty
  3. import os
  4. import re
  5. import pandas as pd
  6. import numpy as np
  7. import matplotlib
  8. from sys import exit
  9. import csv
  10. #an easy way to get all the imports installed is by just installing Anaconda (especially true for windows users)
  11.  
  12.  
  13. class BabyNameAnalyzer:
  14.     """Anaylze SSA baby name HTML files -- Use with a web scraper for maximum effectiveness"""
  15.    
  16.     def __init__(self):
  17.         sty.use('ggplot')
  18.         self.namedict = {}
  19.         self.boyrankings = {}
  20.         self.girlrankings = {}
  21.         self.extractNames()
  22.         self.organizeNames()
  23.  
  24.     def extractNames(self):
  25.         """Extract the names from the Python Google class html files
  26.        
  27.         (in this example run the program from that directory)
  28.         """
  29.         for root, dirs, files in os.walk(os.path.dirname(os.path.realpath(__file__))):
  30.             patt = re.compile(r'<tr align="right">\s*<td>(\d{1,4})<\/td>\s*<td>([A-Z][a-z]+?)<\/td>\s*<td>([A-Z][a-z]+?)<\/td>')
  31.             for file in files:             
  32.                 year = file[4:-5]
  33.                 self.namedict[year] = []
  34.                 if file[-4:] == "html":
  35.                     myfile = open(os.path.join(root, file))
  36.                     htmlText = myfile.read()
  37.                     found = re.findall(patt,htmlText)
  38.                     myfile.close()
  39.                     [self.namedict[year].append(find) for find in found]
  40.    
  41.     def nameDumpCSV(self):
  42.         """Organize the names into CSV for both girls and boys"""
  43.         with open("boybabynames.csv","wb") as csvfile:
  44.             fieldnames = ['name', 'year', 'rank']
  45.             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  46.             writer.writeheader()
  47.                
  48.             for key,values in self.boyrankings.items():
  49.                 for value in values:
  50.                     writer.writerow({'name': key, 'year': value[0], 'rank': value[1]})
  51.         csvfile.close()
  52.         with open("girlbabynames.csv","wb") as csvfile:
  53.             fieldnames = ['name', 'year', 'rank']
  54.             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  55.             writer.writeheader()
  56.                
  57.             for key,values in self.girlrankings.items():
  58.                 for value in values:
  59.                     writer.writerow({'name': key, 'year': value[0], 'rank': value[1]})
  60.         csvfile.close()
  61.  
  62.    
  63.     def trendName(self):
  64.         """Given a gender and name create a matplotlib line graph mapping the rankings over the years"""
  65.         namedf = pd.read_csv('babynames.csv', index_col=1)
  66.         namedf.plot()
  67.         plt.show()
  68.        
  69.  
  70. def main():
  71.  
  72.     bna = BabyNameAnalyzer()
  73.     #bna.nameDumpCSV()
  74.     bna.trendName()
  75.                
  76. if __name__ == '__main__':
  77.   main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement