Advertisement
skip420

scrape_names.babynames

Feb 16th, 2021
1,350
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.64 KB | None | 0 0
  1. #python babynames.py
  2. #Get_National_Data_#python babynames.py -f names.csv
  3. #Get_state-by-state data_for_all_states_#python babynames.py -s All
  4. #Get_state_data_by_Command_#python babynames.py -s Ma, -f mystates.csv
  5.  
  6.  
  7. import csv
  8. import optparse
  9. import os
  10. import re
  11. import sys
  12. import urllib
  13. import urllib2
  14.  
  15.  
  16. class BabyNameScraper(object):
  17.  
  18.     def __init__(self, states=None, filename=None):
  19.         """Set states to None to get national data.
  20.  
  21.        Otherwise states should be a list of two-letter
  22.        state abbreviations.
  23.        """
  24.         if states:
  25.             self.years = range(1960, 2009)
  26.         else:
  27.             self.years = range(1880, 2010)
  28.  
  29.         self.states = states or [None,]
  30.         self.filename = filename
  31.  
  32.         self.fields = ['year', 'name', 'sex', 'number', 'rank', ]
  33.         if states is not None:
  34.             self.fields.append('state')
  35.  
  36.  
  37.     def run(self):
  38.         self.write_headers()
  39.         for year in self.years:
  40.             for state in self.states:
  41.                 page = self.get_page(year, state)
  42.                 for data in self.parse_page(page, state):
  43.                     data['year'] = year
  44.                     self.save_data(data)
  45.  
  46.  
  47.     def write_headers(self):
  48.         if self.filename:
  49.             with open(self.filename, 'a') as fh:
  50.                 csv.writer(fh).writerow(self.fields)
  51.         else:
  52.             csv.writer(sys.stdout).writerow(self.fields)
  53.  
  54.  
  55.     def get_page(self, year, state=None):
  56.         """Get the content of the page listing the top 1,000 baby names
  57.        for the given year.
  58.        """
  59.         base_url = 'http://www.ssa.gov/cgi-bin'
  60.  
  61.         if state:
  62.             query = {'year': year, 'state': state}
  63.             path = 'namesbystate.cgi'
  64.         else:
  65.             query = {'year': year, 'top': 1000, 'number': 'n'}
  66.             path = 'popularnames.cgi'
  67.  
  68.         url = os.path.join(base_url, path)
  69.         req = urllib2.Request(url, urllib.urlencode(query))
  70.         response = urllib2.urlopen(req)
  71.  
  72.         if response.msg == 'OK':
  73.             return response.read()
  74.  
  75.         return None
  76.  
  77.  
  78.     def parse_page(self, page, state=None):
  79.         """Get the relevant baby-name data from the given HTML page.
  80.        """
  81.         rows = re.findall(r'<tr align="right">.*?<\/tr>', page, re.S)
  82.  
  83.         fields = ['rank', 'male', 'male_count', 'female', 'female_count', ]
  84.  
  85.         if state:
  86.             regex = re.compile(r'<td(?: align="center")?>(.*?)<\/td>')
  87.         else:
  88.             regex = re.compile(r'<td>(?P<cell_content>.*?)<\/td>')
  89.  
  90.         for row in rows:
  91.             data = dict(zip(fields, regex.findall(row)))
  92.             data['state'] = state
  93.            
  94.             # Yield male name
  95.             yield {'name': data['male'],
  96.                    'sex': 'M',
  97.                    'number': data['male_count'],
  98.                    'rank': data['rank'],
  99.                    'state': data['state'], }
  100.  
  101.             # Yield female names
  102.             yield {'name': data['female'],
  103.                    'sex': 'F',
  104.                    'number': data['female_count'],
  105.                    'rank': data['rank'],
  106.                    'state': data['state'], }
  107.  
  108.  
  109.     def save_data(self, data):
  110.         if data.get('state', None) is None:
  111.             del(data['state'])
  112.  
  113.         if self.filename:
  114.             with open(self.filename, 'a') as fh:
  115.                 csv.DictWriter(fh, self.fields).writerow(data)
  116.         else:
  117.             csv.DictWriter(sys.stdout, self.fields).writerow(data)
  118.  
  119.  
  120. ALLSTATES = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
  121.              'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
  122.              'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
  123.              'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
  124.              'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', ]
  125.  
  126.  
  127. if __name__ == '__main__':
  128.     parser = optparse.OptionParser()
  129.     parser.add_option('-s',
  130.                       '--states',
  131.                       dest='states',
  132.                       help='a comma-separated list of states to get data for, or ALL for all states')
  133.     parser.add_option('-f',
  134.                       '--file',
  135.                       dest='filename',
  136.                       help='write data to FILE (will be written to standard output if no filename is given')
  137.  
  138.     options, args = parser.parse_args()
  139.  
  140.     if options.states:
  141.         if options.states.lower() == 'all':
  142.             states = ALLSTATES
  143.         else:
  144.             states = options.states.split(',')
  145.     else:
  146.         states = None
  147.  
  148.     scraper = BabyNameScraper(states, options.filename).run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement