Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import re
- import urllib2
- data = urllib2.urlopen('http://www.census.gov/acs/www/Downloads/data_documentation/pums/DataDict/PUMSDataDict13.txt')
- ## replace newline characters so we can use dots and find everything until a double
- ## carriage return (replaced to ||) with a lookahead assertion.
- data=data.replace('n','|')
- datadict=pd.DataFrame(re.findall("([A-Z]{2,8})s{2,9}([0-9]{1})s{2,6}|s{2,4}([A-Za-z-() ]{3,85})",data,re.MULTILINE),columns=['variable','width','description'])
- datadict.head(5)
- +----+----------+-------+------------------------------------------------+
- | | variable | width | description |
- +----+----------+-------+------------------------------------------------+
- | 0 | RT | 1 | Record Type |
- +----+----------+-------+------------------------------------------------+
- | 1 | SERIALNO | 7 | Housing unit |
- +----+----------+-------+------------------------------------------------+
- | 2 | DIVISION | 1 | Division code |
- +----+----------+-------+------------------------------------------------+
- | 3 | PUMA | 5 | Public use microdata area code (PUMA) based on |
- +----+----------+-------+------------------------------------------------+
- | 4 | REGION | 1 | Region code |
- +----+----------+-------+------------------------------------------------+
- | 5 | ST | 2 | State Code |
- +----+----------+-------+------------------------------------------------+
- datadict_exp=pd.DataFrame(
- re.findall("([A-Z]{2,9})s{2,9}([0-9]{1})s{2,6}|s{4}([A-Za-z-();<> 0-9]{2,85})|s{11,15}([a-z0-9]{0,2})[ ].([A-Za-z/-() ]{2,120})",
- data,re.MULTILINE))
- datadict_exp.head(5)
- +----+----------+-------+---------------------------------------------------+---------+--------------+
- | id | variable | width | description | value_1 | label_1 |
- +----+----------+-------+---------------------------------------------------+---------+--------------+
- | 0 | DIVISION | 1 | Division code | 0 | Puerto Rico |
- +----+----------+-------+---------------------------------------------------+---------+--------------+
- | 1 | REGION | 1 | Region code | 1 | Northeast |
- +----+----------+-------+---------------------------------------------------+---------+--------------+
- | 2 | ST | 2 | State Code | 1 | Alabama/AL |
- +----+----------+-------+---------------------------------------------------+---------+--------------+
- | 3 | NP | 2 | Number of person records following this housin... | 0 | Vacant unit |
- +----+----------+-------+---------------------------------------------------+---------+--------------+
- | 4 | TYPE | 1 | Type of unit | 1 | Housing unit |
- +----+----------+-------+---------------------------------------------------+---------+--------------+
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement