Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import pandas
- import numpy
- ethnicity_mapping = {
- 1: "White",
- 2: "Black",
- 3: "Asian",
- 4: "Native American",
- 5: "Hispanic",
- 6: "Unknown",
- }
- # Identifier for breast cancer (all we care about)
- cancer_index = 2
- raw_data_filepath = './Counts/Originals/v01_PaCo.csv'
- def read_raw_data_file(csv_filepath):
- """
- Accepts raw data CSV file, returns Pandas dataframe.
- """
- df = pandas.DataFrame(pandas.read_csv(csv_filepath))
- return df
- def munge_cancer_dataframe(csv_filepath):
- """
- Accepts raw data CSV file, returns Pandas dataframe,
- with only breast cancer incidents.
- """
- df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False))
- df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data
- del df['CancerType'] # Delete CancerType column
- #del df['Gen'] # Delete CancerType column
- return(df) #Return the dataframe
- df = munge_cancer_dataframe(raw_data_filepath)
- years = set(df['year'])
- new_data = pandas.DataFrame()
- for year in years:
- for ethnicity_index, ethnicity in ethnicity_mapping.items():
- q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year))
- # Trying to combine all genders' incidences.
- combined_incidence_filter = q.groupby('year').agg(sum)['count(*)']
- # If no incidence, lookup will throw IndexError.
- # Instead set to "0" and continue processing.
- try:
- combined_incidence_count = list(combined_incidence_filter)[0]
- except IndexError:
- combined_incidence_count = 0
- # Remove gender column, since we've already summed above.
- del q['Gen']
- # Overwrite count for given ethnicity with the combined count.
- # This will result in duplicate rows (since all incidences
- # will be the same). No problem: we'll clean up after.
- q.loc[:,('count(*)',)] = combined_incidence_count
- q.loc[:,('Eth',)] = ethnicity
- # Append new munged data set onto the pristine dataframe.
- new_data = pandas.concat([new_data, q])
- # Purge all duplicates, of which we have many, due to the
- # inelegant munging above.
- new_data = new_data.drop_duplicates()
- # Step 2: convert numeric ethnicities to words.
- # (This was done above in the loop)
- # Step 3: Sort by year so the rows descend in
- # ascending chronological order.
- new_data = new_data.sort_values(by=['year'])
- # Now let's format the CSV structure prior to writing to file.
- # Step 1: set year as index
- # Don't do this... it removes 'year' from the data!
- # So we'll do it as last operation prior to writing.
- new_data = new_data.set_index('year')
- # Finally, write out to local file.
- new_data.to_csv('jawn.txt')
Add Comment
Please, Sign In to add comment