Untitled

#!/usr/bin/env python
import pandas
import numpy


ethnicity_mapping = {
    1: "White",
    2: "Black",
    3: "Asian",
    4: "Native American",
    5: "Hispanic",
    6: "Unknown",
}

# Identifier for breast cancer (all we care about)
cancer_index = 2


raw_data_filepath = './Counts/Originals/v01_PaCo.csv'


def read_raw_data_file(csv_filepath):
    """
    Accepts raw data CSV file, returns Pandas dataframe.
    """
    df = pandas.DataFrame(pandas.read_csv(csv_filepath))
    return df


def munge_cancer_dataframe(csv_filepath):
    """
    Accepts raw data CSV file, returns Pandas dataframe,
    with only breast cancer incidents.
    """
    df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False))
    df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data
    del df['CancerType'] # Delete CancerType column
    #del df['Gen'] # Delete CancerType column
    return(df) #Return the dataframe


df = munge_cancer_dataframe(raw_data_filepath)


years = set(df['year'])


new_data = pandas.DataFrame()
for year in years:
    for ethnicity_index, ethnicity in ethnicity_mapping.items():
        q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year))
        # Trying to combine all genders' incidences.
        combined_incidence_filter = q.groupby('year').agg(sum)['count(*)']

        # If no incidence, lookup will throw IndexError.
        # Instead set to "0" and continue processing.
        try:
            combined_incidence_count = list(combined_incidence_filter)[0]
        except IndexError:
            combined_incidence_count = 0

        # Remove gender column, since we've already summed above.
        del q['Gen']

        # Overwrite count for given ethnicity with the combined count.
        # This will result in duplicate rows (since all incidences
        # will be the same). No problem: we'll clean up after.
        q.loc[:,('count(*)',)] = combined_incidence_count

        q.loc[:,('Eth',)] = ethnicity
        # Append new munged data set onto the pristine dataframe.
        new_data = pandas.concat([new_data, q])

# Purge all duplicates, of which we have many, due to the
# inelegant munging above.
new_data = new_data.drop_duplicates()

# Step 2: convert numeric ethnicities to words.
# (This was done above in the loop)

# Step 3: Sort by year so the rows descend in
# ascending chronological order.
new_data = new_data.sort_values(by=['year'])

# Now let's format the CSV structure prior to writing to file.
# Step 1: set year as index
# Don't do this... it removes 'year' from the data!
# So we'll do it as last operation prior to writing.
new_data = new_data.set_index('year')

# Finally, write out to local file.
new_data.to_csv('jawn.txt')