Guest User

Untitled

a guest
Jan 16th, 2018
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.68 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import pandas
  3. import numpy
  4.  
  5.  
  6.  
  7. ethnicity_mapping = {
  8. 1: "White",
  9. 2: "Black",
  10. 3: "Asian",
  11. 4: "Native American",
  12. 5: "Hispanic",
  13. 6: "Unknown",
  14. }
  15.  
  16. # Identifier for breast cancer (all we care about)
  17. cancer_index = 2
  18.  
  19.  
  20. raw_data_filepath = './Counts/Originals/v01_PaCo.csv'
  21.  
  22.  
  23. def read_raw_data_file(csv_filepath):
  24. """
  25. Accepts raw data CSV file, returns Pandas dataframe.
  26. """
  27. df = pandas.DataFrame(pandas.read_csv(csv_filepath))
  28. return df
  29.  
  30.  
  31.  
  32. def munge_cancer_dataframe(csv_filepath):
  33. """
  34. Accepts raw data CSV file, returns Pandas dataframe,
  35. with only breast cancer incidents.
  36. """
  37. df = pandas.DataFrame(pandas.read_csv(csv_filepath, index_col=False))
  38. df = df.query('CancerType == '+str(cancer_index)) # Delete other cancer type data
  39. del df['CancerType'] # Delete CancerType column
  40. #del df['Gen'] # Delete CancerType column
  41. return(df) #Return the dataframe
  42.  
  43.  
  44. df = munge_cancer_dataframe(raw_data_filepath)
  45.  
  46.  
  47. years = set(df['year'])
  48.  
  49.  
  50.  
  51. new_data = pandas.DataFrame()
  52. for year in years:
  53. for ethnicity_index, ethnicity in ethnicity_mapping.items():
  54. q = df.query('Eth == '+str(ethnicity_index)+' and year == '+str(year))
  55. # Trying to combine all genders' incidences.
  56. combined_incidence_filter = q.groupby('year').agg(sum)['count(*)']
  57.  
  58. # If no incidence, lookup will throw IndexError.
  59. # Instead set to "0" and continue processing.
  60. try:
  61. combined_incidence_count = list(combined_incidence_filter)[0]
  62. except IndexError:
  63. combined_incidence_count = 0
  64.  
  65. # Remove gender column, since we've already summed above.
  66. del q['Gen']
  67.  
  68. # Overwrite count for given ethnicity with the combined count.
  69. # This will result in duplicate rows (since all incidences
  70. # will be the same). No problem: we'll clean up after.
  71. q.loc[:,('count(*)',)] = combined_incidence_count
  72.  
  73. q.loc[:,('Eth',)] = ethnicity
  74. # Append new munged data set onto the pristine dataframe.
  75. new_data = pandas.concat([new_data, q])
  76.  
  77. # Purge all duplicates, of which we have many, due to the
  78. # inelegant munging above.
  79. new_data = new_data.drop_duplicates()
  80.  
  81. # Step 2: convert numeric ethnicities to words.
  82. # (This was done above in the loop)
  83.  
  84. # Step 3: Sort by year so the rows descend in
  85. # ascending chronological order.
  86. new_data = new_data.sort_values(by=['year'])
  87.  
  88. # Now let's format the CSV structure prior to writing to file.
  89. # Step 1: set year as index
  90. # Don't do this... it removes 'year' from the data!
  91. # So we'll do it as last operation prior to writing.
  92. new_data = new_data.set_index('year')
  93.  
  94. # Finally, write out to local file.
  95. new_data.to_csv('jawn.txt')
Add Comment
Please, Sign In to add comment