osteth

pandas test

May 21st, 2020
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.51 KB | None | 0 0
  1. import pandas as pd
  2. import datetime
  3.  
  4. def read_file(file_p):
  5. '''
  6. Read in the file example_tape1.csv as a Pandas DataFrame, print out the datatypes and returns the output DataFrame. (hint: you might have a little cleaning to do)
  7.  
  8. The file is accessible at this location: ./data/example_tape1.csv
  9. '''
  10. #This try catch pattern allow for the use of both CVS and Excel formatted inputs. It reduces a lot of user error and frustrations.
  11. try:
  12. '''skipping last line because it is malformed (possibly a timestamp. this may need refactored if all data is not
  13. formatted like this example is. Additionally, the example contains a large number of blank columns. I am not addressing
  14. this at this time in case they are used for something downstream. These would normally be removed for data cleanliness.'''
  15. df = pd.read_csv(file_p, engine='python', error_bad_lines=False, skipfooter=1)
  16. except:
  17. try:
  18. df = pd.read_excel(file_p, engine='python', error_bad_lines=False, skipfooter=1)
  19. except:
  20. print("Failed to read file, file is not of supported tile types (CSV,XLSX,XLS)")
  21.  
  22. return df
  23.  
  24.  
  25.  
  26. def max_cash(collections):
  27. '''
  28. Return the row of the DataFrame with the highest gross_cash
  29.  
  30. Args
  31. collections <pd.DataFrame>
  32.  
  33. Returns <pd.DataFrame> containing one row
  34. '''
  35. return collections.loc[collections['gross_cash'].idxmax()]
  36.  
  37. def vintage_level_cash(collections):
  38. '''
  39. Calculate the total gross_cash collected for each vintage across all of time
  40.  
  41. Args
  42. collections <pd.DataFrame>
  43.  
  44. Returns <pd.DataFrame>
  45. '''
  46. vlc_collections = collections.copy()
  47.  
  48. #Convert each vintage to a Datetime value
  49. vlc_collections['vintage'] = pd.to_datetime(vlc_collections['vintage'])
  50.  
  51. # Set the datetime column as the index
  52. vlc_collections.index = vlc_collections['vintage']
  53.  
  54. #resampling the dataframe by vintage month and calculating the sum's
  55. vintage_cash = vlc_collections.resample('M').sum()
  56.  
  57. #reducing the data down to only the vintage and gross cash for easier readability.
  58. #vintage_cash = pd.DataFrame(vintage_cash, columns = ['vintage', 'gross_cash'])
  59. return vintage_cash[['gross_cash']]
  60.  
  61. def vintage_totals(collections):
  62. '''
  63. Calculate the cumulative gross_cash collected for each vintage from inception through the indicated asofmonth.
  64.  
  65. Args
  66. collections <pd.DataFrame>
  67.  
  68. Returns <pd.DataFrame> with columns vintage, asofmonth, cumulative_gross_cash
  69. '''
  70. totals = []
  71. for vintage in collections['vintage'].unique():
  72. total = 0
  73. #print(vintage)
  74. subvin = collections[(collections['vintage'] == vintage)]
  75. for cash in subvin['gross_cash']:
  76. total = total + cash
  77. totals.append(round(total, 2))
  78.  
  79. collections['cumulative_gross_cash'] = totals
  80. vintage_totals_result = pd.DataFrame(collections, columns = ['vintage', 'asofmonth', 'cumulative_gross_cash'])
  81. return(vintage_totals_result)
  82.  
  83.  
  84. def vintage_pivot(vintage_totals_result):
  85. '''
  86. Using the output of vintage_totals, pivot the DataFrame to make it wide and short, such that each column
  87. is the name of a vintage, each row indicates the asofmonth, and cumulative_gross_cash is the value in each cell.
  88. Please comment how you might plot the output data.
  89.  
  90. Args
  91. vintage_totals_dataframe <pd.DataFrame>
  92.  
  93. Returns <pd.DataFrame> with a column for each vintage
  94. '''
  95. vintage_pivot_result = vintage_totals_result.pivot(index='asofmonth', columns='vintage', values='cumulative_gross_cash')
  96. return vintage_pivot_result
  97.  
  98. '''This is a "sentinel" it marks a break in the execution of the code post execution of the functions
  99. allowing these functions to used as a library for additional works.'''
  100. if __name__ == "__main__":
  101.  
  102. #This variable sets the path to the data file to be read when executing this scrip standalone.
  103. file_p = "./data/example_tape1.csv"
  104.  
  105. collections = read_file(file_p)
  106.  
  107. if collections is not None:
  108. max_cash_result = max_cash(collections)
  109. #print(max_cash_result)
  110. vintage_level_result = vintage_level_cash(collections)
  111. #print(vintage_level_result)
  112. vintage_totals_result = vintage_totals(collections)
  113. #print(vintage_totals_result)
  114. vintage_pivot_result = vintage_pivot(vintage_totals_result)
  115. display(vintage_pivot_result)
Advertisement
Add Comment
Please, Sign In to add comment