osteth

pandas test 2

May 21st, 2020
587
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.51 KB | None | 0 0
  1. import pandas as pd
  2. import datetime
  3.  
  4. def read_file(file_p):
  5.     '''
  6.    Read in the file example_tape1.csv as a Pandas DataFrame, print out the datatypes and returns the output DataFrame. (hint: you might have a little cleaning to do)
  7.    
  8.    The file is accessible at this location: ./data/example_tape1.csv
  9.    '''
  10.     #This try catch pattern allow for the use of both CVS and Excel formatted inputs. It reduces a lot of user error and frustrations.
  11.     try:
  12.         '''skipping last line because it is malformed (possibly a timestamp. this may need refactored if all data is not
  13.        formatted like this example is. Additionally, the example contains a large number of blank columns. I am not addressing
  14.        this at this time in case they are used for something downstream. These would normally be removed for data cleanliness.'''
  15.         df = pd.read_csv(file_p, engine='python', error_bad_lines=False, skipfooter=1)
  16.     except:
  17.         try:
  18.             df = pd.read_excel(file_p, engine='python', error_bad_lines=False, skipfooter=1)
  19.         except:
  20.             print("Failed to read file, file is not of supported tile types (CSV,XLSX,XLS)")
  21.    
  22.     return df
  23.  
  24.  
  25.    
  26. def max_cash(collections):
  27.     '''
  28.    Return the row of the DataFrame with the highest gross_cash
  29.    
  30.    Args
  31.    collections <pd.DataFrame>
  32.    
  33.    Returns <pd.DataFrame> containing one row
  34.    '''
  35.     return collections.loc[collections['gross_cash'].idxmax()]
  36.  
  37. def vintage_level_cash(collections):
  38.     '''
  39.    Calculate the total gross_cash collected for each vintage across all of time
  40.    
  41.    Args
  42.    collections <pd.DataFrame>
  43.    
  44.    Returns <pd.DataFrame>
  45.    '''
  46.     vlc_collections = collections.copy()
  47.    
  48.     #Convert each vintage to a Datetime value
  49.     vlc_collections['vintage'] = pd.to_datetime(vlc_collections['vintage'])
  50.    
  51.     # Set the datetime column as the index
  52.     vlc_collections.index = vlc_collections['vintage']
  53.  
  54.     #resampling the dataframe by vintage month and calculating the sum's
  55.     vintage_cash = vlc_collections.resample('M').sum()
  56.    
  57.     #reducing the data down to only the vintage and gross cash for easier readability.
  58.     #vintage_cash = pd.DataFrame(vintage_cash, columns = ['vintage', 'gross_cash'])
  59.     return vintage_cash[['gross_cash']]
  60.  
  61. def vintage_totals(collections):
  62.     '''
  63.    Calculate the cumulative gross_cash collected for each vintage from inception through the indicated asofmonth.
  64.    
  65.    Args
  66.    collections <pd.DataFrame>
  67.    
  68.    Returns <pd.DataFrame> with columns vintage, asofmonth, cumulative_gross_cash
  69.    '''  
  70.     totals = []
  71.     for vintage in collections['vintage'].unique():
  72.         total = 0
  73.         #print(vintage)
  74.         subvin = collections[(collections['vintage'] == vintage)]
  75.         for cash in subvin['gross_cash']:
  76.             total = total + cash
  77.             totals.append(round(total, 2))
  78.  
  79.     collections['cumulative_gross_cash'] = totals
  80.     vintage_totals_result = pd.DataFrame(collections, columns = ['vintage', 'asofmonth', 'cumulative_gross_cash'])
  81.     return(vintage_totals_result)
  82.  
  83.  
  84. def vintage_pivot(vintage_totals_result):
  85.     '''
  86.    Using the output of vintage_totals, pivot the DataFrame to make it wide and short, such that each column
  87.    is the name of a vintage, each row indicates the asofmonth, and cumulative_gross_cash is the value in each cell.
  88.    Please comment how you might plot the output data.
  89.    
  90.    Args
  91.    vintage_totals_dataframe <pd.DataFrame>
  92.    
  93.    Returns <pd.DataFrame> with a column for each vintage
  94.    '''
  95.     vintage_pivot_result = vintage_totals_result.pivot(index='asofmonth', columns='vintage', values='cumulative_gross_cash')
  96.     return vintage_pivot_result
  97.  
  98. '''This is a "sentinel" it marks a break in the execution of the code post execution of the functions
  99. allowing these functions to used as a library for additional works.'''
  100. if __name__ == "__main__":
  101.    
  102.     #This variable sets the path to the data file to be read when executing this scrip standalone.
  103.     file_p = "./data/example_tape1.csv"
  104.  
  105.     collections = read_file(file_p)
  106.  
  107.     if collections is not None:
  108.         max_cash_result = max_cash(collections)
  109.         #print(max_cash_result)
  110.         vintage_level_result = vintage_level_cash(collections)
  111.         #print(vintage_level_result)
  112.         vintage_totals_result = vintage_totals(collections)
  113.         #print(vintage_totals_result)
  114.         vintage_pivot_result = vintage_pivot(vintage_totals_result)
  115.         display(vintage_pivot_result)
Add Comment
Please, Sign In to add comment