Guest User

Untitled

a guest
Oct 21st, 2017
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.31 KB | None | 0 0
  1. #import os
  2. #import pandas as pd
  3. #import datetime as dt
  4.  
  5. ####you want to optimize the conversion and create a summary report####
  6.  
  7. def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False):
  8. """This function is designed to condense the raw csv files, made by the
  9. 'download_csv.py' function held by Christian Glunk and the Deepfiled
  10. Connect team, into a more maneagable form"""
  11.  
  12. #print everything ('all') to console without print() statement
  13. from IPython.core.interactiveshell import InteractiveShell
  14. InteractiveShell.ast_node_interactivity = 'all'
  15.  
  16. import os
  17. import pandas as pd
  18. import datetime as dt
  19.  
  20. os.chdir(raw_dir)
  21. csv_list = os.listdir(raw_dir)
  22.  
  23. #relevant variables in raw csvs to extract
  24. extract_vars = ['AN1', 'DTC', 'HM1', 'HM2',
  25. 'HM3', 'HM4', 'T1', 'T2',
  26. 'T3', 'T4', 'latitude',
  27. 'longitude', 'measurement_time']
  28.  
  29. #for DevID column indexing
  30. b_count = 0
  31. p_count = 0
  32.  
  33. for csv in csv_list[0:2]:
  34.  
  35.  
  36. with open(csv) as f:
  37.  
  38. #import csv, parse datetime and set as index
  39. raw_df = pd.read_csv(f)
  40. #parse_dates=True
  41. #draw relevant columns from rawdf
  42. df = raw_df[extract_vars]
  43.  
  44. #Numerizing instruments
  45. imei = csv[7:-4]
  46. df.insert(0, column='IMEI', value=imei)
  47.  
  48. #renames initial columns columns
  49. new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime',
  50. 'longitude':'Lng', 'latitude':'Lat'}
  51.  
  52. df = df.rename(columns=new_names)
  53. print("During OPEN_csv_df")
  54. print(df)
  55.  
  56. #DateTime as datetime
  57. df['DateTime'] = pd.to_datetime(df['DateTime'])
  58.  
  59.  
  60. #fills missing lat, long values for the sake of row completeness
  61. cols = ['Lat', 'Lng']
  62. df[cols] = df[cols].ffill()
  63. df[cols] = df[cols].bfill()
  64.  
  65. #call cleaning functions
  66. df = rc_remove(df, imei)
  67. df = var_insert(df, imei)
  68.  
  69.  
  70.  
  71. #write new csvs, and performs some last cleaning steps
  72. if 'HM2' in df:
  73. #insert premium_count for later indexing based on individual number,
  74. #clear out any remaining NaNs, set DateTime as index
  75. """can be done at a later step"""
  76. #p_count += 1
  77. #df.insert(0, column='DevID', value=p_count)
  78.  
  79. #df = df[pd.notnull(df['T2'])]
  80. #df = df[pd.notnull(df['T1'])]
  81.  
  82. #write to wrt_dir
  83. print("writing premium_", imei)
  84. df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False)
  85.  
  86.  
  87. else:
  88. #insert basic_count for later indexing on individual number,
  89. #clear out any remaining NaNs
  90. """can be done in a later step"""
  91. #b_count +=1
  92. #df.insert(0, column='DevID', value=b_count)
  93.  
  94. #df = df[pd.notnull(df['T1'])]
  95.  
  96. #wrtite to wrt_dir
  97. print("writing basic_", imei)
  98. df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False)
  99.  
  100. #if wrt_massive:
  101. #call massive writing function
  102.  
  103.  
  104.  
  105.  
  106. def rc_remove(df, imei):
  107. """This function removes row columns and values from the the raw_df.
  108. It also assigns non-ambigious temp column names. In the raw files some
  109. temperature columns are in T1, the switches to T2, etc"""
  110.  
  111. import pandas as pd
  112.  
  113. #remove rows where Voltage = Null
  114. df = df[pd.notnull(df['Volts'])]
  115. keep during raw conversion, but
  116.  
  117. #drop rows where voltage values are -255.00
  118. #(UNSURE WHY THIS '-255.0' value EXISTS)
  119.  
  120. """df = df[df['Volts'] != -255.0]
  121. keep for raw conversion, but remove change during
  122. manipultion and analysis"""
  123.  
  124. #keep the the columns that are at least 30% populated
  125. df = df.dropna(thresh=len(df)*.30, axis=1)
  126.  
  127. #This removes the ambiguity in the names of T# columns of certain csvs which temperature
  128. #measurements go into several columns, as well as removing rows with NaN values in column
  129. vars_len = len(df.columns.values)
  130.  
  131. if vars_len <9:
  132. df.columns.values[-4] = 'T1'
  133.  
  134. else:
  135. df.columns.values[-4] = 'T2'
  136. df.columns.values[-5] = 'T1'
  137.  
  138. print('during REMOVE df')
  139. return df
  140.  
  141.  
  142.  
  143. def var_insert(df, imei):
  144. """This function shifts around columns, inserts new variables into
  145. into the dataframe"""
  146.  
  147. import math
  148. from datetime import timedelta
  149.  
  150. #insert calculated volumetric water content measurements #and change in volumetric water
  151. #content and populate for two subsequent values under DateTime if their difference == 30min
  152. #populate the dVWC column with the difference of the adjoining values under VWC.
  153. last_col = len(df.columns.values) -1
  154. df.insert(last_col, column='VWC', value=None)
  155. df['VWC'] = .000494*(df['Volts']*1000)-.544
  156.  
  157. ##Take the following into account during manipulation
  158. """df.insert(last_col+1, column='dVWC', value=None)
  159. df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30),
  160. 'dVWC'] = df['VWC'].shift(-1) - df['VWC']
  161.  
  162.  
  163. #Then shift those values by one row remove top row holding NaN
  164. #fill rest of NaN with preceeding real value
  165. df['dVWC'] = df['dVWC'].shift(+1)
  166. df = df[1:]
  167. df['dVWC'] = df['dVWC'].ffill()"""
  168.  
  169.  
  170. #insert 'month', 'week', 'day' columns... these can be set as indeces
  171. #for used for simpler visualization, ##better to add during other algos
  172. """ df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b'))
  173. df.insert(2, column="Week", value = df['DateTime'].dt.week)
  174. df.insert(3, column="Day", value = df['DateTime'].dt.day)"""
  175.  
  176. return df
  177.  
  178. raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/'
  179. wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/'
  180.  
  181. raw_converter_main(raw_dir, wrt_dir)
Add Comment
Please, Sign In to add comment