Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #import os
- #import pandas as pd
- #import datetime as dt
- ####you want to optimize the conversion and create a summary report####
- def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False):
- """This function is designed to condense the raw csv files, made by the
- 'download_csv.py' function held by Christian Glunk and the Deepfiled
- Connect team, into a more maneagable form"""
- #print everything ('all') to console without print() statement
- from IPython.core.interactiveshell import InteractiveShell
- InteractiveShell.ast_node_interactivity = 'all'
- import os
- import pandas as pd
- import datetime as dt
- os.chdir(raw_dir)
- csv_list = os.listdir(raw_dir)
- #relevant variables in raw csvs to extract
- extract_vars = ['AN1', 'DTC', 'HM1', 'HM2',
- 'HM3', 'HM4', 'T1', 'T2',
- 'T3', 'T4', 'latitude',
- 'longitude', 'measurement_time']
- #for DevID column indexing
- b_count = 0
- p_count = 0
- for csv in csv_list[0:2]:
- with open(csv) as f:
- #import csv, parse datetime and set as index
- raw_df = pd.read_csv(f)
- #parse_dates=True
- #draw relevant columns from rawdf
- df = raw_df[extract_vars]
- #Numerizing instruments
- imei = csv[7:-4]
- df.insert(0, column='IMEI', value=imei)
- #renames initial columns columns
- new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime',
- 'longitude':'Lng', 'latitude':'Lat'}
- df = df.rename(columns=new_names)
- print("During OPEN_csv_df")
- print(df)
- #DateTime as datetime
- df['DateTime'] = pd.to_datetime(df['DateTime'])
- #fills missing lat, long values for the sake of row completeness
- cols = ['Lat', 'Lng']
- df[cols] = df[cols].ffill()
- df[cols] = df[cols].bfill()
- #call cleaning functions
- df = rc_remove(df, imei)
- df = var_insert(df, imei)
- #write new csvs, and performs some last cleaning steps
- if 'HM2' in df:
- #insert premium_count for later indexing based on individual number,
- #clear out any remaining NaNs, set DateTime as index
- """can be done at a later step"""
- #p_count += 1
- #df.insert(0, column='DevID', value=p_count)
- #df = df[pd.notnull(df['T2'])]
- #df = df[pd.notnull(df['T1'])]
- #write to wrt_dir
- print("writing premium_", imei)
- df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False)
- else:
- #insert basic_count for later indexing on individual number,
- #clear out any remaining NaNs
- """can be done in a later step"""
- #b_count +=1
- #df.insert(0, column='DevID', value=b_count)
- #df = df[pd.notnull(df['T1'])]
- #wrtite to wrt_dir
- print("writing basic_", imei)
- df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False)
- #if wrt_massive:
- #call massive writing function
- def rc_remove(df, imei):
- """This function removes row columns and values from the the raw_df.
- It also assigns non-ambigious temp column names. In the raw files some
- temperature columns are in T1, the switches to T2, etc"""
- import pandas as pd
- #remove rows where Voltage = Null
- df = df[pd.notnull(df['Volts'])]
- keep during raw conversion, but
- #drop rows where voltage values are -255.00
- #(UNSURE WHY THIS '-255.0' value EXISTS)
- """df = df[df['Volts'] != -255.0]
- keep for raw conversion, but remove change during
- manipultion and analysis"""
- #keep the the columns that are at least 30% populated
- df = df.dropna(thresh=len(df)*.30, axis=1)
- #This removes the ambiguity in the names of T# columns of certain csvs which temperature
- #measurements go into several columns, as well as removing rows with NaN values in column
- vars_len = len(df.columns.values)
- if vars_len <9:
- df.columns.values[-4] = 'T1'
- else:
- df.columns.values[-4] = 'T2'
- df.columns.values[-5] = 'T1'
- print('during REMOVE df')
- return df
- def var_insert(df, imei):
- """This function shifts around columns, inserts new variables into
- into the dataframe"""
- import math
- from datetime import timedelta
- #insert calculated volumetric water content measurements #and change in volumetric water
- #content and populate for two subsequent values under DateTime if their difference == 30min
- #populate the dVWC column with the difference of the adjoining values under VWC.
- last_col = len(df.columns.values) -1
- df.insert(last_col, column='VWC', value=None)
- df['VWC'] = .000494*(df['Volts']*1000)-.544
- ##Take the following into account during manipulation
- """df.insert(last_col+1, column='dVWC', value=None)
- df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30),
- 'dVWC'] = df['VWC'].shift(-1) - df['VWC']
- #Then shift those values by one row remove top row holding NaN
- #fill rest of NaN with preceeding real value
- df['dVWC'] = df['dVWC'].shift(+1)
- df = df[1:]
- df['dVWC'] = df['dVWC'].ffill()"""
- #insert 'month', 'week', 'day' columns... these can be set as indeces
- #for used for simpler visualization, ##better to add during other algos
- """ df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b'))
- df.insert(2, column="Week", value = df['DateTime'].dt.week)
- df.insert(3, column="Day", value = df['DateTime'].dt.day)"""
- return df
- raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/'
- wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/'
- raw_converter_main(raw_dir, wrt_dir)
Add Comment
Please, Sign In to add comment