Untitled

#import os
#import pandas as pd
#import datetime as dt

####you want to optimize the conversion and create a summary report####

def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False):
    """This function is designed to condense the raw csv files, made by the
    'download_csv.py' function held by Christian Glunk and the Deepfiled
    Connect team, into a more maneagable form"""

    #print everything ('all') to console without print() statement
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = 'all'

    import os
    import pandas as pd
    import datetime as dt

    os.chdir(raw_dir)
    csv_list = os.listdir(raw_dir)

    #relevant variables in raw csvs to extract
    extract_vars = ['AN1', 'DTC', 'HM1', 'HM2',
                    'HM3', 'HM4', 'T1', 'T2',
                    'T3', 'T4', 'latitude',
                    'longitude', 'measurement_time']

    #for DevID column indexing
    b_count = 0
    p_count = 0

    for csv in csv_list[0:2]:


        with open(csv) as f:

            #import csv, parse datetime and set as index
            raw_df = pd.read_csv(f)
                #parse_dates=True
            #draw relevant columns from rawdf
            df = raw_df[extract_vars]

            #Numerizing instruments
            imei = csv[7:-4]
            df.insert(0, column='IMEI', value=imei)

            #renames initial columns columns
            new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime',
                        'longitude':'Lng', 'latitude':'Lat'}

            df = df.rename(columns=new_names)
            print("During OPEN_csv_df")
            print(df)

            #DateTime as datetime
            df['DateTime'] = pd.to_datetime(df['DateTime'])


            #fills missing lat, long values for the sake of row completeness
            cols = ['Lat', 'Lng']
            df[cols] = df[cols].ffill()
            df[cols] = df[cols].bfill()

            #call cleaning functions
            df = rc_remove(df, imei)
            df = var_insert(df, imei)


            #write new csvs, and performs some last cleaning steps
            if 'HM2' in df:
                #insert premium_count for later indexing based on individual number,
                #clear out any remaining NaNs, set DateTime as index
                """can be done at a later step"""
                #p_count += 1
                #df.insert(0, column='DevID', value=p_count)

                #df = df[pd.notnull(df['T2'])]
                #df = df[pd.notnull(df['T1'])]

                #write to wrt_dir
                print("writing premium_", imei)
                df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False)


            else:
                #insert basic_count for later indexing on individual number,
                #clear out any remaining NaNs
                """can be done in a later step"""
                #b_count +=1
                #df.insert(0, column='DevID', value=b_count)

                #df = df[pd.notnull(df['T1'])]

                #wrtite to wrt_dir
                print("writing basic_", imei)
                df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False)

            #if wrt_massive:
            #call massive writing function


def rc_remove(df, imei):
    """This function removes row columns and values from the the raw_df.
    It also assigns non-ambigious temp column names. In the raw files some
    temperature columns are in T1, the switches to T2, etc"""

    import pandas as pd

    #remove rows where Voltage = Null
    df = df[pd.notnull(df['Volts'])]
    keep during raw conversion, but

    #drop rows where voltage values are -255.00
    #(UNSURE WHY THIS '-255.0' value EXISTS)

    """df = df[df['Volts'] != -255.0]
    keep for raw conversion, but remove change during
    manipultion and analysis"""

    #keep the the columns that are at least 30% populated
    df = df.dropna(thresh=len(df)*.30, axis=1)

    #This removes the ambiguity in the names of T# columns of certain csvs which temperature
    #measurements go into several columns, as well as removing rows with NaN values in column
    vars_len = len(df.columns.values)

    if vars_len <9:
        df.columns.values[-4] = 'T1'

    else:
        df.columns.values[-4] = 'T2'
        df.columns.values[-5] = 'T1'

    print('during REMOVE df')
    return df


def var_insert(df, imei):
    """This function shifts around columns, inserts new variables into
    into the dataframe"""

    import math
    from datetime import timedelta

    #insert calculated volumetric water content measurements #and change in volumetric water
    #content and populate for two subsequent values under DateTime if their difference == 30min
    #populate the dVWC column with the difference of the adjoining values under VWC.
    last_col = len(df.columns.values) -1
    df.insert(last_col, column='VWC', value=None)
    df['VWC'] = .000494*(df['Volts']*1000)-.544

    ##Take the following into account during manipulation
    """df.insert(last_col+1, column='dVWC', value=None)
    df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30),
           'dVWC'] = df['VWC'].shift(-1) - df['VWC']


    #Then shift those values by one row remove top row holding NaN
    #fill rest of NaN with preceeding real value
    df['dVWC'] = df['dVWC'].shift(+1)
    df = df[1:]
    df['dVWC'] = df['dVWC'].ffill()"""


    #insert 'month', 'week', 'day' columns... these can be set as indeces
    #for used for simpler visualization, ##better to add during other algos
    """ df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b'))
    df.insert(2, column="Week", value = df['DateTime'].dt.week)
    df.insert(3, column="Day", value = df['DateTime'].dt.day)"""

    return df

raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/'
wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/'

raw_converter_main(raw_dir, wrt_dir)