IDS-18 Data Loading

import pandas as pd
import numpy as np


def reduce_mem_usage(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df


d0 = pd.read_csv('./data/CSVs/02-14-2018.csv')
dtypes_of_0 = d0.dtypes.to_dict()
d1 = pd.read_csv('./data/CSVs/02-15-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d2 = pd.read_csv('./data/CSVs/02-16-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d3 = pd.read_csv('./data/CSVs/02-20-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d4 = pd.read_csv('./data/CSVs/02-21-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d5 = pd.read_csv('./data/CSVs/02-22-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d6 = pd.read_csv('./data/CSVs/02-23-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d7 = pd.read_csv('./data/CSVs/02-28-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d8 = pd.read_csv('./data/CSVs/03-01-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)
d9 = pd.read_csv('./data/CSVs/03-02-2018.csv', low_memory=False,dtype=dtypes_of_0,index_col=False)

dataframes = [d0, d1, d2, d3, d4, d5, d6, d7, d8, d9]

for df in dataframes:
    df = reduce_mem_usage(df)


df = pd.concat([d0, d1])
del d0, d1

df = pd.concat([df, d2])
del d2

df = pd.concat([df, d3])
del d3

df = pd.concat([df, d4])
del d4

df = pd.concat([df, d5])
del d5

df = pd.concat([df, d6])
del d6

df = pd.concat([df, d7])
del d7

df = pd.concat([df, d8])
del d8

df = pd.concat([df, d9])
del d9