Untitled

import pathlib
import pandas as pd
from IPython.display import display


# Filenames (without the extension) we are expected to find in `ashrae_dir`.
_NAMES = [
    'building_metadata',
    'weather_train',
    'weather_test',
    'train',
    'test',
]


def import_data(ashrae_dir):
    """
    Import ASHRAE data from a directory containing the .csv files.

    Return a {'thing': pd.Dataframe} dictionary.
    """
    ashrae_dir = pathlib.Path(ashrae_dir)
    data = {name: pd.read_csv((ashrae_dir / name).with_suffix('.csv'))}

    # Sanity check: the set of building ids should be the same in the train and test sets.
    assert set(data['train'].building_id) == set(data['test'].building_id)


# This now can be reused everytime you need to cache a bunch of dataframes to a h5 file
def _cache_data(data, filename):
    """
    Given a data as a {str: pd.DataFrame} dictionary, save it to a .h5 file.
    """
    filename = pathlib.Path(filename)
    assert filename.suffix == '.h5'
    with pd.HDFStore(filename) as f:
        for name, df in data.items():
            f[name] = df


def get_data(ashrae_dir, cache_file=None):
    """
    Import ASHRAE data with optional caching mechanism.

    Return a {'thing': pd.Dataframe} dictionary.
    """
    cache_file = pathlib.Path(cache_file)

    if cache_file is not None and cache_file.exists():
        print(f'Importing data from {cache_file}')
        with pd.HDF5Store(cache_file) as f:
            data = {name: f[name] for name in _NAMES}
    else:
        print('Importing data from csv')
        data = import_data(ashrae_dir)
        _cache_data(data, cache_file)

    return data


# Calling like this:
data_raw = get_data('ashrae-energy-prediction', cache_file='store_raw.h5')
# And if you want the same as in your .ipynb then you do:
train_raw = data_raw['train']
test_raw = data_raw['test']
# ...

# But personally I would go with things like:

for name in _NAMES:
    print(f'NaNs for {name}')
    display(data_raw[name].isna().sum())


# Count buildings
def get_buildings_with_high_meter(df, threshold):
    """
    Return the building ids with a count of meter readings above a certain threshold.
    """
    return set(df.meter_reading > threshold)

for threshold in [1_000_000, 10_000_000]:
    buildings = get_buildings_with_high_meter(data_raw['train'], threshold)
    print(f'There are {len(buildings)} buildings above {threshold // 1_000_000}M meter readings}')

# Plot the > 10M.
for b_id in buildings:
    raw_data['train'][train_raw.building_id == b_id][train_raw.meter == 2].meter_reading.plot()