Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pathlib
- import pandas as pd
- from IPython.display import display
- # Filenames (without the extension) we are expected to find in `ashrae_dir`.
- _NAMES = [
- 'building_metadata',
- 'weather_train',
- 'weather_test',
- 'train',
- 'test',
- ]
- def import_data(ashrae_dir):
- """
- Import ASHRAE data from a directory containing the .csv files.
- Return a {'thing': pd.Dataframe} dictionary.
- """
- ashrae_dir = pathlib.Path(ashrae_dir)
- data = {name: pd.read_csv((ashrae_dir / name).with_suffix('.csv'))}
- # Sanity check: the set of building ids should be the same in the train and test sets.
- assert set(data['train'].building_id) == set(data['test'].building_id)
- # This now can be reused everytime you need to cache a bunch of dataframes to a h5 file
- def _cache_data(data, filename):
- """
- Given a data as a {str: pd.DataFrame} dictionary, save it to a .h5 file.
- """
- filename = pathlib.Path(filename)
- assert filename.suffix == '.h5'
- with pd.HDFStore(filename) as f:
- for name, df in data.items():
- f[name] = df
- def get_data(ashrae_dir, cache_file=None):
- """
- Import ASHRAE data with optional caching mechanism.
- Return a {'thing': pd.Dataframe} dictionary.
- """
- cache_file = pathlib.Path(cache_file)
- if cache_file is not None and cache_file.exists():
- print(f'Importing data from {cache_file}')
- with pd.HDF5Store(cache_file) as f:
- data = {name: f[name] for name in _NAMES}
- else:
- print('Importing data from csv')
- data = import_data(ashrae_dir)
- _cache_data(data, cache_file)
- return data
- # Calling like this:
- data_raw = get_data('ashrae-energy-prediction', cache_file='store_raw.h5')
- # And if you want the same as in your .ipynb then you do:
- train_raw = data_raw['train']
- test_raw = data_raw['test']
- # ...
- # But personally I would go with things like:
- for name in _NAMES:
- print(f'NaNs for {name}')
- display(data_raw[name].isna().sum())
- # Count buildings
- def get_buildings_with_high_meter(df, threshold):
- """
- Return the building ids with a count of meter readings above a certain threshold.
- """
- return set(df.meter_reading > threshold)
- for threshold in [1_000_000, 10_000_000]:
- buildings = get_buildings_with_high_meter(data_raw['train'], threshold)
- print(f'There are {len(buildings)} buildings above {threshold // 1_000_000}M meter readings}')
- # Plot the > 10M.
- for b_id in buildings:
- raw_data['train'][train_raw.building_id == b_id][train_raw.meter == 2].meter_reading.plot()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement