Advertisement
Guest User

Untitled

a guest
Nov 19th, 2019
246
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.60 KB | None | 0 0
  1. import pathlib
  2. import pandas as pd
  3. from IPython.display import display
  4.  
  5.  
  6. # Filenames (without the extension) we are expected to find in `ashrae_dir`.
  7. _NAMES = [
  8.     'building_metadata',
  9.     'weather_train',
  10.     'weather_test',
  11.     'train',
  12.     'test',
  13. ]
  14.  
  15.  
  16. def import_data(ashrae_dir):
  17.     """
  18.    Import ASHRAE data from a directory containing the .csv files.
  19.  
  20.    Return a {'thing': pd.Dataframe} dictionary.
  21.    """
  22.     ashrae_dir = pathlib.Path(ashrae_dir)
  23.     data = {name: pd.read_csv((ashrae_dir / name).with_suffix('.csv'))}
  24.  
  25.     # Sanity check: the set of building ids should be the same in the train and test sets.
  26.     assert set(data['train'].building_id) == set(data['test'].building_id)
  27.  
  28.  
  29. # This now can be reused everytime you need to cache a bunch of dataframes to a h5 file
  30. def _cache_data(data, filename):
  31.     """
  32.    Given a data as a {str: pd.DataFrame} dictionary, save it to a .h5 file.
  33.    """
  34.     filename = pathlib.Path(filename)
  35.     assert filename.suffix == '.h5'
  36.     with pd.HDFStore(filename) as f:
  37.         for name, df in data.items():
  38.             f[name] = df
  39.  
  40.  
  41. def get_data(ashrae_dir, cache_file=None):
  42.     """
  43.    Import ASHRAE data with optional caching mechanism.
  44.  
  45.    Return a {'thing': pd.Dataframe} dictionary.
  46.    """
  47.     cache_file = pathlib.Path(cache_file)
  48.  
  49.     if cache_file is not None and cache_file.exists():
  50.         print(f'Importing data from {cache_file}')
  51.         with pd.HDF5Store(cache_file) as f:
  52.             data = {name: f[name] for name in _NAMES}
  53.     else:
  54.         print('Importing data from csv')
  55.         data = import_data(ashrae_dir)
  56.         _cache_data(data, cache_file)
  57.  
  58.     return data
  59.  
  60.  
  61. # Calling like this:
  62. data_raw = get_data('ashrae-energy-prediction', cache_file='store_raw.h5')
  63. # And if you want the same as in your .ipynb then you do:
  64. train_raw = data_raw['train']
  65. test_raw = data_raw['test']
  66. # ...
  67.  
  68. # But personally I would go with things like:
  69.  
  70. for name in _NAMES:
  71.     print(f'NaNs for {name}')
  72.     display(data_raw[name].isna().sum())
  73.  
  74.  
  75. # Count buildings
  76. def get_buildings_with_high_meter(df, threshold):
  77.     """
  78.    Return the building ids with a count of meter readings above a certain threshold.
  79.    """
  80.     return set(df.meter_reading > threshold)
  81.  
  82. for threshold in [1_000_000, 10_000_000]:
  83.     buildings = get_buildings_with_high_meter(data_raw['train'], threshold)
  84.     print(f'There are {len(buildings)} buildings above {threshold // 1_000_000}M meter readings}')
  85.  
  86. # Plot the > 10M.
  87. for b_id in buildings:
  88.     raw_data['train'][train_raw.building_id == b_id][train_raw.meter == 2].meter_reading.plot()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement