Advertisement
jack06215

[pandas] make custom time-series data

Jun 22nd, 2020
154
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.17 KB | None | 0 0
  1. import pandas as pd
  2. import dask.dataframe as dd
  3. import numpy as np
  4. import os
  5. from sklearn.datasets import make_classification
  6.  
  7. # parameters
  8. n_samples = 2000
  9. start = pd.to_datetime('2017-01-01')
  10. end = pd.to_datetime('2017-01-31')
  11.  
  12. def random_datetimes_or_dates(start, end, out_format='datetime', n=10):
  13.     start_u = start.value//10**9
  14.     end_u = end.value//10**9
  15.  
  16.     return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n, dtype=np.int64)).view('M8[ns]'))
  17.  
  18. # features
  19. df = make_classification(n_samples = n_samples, n_features = 10)
  20. df_raw = pd.DataFrame(df[0], columns = ['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10'])
  21. df_raw['var1'] = np.random.randint(2, size=n_samples) + 1
  22. df_raw['var2'] = np.random.randint(100000, size=n_samples)
  23. df_raw['var3'] = np.random.randint(1000000, size=n_samples)
  24. df_raw['trx_inRow'] = 1
  25.  
  26. # timestamps
  27. df_timestamp = pd.DataFrame({'date': random_datetimes_or_dates(start, end, out_format= 'datetime', n=n_samples).sort_values()})
  28. df_raw['timestamp'] = pd.to_datetime(df_timestamp['date'], format='%Y-%m-%d', errors='ignore')
  29.  
  30. # class
  31. df_raw['class'] = df[1]
  32.  
  33. del df, df_timestamp
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement