Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import dask.dataframe as dd
- import numpy as np
- import os
- from sklearn.datasets import make_classification
- # parameters
- n_samples = 2000
- start = pd.to_datetime('2017-01-01')
- end = pd.to_datetime('2017-01-31')
- def random_datetimes_or_dates(start, end, out_format='datetime', n=10):
- start_u = start.value//10**9
- end_u = end.value//10**9
- return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n, dtype=np.int64)).view('M8[ns]'))
- # features
- df = make_classification(n_samples = n_samples, n_features = 10)
- df_raw = pd.DataFrame(df[0], columns = ['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10'])
- df_raw['var1'] = np.random.randint(2, size=n_samples) + 1
- df_raw['var2'] = np.random.randint(100000, size=n_samples)
- df_raw['var3'] = np.random.randint(1000000, size=n_samples)
- df_raw['trx_inRow'] = 1
- # timestamps
- df_timestamp = pd.DataFrame({'date': random_datetimes_or_dates(start, end, out_format= 'datetime', n=n_samples).sort_values()})
- df_raw['timestamp'] = pd.to_datetime(df_timestamp['date'], format='%Y-%m-%d', errors='ignore')
- # class
- df_raw['class'] = df[1]
- del df, df_timestamp
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement