Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def synthetic_data(size = 1000,n_stages = 4,n_features = 2,noise = False,size_noise = 0.1):
- data = pd.DataFrame(np.arange((int(size/n_stages))*n_stages), columns=['id'])
- data['stages'] = [0]*(int(size/n_stages))*n_stages
- start = 0
- feature_cols = []
- for stage in range(n_stages):
- ways = []
- for _ in range(np.random.randint(2,n_stages+1)):
- ways.append(np.random.randint(1,np.random.randint(2,n_stages+1)))
- ways = pd.unique(ways)
- if len(ways) == 1:
- if ways[0] == 1:
- ways = np.insert(ways,[1],[2])
- else:
- ways = np.insert(ways,[0],[1])
- data.loc[start:start+int(size/n_stages),'stages'] = data.loc[start:start+int(size/n_stages),'stages'].apply(lambda x:ways)
- start += int(size/n_stages)
- data = data.explode('stages')
- data['stages'] = data['stages'].astype(str)
- data['from'] = data.groupby(['id'])['stages'].shift(1)
- edges = data.loc[:, ['from', "stages"]].drop_duplicates().dropna().values
- edges = [list(map(str, edge)) for edge in edges]
- i = 0
- for stage in data['stages'].unique():
- transitions = list(filter(lambda x: stage in x[0], edges))
- if len(transitions) > 1:
- stage_from = stage
- stages_to = [transition[1] for transition in transitions]
- train_mask = data['from'] == stage_from
- for feat,stage_to in enumerate(stages_to):
- mask = (data["stages"] == stage_to) & (data['from'] == stage_from)
- data.loc[mask,'X_{}'.format(i)] = (feat + 1) + np.random.random() * 1.5
- feature_cols.append("X_{}".format(i))
- i+=1
- data[data.columns[3:]] = data[data.columns[3:]].fillna(0)
- return feature_cols,data
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement