Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Create new column for replic separation
- def create_rep_col(data, by_columns, new_rep_name):
- if ((isinstance(by_columns, list)) and (len(by_columns) > 1)):
- data[new_rep_name] = ["_".join([str(xx) for xx in x]) for x in data[by_columns].get_values()]
- else:
- data[new_rep_name] = data[by_columns].get_values()
- return data
- def replace_nans(data, f):
- fo = data[f].get_values().astype(np.float32)
- fo[np.isnan(fo)] = 0
- fo[np.isinf(fo)] = 0
- data[f] = fo
- return data
- def generate_pairs(data, features, uid, rep_name, count):
- count_1 = count // 2
- count_0 = count // 2
- pairs1 = []
- data_map = data[[uid, rep_name] + list(features)].copy()
- uniq_reps = data[rep_name].get_values()
- u, c = np.unique(uniq_reps, return_counts=True)
- uniq_reps = np.unique(uniq_reps[~np.in1d(uniq_reps, u[c==1])])
- for c in range(count_1):
- uids = []
- t = r.randint(0, len(uniq_reps) - 1)
- uids = data_map[data_map[rep_name] == uniq_reps[t]].get_values()
- d1 = r.randint(0, len(uids) - 1)
- d2 = (d1 + r.randint(1, len(uids) - 1)) % len(uids)
- pairs1.append([copy(uids[d1]),copy(uids[d2])])
- pairs0 = []
- for c in range(count_0):
- t1 = r.randint(0, len(uniq_reps) - 1)
- t2 = (t1 + r.randint(1, len(uniq_reps) - 1)) % len(uniq_reps)
- d1 = data_map[data_map[rep_name] == uniq_reps[t1]].get_values()
- d2 = data_map[data_map[rep_name] == uniq_reps[t2]].get_values()
- pairs0.append([copy(d1[r.randint(0, len(d1) - 1)]),copy(d2[r.randint(0, len(d2) - 1)])])
- pairs0 = np.array(pairs0)
- pairs1 = np.array(pairs1)
- uids0 = pairs0[:, :, 0]
- uids1 = pairs1[:, :, 0]
- pairs0 = pairs0[:, :, 2:]
- pairs1 = pairs1[:, :, 2:]
- return pairs1, pairs0, uids1, uids0
- #Data specific.
- def train_test_uids_split(data, uid, test_size = 0.3):
- np.random.seed(5531)
- test_uids = data[data[uid].isin(data[uid].drop_duplicates().sample(frac=test_size, random_state=23))][uid].get_values()
- train_uids = data[~data[uid].isin(test_uids)][uid].get_values()
- return train_uids, test_uids
- def get_props(data, uid, uids, prop):
- props = []
- p0 = []
- for u in uids[:, 0]:
- p0.append(data[data[uid] == u][prop].get_values()[0])
- p1 = []
- for u in uids[:, 1]:
- p1.append(data[data[uid] == u][prop].get_values()[0])
- props.append(np.array(p0))
- props.append(np.array(p1))
- return np.array(props)
- def create_pairs_for_siamese(data, features, uid, by_columns, new_rep_name, count, prop = None):
- # data = replace_nans(data, features)
- data = create_rep_col(data, by_columns, new_rep_name)
- train_uids, test_uids = train_test_uids_split(data, uid, 0.2)
- print(train_uids.shape, test_uids.shape)
- #for PERT_ID. Data specific
- test_stand = pickle.load(open('/home/kkochetov/data/Drugs/test_stand.pkl', 'rb'))
- stand_uids = data[data.pert_id.isin(test_stand)][uid].get_values()
- train_uids = np.array([t for t in train_uids if t not in stand_uids])
- test_uids = np.unique(np.concatenate((test_uids, stand_uids)))
- print(train_uids.shape, test_uids.shape)
- test_pairs1, test_pairs0, test_uids1, test_uids0 = generate_pairs(data[data[uid].isin(test_uids)], features, uid, new_rep_name, int(count * 0.3))
- train_pairs1, train_pairs0, train_uids1, train_uids0 = generate_pairs(data[data[uid].isin(train_uids)], features, uid, new_rep_name, int(count * 0.7))
- train_pairs = np.concatenate((train_pairs1, train_pairs0))
- train_labels = np.concatenate((np.array([0]*train_pairs1.shape[0]), np.array([1]*train_pairs0.shape[0])))
- test_pairs = np.concatenate((test_pairs1, test_pairs0))
- test_labels = np.concatenate((np.array([0]*test_pairs1.shape[0]), np.array([1]*test_pairs0.shape[0])))
- #If property exist
- if (prop != None):
- test_props = get_props(data, uid, np.concatenate((test_uids1, test_uids0)), prop)
- train_props = get_props(data, uid, np.concatenate((train_uids1, train_uids0)), prop)
- train_pairs, train_props, train_labels = bootstrap(train_pairs, train_props, train_labels)
- return test_pairs, test_labels, test_props, train_pairs, train_labels, train_props
- return test_pairs, test_labels, train_pairs, train_labels
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement