Advertisement
Guest User

Untitled

a guest
Nov 19th, 2017
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.26 KB | None | 0 0
  1. #Create new column for replic separation
  2. def create_rep_col(data, by_columns, new_rep_name):
  3.     if ((isinstance(by_columns, list)) and (len(by_columns) > 1)):
  4.         data[new_rep_name] = ["_".join([str(xx) for xx in x]) for x in data[by_columns].get_values()]
  5.     else:
  6.         data[new_rep_name] = data[by_columns].get_values()
  7.     return data
  8.  
  9. def replace_nans(data, f):
  10.     fo = data[f].get_values().astype(np.float32)
  11.     fo[np.isnan(fo)] = 0
  12.     fo[np.isinf(fo)] = 0
  13.     data[f] = fo
  14.     return data
  15.  
  16. def generate_pairs(data, features, uid, rep_name, count):
  17.     count_1 = count // 2
  18.     count_0 = count // 2
  19.     pairs1 = []
  20.     data_map = data[[uid, rep_name] + list(features)].copy()
  21.     uniq_reps = data[rep_name].get_values()
  22.     u, c = np.unique(uniq_reps, return_counts=True)
  23.     uniq_reps = np.unique(uniq_reps[~np.in1d(uniq_reps, u[c==1])])
  24.     for c in range(count_1):
  25.         uids = []
  26.         t = r.randint(0, len(uniq_reps) - 1)
  27.         uids = data_map[data_map[rep_name] == uniq_reps[t]].get_values()
  28.         d1 = r.randint(0, len(uids) - 1)
  29.         d2 = (d1 + r.randint(1, len(uids) - 1)) % len(uids)
  30.         pairs1.append([copy(uids[d1]),copy(uids[d2])])
  31.     pairs0 = []
  32.     for c in range(count_0):
  33.         t1 = r.randint(0, len(uniq_reps) - 1)
  34.         t2 = (t1 + r.randint(1, len(uniq_reps) - 1)) % len(uniq_reps)
  35.         d1 = data_map[data_map[rep_name] == uniq_reps[t1]].get_values()
  36.         d2 = data_map[data_map[rep_name] == uniq_reps[t2]].get_values()
  37.         pairs0.append([copy(d1[r.randint(0, len(d1) - 1)]),copy(d2[r.randint(0, len(d2) - 1)])])
  38.     pairs0 = np.array(pairs0)
  39.     pairs1 = np.array(pairs1)
  40.     uids0 = pairs0[:, :, 0]
  41.     uids1 = pairs1[:, :, 0]
  42.     pairs0 = pairs0[:, :, 2:]
  43.     pairs1 = pairs1[:, :, 2:]
  44.     return pairs1, pairs0, uids1, uids0
  45.  
  46. #Data specific.
  47. def train_test_uids_split(data, uid, test_size = 0.3):
  48.     np.random.seed(5531)
  49.     test_uids = data[data[uid].isin(data[uid].drop_duplicates().sample(frac=test_size, random_state=23))][uid].get_values()
  50.     train_uids = data[~data[uid].isin(test_uids)][uid].get_values()
  51.     return train_uids, test_uids
  52.  
  53. def get_props(data, uid, uids, prop):
  54.     props = []
  55.     p0 = []
  56.     for u in uids[:, 0]:
  57.         p0.append(data[data[uid] == u][prop].get_values()[0])
  58.     p1 = []
  59.     for u in uids[:, 1]:
  60.         p1.append(data[data[uid] == u][prop].get_values()[0])
  61.     props.append(np.array(p0))
  62.     props.append(np.array(p1))
  63.     return np.array(props)
  64.  
  65. def create_pairs_for_siamese(data, features, uid, by_columns, new_rep_name, count,  prop = None):
  66. #     data = replace_nans(data, features)
  67.     data = create_rep_col(data, by_columns, new_rep_name)
  68.     train_uids, test_uids = train_test_uids_split(data, uid, 0.2)
  69.     print(train_uids.shape, test_uids.shape)
  70.     #for PERT_ID. Data specific
  71.     test_stand = pickle.load(open('/home/kkochetov/data/Drugs/test_stand.pkl', 'rb'))
  72.     stand_uids = data[data.pert_id.isin(test_stand)][uid].get_values()
  73.     train_uids = np.array([t for t in train_uids if t not in stand_uids])
  74.     test_uids = np.unique(np.concatenate((test_uids, stand_uids)))
  75.     print(train_uids.shape, test_uids.shape)
  76.    
  77.     test_pairs1, test_pairs0, test_uids1, test_uids0 = generate_pairs(data[data[uid].isin(test_uids)], features, uid, new_rep_name, int(count * 0.3))
  78.     train_pairs1, train_pairs0, train_uids1, train_uids0 = generate_pairs(data[data[uid].isin(train_uids)], features, uid, new_rep_name, int(count * 0.7))
  79.  
  80.     train_pairs = np.concatenate((train_pairs1, train_pairs0))
  81.     train_labels = np.concatenate((np.array([0]*train_pairs1.shape[0]), np.array([1]*train_pairs0.shape[0])))
  82.     test_pairs = np.concatenate((test_pairs1, test_pairs0))
  83.     test_labels = np.concatenate((np.array([0]*test_pairs1.shape[0]), np.array([1]*test_pairs0.shape[0])))
  84.     #If property exist
  85.     if (prop != None):
  86.         test_props = get_props(data, uid, np.concatenate((test_uids1, test_uids0)), prop)
  87.         train_props = get_props(data, uid, np.concatenate((train_uids1, train_uids0)), prop)
  88.         train_pairs, train_props, train_labels = bootstrap(train_pairs, train_props, train_labels)
  89.         return test_pairs, test_labels, test_props, train_pairs, train_labels, train_props
  90.     return test_pairs, test_labels, train_pairs, train_labels
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement