Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2019
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.08 KB | None | 0 0
  1. import random
  2. import numpy as np
  3. import pandas as pd
  4. from collections import Counter, defaultdict
  5.  
  6.  
  7. def stratified_group_k_fold(X, y, groups, k, seed=None):
  8. labels_num = np.max(y) + 1
  9. y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
  10. y_distr = Counter()
  11. for label, g in zip(y, groups):
  12. y_counts_per_group[g][label] += 1
  13. y_distr[label] += 1
  14.  
  15. y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
  16. groups_per_fold = defaultdict(set)
  17.  
  18. def eval_y_counts_per_fold(y_counts, fold):
  19. y_counts_per_fold[fold] += y_counts
  20. std_per_label = []
  21. for label in range(labels_num):
  22. label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
  23. std_per_label.append(label_std)
  24. y_counts_per_fold[fold] -= y_counts
  25. return np.mean(std_per_label)
  26.  
  27. groups_and_y_counts = list(y_counts_per_group.items())
  28. random.Random(seed).shuffle(groups_and_y_counts)
  29.  
  30. for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
  31. best_fold = None
  32. min_eval = None
  33. for i in range(k):
  34. fold_eval = eval_y_counts_per_fold(y_counts, i)
  35. if min_eval is None or fold_eval < min_eval:
  36. min_eval = fold_eval
  37. best_fold = i
  38. y_counts_per_fold[best_fold] += y_counts
  39. groups_per_fold[best_fold].add(g)
  40.  
  41. all_groups = set(groups)
  42. for i in range(k):
  43. train_groups = all_groups - groups_per_fold[i]
  44. test_groups = groups_per_fold[i]
  45.  
  46. train_indices = [i for i, g in enumerate(groups) if g in train_groups]
  47. test_indices = [i for i, g in enumerate(groups) if g in test_groups]
  48.  
  49. yield train_indices, test_indices
  50.  
  51.  
  52.  
  53. x_train = pd.read_csv('../input/train/train.csv')
  54. y_train = train.Target.values
  55. groups = np.array(x_train.ID.values)
  56.  
  57.  
  58. for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
  59. y_train, y_test = y[train_idx], y[test_idx]
  60. x_train, x_test = groups[train_idx], groups[test_idx]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement