Guest User

Untitled

a guest
Dec 19th, 2018
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.65 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import numpy.random as nr
  4.  
  5. # Read In
  6. DF = pd.read_csv('train.csv',sep=',',header=0)
  7. for i in xrange(28):
  8. DF[str(i)] = DF['Target'].map(lambda x: int(str(i) in x.split(' ')))
  9.  
  10. value_counts = DF.ix[:,2:].apply(np.sum, axis=0)
  11. class_order = list(value_counts.sort_values().index)
  12. DF['inpool'] = 0
  13.  
  14. # Config Fold Num
  15.  
  16. Nfold = 5
  17.  
  18. # Arrange Fold From Least Labelled Class
  19.  
  20. for item in class_order:
  21. #print item,
  22. value_counts[item]
  23. origin_pool = [value_counts[item]/Nfold for i in range(Nfold)]
  24. randadd = nr.choice(Nfold, value_counts[item]%Nfold)
  25. while len(randadd)!=len(set(randadd)):
  26. randadd = nr.choice(Nfold, value_counts[item]%Nfold)
  27. for index in randadd:
  28. origin_pool[index] = origin_pool[index]+1
  29. existing_pool = DF[DF[item]==1].groupby('inpool').count().ix[:,0].to_dict()
  30. for i in range(Nfold):
  31. if i+1 not in existing_pool:
  32. existing_pool[i+1] = 0
  33.  
  34. waiting_pool = [each for each in origin_pool]
  35. for i in range(Nfold):
  36. if waiting_pool[i]<existing_pool[i+1]:
  37. print "Error Allocating Label: Could Not Balance"
  38. waiting_pool[i] = waiting_pool[i] - existing_pool[i+1]
  39.  
  40. unperturb_label = [pool_id+1 for pool_id in range(Nfold) for each in range(waiting_pool[pool_id]) ]
  41. perturb_label = nr.permutation(unperturb_label)
  42. #print perturb_label,
  43. #print len(perturb_label),
  44. #print len(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'])
  45. for index,dfindex in enumerate(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'].index):
  46. DF.ix[dfindex,'inpool'] = perturb_label[index]
  47.  
  48. # Check Balancing
  49.  
  50. DF.groupby('inpool').sum()
Add Comment
Please, Sign In to add comment