Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import numpy.random as nr
- # Read In
- DF = pd.read_csv('train.csv',sep=',',header=0)
- for i in xrange(28):
- DF[str(i)] = DF['Target'].map(lambda x: int(str(i) in x.split(' ')))
- value_counts = DF.ix[:,2:].apply(np.sum, axis=0)
- class_order = list(value_counts.sort_values().index)
- DF['inpool'] = 0
- # Config Fold Num
- Nfold = 5
- # Arrange Fold From Least Labelled Class
- for item in class_order:
- #print item,
- value_counts[item]
- origin_pool = [value_counts[item]/Nfold for i in range(Nfold)]
- randadd = nr.choice(Nfold, value_counts[item]%Nfold)
- while len(randadd)!=len(set(randadd)):
- randadd = nr.choice(Nfold, value_counts[item]%Nfold)
- for index in randadd:
- origin_pool[index] = origin_pool[index]+1
- existing_pool = DF[DF[item]==1].groupby('inpool').count().ix[:,0].to_dict()
- for i in range(Nfold):
- if i+1 not in existing_pool:
- existing_pool[i+1] = 0
- waiting_pool = [each for each in origin_pool]
- for i in range(Nfold):
- if waiting_pool[i]<existing_pool[i+1]:
- print "Error Allocating Label: Could Not Balance"
- waiting_pool[i] = waiting_pool[i] - existing_pool[i+1]
- unperturb_label = [pool_id+1 for pool_id in range(Nfold) for each in range(waiting_pool[pool_id]) ]
- perturb_label = nr.permutation(unperturb_label)
- #print perturb_label,
- #print len(perturb_label),
- #print len(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'])
- for index,dfindex in enumerate(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'].index):
- DF.ix[dfindex,'inpool'] = perturb_label[index]
- # Check Balancing
- DF.groupby('inpool').sum()
Add Comment
Please, Sign In to add comment