Advertisement
ChrisProsser

blackbox_classification

Jan 16th, 2014
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.41 KB | None | 0 0
  1. #/usr/bin/python
  2. from __future__ import division
  3. import sys, os, numpy as np
  4. from sklearn import svm
  5. from matplotlib import pyplot as plt
  6. from csv import writer as csvwriter, reader as csvreader
  7. from scipy.stats import nanmean
  8.  
  9. # globals - you will need to alter these for your problem...
  10. default_dir = r'C:\Users\Chris\Dropbox\ProsserSystems\Python\machine_learning\sklearn_data'
  11. default_trn_path = os.path.join(default_dir, 'train.csv')
  12. default_tst_path = os.path.join(default_dir, 'test.csv')
  13. delim = ','
  14. # zero indexed, don't allow for exclusions, they are handled, -1 means no ids
  15. trn_id_col = -1
  16. tst_id_col = -1
  17. labels_col = 0
  18. # col nos to be excluded - zero indexed
  19. excl_trn_cols = [] #e.g. [3,8,10]
  20. excl_tst_cols = [] # e.g. -1 from above: [i-1 for i in excl_trn_cols]
  21. heads_in_trn_file, heads_in_tst_file = False, False
  22.  
  23. # regularisation
  24. auto_find_loops = 30 # WARNING this will run this no * iterations global times.
  25.                      #         set to 1 to turn off and use reg default
  26.                      #         (specified in main)
  27. adj_rate = 2.0 # size of steps to take in auto_find * or / by this number
  28.  
  29. # These should have reasonable defaults if you are not sure...
  30. trn_perc = 90
  31. iterations = 1000
  32. verbose = False
  33. show_graphs = False # plots data by one feature against another, only useful if
  34.                     # you have a very small no of features e.g.~ <=8
  35.  
  36. #initialisations
  37. trn_cols, tst_cols = 0, 0
  38.  
  39. def scale_features(X, mu, sd, m, n):
  40.     """
  41.    Call with a numpy array for the values to be scaled down to approx -3 to 3.
  42.    This is required for algorithms such as gradient descent so
  43.    that the results can converge more efficiently.
  44.    m & n args are for no of rows and no of cols respectively.
  45.    Returns as a Numpy array with a feature added for x0 (a 1 for each row).
  46.    """
  47.     # function for each element (vectorised later)
  48.     def scale(x, mu, sd):
  49.         return (x-mu)/sd if sd != 0 else x
  50.  
  51.     # vectorise function above
  52.     scale_vec = np.vectorize(scale, otypes=[np.float])
  53.  
  54.     if len(mu) == 0:
  55.         mu = np.mean(X, axis=0)
  56.     if len(sd) == 0:
  57.         sd = np.std(X, axis=0)
  58.     X_norm = np.ones((m, n+1))
  59.     X_norm[:, 1:] = scale_vec(X, mu, sd)
  60.     return X_norm, mu, sd
  61.  
  62. def graphs(y, X, m, n, label):
  63.     # get indexes of positive and negative examples
  64.     pos = [i for i in range(m) if y[i] == 1]
  65.     neg = [i for i in range(m) if y[i] == 0]
  66.  
  67.     # plot stuff...
  68.     if show_graphs:
  69.         for j in range(1, n): # miss x0
  70.             for j2 in range(j+1, n): # for all combos
  71.                 x1_p, x1_n = [X[j][i] for i in pos], [X[j][i] for i in neg]
  72.                 x2_p, x2_n = [X[j2][i] for i in pos], [X[j2][i] for i in neg]
  73.                 fig = plt.plot(x1_p, x2_p, 'go', x1_n, x2_n, 'rx')
  74.                 x_lab, y_lab = plt.xlabel(label[j]), plt.ylabel(label[j2])
  75.                 plt.show()
  76.  
  77. def write_to_file(data, f_name):
  78.     writer = csvwriter(open(f_name, 'wb'))
  79.     writer.writerows(data)
  80.  
  81. def feature_prep(data, heads, stage='training', use_mean=[], use_sd=[],
  82.                  write_output_to_file=False, cur_loop=0):
  83.     """
  84.    use_mean and use_sd should not be specified for initial training data, but
  85.    then should be passed in for cv, test and predictions (to use the same
  86.    feature scaling as inital training data).
  87.    """
  88.  
  89.     if cur_loop == 0 and verbose:
  90.         l_verbose = True
  91.     else:
  92.         l_verbose = False
  93.    
  94.     # load in training data
  95.     if l_verbose: print '\n', '-'*80, '\n', 'Stage:', stage
  96.     ids = None
  97.     if stage not in ['test', 'predict']:
  98.         feature_cols = [i for i in range(trn_cols)
  99.                         if i not in [trn_id_col, labels_col]+excl_trn_cols]
  100.         if l_verbose:
  101.             print 'feature_cols:', feature_cols
  102.             print 'trn_cols:', trn_cols, '| trn_id_col:', trn_id_col, \
  103.                   '| labels_col:', labels_col, '| excl_trn_cols:', excl_trn_cols
  104.         X, y= data[:, feature_cols], data[:, labels_col]
  105.  
  106.         # filter heads
  107.         if heads_in_trn_file:
  108.             heads = [heads[i] for i in range(n) if i in feature_cols]
  109.  
  110.     else:
  111.         feature_cols = [i for i in range(tst_cols)
  112.                         if i not in [tst_id_col]+excl_tst_cols]
  113.         if l_verbose:
  114.             print 'feature_cols:', feature_cols
  115.             print 'tst_cols:', tst_cols, '| tst_id_col:', tst_id_col, \
  116.                   '| excl_tst_cols:', excl_tst_cols
  117.         X, y = data[:, feature_cols], None
  118.  
  119.         # filter heads
  120.         if heads_in_tst_file:
  121.             heads = [heads[i] for i in range(n) if i in feature_cols]
  122.  
  123.         # record tst id columns if needed
  124.         if tst_id_col >= 0:
  125.             data[:, tst_id_col]
  126.  
  127.     m, n = np.size(X, 0), np.size(X, 1) # no of rows and cols
  128.    
  129.     if l_verbose:
  130.         print 'Heads used:\n', ', '.join(i for i in heads)
  131.         print 'X:', np.shape(X), 'y:', np.shape(y), 'ids:', np.shape(ids), \
  132.                'data:', np.shape(data), 'm:', m, 'n:', n, '\n'
  133.  
  134.     # fill blanks with averages
  135.     if len(use_mean) > 0:
  136.         col_default = use_mean
  137.     else:
  138.         # calc means of cols if not passed in as args
  139.         col_default = use_mean = nanmean(X, axis=0)
  140.     inds = np.where(np.isnan(X)) # find indicies of empty cells to be replaced
  141.     X[inds] = np.take(col_default, inds[1])
  142.    
  143.     if show_graphs:
  144.         graphs(y, X, m, n, heads)
  145.  
  146.     #test
  147.     if l_verbose:
  148.         print '\nFirst ten rows before normalisation:'
  149.         np.set_printoptions(precision=4, suppress=True)
  150.         print X[:10, :], '\n'
  151.    
  152.     # scale the features & write output
  153.     X, mu, sd = scale_features(X, use_mean, use_sd, m, n)
  154.     if write_output_to_file:
  155.         write_to_file(X, os.join.path(default_dir, 'X_'+stage+'.csv'))
  156.    
  157.     #test
  158.     if l_verbose:
  159.         print '\nFirst ten rows after normalisation:'
  160.         print X[:10, 1:], '\n'
  161.  
  162.     return X, y, mu, sd, ids
  163.  
  164. def conv(val):
  165.     try:
  166.         return float(val)
  167.     except:
  168.         if not val:
  169.             return None
  170.         return float(sum([ord(i) for i in str(val)])) # sum of ascii vals
  171.  
  172. def import_data(mode='training'):
  173.     global trn_cols, tst_cols
  174.    
  175.     # get input file (features and labels)
  176.     if len(sys.argv) > 1:
  177.         if mode == 'training':
  178.             fname = sys.argv[1]
  179.         elif len(sys.argv) > 2:
  180.             fname = sys.argv[2]
  181.     else:
  182.         if mode == 'training':  
  183.             fname = default_trn_path
  184.         else:
  185.             fname = default_tst_path
  186.  
  187.     if not os.path.exists(fname):
  188.         print "usage:", os.path.split(sys.argv[0])[1], "[default_trn_path]", \
  189.               "[default_tst_path]"
  190.         print "Valid file paths must be provided as an arg or global varables"
  191.         sys.exit("invalid input")
  192.  
  193.     # get heads
  194.     reader = csvreader(open(fname, 'rb'))
  195.     r, start_row, heads = 0, 0, []
  196.     for row in reader:
  197.         if r == 0:
  198.             # get no of cols in data
  199.             if mode == 'training':
  200.                 trn_cols = len(row)
  201.                 if heads_in_trn_file:
  202.                     heads = row
  203.                     start_row += 1
  204.             else:
  205.                 tst_cols = len(row)
  206.                 if heads_in_tst_file:
  207.                     heads = row
  208.                     start_row += 1
  209.             r += 1
  210.         else:
  211.             break
  212.  
  213.     # build a dict to map each col to a conv func (if not excl)
  214.     if mode not in ['test', 'predict']:
  215.         cols = [i for i in range(trn_cols) if i not in excl_trn_cols]
  216.         conv_dict = {c: conv for c in cols}
  217.     else:
  218.         cols = [i for i in range(tst_cols) if i not in excl_tst_cols]
  219.         conv_dict = {c: conv for c in cols}
  220.  
  221.     if verbose:
  222.         print '\nData import:', mode, '| cols:', cols, '\n'
  223.  
  224.     # import data
  225.     #   not excluding unneeded cols, import all, just without conversions
  226.     #   they are exlcuded later in feature_prep
  227.     data = np.genfromtxt(fname, delimiter=delim, converters=conv_dict,
  228.                          skip_header=start_row)
  229.  
  230.     if verbose:
  231.         print 'all heads:\n',  ', '.join(i for i in heads), '\n'
  232.         print 'shape of data:', np.shape(data)
  233.         print data
  234.    
  235.     return data, heads
  236.  
  237. def split_trn_data(data):
  238.     m = np.size(data, 0)
  239.     rands = np.random.random_sample(m)
  240.    
  241.     # select cases where random no from above is <= threshold
  242.     trn_data = data[rands <= (trn_perc/100), :]
  243.     cv_data = data[rands > (trn_perc/100), :]
  244.  
  245.     return trn_data, cv_data
  246.  
  247. def build_classifier(X, y, reg):
  248.     # rbf is guassian kernal
  249.     clf = svm.SVC(kernel='rbf', C=reg, cache_size=1000)
  250.     return clf.fit(X, y)
  251.  
  252. def main():
  253.  
  254.     global adj_rate
  255.     reg, reg_loop, reg_dir = 1.0, 0, 'up'
  256.     reg_rec, trn_rec, cv_rec = [], [], []
  257.    
  258.     # import training data
  259.     data, heads = import_data('training')
  260.  
  261.     while reg_loop < auto_find_loops:
  262.        
  263.         trn, cv, = [], []
  264.         for i in range(iterations):
  265.  
  266.             # split data into training and cross validation groups
  267.             trn_data, cv_data = split_trn_data(data)
  268.             if verbose:
  269.                 print '\nSize of training data:', np.shape(trn_data)
  270.                 print 'Size of cross val data:', np.shape(cv_data)
  271.            
  272.             # prep training data and build classifier
  273.             X_trn, y_trn, mu, sd, ids = feature_prep(trn_data, heads,
  274.                                                      'training',
  275.                                                      [], [], verbose, i)
  276.             clf = build_classifier(X_trn, y_trn, reg)
  277.  
  278.             # training accuracy
  279.             trn_pred = clf.predict(X_trn)
  280.             trn_accuracy = 1 - (sum(abs(y_trn - trn_pred)) / len(X_trn))
  281.             trn.append(trn_accuracy)
  282.  
  283.             # load prepare cv set
  284.             if trn_perc < 100:
  285.                 X_cv, y_cv, mu, sd, ids = feature_prep(cv_data, heads, 'cv',
  286.                                                        mu, sd, verbose, i)
  287.  
  288.                 # cv accuracy
  289.                 cv_pred = clf.predict(X_cv)
  290.                 cv_accuracy = 1 - (sum(abs(y_cv - cv_pred)) / len(X_cv))
  291.                 cv.append(cv_accuracy)
  292.  
  293.         reg_rec.append(reg)
  294.         trn_rec.append(np.mean(trn))
  295.         if trn_perc < 100:
  296.             cv_rec.append(np.mean(cv))
  297.         else:
  298.             cv_rec.append(0)
  299.  
  300.         if reg_loop == 0:
  301.             print 'Loop  |  C param  |  Trn accuracy  |  CV accuracy   |  Dir'
  302.             print '-----------------------------------------------------------'
  303.  
  304.         better = (reg_loop == 0 or cv_rec[reg_loop] > cv_rec[reg_loop-1])
  305.  
  306.         # switch direction & reduce adj_rate if not getting better
  307.         if not better:
  308.             adj_rate *= 0.95
  309.             if reg_dir == 'up':
  310.                 reg_dir = 'down'
  311.             else:
  312.                 reg_dir = 'up'
  313.  
  314.         try:
  315.             print str(reg_loop) + ' ' * (6 - len(str(reg_loop))) + '|' + \
  316.                   '  ' + str(round(reg, 3)) + \
  317.                   ' ' * (9 - len(str(round(reg, 3)))) + '|' + \
  318.                   '  ' + str(round(trn_rec[reg_loop], 9)) + \
  319.                   ' ' * (14 - len(str(round(trn_rec[reg_loop], 9)))) + '|' + \
  320.                   '  ' + str(round(cv_rec[reg_loop], 9)) + \
  321.                   ' ' * (14 - len(str(round(cv_rec[reg_loop], 9)))) + '|' + \
  322.                   '  ' + reg_dir
  323.         except:
  324.             print reg_loop, reg, trn_rec[reg_loop], cv_rec[reg_loop], reg_dir
  325.             pass
  326.  
  327.         if reg_dir == 'up':
  328.             reg *= adj_rate
  329.         else:
  330.             reg /= adj_rate
  331.  
  332.         reg_loop += 1
  333.  
  334.     # load in test data and run through the same prep / normalisation
  335.     t_data, t_heads = import_data('test')
  336.     X, tmp_y, mu, sd, ids = feature_prep(t_data, t_heads, 'test',
  337.                                          mu, sd, verbose, i)
  338.    
  339.     # get predictions and make each item an int in a sublist (required format)
  340.     y = clf.predict(X)
  341.     print '\nFound', int(sum(y)), 'positive predictions out of', len(y)
  342.     print '(iterations:', iterations, '| trn_perc:', trn_perc, ')'
  343.  
  344.     if tst_id_col >= 0:
  345.         predictions = [[int(ids[i]), int(round(y[i],0))] for i in range(len(y))]
  346.     else:
  347.         predictions = [[int(round(y[i],0))] for i in range(len(y))]
  348.     if heads_in_trn_file:
  349.         predictions.insert(0, [t_heads[tst_id_col], t_heads[labels_col]])
  350.  
  351.     write_to_file(predictions,
  352.                   os.path.join(default_dir, 'test_predictions.csv'))    
  353.  
  354. if __name__ == '__main__':
  355.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement