Advertisement
kevinsf90

Quora Classifier - CodeSprint 2

Jan 8th, 2012
164
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.69 KB | None | 0 0
  1. # A python implementation of a solution to the Quora Classifier
  2. # problem for Codesprint 2. Uses a form of logistic regression.
  3. # Author: Kevin Tham
  4.  
  5. from math import exp, pow, sqrt
  6. import random
  7.  
  8. class QuoraClassifier(object):
  9.     def __init__(self,numFeatures):
  10.         self.W = []
  11.         self.means = []
  12.         self.deviations = []
  13.         self.featureTotals = []
  14.         self.features = range(numFeatures) # List of feature indices
  15.         for n in self.features:
  16.             self.means.append(0.0)
  17.             self.deviations.append(0.0)
  18.             self.W.append(0.0)
  19.             self.featureTotals.append(0.0)
  20.         self.numExamples = 0.0 # Number of examples trained on  
  21.         self.L = 0.5 # Learning Rate
  22.         self.E = 10 # Epochs
  23.        
  24.     def findMeanAndVariance(self, examples):
  25.         for exs in examples:
  26.             features = exs[1]
  27.             # Get the values
  28.             featureVector = []
  29.             for feature in features:
  30.                 featureVector.append(float(feature.split(':')[1]))
  31.             self.numExamples += 1
  32.             # Iterate through each feature and sum the values
  33.             for n in xrange(len(self.features)):
  34.                 self.means[n] += featureVector[n]
  35.         self.means = map(lambda x: x / self.numExamples, self.means)
  36.         for exs in examples:
  37.             # Get the values
  38.             featureVector = []
  39.             for feature in features:
  40.                 featureVector.append(float(feature.split(':')[1]))
  41.             for n in xrange(len(self.features)):
  42.                 self.deviations[n] += pow(featureVector[n] - self.means[n], 2)
  43.         self.deviations = map(lambda x: sqrt(x / self.numExamples), self.deviations)
  44.  
  45.     def updateSGD(self, actual, featureVector):
  46.         # Calculate Mean
  47.         self.numExamples += 1
  48.         for n in xrange(len(self.features)):
  49.             self.featureTotals[n] += featureVector[n]
  50.         # Normalize with Mean to prevent overflow problem
  51.         for n in xrange(len(self.features)):
  52.             if (self.featureTotals[n] == 0):
  53.                 continue
  54.             featureVector[n] = featureVector[n]/self.featureTotals[n]
  55.         # Use Stochastic Gradient Descent for weight updating
  56.         dotproduct = sum([self.W[n]*featureVector[n] for n in xrange(len(self.features))])
  57.         gradient = 1.0 / (1.0 + exp(-dotproduct))
  58.         error = actual - gradient
  59.         for n in xrange(len(self.features)):
  60.             self.W[n] += self.L * error * featureVector[n]
  61.            
  62.     def train(self, examples):
  63.         self.findMeanAndVariance(examples)
  64.         for e in xrange(self.E):
  65.             random.shuffle(examples)
  66.             for exs in examples:
  67.                 classification = int(exs[0])
  68.                 featureVector = []
  69.                 for feature in exs[1]:
  70.                     featureVector.append(float(feature.split(':')[1]))
  71.                 self.updateSGD(classification, featureVector)
  72.            
  73.     def predict(self, features):
  74.         # Get the values
  75.         featureVector = []
  76.         for feature in features:
  77.             featureVector.append(float(feature.split(':')[1]))
  78.         # Normalize some entries to prevent overflow
  79.         for n in xrange(len(self.features)):
  80.             if self.deviations[n] > 0.0:
  81.                 featureVector[n] = (featureVector[n] - self.means[n]) / self.deviations[n]
  82.             #if self.featureTotals[n] > 0:
  83.                 #featureVector[n] = featureVector[n]/(self.featureTotals[n]/self.numExamples)
  84.         dotproduct = sum([self.W[n]*featureVector[n] for n in xrange(len(self.features))])
  85.         gradient = 1.0 / (1.0 + exp(-dotproduct))
  86.         return "-1" if gradient > 0.5 else "+1"
  87.                  
  88. def main():
  89.     # Get the training examples
  90.     N, M = map(int,raw_input().split()) # number of inputs, number of paramaters
  91.     examples = [] # Stores (id, class, features) tuples
  92.     for n in xrange(N):
  93.         columns = raw_input().split()
  94.         #id = columns[0]
  95.         type = columns[1]
  96.         features = columns[2:]
  97.         examples.append((type,features))
  98.    
  99.     # Get the queries
  100.     Q = int(raw_input()) # number of queries to predict
  101.     queries = [] # Stores (id, class, features) tuples
  102.     for q in xrange(Q):
  103.         columns = raw_input().split()
  104.         id = columns[0]
  105.         features = columns[1:]
  106.         queries.append((id,features))
  107.        
  108.     # Create and train a classifier
  109.     classifier = QuoraClassifier(M)
  110.     classifier.train(examples)
  111.  
  112.     # Predict each query
  113.     for query in queries:
  114.         queryId = query[0]
  115.         queryFeatures = query[1]
  116.         print str(queryId) + ' ' + classifier.predict(queryFeatures)
  117.  
  118. if __name__ == '__main__':
  119.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement