Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # A python implementation of a solution to the Quora Classifier
- # problem for Codesprint 2. Uses a form of logistic regression.
- # Author: Kevin Tham
- from math import exp, pow, sqrt
- import random
- class QuoraClassifier(object):
- def __init__(self,numFeatures):
- self.W = []
- self.means = []
- self.deviations = []
- self.featureTotals = []
- self.features = range(numFeatures) # List of feature indices
- for n in self.features:
- self.means.append(0.0)
- self.deviations.append(0.0)
- self.W.append(0.0)
- self.featureTotals.append(0.0)
- self.numExamples = 0.0 # Number of examples trained on
- self.L = 0.5 # Learning Rate
- self.E = 10 # Epochs
- def findMeanAndVariance(self, examples):
- for exs in examples:
- features = exs[1]
- # Get the values
- featureVector = []
- for feature in features:
- featureVector.append(float(feature.split(':')[1]))
- self.numExamples += 1
- # Iterate through each feature and sum the values
- for n in xrange(len(self.features)):
- self.means[n] += featureVector[n]
- self.means = map(lambda x: x / self.numExamples, self.means)
- for exs in examples:
- # Get the values
- featureVector = []
- for feature in features:
- featureVector.append(float(feature.split(':')[1]))
- for n in xrange(len(self.features)):
- self.deviations[n] += pow(featureVector[n] - self.means[n], 2)
- self.deviations = map(lambda x: sqrt(x / self.numExamples), self.deviations)
- def updateSGD(self, actual, featureVector):
- # Calculate Mean
- self.numExamples += 1
- for n in xrange(len(self.features)):
- self.featureTotals[n] += featureVector[n]
- # Normalize with Mean to prevent overflow problem
- for n in xrange(len(self.features)):
- if (self.featureTotals[n] == 0):
- continue
- featureVector[n] = featureVector[n]/self.featureTotals[n]
- # Use Stochastic Gradient Descent for weight updating
- dotproduct = sum([self.W[n]*featureVector[n] for n in xrange(len(self.features))])
- gradient = 1.0 / (1.0 + exp(-dotproduct))
- error = actual - gradient
- for n in xrange(len(self.features)):
- self.W[n] += self.L * error * featureVector[n]
- def train(self, examples):
- self.findMeanAndVariance(examples)
- for e in xrange(self.E):
- random.shuffle(examples)
- for exs in examples:
- classification = int(exs[0])
- featureVector = []
- for feature in exs[1]:
- featureVector.append(float(feature.split(':')[1]))
- self.updateSGD(classification, featureVector)
- def predict(self, features):
- # Get the values
- featureVector = []
- for feature in features:
- featureVector.append(float(feature.split(':')[1]))
- # Normalize some entries to prevent overflow
- for n in xrange(len(self.features)):
- if self.deviations[n] > 0.0:
- featureVector[n] = (featureVector[n] - self.means[n]) / self.deviations[n]
- #if self.featureTotals[n] > 0:
- #featureVector[n] = featureVector[n]/(self.featureTotals[n]/self.numExamples)
- dotproduct = sum([self.W[n]*featureVector[n] for n in xrange(len(self.features))])
- gradient = 1.0 / (1.0 + exp(-dotproduct))
- return "-1" if gradient > 0.5 else "+1"
- def main():
- # Get the training examples
- N, M = map(int,raw_input().split()) # number of inputs, number of paramaters
- examples = [] # Stores (id, class, features) tuples
- for n in xrange(N):
- columns = raw_input().split()
- #id = columns[0]
- type = columns[1]
- features = columns[2:]
- examples.append((type,features))
- # Get the queries
- Q = int(raw_input()) # number of queries to predict
- queries = [] # Stores (id, class, features) tuples
- for q in xrange(Q):
- columns = raw_input().split()
- id = columns[0]
- features = columns[1:]
- queries.append((id,features))
- # Create and train a classifier
- classifier = QuoraClassifier(M)
- classifier.train(examples)
- # Predict each query
- for query in queries:
- queryId = query[0]
- queryFeatures = query[1]
- print str(queryId) + ' ' + classifier.predict(queryFeatures)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement