Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #import regex
- import re
- import csv
- import nltk
- import svm
- from svmutil import *
- #start replaceTwoOrMore
- def replaceTwoOrMore(s):
- #look for 2 or more repetitions of character
- pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
- return pattern.sub(r"\1\1", s)
- #end
- #start process_tweet
- def processTweet(tweet):
- # process the tweets
- #Convert to lower case
- tweet = tweet.lower()
- #Convert www.* or https?://* to URL
- tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
- #Convert @username to AT_USER
- tweet = re.sub('@[^\s]+','AT_USER',tweet)
- #Remove additional white spaces
- tweet = re.sub('[\s]+', ' ', tweet)
- #Replace #word with word
- tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
- #trim
- tweet = tweet.strip('\'"')
- return tweet
- #end
- def getFeatureVector(tweet, stopWords):
- featureVector = []
- words = tweet.split()
- for w in words:
- #replace two or more with two occurrences
- w = replaceTwoOrMore(w)
- #strip punctuation
- w = w.strip('\'"?,.')
- #check if it consists of only words
- val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
- #ignore if it is a stopWord
- if(w in stopWords or val is None):
- continue
- else:
- featureVector.append(w.lower())
- return featureVector
- #end
- #start getStopWordList
- def getStopWordList(stopWordListFileName):
- #read the stopwords file and build a list
- stopWords = []
- stopWords.append('AT_USER')
- stopWords.append('URL')
- fp = open(stopWordListFileName, 'r')
- line = fp.readline()
- while line:
- word = line.strip()
- stopWords.append(word)
- line = fp.readline()
- fp.close()
- return stopWords
- #end
- #start extract_features
- def extract_features(tweet):
- tweet_words = set(tweet)
- features = {}
- for word in featureList:
- features['contains(%s)' % word] = (word in tweet_words)
- return features
- #end
- st = open('stopwords.txt', 'r')
- stopWords = getStopWordList(stopwords.txt')
- #Read the tweets one by one and process it
- inpTweets = csv.reader(open('result.csv', 'rb'), delimiter=',', quotechar='|')
- stopWords = getStopWordList('stopwords.txt')
- featureList = []
- # Get tweet words
- tweets = []
- for row in inpTweets:
- sentiment = row[0]
- tweet = row[1]
- processedTweet = processTweet(tweet)
- featureVector = getFeatureVector(processedTweet, stopWords)
- featureList.extend(featureVector)
- tweets.append((featureVector, sentiment));
- #end loop
- # Remove featureList duplicates
- featureList = list(set(featureList))
- #print tweets
- #print featureList
- #start extract_features
- def extract_features(tweet):
- tweet_words = set(tweet)
- features = {}
- for word in featureList:
- features['contains(%s)' % word] = (word in tweet_words)
- return features
- #end
- training_set = nltk.classify.util.apply_features(extract_features, tweets)
- #print training_set
- def getSVMFeatureVectorAndLabels(tweets, featureList):
- sortedFeatures = sorted(featureList)
- map = {}
- feature_vector = []
- labels = []
- for t in tweets:
- label = 0
- map = {}
- #Initialize empty map
- for w in sortedFeatures:
- map[w] = 0
- tweet_words = t[0]
- tweet_opinion = t[1]
- #Fill the map
- for word in tweet_words:
- #process the word (remove repetitions and punctuations)
- word = replaceTwoOrMore(word)
- word = word.strip('\'"?,.')
- #set map[word] to 1 if word exists
- if word in map:
- map[word] = 1
- #end for loop
- values = map.values()
- feature_vector.append(values)
- if(tweet_opinion == 'positive'):
- label = 0
- elif(tweet_opinion == 'negative'):
- label = 1
- elif(tweet_opinion == 'neutral'):
- label = 2
- labels.append(label)
- #return the list of feature_vector and labels
- return {'feature_vector' : feature_vector, 'labels': labels}
- #end
- #Train the classifier
- result = getSVMFeatureVectorAndLabels(tweets, featureList)
- problem = svm_problem(result['labels'], result['feature_vector'])
- #'-q' option suppress console output
- param = svm_parameter('-q')
- param.kernel_type = LINEAR
- classifier = svm_train(problem, param)
- svm_save_model('classifierDumpFile', classifier)
- testing = 'This is a test tweet'
- #Test the classifier
- test_feature_vector = getSVMFeatureVectorAndLabels(testing, featureList)
- #print getSVMFeatureVectorAndLabels(test_tweets, featureList1)
- #p_labels contains the final labeling result
- passing_param = []
- print("Test Feature Vector: ")
- for xi in test_feature_vector['feature_vector']:
- #print(xi)
- passing_param.append(xi)
- p_labels, p_accs, p_vals = svm_predict([0] * len(passing_param), passing_param, classifier)
- print p_labels
Add Comment
Please, Sign In to add comment