Guest User

Untitled

a guest
Apr 9th, 2017
237
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.92 KB | None | 0 0
  1. #import regex
  2. import re
  3. import csv
  4. import nltk
  5. import svm
  6. from svmutil import *
  7. #start replaceTwoOrMore
  8. def replaceTwoOrMore(s):
  9. #look for 2 or more repetitions of character
  10. pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
  11. return pattern.sub(r"\1\1", s)
  12. #end
  13.  
  14. #start process_tweet
  15. def processTweet(tweet):
  16. # process the tweets
  17.  
  18. #Convert to lower case
  19. tweet = tweet.lower()
  20. #Convert www.* or https?://* to URL
  21. tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
  22. #Convert @username to AT_USER
  23. tweet = re.sub('@[^\s]+','AT_USER',tweet)
  24. #Remove additional white spaces
  25. tweet = re.sub('[\s]+', ' ', tweet)
  26. #Replace #word with word
  27. tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
  28. #trim
  29. tweet = tweet.strip('\'"')
  30. return tweet
  31. #end
  32. def getFeatureVector(tweet, stopWords):
  33. featureVector = []
  34. words = tweet.split()
  35. for w in words:
  36. #replace two or more with two occurrences
  37. w = replaceTwoOrMore(w)
  38. #strip punctuation
  39. w = w.strip('\'"?,.')
  40. #check if it consists of only words
  41. val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
  42. #ignore if it is a stopWord
  43. if(w in stopWords or val is None):
  44. continue
  45. else:
  46. featureVector.append(w.lower())
  47. return featureVector
  48. #end
  49.  
  50. #start getStopWordList
  51. def getStopWordList(stopWordListFileName):
  52. #read the stopwords file and build a list
  53. stopWords = []
  54. stopWords.append('AT_USER')
  55. stopWords.append('URL')
  56.  
  57. fp = open(stopWordListFileName, 'r')
  58. line = fp.readline()
  59. while line:
  60. word = line.strip()
  61. stopWords.append(word)
  62. line = fp.readline()
  63. fp.close()
  64. return stopWords
  65. #end
  66.  
  67. #start extract_features
  68. def extract_features(tweet):
  69. tweet_words = set(tweet)
  70. features = {}
  71. for word in featureList:
  72. features['contains(%s)' % word] = (word in tweet_words)
  73. return features
  74. #end
  75.  
  76. st = open('stopwords.txt', 'r')
  77. stopWords = getStopWordList(stopwords.txt')
  78. #Read the tweets one by one and process it
  79. inpTweets = csv.reader(open('result.csv', 'rb'), delimiter=',', quotechar='|')
  80. stopWords = getStopWordList('stopwords.txt')
  81. featureList = []
  82.  
  83. # Get tweet words
  84. tweets = []
  85. for row in inpTweets:
  86. sentiment = row[0]
  87. tweet = row[1]
  88. processedTweet = processTweet(tweet)
  89. featureVector = getFeatureVector(processedTweet, stopWords)
  90. featureList.extend(featureVector)
  91. tweets.append((featureVector, sentiment));
  92. #end loop
  93.  
  94. # Remove featureList duplicates
  95. featureList = list(set(featureList))
  96.  
  97. #print tweets
  98. #print featureList
  99. #start extract_features
  100. def extract_features(tweet):
  101. tweet_words = set(tweet)
  102. features = {}
  103. for word in featureList:
  104. features['contains(%s)' % word] = (word in tweet_words)
  105. return features
  106. #end
  107. training_set = nltk.classify.util.apply_features(extract_features, tweets)
  108. #print training_set
  109.  
  110. def getSVMFeatureVectorAndLabels(tweets, featureList):
  111. sortedFeatures = sorted(featureList)
  112. map = {}
  113. feature_vector = []
  114. labels = []
  115. for t in tweets:
  116. label = 0
  117. map = {}
  118. #Initialize empty map
  119. for w in sortedFeatures:
  120. map[w] = 0
  121.  
  122. tweet_words = t[0]
  123. tweet_opinion = t[1]
  124. #Fill the map
  125. for word in tweet_words:
  126. #process the word (remove repetitions and punctuations)
  127. word = replaceTwoOrMore(word)
  128. word = word.strip('\'"?,.')
  129. #set map[word] to 1 if word exists
  130. if word in map:
  131. map[word] = 1
  132. #end for loop
  133. values = map.values()
  134. feature_vector.append(values)
  135. if(tweet_opinion == 'positive'):
  136. label = 0
  137. elif(tweet_opinion == 'negative'):
  138. label = 1
  139. elif(tweet_opinion == 'neutral'):
  140. label = 2
  141. labels.append(label)
  142. #return the list of feature_vector and labels
  143. return {'feature_vector' : feature_vector, 'labels': labels}
  144. #end
  145.  
  146. #Train the classifier
  147. result = getSVMFeatureVectorAndLabels(tweets, featureList)
  148. problem = svm_problem(result['labels'], result['feature_vector'])
  149. #'-q' option suppress console output
  150. param = svm_parameter('-q')
  151. param.kernel_type = LINEAR
  152. classifier = svm_train(problem, param)
  153. svm_save_model('classifierDumpFile', classifier)
  154. testing = 'This is a test tweet'
  155. #Test the classifier
  156. test_feature_vector = getSVMFeatureVectorAndLabels(testing, featureList)
  157. #print getSVMFeatureVectorAndLabels(test_tweets, featureList1)
  158.  
  159.  
  160.  
  161. #p_labels contains the final labeling result
  162.  
  163. passing_param = []
  164. print("Test Feature Vector: ")
  165. for xi in test_feature_vector['feature_vector']:
  166. #print(xi)
  167. passing_param.append(xi)
  168.  
  169.  
  170. p_labels, p_accs, p_vals = svm_predict([0] * len(passing_param), passing_param, classifier)
  171.  
  172.  
  173.  
  174. print p_labels
Add Comment
Please, Sign In to add comment