Advertisement
cephurs

twitterAnalysis.py

Oct 14th, 2017
717
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.26 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Fri Mar 18 08:45:50 2016
  4. @author: Michael
  5. For starters, we will just get sentiment from textBlob for tweets containing keywords like "Trump", "Carson", "Cruz", "Bern", Bernie", "guns", "immigration", "immigrants", etc.  
  6. matplotlib the results.
  7. Stuff to do:
  8.    Get user IDs
  9.    retrieve all the user's recent tweets and favorites.
  10.    separate tweets into groups containing each keyword
  11.    get sentiment graph of the whole group with textBlob and matplotlib
  12.    
  13. """
  14.  
  15.  
  16.  
  17. #Import the necessary methods from tweepy library
  18. import sys
  19. import tweepy
  20. from tweepy import OAuthHandler
  21. import textblob
  22. import numpy as np
  23. import time
  24. import pandas
  25. from tweepy.streaming import StreamListener
  26. from tweepy import Stream
  27.  
  28. def readFromFileA(filename,splitter=',', lineStart = 0, lineEnd = 1000):
  29.     f = open(filename,'r')
  30.     lines_list = f.readlines()
  31.     f.close()
  32.     my_data = [[str(val) for val in line.split(splitter)[0:]] for line in lines_list[lineStart:lineEnd]]    
  33.     my_data = filter(lambda a: a != ['\n'], my_data)
  34.     return my_data
  35.    
  36. class MyStreamListener(tweepy.StreamListener):
  37.  
  38.     def on_status(self, status):
  39.         randIds.append(status.user.id)
  40.  
  41.  
  42.  
  43.  
  44. #input is a tweet as a single line str.  This function will convert it all to lower case, remove useless words, and put in the format for the neural network.  
  45. #def parseTweet(tweet):    #will make this when I'm working on the nn part of the project.  not needed if using the textblob tool
  46.    
  47.    
  48.  
  49.  
  50.  
  51.  
  52. if __name__ == '__main__':
  53.     #Variables that contains the user credentials to access Twitter API: these are not real
  54.     consumerKey = 'BSfiAQWf44tc7'
  55.     consumerSecret = 'Nk0x66OaUrHNn4WjC'
  56.     accessToken = '324214621-bkOFZdKv1X9Rd9pTI6TC'
  57.     accessTokenSecret = 'grLo38PqPDQy'
  58.    
  59.    
  60.     idsFileName = "./data/twitter_ids.csv"
  61.     dataFileName2 = "./data/Video Transcript.txt"
  62.     randIdsFileName = "./data/randomIds.txt"
  63.    
  64.  
  65.     auth = OAuthHandler(consumerKey, consumerSecret)
  66.     auth.set_access_token(accessToken, accessTokenSecret)
  67.     api = tweepy.API(auth)
  68.     ids = readFromFileA(idsFileName, splitter = ',', lineStart = 1, lineEnd = 3000)
  69.    
  70.    
  71.     """if we are using a randomized control group, this code creates the ids list for them."""
  72.     #if getting randoms
  73. #    ids = []
  74. #    with open(randIdsFileName, 'r') as f:
  75. #        for line in f:
  76. #            ids.append(int(line))
  77. #    
  78. #    ids = ids[:720]
  79.    
  80.    
  81.     """this is only for non-tweet files, such as focus group transcripts"""
  82. #    lines = []
  83. #    
  84. #    with open(dataFileName2, 'r') as f:
  85. #        for line in f:
  86. #            lines.append(line.lower())
  87.    
  88.    
  89.     """this is the tweet id of roughly the time that the news broke that Carson dropped out.  I pull only tweets since that time"""
  90.     carsonDropsOutTweetId = 705885709861715968
  91.    
  92.    
  93.    
  94.     """hard coded lists for keywords that I'm searching for.   This is used to compile data for sentiments regarding tweets containing each keyword."""
  95.    
  96.    
  97.     #each sentiments list will have tuples: (sentiment, tweetID)
  98.     #note: could include many more keywords like "feelthebern" for example, but need neutral keywords to get true sentiments.  feelthebern would be a biased term.  
  99.     hillarySentiments = []
  100.     hillaryKeywords = ['hillary', 'clinton', 'hillaryclinton']
  101.     trumpSentiments = []
  102.     trumpKeywords = ['trump', 'realdonaldtrump']
  103.     cruzSentiments = []
  104.     cruzKeywords = ['cruz', 'tedcruz']
  105.     bernieSentiments = []
  106.     bernieKeywords = ['bern', 'bernie', 'sanders', 'sensanders']
  107.     obamaSentiments = []
  108.     obamaKeywords = ['obama', 'barack', 'barackobama']
  109.     republicanSentiments = []
  110.     republicanKeywords = ['republican', 'conservative']
  111.     democratSentiments = []
  112.     democratKeywords = ['democrat', 'dems', 'liberal']
  113.     gunsSentiments = []
  114.     gunsKeywords = ['guns', 'gun', 'nra', 'pistol', 'firearm', 'shooting']
  115.     immigrationSentiments = []
  116.     immigrationKeywords = ['immigration', 'immigrants', 'citizenship', 'naturalization', 'visas']
  117.     employmentSentiments = []
  118.     emplyomentKeywords = ['jobs', 'employment', 'unemployment', 'job']
  119.     inflationSentiments = []
  120.     inflationKeywords = ['inflate', 'inflation', 'price hike', 'price increase', 'prices rais']
  121.     minimumwageupSentiments = []
  122.     minimumwageupKeywords = ['raise minimum wage', 'wage increase', 'raise wage', 'wage hike']
  123.     abortionSentiments = []
  124.     abortionKeywords = ['abortion', 'pro-choice', 'planned parenthood']
  125.     governmentspendingSentiments = []
  126.     governmentspendingKeywords = ['gov spending', 'government spending', 'gov. spending', 'expenditure']
  127.     taxesupSentiments = []
  128.     taxesupKeywords = ['raise tax', 'tax hike', 'taxes up', 'tax up', 'increase taxes', 'taxes increase', 'tax increase']
  129.     taxesdownSentiments = []
  130.     taxesdownKeywords = ['lower tax', 'tax cut', 'tax slash', 'taxes down', 'tax down', 'decrease taxes', 'taxes decrease', 'tax decrease']
  131.    
  132.    
  133.     #(nameOfTuple, sentimentList, keywordList)
  134.     personSentimentList = [('hillary', hillarySentiments, hillaryKeywords), ('trump', trumpSentiments, trumpKeywords), ('cruz', cruzSentiments, cruzKeywords),
  135.                            ('bernie', bernieSentiments, bernieKeywords), ('obama', obamaSentiments, obamaKeywords)]
  136.     issueSentimentList = [('guns', gunsSentiments, gunsKeywords), ('immigration', immigrationSentiments, immigrationKeywords),
  137.                           ('employment', employmentSentiments, emplyomentKeywords), ('inflation', inflationSentiments, inflationKeywords),
  138.                           ('minimum wage up', minimumwageupSentiments, minimumwageupKeywords), ('abortion', abortionSentiments, abortionKeywords),
  139.                           ('government spending', governmentspendingSentiments, governmentspendingKeywords), ('taxes up', taxesupSentiments, taxesupKeywords),
  140.                           ('taxes down', taxesdownSentiments, taxesdownKeywords) ]
  141.      
  142.  
  143.  
  144.    
  145. """this bit is for taking random twitter IDs for the control group.  It simply skims the most recent tweets that have mentioned one of our keywords.
  146.   it turned out that skimming all of the tweets found very very few occurances of keywords since twitter is such a global/multilingual platform"""
  147.    
  148.     #randIds = []
  149.    
  150. #    allKeys = []
  151. #    for person in personSentimentList:
  152. #        for keyWord in person[2]:
  153. #            allKeys.append(keyWord)
  154. #    for issue in issueSentimentList:
  155. #        for keyWord in issue[2]:
  156. #            allKeys.append(keyWord)
  157. #    
  158. #    myStreamListener = MyStreamListener()
  159. #    myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
  160. #    sys.exit()
  161. #    myStream.filter(track = allKeys)
  162.            
  163.            
  164.            
  165. """here is the format for the basic loop for finding text that has the keywords we're searching for.
  166.   It then finds the sentiment and adds that to the respective keywords' data list
  167.   This particular segment uses 'lines' which is for if we're looking through non-tweets (like transcripts of some sort"""
  168.            
  169. #    for line in lines:
  170. #        for person in personSentimentList:
  171. #            for keyword in person[2]:
  172. #                if keyword in line:
  173. #                    try:
  174. #                        tb=textblob.TextBlob(line)
  175. #                        person[1].append((tb.sentiment.polarity, 5))
  176. #                        break
  177. #                    except:
  178. #                        continue
  179. #                    
  180. #    for line in lines:
  181. #        for person in issueSentimentList:
  182. #            for keyword in person[2]:
  183. #                if keyword in line:
  184. #                    try:
  185. #                        tb=textblob.TextBlob(line)
  186. #                        person[1].append((tb.sentiment.polarity, 5))
  187. #                        break
  188. #                    except:
  189. #                        continue
  190.            
  191.            
  192.      
  193.      
  194.     """this big block goes through tweets of each user, looks for keywords, and if the keyword is there,
  195.       we find the sentiment for that tweet and add it to the sentiment data list"""
  196.     start = time.time()
  197.     try:
  198.         ids = np.asarray(ids)[:,1]
  199.     except:
  200.         ids = np.asarray(ids)
  201.     try:
  202.         ids = ids.astype(np.int)
  203.     except:
  204.         print "whoops"
  205.     i = 0
  206.     counter = 0
  207.     totalIdsWithMentions = 0
  208.     mentionFlag = False
  209.     for idno in ids:
  210.         try:
  211.             idno = int(idno)
  212.         except:
  213.             print 'idno too long to convert to int'
  214.         if mentionFlag == True:
  215.             totalIdsWithMentions = totalIdsWithMentions + 1
  216.         mentionFlag = False
  217.        
  218.         """the rate limit is handled here.  Also, if for some reason we can't access the tweets (like internet failure)
  219.           we don't want to crash, so we wait 30 seconds and try again."""
  220.         if i % 2 == 0:
  221.             try:
  222.                 apiInfo = api.rate_limit_status()['resources']['statuses']['/statuses/user_timeline']
  223.             except:
  224.                 print 'no internet, sleeping for 30 seconds'
  225.                 time.sleep(30)
  226.             if apiInfo['remaining'] < 2:
  227.                 timeToSleep = apiInfo['reset'] - time.time()
  228.                 if timeToSleep > 0:
  229.                     print 'sleeping for: ', timeToSleep, ' seconds'
  230.                     sys.stdout.flush()
  231.                     time.sleep(timeToSleep + 1)
  232.                 else:
  233.                     time.sleep(1)
  234.                          
  235.         if i % 100 == 0:
  236.             print "on id number: ", i
  237.             sys.stdout.flush()
  238.         i = i + 1
  239.         counter = counter + 1
  240.         try:
  241.             for status in tweepy.Cursor(api.user_timeline, user_id = idno, since_id = carsonDropsOutTweetId).items(20):
  242.                 statusText = status.text.lower()
  243.                 for person in personSentimentList:
  244.                     for keyword in person[2]:
  245.                         if keyword in statusText:
  246.                             tb = textblob.TextBlob(statusText)
  247.                             person[1].append((tb.sentiment.polarity, status.id))
  248.                             mentionFlag = True
  249.                             break
  250.                 for issue in issueSentimentList:
  251.                     for keyword in issue[2]:
  252.                         if keyword in statusText:
  253.                             tb = textblob.TextBlob(statusText)
  254.                             issue[1].append((tb.sentiment.polarity, status.id))
  255.                             mentionFlag = True
  256.                             break
  257.         except KeyboardInterrupt:
  258.             raise
  259.         except:
  260.             print sys.exc_info()[0]
  261.             sys.stdout.flush()
  262.             counter = counter - 1
  263.             continue
  264.        
  265.      
  266.     arrayList = []            
  267.                    
  268.    
  269.    
  270.     """ here we're just compiling the sentiment data for each keyword group into an easier to work with format (dataframe).
  271.        df will contain the mean and median and mention count data.  Note that it is only meaningful if compared with a
  272.        control group, since keyword selection is impossible to employ neutrally.  """
  273.    
  274.     for person in personSentimentList:
  275.         sentimentData = np.asarray(person[1])
  276.         if len(sentimentData) > 0:
  277.             arrayList.append([person[0], np.mean(sentimentData[:,0]), np.percentile(sentimentData[:,0], 50), len(sentimentData)] )
  278.            
  279.     for issue in issueSentimentList:
  280.         sentimentData = np.asarray(issue[1])
  281.         if len(sentimentData) > 0:
  282.             arrayList.append([issue[0], np.mean(sentimentData[:,0]), np.percentile(sentimentData[:,0], 50), len(sentimentData)])
  283.    
  284.     meanMedianCountData = np.asarray(arrayList)
  285.     df = pandas.DataFrame(meanMedianCountData, columns=['name', 'mean', 'median', 'count'])
  286.     df[['name']] = df[['name']].astype(str)
  287.     df[['mean', 'median']] = df[['mean', 'median']].astype(float)
  288.     df[['count']] = df[['count']].astype(int)
  289.    
  290.     df.sort(['count'], ascending = 0, inplace = True)
  291.    
  292.     print df
  293.    
  294.     print 'time taken: ' , time.time()- start
  295.     print 'number of ids read: ' , counter
  296.     print 'Number of ids with keyword mentions: ' , totalIdsWithMentions
  297.    
  298.     #http://archive.is/f9HLC#selection-543.0-543.18
  299. #MichaelPhillipsData/GitSampleCode
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement