Advertisement
Guest User

Untitled

a guest
May 21st, 2019
190
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.19 KB | None | 0 0
  1. import tweepy
  2. import codecs
  3. from tweepy import Stream
  4. from tweepy import OAuthHandler
  5. from tweepy.streaming import StreamListener
  6. from nltk.corpus import twitter_samples
  7. from nltk.tokenize import TweetTokenizer
  8. import json
  9. import string
  10. import re
  11. import botometer
  12. from nltk.corpus import stopwords
  13. stopwords_english = stopwords.words('english')
  14. from nltk.stem import PorterStemmer
  15. stemmer = PorterStemmer()
  16. from nltk.tokenize import TweetTokenizer
  17. from nltk import classify
  18. from nltk import NaiveBayesClassifier
  19.  
  20. # Happy Emoticons
  21. emoticons_happy = set([
  22. ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
  23. ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
  24. '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
  25. 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
  26. '<3'
  27. ])
  28.  
  29. # Sad Emoticons
  30. emoticons_sad = set([
  31. ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
  32. ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
  33. ':c', ':{', '>:\\', ';('
  34. ])
  35. # all emoticons (happy + sad)
  36. emoticons = emoticons_happy.union(emoticons_sad)
  37.  
  38. #consumer key, consumer secret, access token, access secret.
  39. #ckey="mzhns6Ra7P1e1aO06yMVAmrAA"
  40. #csecret="6wheiTnv9ACqdpEOG3q2YVijuXLlCD1njWZlyQJy3ky4XKrymj"
  41. #atoken="366417856-tttnd9Eng3qyOx4yvzxO4ZuePy2qs1dus5ByYXbo"
  42. #asecret="pjvo9pkLljPmANPat9ALmXfBzu6rmFR3eFcjPDyatshT"
  43. ckey="ZUKS9ElXbYO4UBHnHBkhSLPdW"
  44. csecret="Ww7YKAvf4LMSyffxmYSdkauu3JUhcKOo9XxLpnCPWAPx3ksCvg"
  45. atoken="92337145-mI3PSDTdjAw38ld3kBqPnDavytDiSsFdve0DHd0jA"
  46. asecret="6ATbGCPfsey6LYsfTKNVX8m8nT9QThvhzTQ50WcJX6nZd"
  47. auth = OAuthHandler(ckey, csecret)
  48. auth.set_access_token(atoken, asecret)
  49. api = tweepy.API(auth)
  50.  
  51. mashape_key = "ad092870a1msh558166c739f9457p12bddfjsn39f1129751f5"
  52. twitter_app_auth = {
  53. 'consumer_key': 'ZUKS9ElXbYO4UBHnHBkhSLPdW',
  54. 'consumer_secret': 'Ww7YKAvf4LMSyffxmYSdkauu3JUhcKOo9XxLpnCPWAPx3ksCvg',
  55. 'access_token': '92337145-mI3PSDTdjAw38ld3kBqPnDavytDiSsFdve0DHd0jA',
  56. 'access_token_secret': '6ATbGCPfsey6LYsfTKNVX8m8nT9QThvhzTQ50WcJX6nZd',
  57. }
  58.  
  59.  
  60.  
  61.  
  62. query="ExpressVPN" #search word OR for a OR b, AND for tweets with a AND b
  63. max_tweets=500 #number of tweets
  64. n=1
  65. #tweet_mode=extended for entire tweets
  66. #tweets = open("tweetset","w", encoding="utf-8")
  67. retweet = " -filter:retweets"
  68. query = f"{query}{retweet}"
  69. #for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended', lang='en').items(max_tweets):
  70. # with open('tweetset.json', 'a') as f:
  71. # f.write(json.dumps(tweet._json))
  72. # f.write("\n")
  73. # f.close()
  74. # print (n,tweet.full_text)
  75. # n=n+1
  76.  
  77. #data tweetset
  78. data = []
  79. with open('tweetset.json') as f:
  80. for line in f:
  81. data.append(json.loads(line))
  82.  
  83. posdata = []
  84. with open('onlypostweet.json') as f:
  85. for line in f:
  86. posdata.append(json.loads(line))
  87.  
  88.  
  89. negdata = []
  90. with open('onlynegtweet.json') as f:
  91. for line in f:
  92. negdata.append(json.loads(line))
  93.  
  94. print(negdata)
  95.  
  96. #skriv ut pdata
  97. #for pdata in posdata:
  98. #print(str(n), pdata['full_text'])
  99. #n = n+1
  100. #print(pdata['user']['screen_name'])
  101.  
  102. #skriv positiva tweets till en ny json med bara texten
  103. #for ndata in negdata:
  104. # with open('onlynegtweet.json', 'a') as f:
  105. # f.write(json.dumps(ndata['full_text']))
  106. # f.write("\n")
  107. # f.close()
  108.  
  109.  
  110. #for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended', lang='en').items(max_tweets):
  111. # with open('tweetset.json', 'a') as f:
  112. # f.write(json.dumps(tweet._json))
  113. # f.write("\n")
  114. # f.close()
  115. # print (n,tweet.full_text)
  116. # n=n+1
  117. #for tweetset in data:
  118. # print(str(n), tweetset['full_text'])
  119. # n = n+1
  120. # print(tweetset['user']['screen_name'])
  121.  
  122. bom = botometer.Botometer(wait_on_ratelimit=True,
  123. mashape_key=mashape_key,
  124. **twitter_app_auth)
  125.  
  126.  
  127. #data = []
  128. #with open('tweetset.json') as f:
  129. # for line in f:
  130. # data.append(json.loads(line))
  131. #check accounts
  132. #accounts = []
  133. #for row in data:
  134. # name=row['user']['screen_name']
  135. # result=bom.check_account(name)
  136. # print(n,result['user']['screen_name'] + " %.2f%%" % (100*result['scores']['universal']))
  137. # n=n+1
  138.  
  139.  
  140. #clean tweets
  141. def clean_tweets(tweet):
  142. # remove stock market tickers like $GE
  143. tweet = re.sub(r'\$\w*', '', tweet)
  144.  
  145. # remove old style retweet text "RT"
  146. tweet = re.sub(r'^RT[\s]+', '', tweet)
  147.  
  148. # remove hyperlinks
  149. tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  150.  
  151. # remove hashtags
  152. # only removing the hash # sign from the word
  153. tweet = re.sub(r'#', '', tweet)
  154. tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  155. tweet_tokens = tokenizer.tokenize(tweet)
  156.  
  157. tweets_clean = []
  158. for word in tweet_tokens:
  159. if (word not in stopwords_english and # remove stopwords
  160. word not in emoticons and # remove emoticons
  161. word not in string.punctuation): # remove punctuation
  162. tweets_clean.append(word)
  163.  
  164. return tweets_clean
  165.  
  166. #print (clean_tweets(custom_tweet))
  167.  
  168. #for row in data:
  169. # name=row['user']['screen_name']
  170. # result=bom.check_account(name)
  171. # print(n,result['user']['screen_name'] + " %.2f%%" % (100*result['scores']['universal']))
  172. # n=n+1
  173. #gettweets
  174. #def get_tweets(tweetset):
  175.  
  176.  
  177.  
  178.  
  179.  
  180. #twitter samples
  181. pos_tweets = twitter_samples.strings('positive_tweets.json')
  182. print ("onlypostweets = ",len(pos_tweets)) # Output: 5000
  183.  
  184. neg_tweets = twitter_samples.strings('negative_tweets.json')
  185. print (len(neg_tweets)) # Output: 5000
  186.  
  187. #all_tweets = twitter_samples.strings('tweetset.json')
  188.  
  189. #print (clean_tweets(pos_tweets[5]))
  190. #print (pos_tweets[5])
  191. #print (len(all_tweets)) # Output: 20000
  192.  
  193. # tokenize tweets
  194.  
  195. #for tweet in pos_tweets[:5]:
  196. # print (tweet_tokenizer.tokenize(tweet))
  197.  
  198.  
  199.  
  200. #nästa steg skriva tweets till text.doc
  201.  
  202. # feature extractor function
  203. def bag_of_words(tweet):
  204. words = clean_tweets(tweet)
  205. words_dictionary = dict([word, True] for word in words)
  206. return words_dictionary
  207. #print (bag_of_words(custom_tweet))
  208.  
  209. # positive tweets feature set
  210. pos_tweets_set = []
  211. for tweet in posdata:
  212. pos_tweets_set.append((bag_of_words(tweet), 'pos'))
  213.  
  214. # negative tweets feature set
  215. neg_tweets_set = []
  216. for tweet in negdata:
  217. neg_tweets_set.append((bag_of_words(tweet), 'neg'))
  218.  
  219. #print (len(pos_tweets_set), len(neg_tweets_set)) # Output: (5000, 5000)
  220.  
  221.  
  222.  
  223. # radomize pos_reviews_set and neg_reviews_set
  224. # doing so will output different accuracy result everytime we run the program
  225. from random import shuffle
  226. shuffle(pos_tweets_set)
  227. shuffle(neg_tweets_set)
  228.  
  229. test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
  230. train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
  231.  
  232. print(len(test_set), len(train_set)) # Output: (2000, 8000)
  233.  
  234.  
  235. classifier = NaiveBayesClassifier.train(train_set)
  236.  
  237. accuracy = classify.accuracy(classifier, test_set)
  238. #print(accuracy) # Output: 0.765
  239.  
  240. #print (classifier.show_most_informative_features(10))
  241.  
  242. custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
  243. custom_tweet_set = bag_of_words(custom_tweet)
  244. #print (classifier.classify(custom_tweet_set)) # Output: neg
  245.  
  246. # probability result
  247. prob_result = classifier.prob_classify(custom_tweet_set)
  248. print (prob_result) # Output: <ProbDist with 2 samples>
  249. print (prob_result.max()) # Output: neg
  250. print ("neg probability:", prob_result.prob("neg")) # Output: 0.941844352481
  251. print ("pos probability:", prob_result.prob("pos")) # Output: R0.0581556475194
  252.  
  253. custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
  254. custom_tweet_set = bag_of_words(custom_tweet)
  255.  
  256. #print (classifier.classify(custom_tweet_set)) # Output: pos
  257. # Positive tweet correctly classified as positive
  258.  
  259. # probability result
  260. prob_result = classifier.prob_classify(custom_tweet_set)
  261. #print (prob_result) # Output: <ProbDist with 2 samples>
  262. #print (prob_result.max()) # Output: pos
  263. #print (prob_result.prob("neg")) # Output: 0.00131055449755
  264. #print (prob_result.prob("pos")) # Output: 0.998689445502
  265.  
  266.  
  267. #bot träning
  268. my_tweets = 'tweetset.json'
  269. my_tweets_set = bag_of_words(my_tweets)
  270.  
  271. #classifier = NaiveBayesClassifier.train()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement