Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tweepy
- import codecs
- from tweepy import Stream
- from tweepy import OAuthHandler
- from tweepy.streaming import StreamListener
- from nltk.corpus import twitter_samples
- from nltk.tokenize import TweetTokenizer
- import json
- import string
- import re
- import botometer
- from nltk.corpus import stopwords
- stopwords_english = stopwords.words('english')
- from nltk.stem import PorterStemmer
- stemmer = PorterStemmer()
- from nltk.tokenize import TweetTokenizer
- from nltk import classify
- from nltk import NaiveBayesClassifier
- # Happy Emoticons
- emoticons_happy = set([
- ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
- ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
- '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
- 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
- '<3'
- ])
- # Sad Emoticons
- emoticons_sad = set([
- ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
- ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
- ':c', ':{', '>:\\', ';('
- ])
- # all emoticons (happy + sad)
- emoticons = emoticons_happy.union(emoticons_sad)
- #consumer key, consumer secret, access token, access secret.
- #ckey="mzhns6Ra7P1e1aO06yMVAmrAA"
- #csecret="6wheiTnv9ACqdpEOG3q2YVijuXLlCD1njWZlyQJy3ky4XKrymj"
- #atoken="366417856-tttnd9Eng3qyOx4yvzxO4ZuePy2qs1dus5ByYXbo"
- #asecret="pjvo9pkLljPmANPat9ALmXfBzu6rmFR3eFcjPDyatshT"
- ckey="ZUKS9ElXbYO4UBHnHBkhSLPdW"
- csecret="Ww7YKAvf4LMSyffxmYSdkauu3JUhcKOo9XxLpnCPWAPx3ksCvg"
- atoken="92337145-mI3PSDTdjAw38ld3kBqPnDavytDiSsFdve0DHd0jA"
- asecret="6ATbGCPfsey6LYsfTKNVX8m8nT9QThvhzTQ50WcJX6nZd"
- auth = OAuthHandler(ckey, csecret)
- auth.set_access_token(atoken, asecret)
- api = tweepy.API(auth)
- mashape_key = "ad092870a1msh558166c739f9457p12bddfjsn39f1129751f5"
- twitter_app_auth = {
- 'consumer_key': 'ZUKS9ElXbYO4UBHnHBkhSLPdW',
- 'consumer_secret': 'Ww7YKAvf4LMSyffxmYSdkauu3JUhcKOo9XxLpnCPWAPx3ksCvg',
- 'access_token': '92337145-mI3PSDTdjAw38ld3kBqPnDavytDiSsFdve0DHd0jA',
- 'access_token_secret': '6ATbGCPfsey6LYsfTKNVX8m8nT9QThvhzTQ50WcJX6nZd',
- }
- query="ExpressVPN" #search word OR for a OR b, AND for tweets with a AND b
- max_tweets=500 #number of tweets
- n=1
- #tweet_mode=extended for entire tweets
- #tweets = open("tweetset","w", encoding="utf-8")
- retweet = " -filter:retweets"
- query = f"{query}{retweet}"
- #for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended', lang='en').items(max_tweets):
- # with open('tweetset.json', 'a') as f:
- # f.write(json.dumps(tweet._json))
- # f.write("\n")
- # f.close()
- # print (n,tweet.full_text)
- # n=n+1
- #data tweetset
- data = []
- with open('tweetset.json') as f:
- for line in f:
- data.append(json.loads(line))
- posdata = []
- with open('onlypostweet.json') as f:
- for line in f:
- posdata.append(json.loads(line))
- negdata = []
- with open('onlynegtweet.json') as f:
- for line in f:
- negdata.append(json.loads(line))
- print(negdata)
- #skriv ut pdata
- #for pdata in posdata:
- #print(str(n), pdata['full_text'])
- #n = n+1
- #print(pdata['user']['screen_name'])
- #skriv positiva tweets till en ny json med bara texten
- #for ndata in negdata:
- # with open('onlynegtweet.json', 'a') as f:
- # f.write(json.dumps(ndata['full_text']))
- # f.write("\n")
- # f.close()
- #for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended', lang='en').items(max_tweets):
- # with open('tweetset.json', 'a') as f:
- # f.write(json.dumps(tweet._json))
- # f.write("\n")
- # f.close()
- # print (n,tweet.full_text)
- # n=n+1
- #for tweetset in data:
- # print(str(n), tweetset['full_text'])
- # n = n+1
- # print(tweetset['user']['screen_name'])
- bom = botometer.Botometer(wait_on_ratelimit=True,
- mashape_key=mashape_key,
- **twitter_app_auth)
- #data = []
- #with open('tweetset.json') as f:
- # for line in f:
- # data.append(json.loads(line))
- #check accounts
- #accounts = []
- #for row in data:
- # name=row['user']['screen_name']
- # result=bom.check_account(name)
- # print(n,result['user']['screen_name'] + " %.2f%%" % (100*result['scores']['universal']))
- # n=n+1
- #clean tweets
- def clean_tweets(tweet):
- # remove stock market tickers like $GE
- tweet = re.sub(r'\$\w*', '', tweet)
- # remove old style retweet text "RT"
- tweet = re.sub(r'^RT[\s]+', '', tweet)
- # remove hyperlinks
- tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
- # remove hashtags
- # only removing the hash # sign from the word
- tweet = re.sub(r'#', '', tweet)
- tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
- tweet_tokens = tokenizer.tokenize(tweet)
- tweets_clean = []
- for word in tweet_tokens:
- if (word not in stopwords_english and # remove stopwords
- word not in emoticons and # remove emoticons
- word not in string.punctuation): # remove punctuation
- tweets_clean.append(word)
- return tweets_clean
- #print (clean_tweets(custom_tweet))
- #for row in data:
- # name=row['user']['screen_name']
- # result=bom.check_account(name)
- # print(n,result['user']['screen_name'] + " %.2f%%" % (100*result['scores']['universal']))
- # n=n+1
- #gettweets
- #def get_tweets(tweetset):
- #twitter samples
- pos_tweets = twitter_samples.strings('positive_tweets.json')
- print ("onlypostweets = ",len(pos_tweets)) # Output: 5000
- neg_tweets = twitter_samples.strings('negative_tweets.json')
- print (len(neg_tweets)) # Output: 5000
- #all_tweets = twitter_samples.strings('tweetset.json')
- #print (clean_tweets(pos_tweets[5]))
- #print (pos_tweets[5])
- #print (len(all_tweets)) # Output: 20000
- # tokenize tweets
- #for tweet in pos_tweets[:5]:
- # print (tweet_tokenizer.tokenize(tweet))
- #nästa steg skriva tweets till text.doc
- # feature extractor function
- def bag_of_words(tweet):
- words = clean_tweets(tweet)
- words_dictionary = dict([word, True] for word in words)
- return words_dictionary
- #print (bag_of_words(custom_tweet))
- # positive tweets feature set
- pos_tweets_set = []
- for tweet in posdata:
- pos_tweets_set.append((bag_of_words(tweet), 'pos'))
- # negative tweets feature set
- neg_tweets_set = []
- for tweet in negdata:
- neg_tweets_set.append((bag_of_words(tweet), 'neg'))
- #print (len(pos_tweets_set), len(neg_tweets_set)) # Output: (5000, 5000)
- # radomize pos_reviews_set and neg_reviews_set
- # doing so will output different accuracy result everytime we run the program
- from random import shuffle
- shuffle(pos_tweets_set)
- shuffle(neg_tweets_set)
- test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
- train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
- print(len(test_set), len(train_set)) # Output: (2000, 8000)
- classifier = NaiveBayesClassifier.train(train_set)
- accuracy = classify.accuracy(classifier, test_set)
- #print(accuracy) # Output: 0.765
- #print (classifier.show_most_informative_features(10))
- custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
- custom_tweet_set = bag_of_words(custom_tweet)
- #print (classifier.classify(custom_tweet_set)) # Output: neg
- # probability result
- prob_result = classifier.prob_classify(custom_tweet_set)
- print (prob_result) # Output: <ProbDist with 2 samples>
- print (prob_result.max()) # Output: neg
- print ("neg probability:", prob_result.prob("neg")) # Output: 0.941844352481
- print ("pos probability:", prob_result.prob("pos")) # Output: R0.0581556475194
- custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
- custom_tweet_set = bag_of_words(custom_tweet)
- #print (classifier.classify(custom_tweet_set)) # Output: pos
- # Positive tweet correctly classified as positive
- # probability result
- prob_result = classifier.prob_classify(custom_tweet_set)
- #print (prob_result) # Output: <ProbDist with 2 samples>
- #print (prob_result.max()) # Output: pos
- #print (prob_result.prob("neg")) # Output: 0.00131055449755
- #print (prob_result.prob("pos")) # Output: 0.998689445502
- #bot träning
- my_tweets = 'tweetset.json'
- my_tweets_set = bag_of_words(my_tweets)
- #classifier = NaiveBayesClassifier.train()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement