Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- # Jeremy Robinson
- # Sep 8, 2016
- # Mining the Social Web - get Twitter Search Results
- import re
- import json
- import twitter
- import pandas as pd
- import numpy
- from collections import Counter
- from authTwitter import authTW
- from textblob import TextBlob
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.feature_extraction.text import CountVectorizer
- def cleanTweet(t):
- # use the regular expression library to strip all unwanted characters from the text
- return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", t).split())
- def getSentiment(t):
- # using TextBlob, create an Object from the input tweet
- tbObject = TextBlob(t)
- # compute the sentiment
- if tbObject.sentiment.polarity > 0:
- return 'positive'
- elif tbObject.sentiment.polarity == 0:
- return 'neutral'
- else:
- return 'negative'
- def getSearch(t_obj):
- q = '#tulsa'
- count = 100
- # use the twitter api to get the tweets
- search_results = t_obj.search.tweets(q=q, count=count)
- # filter the json results just to status
- statuses = search_results['statuses']
- # iterate through the status to get the tweet text and id
- tw_list = []
- for tw in statuses:
- tw_text = tw['text']
- tw_text = cleanTweet(tw_text)
- tw_id = tw['id']
- tw_list.append([tw_id, tw_text])
- # use TextBlob to analyze and compute sentiment for the text
- # print("\n ", getSentiment(tw_text), " : ", tw_text)
- return tw_list
- def bagOfWords(sentence):
- bow = {}
- sentence = sentence.to_string()
- sentence.strip('RT')
- for word in sentence.split():
- w = word.lower()
- if w not in bow.keys():
- bow[w] = 0
- bow[w] += 1
- return bow
- def main():
- twitter_obj = authTW()
- r = getSearch(twitter_obj)
- # print(json.dumps(r, indent = 2))
- df = pd.DataFrame(r)
- # create the bag of words
- bag = bagOfWords(df[1])
- # print bag of words
- print(bag)
- print("=============================================")
- # print out the term frequncy in descending order
- print("Term Frequency\n", df[1].str.split(
- expand=True).stack().value_counts())
- print("=============================================")
- # instantiate CountVectorizer()
- cv = CountVectorizer()
- # this steps generates word counts for the words in your docs
- word_count_vector = cv.fit_transform(df[1].tolist())
- tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
- tfidf_transformer.fit(word_count_vector)
- # idf values
- df_idf = pd.DataFrame(tfidf_transformer.idf_,
- index=cv.get_feature_names(), columns=["idf_weights"])
- # print idf values in descending order
- print(df_idf.sort_values(by=['idf_weights'], ascending=False))
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement