Untitled

#!/usr/bin/python3
# Jeremy Robinson
# Sep 8, 2016
# Mining the Social Web - get Twitter Search Results

import re
import json
import twitter
import pandas as pd
import numpy
from collections import Counter

from authTwitter import authTW
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


def cleanTweet(t):
    # use the regular expression library to strip all unwanted characters from the text
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", t).split())


def getSentiment(t):
    # using TextBlob, create an Object from the input tweet
    tbObject = TextBlob(t)

    # compute the sentiment
    if tbObject.sentiment.polarity > 0:
        return 'positive'
    elif tbObject.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'


def getSearch(t_obj):
    q = '#tulsa'
    count = 100

    # use the twitter api to get the tweets
    search_results = t_obj.search.tweets(q=q, count=count)

    # filter the json results just to status
    statuses = search_results['statuses']

    # iterate through the status to get the tweet text and id
    tw_list = []
    for tw in statuses:
        tw_text = tw['text']
        tw_text = cleanTweet(tw_text)

        tw_id = tw['id']
        tw_list.append([tw_id, tw_text])

        # use TextBlob to analyze and compute sentiment for the text
        # print("\n ", getSentiment(tw_text), " : ", tw_text)

    return tw_list


def bagOfWords(sentence):
    bow = {}
    sentence = sentence.to_string()
    sentence.strip('RT')
    for word in sentence.split():
        w = word.lower()
        if w not in bow.keys():
            bow[w] = 0
        bow[w] += 1
    return bow


def main():
    twitter_obj = authTW()
    r = getSearch(twitter_obj)
    # print(json.dumps(r, indent = 2))
    df = pd.DataFrame(r)

    # create the bag of words
    bag = bagOfWords(df[1])
    # print bag of words
    print(bag)

    print("=============================================")

    # print out the term frequncy in descending order
    print("Term Frequency\n", df[1].str.split(
        expand=True).stack().value_counts())

    print("=============================================")

    # instantiate CountVectorizer()
    cv = CountVectorizer()

    # this steps generates word counts for the words in your docs
    word_count_vector = cv.fit_transform(df[1].tolist())

    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count_vector)

    # idf values
    df_idf = pd.DataFrame(tfidf_transformer.idf_,
                          index=cv.get_feature_names(), columns=["idf_weights"])

    # print idf values in descending order
    print(df_idf.sort_values(by=['idf_weights'], ascending=False))


main()