Untitled

import csv, pickle
import nltk
import praw
import tweepy
import math
import praw
from collections import Counter, namedtuple

User = namedtuple('User', ('user', 'comments'))
Info = namedtuple('Info', ('words', 'comments'))

reddit = praw.Reddit(username='ComparisonBot', password='r3dd1t2016',
    client_id='slOtREeDSDjALg', client_secret='w0TBMNHGcZ0j9W6rqrCrUYM08nE',
    user_agent='Extracts text from comments on Reddit for comparison purposes')

def sort_information(comments):
    words = nltk.word_tokenize(unicode(comments))
    return Info(words, comments)

try:
    nltk.word_tokenize('Force data check')
except:
    nltk.download('punkt')

try:
    with open('tweets.pkl', 'rb') as f:
        print 'Loading from pkl'
        trump_info = pickle.load(f)
except:
    with open('tweets.csv') as f:
        print 'Loading from csv'
        trump_info = sort_information([row['Text'] for row in csv.DictReader(f)])
    with open('tweets.pkl', 'wb') as f:
        print 'Writing pkl'
        pickle.dump(trump_info, f, pickle.HIGHEST_PROTOCOL)

def get_twitter_comments(auth):
    api = tweepy.API(auth)
    account = api.me().screen_name

    tweet_list = []
    master = []
    user_tweets = api.user_timeline(account, count=200)
    for tweet in user_tweets:
        tweet_list.append(tweet)
    last = tweet_list[-1].id - 1
    while len(user_tweets) > 0:
        user_tweets = api.user_timeline(account, count=200, max_id=last)
        for tweet in user_tweets:
            tweet_list.append(tweet)
        last = tweet_list[-1].id - 1
    for tweet in tweet_list:
        tweet = unicode(tweet.text).replace('the','')
        tweet = tweet.replace('be','')
        tweet = tweet.replace('to','')
        tweet = tweet.replace('of','')
        tweet = tweet.replace('and','')
        tweet = tweet.replace('  ',' ')
        master.append(tweet)
    return User(account, master)

def get_reddit_comments(account):
    comment_list = []
    user = reddit.redditor(unicode(account))
    [comment_list.append(unicode(comment.body)) for comment in user.comments.new(limit=5000)]
    return User(account, comment_list)

def calculate_similarity(info):
    argument_1 = Counter(trump_info.words)
    argument_2 = Counter(info.words)
    terms = set(argument_1).union(argument_2)
    product = sum(argument_1.get(i,0) * argument_2.get(i,0) for i in terms)
    first = math.sqrt(sum(argument_1.get(i,0)**2 for i in terms))
    second = math.sqrt(sum(argument_2.get(i,0)**2 for i in terms))
    solution = product/(first*second)
    length_1 = sum(argument_1.values())
    length_2 = sum(argument_2.values())
    lengths = min(length_1,length_2) / float(max(length_1,length_2))
    similarity = round(lengths*solution * 100,2)
    return similarity

def sentence_similarity(info):
    sim = calculate_similarity(info)*.5
    arg = Counter(unicode(trump_info.comments))
    kwarg = Counter(unicode(info.comments))
    intercept = set(arg.keys()) & (set(kwarg.keys()))
    num = sum([arg[i] * kwarg[i] for i in intercept])
    first = sum([arg[i]**2 for i in arg.keys()])
    second = sum([kwarg[i]**2 for i in kwarg.keys()])
    den = math.sqrt(first) * math.sqrt(second)
    if not den:
        similarity = 0.0
    else:
        similarity = float(num)/den
    similarity = round((similarity*sim),2)
    return similarity

def get_score(comments):
    info = sort_information(comments)
    return sentence_similarity(info)