Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv, pickle
- import nltk
- import praw
- import tweepy
- import math
- import praw
- from collections import Counter, namedtuple
- User = namedtuple('User', ('user', 'comments'))
- Info = namedtuple('Info', ('words', 'comments'))
- reddit = praw.Reddit(username='ComparisonBot', password='r3dd1t2016',
- client_id='slOtREeDSDjALg', client_secret='w0TBMNHGcZ0j9W6rqrCrUYM08nE',
- user_agent='Extracts text from comments on Reddit for comparison purposes')
- def sort_information(comments):
- words = nltk.word_tokenize(unicode(comments))
- return Info(words, comments)
- try:
- nltk.word_tokenize('Force data check')
- except:
- nltk.download('punkt')
- try:
- with open('tweets.pkl', 'rb') as f:
- print 'Loading from pkl'
- trump_info = pickle.load(f)
- except:
- with open('tweets.csv') as f:
- print 'Loading from csv'
- trump_info = sort_information([row['Text'] for row in csv.DictReader(f)])
- with open('tweets.pkl', 'wb') as f:
- print 'Writing pkl'
- pickle.dump(trump_info, f, pickle.HIGHEST_PROTOCOL)
- def get_twitter_comments(auth):
- api = tweepy.API(auth)
- account = api.me().screen_name
- tweet_list = []
- master = []
- user_tweets = api.user_timeline(account, count=200)
- for tweet in user_tweets:
- tweet_list.append(tweet)
- last = tweet_list[-1].id - 1
- while len(user_tweets) > 0:
- user_tweets = api.user_timeline(account, count=200, max_id=last)
- for tweet in user_tweets:
- tweet_list.append(tweet)
- last = tweet_list[-1].id - 1
- for tweet in tweet_list:
- tweet = unicode(tweet.text).replace('the','')
- tweet = tweet.replace('be','')
- tweet = tweet.replace('to','')
- tweet = tweet.replace('of','')
- tweet = tweet.replace('and','')
- tweet = tweet.replace(' ',' ')
- master.append(tweet)
- return User(account, master)
- def get_reddit_comments(account):
- comment_list = []
- user = reddit.redditor(unicode(account))
- [comment_list.append(unicode(comment.body)) for comment in user.comments.new(limit=5000)]
- return User(account, comment_list)
- def calculate_similarity(info):
- argument_1 = Counter(trump_info.words)
- argument_2 = Counter(info.words)
- terms = set(argument_1).union(argument_2)
- product = sum(argument_1.get(i,0) * argument_2.get(i,0) for i in terms)
- first = math.sqrt(sum(argument_1.get(i,0)**2 for i in terms))
- second = math.sqrt(sum(argument_2.get(i,0)**2 for i in terms))
- solution = product/(first*second)
- length_1 = sum(argument_1.values())
- length_2 = sum(argument_2.values())
- lengths = min(length_1,length_2) / float(max(length_1,length_2))
- similarity = round(lengths*solution * 100,2)
- return similarity
- def sentence_similarity(info):
- sim = calculate_similarity(info)*.5
- arg = Counter(unicode(trump_info.comments))
- kwarg = Counter(unicode(info.comments))
- intercept = set(arg.keys()) & (set(kwarg.keys()))
- num = sum([arg[i] * kwarg[i] for i in intercept])
- first = sum([arg[i]**2 for i in arg.keys()])
- second = sum([kwarg[i]**2 for i in kwarg.keys()])
- den = math.sqrt(first) * math.sqrt(second)
- if not den:
- similarity = 0.0
- else:
- similarity = float(num)/den
- similarity = round((similarity*sim),2)
- return similarity
- def get_score(comments):
- info = sort_information(comments)
- return sentence_similarity(info)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement