Advertisement
Guest User

Untitled

a guest
Feb 9th, 2017
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.18 KB | None | 0 0
  1. import csv, pickle
  2. import nltk
  3. import praw
  4. import tweepy
  5. import math
  6. import praw
  7. from collections import Counter, namedtuple
  8.  
  9. User = namedtuple('User', ('user', 'comments'))
  10. Info = namedtuple('Info', ('words', 'comments'))
  11.  
  12. reddit = praw.Reddit(username='ComparisonBot', password='r3dd1t2016',
  13.     client_id='slOtREeDSDjALg', client_secret='w0TBMNHGcZ0j9W6rqrCrUYM08nE',
  14.     user_agent='Extracts text from comments on Reddit for comparison purposes')
  15.  
  16. def sort_information(comments):
  17.     words = nltk.word_tokenize(unicode(comments))
  18.     return Info(words, comments)
  19.  
  20. try:
  21.     nltk.word_tokenize('Force data check')
  22. except:
  23.     nltk.download('punkt')
  24.  
  25. try:
  26.     with open('tweets.pkl', 'rb') as f:
  27.         print 'Loading from pkl'
  28.         trump_info = pickle.load(f)
  29. except:
  30.     with open('tweets.csv') as f:
  31.         print 'Loading from csv'
  32.         trump_info = sort_information([row['Text'] for row in csv.DictReader(f)])
  33.     with open('tweets.pkl', 'wb') as f:
  34.         print 'Writing pkl'
  35.         pickle.dump(trump_info, f, pickle.HIGHEST_PROTOCOL)
  36.  
  37. def get_twitter_comments(auth):
  38.     api = tweepy.API(auth)
  39.     account = api.me().screen_name
  40.  
  41.     tweet_list = []
  42.     master = []
  43.     user_tweets = api.user_timeline(account, count=200)
  44.     for tweet in user_tweets:
  45.         tweet_list.append(tweet)
  46.     last = tweet_list[-1].id - 1
  47.     while len(user_tweets) > 0:
  48.         user_tweets = api.user_timeline(account, count=200, max_id=last)
  49.         for tweet in user_tweets:
  50.             tweet_list.append(tweet)
  51.         last = tweet_list[-1].id - 1
  52.     for tweet in tweet_list:
  53.         tweet = unicode(tweet.text).replace('the','')
  54.         tweet = tweet.replace('be','')
  55.         tweet = tweet.replace('to','')
  56.         tweet = tweet.replace('of','')
  57.         tweet = tweet.replace('and','')
  58.         tweet = tweet.replace('  ',' ')
  59.         master.append(tweet)
  60.     return User(account, master)
  61.  
  62. def get_reddit_comments(account):
  63.     comment_list = []
  64.     user = reddit.redditor(unicode(account))
  65.     [comment_list.append(unicode(comment.body)) for comment in user.comments.new(limit=5000)]
  66.     return User(account, comment_list)
  67.  
  68. def calculate_similarity(info):
  69.     argument_1 = Counter(trump_info.words)
  70.     argument_2 = Counter(info.words)
  71.     terms = set(argument_1).union(argument_2)
  72.     product = sum(argument_1.get(i,0) * argument_2.get(i,0) for i in terms)
  73.     first = math.sqrt(sum(argument_1.get(i,0)**2 for i in terms))
  74.     second = math.sqrt(sum(argument_2.get(i,0)**2 for i in terms))
  75.     solution = product/(first*second)
  76.     length_1 = sum(argument_1.values())
  77.     length_2 = sum(argument_2.values())
  78.     lengths = min(length_1,length_2) / float(max(length_1,length_2))
  79.     similarity = round(lengths*solution * 100,2)
  80.     return similarity
  81.  
  82. def sentence_similarity(info):
  83.     sim = calculate_similarity(info)*.5
  84.     arg = Counter(unicode(trump_info.comments))
  85.     kwarg = Counter(unicode(info.comments))
  86.     intercept = set(arg.keys()) & (set(kwarg.keys()))
  87.     num = sum([arg[i] * kwarg[i] for i in intercept])
  88.     first = sum([arg[i]**2 for i in arg.keys()])
  89.     second = sum([kwarg[i]**2 for i in kwarg.keys()])
  90.     den = math.sqrt(first) * math.sqrt(second)
  91.     if not den:
  92.         similarity = 0.0
  93.     else:
  94.         similarity = float(num)/den
  95.     similarity = round((similarity*sim),2)
  96.     return similarity
  97.  
  98. def get_score(comments):
  99.     info = sort_information(comments)
  100.     return sentence_similarity(info)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement