Advertisement
Guest User

Untitled

a guest
Sep 29th, 2016
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.76 KB | None | 0 0
  1. import os
  2. import json
  3. import re
  4.  
  5. target_words = ['han', 'hon', 'den', 'det', 'denna', 'denne', 'hen']
  6. occurences = {word: 0 for word in target_words}
  7.  
  8. for filename in os.listdir(os.getcwd() + '/tweets':
  9.     file = open(filename, 'w')
  10.     analyze_file(file)
  11.  
  12. def analyze_file(file):
  13.     for line in file:
  14.         tweet = json.loads(line)
  15.         if 'retweeted_status' not in tweet:
  16.             analyze_tweet(tweet)
  17.  
  18. def analyze_tweet(tweet):
  19.     for target_word in target_words:
  20.         occurences[target_word] = occurence_count(target_word, tweet['text'].lower())
  21.  
  22. def occurence_count(target_word, text):
  23.     count = 0
  24.     words = re.findall('\w+', text).read() # TODO: Improve?
  25.     for word in words: # TODO: Use collections.Counter() instead?
  26.         if word == target_word:
  27.             count += 1
  28.     return count
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement