Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import json
- import re
- target_words = ['han', 'hon', 'den', 'det', 'denna', 'denne', 'hen']
- occurences = {word: 0 for word in target_words}
- for filename in os.listdir(os.getcwd() + '/tweets':
- file = open(filename, 'w')
- analyze_file(file)
- def analyze_file(file):
- for line in file:
- tweet = json.loads(line)
- if 'retweeted_status' not in tweet:
- analyze_tweet(tweet)
- def analyze_tweet(tweet):
- for target_word in target_words:
- occurences[target_word] = occurence_count(target_word, tweet['text'].lower())
- def occurence_count(target_word, text):
- count = 0
- words = re.findall('\w+', text).read() # TODO: Improve?
- for word in words: # TODO: Use collections.Counter() instead?
- if word == target_word:
- count += 1
- return count
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement