Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- #Reading and understanding data
- import json
- import pandas as pd
- import matplotlib.pyplot as plt
- tweets_data_path ='output.txt'
- tweets_data = []
- tweets_file = open(tweets_data_path, "r")
- for line in tweets_file:
- try:
- tweet = json.loads(line)
- tweets_data.append(tweet)
- except:
- continue
- print(len(tweets_data))
- tweets = pd.DataFrame()
- pd.set_option('display.max_columns', None)
- pd.set_option('display.max_rows', None)
- pd.set_option('display.max_colwidth', -1)
- pd.set_option('display.width', 200)
- pd.set_option('display.expand_frame_repr', True)
- pd.set_option('display.float_format', '{:20,.2f}'.format)
- tweets['text'] = [tweet['text'] for tweet in tweets_data]
- tweets['lang'] = [tweet['lang'] for tweet in tweets_data]
- tweets_by_lang = tweets['lang'].value_counts()
- fig, ax = plt.subplots()
- ax.tick_params(axis='x', labelsize=15)
- ax.tick_params(axis='y', labelsize=10)
- ax.set_xlabel('Languages', fontsize=15)
- ax.set_ylabel('Number of tweets' , fontsize=15)
- ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
- tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
- for tweet in tweets:
- # add to JSON
- with open('tweets.txt', 'w', encoding='utf8') as f:
- print(tweets['text'], file = f)
- fig, ax = plt.subplots()
- ax.tick_params(axis='x', labelsize=15)
- ax.tick_params(axis='y', labelsize=10)
- ax.set_xlabel('Languages', fontsize=15)
- ax.set_ylabel('Number of tweets' , fontsize=15)
- ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
- tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
- plt.grid()
- def word_in_text(word, text):
- word = word.lower()
- text = text.lower()
- match = re.search(word, text)
- if match:
- return True
- return False
- tweets['hulu'] = tweets['text'].apply(lambda tweet: word_in_text('hulu', tweet))
- tweets['netflix'] = tweets['text'].apply(lambda tweet: word_in_text('netflix', tweet))
- tweets['amazon prime'] = tweets['text'].apply(lambda tweet: word_in_text('amazon prime', tweet))
- print(tweets['hulu'].value_counts()[True])
- print(tweets['netflix'].value_counts()[True])
- print(tweets['amazon prime'].value_counts()[True])
- prg_langs = ['hulu', 'netflix', 'amazon prime']
- tweets_by_prg_lang = [tweets['hulu'].value_counts()[True], tweets['netflix'].value_counts()[True], tweets['amazon prime'].value_counts()[True]]
- x_pos = list(range(len(prg_langs)))
- width = 0.5
- fig, ax = plt.subplots()
- plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='g')
- # Setting axis labels and ticks
- ax.set_ylabel('Number of tweets', fontsize=15)
- ax.set_title('Ranking: Hulu vs. Netflix vs. Amazon Prime (Raw data)', fontsize=10, fontweight='bold')
- ax.set_xticks([p + 0 * width for p in x_pos])
- ax.set_xticklabels(prg_langs)
- plt.grid()
- plt.show()
- ## Only extracting english language tweets
- eng_tweets = tweets.loc[tweets['lang'] == 'en', 'text']
- with open('eng.txt', 'w', encoding='utf8') as engtweets:
- print(eng_tweets, file = engtweets)
Add Comment
Please, Sign In to add comment