Tweets code for eng

import re

#Reading and understanding data
import json
import pandas as pd
import matplotlib.pyplot as plt

tweets_data_path ='output.txt'

tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

print(len(tweets_data))

tweets = pd.DataFrame()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 200)
pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.float_format', '{:20,.2f}'.format)


tweets['text'] = [tweet['text'] for tweet in tweets_data]
tweets['lang'] = [tweet['lang'] for tweet in tweets_data]
tweets_by_lang = tweets['lang'].value_counts()

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Languages', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')

for tweet in tweets:
    # add to JSON
    with open('tweets.txt', 'w', encoding='utf8') as f:
        print(tweets['text'], file = f)

fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Languages', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
plt.grid()

def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

tweets['hulu'] = tweets['text'].apply(lambda tweet: word_in_text('hulu', tweet))
tweets['netflix'] = tweets['text'].apply(lambda tweet: word_in_text('netflix', tweet))
tweets['amazon prime'] = tweets['text'].apply(lambda tweet: word_in_text('amazon prime', tweet))

print(tweets['hulu'].value_counts()[True])
print(tweets['netflix'].value_counts()[True])
print(tweets['amazon prime'].value_counts()[True])

prg_langs = ['hulu', 'netflix', 'amazon prime']
tweets_by_prg_lang = [tweets['hulu'].value_counts()[True], tweets['netflix'].value_counts()[True], tweets['amazon prime'].value_counts()[True]]

x_pos = list(range(len(prg_langs)))
width = 0.5
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='g')

# Setting axis labels and ticks
ax.set_ylabel('Number of tweets', fontsize=15)
ax.set_title('Ranking: Hulu vs. Netflix vs. Amazon Prime (Raw data)', fontsize=10, fontweight='bold')
ax.set_xticks([p + 0 * width for p in x_pos])
ax.set_xticklabels(prg_langs)
plt.grid()
plt.show()

## Only extracting english language tweets
eng_tweets = tweets.loc[tweets['lang'] == 'en', 'text']
with open('eng.txt', 'w', encoding='utf8') as engtweets:
        print(eng_tweets, file = engtweets)