Guest User

Tweets code for eng

a guest
Sep 12th, 2019
126
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.01 KB | None | 0 0
  1. import re
  2.  
  3. #Reading and understanding data
  4. import json
  5. import pandas as pd
  6. import matplotlib.pyplot as plt
  7.  
  8. tweets_data_path ='output.txt'
  9.  
  10. tweets_data = []
  11. tweets_file = open(tweets_data_path, "r")
  12. for line in tweets_file:
  13.     try:
  14.         tweet = json.loads(line)
  15.         tweets_data.append(tweet)
  16.     except:
  17.         continue
  18.  
  19. print(len(tweets_data))
  20.  
  21. tweets = pd.DataFrame()
  22. pd.set_option('display.max_columns', None)  
  23. pd.set_option('display.max_rows', None)
  24. pd.set_option('display.max_colwidth', -1)
  25. pd.set_option('display.width', 200)
  26. pd.set_option('display.expand_frame_repr', True)
  27. pd.set_option('display.float_format', '{:20,.2f}'.format)
  28.  
  29.  
  30. tweets['text'] = [tweet['text'] for tweet in tweets_data]
  31. tweets['lang'] = [tweet['lang'] for tweet in tweets_data]
  32. tweets_by_lang = tweets['lang'].value_counts()
  33.  
  34. fig, ax = plt.subplots()
  35. ax.tick_params(axis='x', labelsize=15)
  36. ax.tick_params(axis='y', labelsize=10)
  37. ax.set_xlabel('Languages', fontsize=15)
  38. ax.set_ylabel('Number of tweets' , fontsize=15)
  39. ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
  40. tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
  41.  
  42. for tweet in tweets:
  43.     # add to JSON                            
  44.     with open('tweets.txt', 'w', encoding='utf8') as f:
  45.         print(tweets['text'], file = f)
  46.  
  47. fig, ax = plt.subplots()
  48. ax.tick_params(axis='x', labelsize=15)
  49. ax.tick_params(axis='y', labelsize=10)
  50. ax.set_xlabel('Languages', fontsize=15)
  51. ax.set_ylabel('Number of tweets' , fontsize=15)
  52. ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
  53. tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
  54. plt.grid()
  55.  
  56. def word_in_text(word, text):
  57.     word = word.lower()
  58.     text = text.lower()
  59.     match = re.search(word, text)
  60.     if match:
  61.         return True
  62.     return False
  63.  
  64. tweets['hulu'] = tweets['text'].apply(lambda tweet: word_in_text('hulu', tweet))
  65. tweets['netflix'] = tweets['text'].apply(lambda tweet: word_in_text('netflix', tweet))
  66. tweets['amazon prime'] = tweets['text'].apply(lambda tweet: word_in_text('amazon prime', tweet))
  67.  
  68. print(tweets['hulu'].value_counts()[True])
  69. print(tweets['netflix'].value_counts()[True])
  70. print(tweets['amazon prime'].value_counts()[True])
  71.  
  72. prg_langs = ['hulu', 'netflix', 'amazon prime']
  73. tweets_by_prg_lang = [tweets['hulu'].value_counts()[True], tweets['netflix'].value_counts()[True], tweets['amazon prime'].value_counts()[True]]
  74.  
  75. x_pos = list(range(len(prg_langs)))
  76. width = 0.5
  77. fig, ax = plt.subplots()
  78. plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='g')
  79.  
  80. # Setting axis labels and ticks
  81. ax.set_ylabel('Number of tweets', fontsize=15)
  82. ax.set_title('Ranking: Hulu vs. Netflix vs. Amazon Prime (Raw data)', fontsize=10, fontweight='bold')
  83. ax.set_xticks([p + 0 * width for p in x_pos])
  84. ax.set_xticklabels(prg_langs)
  85. plt.grid()
  86. plt.show()
  87.  
  88. ## Only extracting english language tweets
  89. eng_tweets = tweets.loc[tweets['lang'] == 'en', 'text']
  90. with open('eng.txt', 'w', encoding='utf8') as engtweets:
  91.         print(eng_tweets, file = engtweets)
Add Comment
Please, Sign In to add comment