Advertisement
Guest User

Untitled

a guest
Dec 12th, 2019
164
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.91 KB | None | 0 0
  1. #!/usr/bin/python3
  2. # Jeremy Robinson
  3. # Sep 8, 2016
  4. # Mining the Social Web - get Twitter Search Results
  5.  
  6. import re
  7. import json
  8. import twitter
  9. import pandas as pd
  10. import numpy
  11. from collections import Counter
  12.  
  13. from authTwitter import authTW
  14. from textblob import TextBlob
  15. from sklearn.feature_extraction.text import TfidfTransformer
  16. from sklearn.feature_extraction.text import CountVectorizer
  17.  
  18.  
  19. def cleanTweet(t):
  20. # use the regular expression library to strip all unwanted characters from the text
  21. return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", t).split())
  22.  
  23.  
  24. def getSentiment(t):
  25. # using TextBlob, create an Object from the input tweet
  26. tbObject = TextBlob(t)
  27.  
  28. # compute the sentiment
  29. if tbObject.sentiment.polarity > 0:
  30. return 'positive'
  31. elif tbObject.sentiment.polarity == 0:
  32. return 'neutral'
  33. else:
  34. return 'negative'
  35.  
  36.  
  37. def getSearch(t_obj):
  38. q = '#tulsa'
  39. count = 100
  40.  
  41. # use the twitter api to get the tweets
  42. search_results = t_obj.search.tweets(q=q, count=count)
  43.  
  44. # filter the json results just to status
  45. statuses = search_results['statuses']
  46.  
  47. # iterate through the status to get the tweet text and id
  48. tw_list = []
  49. for tw in statuses:
  50. tw_text = tw['text']
  51. tw_text = cleanTweet(tw_text)
  52.  
  53. tw_id = tw['id']
  54. tw_list.append([tw_id, tw_text])
  55.  
  56. # use TextBlob to analyze and compute sentiment for the text
  57. # print("\n ", getSentiment(tw_text), " : ", tw_text)
  58.  
  59. return tw_list
  60.  
  61.  
  62. def bagOfWords(sentence):
  63. bow = {}
  64. sentence = sentence.to_string()
  65. sentence.strip('RT')
  66. for word in sentence.split():
  67. w = word.lower()
  68. if w not in bow.keys():
  69. bow[w] = 0
  70. bow[w] += 1
  71. return bow
  72.  
  73.  
  74. def main():
  75. twitter_obj = authTW()
  76. r = getSearch(twitter_obj)
  77. # print(json.dumps(r, indent = 2))
  78. df = pd.DataFrame(r)
  79.  
  80. # create the bag of words
  81. bag = bagOfWords(df[1])
  82. # print bag of words
  83. print(bag)
  84.  
  85. print("=============================================")
  86.  
  87. # print out the term frequncy in descending order
  88. print("Term Frequency\n", df[1].str.split(
  89. expand=True).stack().value_counts())
  90.  
  91. print("=============================================")
  92.  
  93. # instantiate CountVectorizer()
  94. cv = CountVectorizer()
  95.  
  96. # this steps generates word counts for the words in your docs
  97. word_count_vector = cv.fit_transform(df[1].tolist())
  98.  
  99. tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
  100. tfidf_transformer.fit(word_count_vector)
  101.  
  102. # idf values
  103. df_idf = pd.DataFrame(tfidf_transformer.idf_,
  104. index=cv.get_feature_names(), columns=["idf_weights"])
  105.  
  106. # print idf values in descending order
  107. print(df_idf.sort_values(by=['idf_weights'], ascending=False))
  108.  
  109.  
  110. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement