Advertisement
Guest User

BUAT YOGI :*

a guest
Aug 18th, 2019
158
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.47 KB | None | 0 0
  1. import tweepy
  2. from tweepy import OAuthHandler
  3. from tweepy import Stream
  4. from tweepy.streaming import StreamListener
  5.  
  6. import json
  7.  
  8. ## STREAMING TWITTER DATA ###
  9.  
  10. # ACCESS_TOKEN = '971269575596761093-qHEaeN55xhfjbJh5c2VPEuolPm0SpaU'
  11. # ACCESS_SECRET = 'LLplBXMJsbNxbDJOkQC2IKSeOjmmhrbT2H1NYuNNAytwU'
  12. # CONSUMER_KEY = 'F24kXUS5ELRyJb6lSZJtLTlWO'
  13. # CONSUMER_SECRET = 'f4fHkoM48MwV3k5VNRfqNv0WV17g90wTy8XslyZbKWyDzqqn5x'
  14.  
  15. # auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
  16. # auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
  17.  
  18. # twitter_api = tweepy.API(auth)
  19.  
  20. # class MyListener(tweepy.StreamListener):
  21. #   def on_data(self, data):
  22. #       try:
  23. #           with open('hongkongprotests.json', 'a') as f:
  24. #               f.write(data)
  25. #               print('.')
  26. #               return True
  27. #       except BaseException as e:
  28. #           print("Error on data: %s" % str(e))
  29. #       return True
  30.  
  31. #   def on_error(self, status):
  32. #       print("status: " + str(status))
  33. #       if(status == 420):
  34. #           return False
  35. #       return False
  36.  
  37. # twitter_stream = tweepy.Stream(auth, MyListener())
  38. # twitter_stream.filter(track=['#HongKongProtests'])
  39.  
  40.  
  41.  
  42. ## EXAMPLE ACCESS THE DATA ###
  43. # with open('twitter.json', 'r') as f:
  44. #   line = f.readline()
  45. #   tweet = json.loads(line) # load as py dict
  46. #   print(json.dumps(tweet, indent=2))
  47.  
  48.  
  49.  
  50. from nltk.tokenize import word_tokenize
  51.  
  52. ## EXAMPLE TOKENIZE THE TWEET ###
  53. # with open('lgbtq.json', 'r') as f:
  54. #   tweet = f.readline()
  55. #   print(word_tokenize(tweet))
  56.  
  57. import re
  58.  
  59. emoticons_str = r"""
  60.         (?:
  61.             [:=;] # Eyes
  62.             [oO\-]? # Nose (optional)
  63.             [D\)\]\(\]/\\OpP] # Mouth
  64.         )"""
  65.  
  66. regex_str = [
  67.         emoticons_str,
  68.         r'<[^>]+>', # HTML tags
  69.         r'(?:@[\w_]+)', # @-mentions
  70.         r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
  71.         r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
  72.  
  73.         r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
  74.         r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
  75.         r'(?:[\w_]+)', # other words
  76.         r'(?:\S)' # anything else
  77.     ]
  78.  
  79. tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
  80. emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
  81.  
  82. def tokenize(s):
  83.     return tokens_re.findall(s)
  84.  
  85. def preprocess(s, lowercase=False):
  86.     tokens = tokenize(s)
  87.     if lowercase:
  88.         tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
  89.     return tokens
  90.  
  91. with open('lgbtq.json', 'r') as f:
  92.     for line in f:
  93.         tweet = json.loads(line)
  94.         tokens = preprocess(tweet['text'])
  95.         print(tokens)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement