Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tweepy
- from tweepy import OAuthHandler
- from tweepy import Stream
- from tweepy.streaming import StreamListener
- import json
- ## STREAMING TWITTER DATA ###
- # ACCESS_TOKEN = '971269575596761093-qHEaeN55xhfjbJh5c2VPEuolPm0SpaU'
- # ACCESS_SECRET = 'LLplBXMJsbNxbDJOkQC2IKSeOjmmhrbT2H1NYuNNAytwU'
- # CONSUMER_KEY = 'F24kXUS5ELRyJb6lSZJtLTlWO'
- # CONSUMER_SECRET = 'f4fHkoM48MwV3k5VNRfqNv0WV17g90wTy8XslyZbKWyDzqqn5x'
- # auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
- # auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
- # twitter_api = tweepy.API(auth)
- # class MyListener(tweepy.StreamListener):
- # def on_data(self, data):
- # try:
- # with open('hongkongprotests.json', 'a') as f:
- # f.write(data)
- # print('.')
- # return True
- # except BaseException as e:
- # print("Error on data: %s" % str(e))
- # return True
- # def on_error(self, status):
- # print("status: " + str(status))
- # if(status == 420):
- # return False
- # return False
- # twitter_stream = tweepy.Stream(auth, MyListener())
- # twitter_stream.filter(track=['#HongKongProtests'])
- ## EXAMPLE ACCESS THE DATA ###
- # with open('twitter.json', 'r') as f:
- # line = f.readline()
- # tweet = json.loads(line) # load as py dict
- # print(json.dumps(tweet, indent=2))
- from nltk.tokenize import word_tokenize
- ## EXAMPLE TOKENIZE THE TWEET ###
- # with open('lgbtq.json', 'r') as f:
- # tweet = f.readline()
- # print(word_tokenize(tweet))
- import re
- emoticons_str = r"""
- (?:
- [:=;] # Eyes
- [oO\-]? # Nose (optional)
- [D\)\]\(\]/\\OpP] # Mouth
- )"""
- regex_str = [
- emoticons_str,
- r'<[^>]+>', # HTML tags
- r'(?:@[\w_]+)', # @-mentions
- r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
- r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
- r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
- r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
- r'(?:[\w_]+)', # other words
- r'(?:\S)' # anything else
- ]
- tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
- emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
- def tokenize(s):
- return tokens_re.findall(s)
- def preprocess(s, lowercase=False):
- tokens = tokenize(s)
- if lowercase:
- tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
- return tokens
- with open('lgbtq.json', 'r') as f:
- for line in f:
- tweet = json.loads(line)
- tokens = preprocess(tweet['text'])
- print(tokens)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement