Advertisement
Guest User

Untitled

a guest
Jan 9th, 2018
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.79 KB | None | 0 0
  1. # Whoever made these, THANKS
  2. # https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
  3. # https://github.com/haccer/tweep
  4.  
  5. from bs4 import BeautifulSoup # web scraping 101
  6. from time import gmtime, strftime # timestamps
  7. import datetime # another time tool
  8. import json # web data container
  9. import Queue # speed web scraping
  10. import requests # standard web faire
  11.  
  12. q = Queue.Queue()
  13.  
  14. class tweep:
  15.     def __init__(self, u, s=None, year=None):
  16.         self.min = -1
  17.         self.author, self.search, self.year = u, s, year
  18.         self.feed = [-1]
  19.         self.tweets = 0
  20.         self.tweet_urls = []
  21.         self.result = []
  22.  
  23.     def get_url(self):
  24.         url_1 = "https://twitter.com/search?f=tweets&vertical=default&lang=en&q="
  25.         url_2 = "https://twitter.com/i/search/timeline?f=tweets&vertical=default" + \
  26.             "&lang=en&include_available_features=1&include_entities=1&reset_error_state=false&src=typd"
  27.         url = url_1 if self.min == -1 else "{0}&max_position={1.min}&q=".format(url_2, self)
  28.         if self.author != None: url+= "from%3A{0.author}".format(self)
  29.         if self.search != None:
  30.             search = self.search.replace(' ','%20').replace('#','%23')
  31.             url+= "%20{}".format(search)
  32.         if self.year != None: url+= "%20until%3A{0.year}-1-1".format(self)
  33.         return url
  34.  
  35.     def get_feed(self):
  36.         r = requests.get(self.get_url(),headers=agent)
  37.         self.feed = []
  38.         try:
  39.             if self.min == -1: html = r.text
  40.             else: json_response = json.loads(r.text); html = json_response['items_html']
  41.             soup = BeautifulSoup(html,"lxml")
  42.             self.feed = soup.find_all('li','js-stream-item')
  43.             lastid = self.feed[-1]['data-item-id']
  44.             firstid = self.feed[0]['data-item-id']
  45.             if self.min == -1: self.min = "TWEET-{}-{}".format(lastid,firstid)
  46.             else:
  47.                 minsplit = json_response['min_position'].split('-')
  48.                 minsplit[1] = lastid
  49.                 self.min = "-".join(minsplit)
  50.         except: pass
  51.         return self.feed
  52.  
  53.     def get_tweets(self): # whoever wrote this is killing me
  54.         for tweet in self.get_feed():
  55.             self.tweets += 1
  56.             tweetid = tweet['data-item-id']
  57.             datestamp = tweet.find('a','tweet-timestamp')['title'].rpartition(' - ')[-1]
  58.             date = datetime.datetime.strptime(datestamp, '%d %b %Y').strftime('%Y-%m-%d')
  59.             timestamp = str(datetime.timedelta(seconds=int(tweet.find('span','_timestamp')['data-time']))).rpartition(', ')[-1]
  60.             time = datetime.datetime.strptime(timestamp,'%H:%M:%S').strftime('%H:%M:%S')
  61.             username = tweet.find('span','username').text.encode('utf8').replace('@','')
  62.             timezone = strftime("%Z", gmtime())
  63.             text = tweet.find('p','tweet-text').text.encode('utf8').replace('\n',' ')
  64.             self.result +=  [tweetid, date, time, timezone, username, text]
  65.  
  66.     def main(self):
  67.         while True if (self.tweets < float('inf')) and len(self.feed)>0 else False: self.get_tweets()
  68.         return self.result
  69.  
  70. def dig(username: str, tweet_count: int):
  71.     return [i for i in tweep(username).main() if i[4] == username][:tweet_count]
  72. def dig_stream(username: str, tweet_count: int):
  73.     return "".join([i[5] for i in dig(username, tweet_count)])
  74.  
  75. from keras.preprocessing.text import one_hot # text to array
  76. from keras.preprocessing.sequence import pad_sequences # padding
  77. from keras.models import Sequential # Layered Neural Network
  78. from keras.layers import Dense, Flatten # Standard faire
  79. from keras.layers.embeddings import Embedding # Vectorizing
  80. from typing import List # beacuse safety first
  81.  
  82. def ShitFilter(positive: List[str], negative: List[str], tweet_count: int):
  83.     docs = [dig_stream(i, tweet_count) for i in (positive + negative)] # data
  84.     labels = [0] * len(positive) + [1] * len(negative) # expected output
  85.     vocab = 10000000 # a really bad estimate of unique words
  86.     max_len = tweet_count * 60 # 280 characters = 50~60 words
  87.     dimensions = 300 # common word embed dimension number
  88.     iterations = 14 # it's a lucky number, what do you expect?
  89.     post_docs = pad_sequences([one_hot(d, vocab) for d in docs], \
  90.         maxlen=max_len, padding='post') # word array building
  91.     model = Sequential().add(Embedding(vocab, dimensions, input_length=max_len))
  92.     model.add(Flatten()).add(Dense(1, activation='sigmoid')) # output
  93.     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
  94.     print(model.summary()) # summarize the model
  95.     model.fit(post_docs, labels, epochs=iterations, verbose=0) # fit the model
  96.     loss, accuracy = model.evaluate(post_docs, labels, verbose=0) # evaluate the model
  97.     return model # well that is how we are ending this
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement