Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import psycopg2
- import time
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import Select
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
- from browsermobproxy import Server
- import urlparse
- import json
- import psutil
- import time
- conn = psycopg2.connect("dbname='displayi' user='postgres' password='YXuyw3sdW2pV' host='webpostgres.czdjulbcdn68.us-east-1.rds.amazonaws.com' port='5432'")
- conn_insert = psycopg2.connect("dbname='twitter' user='postgres' password='szaflary' host='analyticpostgres.czdjulbcdn68.us-east-1.rds.amazonaws.com' port='5432'")
- conn_insert.autocommit = False
- cursor = conn.cursor()
- cursor_insert = conn_insert.cursor()
- while True:
- cursor.execute("select distinct label from networks.net_nodes where network_id = ANY(ARRAY['TWA418.T2','TWA435.T2','TWA409.T2','TWA126.T2', 'TWA443.T2'])")#select label from networks.net_nodes where network_id ILIKE '%.T2' order by scale desc
- labels = cursor.fetchall()
- count = 1
- num_of_labels = len(labels)
- for label in labels:
- print 'Looking for ads of user: %s ...' % label[0]
- print '%s / %s' % (count, num_of_labels)
- count += 1
- try:
- for proc in psutil.process_iter():
- # check whether the process name matches
- if proc.name() == "browsermob-proxy" or proc.name() == "java" or proc.name() == "chromedriver" or proc.name() == "chromium-browse":
- proc.kill()
- server = Server('./browsermob-proxy-2.1.4/bin/browsermob-proxy')
- server.start()
- time.sleep(1)
- proxy = server.create_proxy()
- time.sleep(1)
- proxy.new_har("https://ads.twitter.com/transparency/%s" % label[0], options={'captureContent':True})
- time.sleep(1)
- url = urlparse.urlparse(proxy.proxy).path
- chrome_options = webdriver.ChromeOptions()
- # set chrome options
- chrome_options.add_argument('--proxy-server=%s' % url)
- chrome_options.add_argument("--headless");
- chrome_options.add_argument("--no-sandbox");
- chrome_options.add_argument("--disable-dev-shm-usage");
- chrome_options.add_argument("--window-size=1920x1080")
- chrome_options.add_argument("--disable-gpu")
- chrome_options.add_argument("--disable-infobars")
- chrome_options.add_argument("--disable-notifications")
- driver = webdriver.Chrome(
- executable_path="/usr/lib/chromium-browser/chromedriver",
- chrome_options=chrome_options)
- driver.get("https://ads.twitter.com/transparency/%s" % label[0])
- time.sleep(5)
- SCROLL_PAUSE_TIME = 3
- # Get scroll height
- last_height = driver.execute_script("return document.body.scrollHeight")
- while True:
- # Scroll down to bottom
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- # Wait to load page
- time.sleep(SCROLL_PAUSE_TIME)
- # Calculate new scroll height and compare with last scroll height
- new_height = driver.execute_script("return document.body.scrollHeight")
- if new_height == last_height:
- break
- last_height = new_height
- modified_ads_tweets_count = 0
- for request in proxy.har['log']['entries']:
- if 'https://ads.twitter.com/transparency/tweets_timeline.json' in str(request['request']['url']):
- ads_data = json.loads(request['response']['content']['text'])
- modified_ads_tweets_count += len(ads_data['tweets'])
- for tweet in ads_data['tweets']:
- tweet_id = tweet['id']
- p_id = tweet['promotedMetadata']['advertiserId']
- is_political = tweet['promotedMetadata']['political']
- lastactiveatseconds = tweet['lastActiveAtSeconds']
- cursor_insert.execute('select upsertadstweet(%s, %s, %s, \'%s\', 1, %s)' \
- % (tweet_id, p_id, is_political, json.dumps(tweet), lastactiveatseconds))
- cursor_insert.execute('INSERT INTO public.tasks '\
- '(userid,priority,iters,createdwhen,task,userhandle,tweetid) '\
- 'VALUES (%s, 1, 1, now()::timestamp, 6, \'\', %s)' \
- % (p_id, tweet_id))
- if is_political:
- cursor_insert.execute('INSERT INTO public.tasks '\
- '(userid,priority,iters,createdwhen,task,userhandle,tweetid) '\
- 'VALUES (%s, 1, 1, now()::timestamp, 7, \'\', %s)' \
- % (p_id, tweet_id))
- print 'Found modified ads tweets: %s' % modified_ads_tweets_count
- conn_insert.commit()
- print 'Saved to db analyticspostgres data tweets.'
- driver.quit()
- server.stop()
- except Exception as e:
- if driver:
- driver.quit()
- if server:
- server.stop()
- print e
- print 'Scrapper didn\'t worked for %s user !!!' % label[0]
- time.sleep(2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement