Advertisement
Guest User

Untitled

a guest
Jan 16th, 2019
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.62 KB | None | 0 0
  1. import psycopg2
  2. import time
  3. from selenium import webdriver
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.support.ui import Select
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
  8. from browsermobproxy import Server
  9. import urlparse
  10. import json
  11. import psutil
  12. import time
  13.  
  14. conn = psycopg2.connect("dbname='displayi' user='postgres' password='YXuyw3sdW2pV' host='webpostgres.czdjulbcdn68.us-east-1.rds.amazonaws.com' port='5432'")
  15. conn_insert = psycopg2.connect("dbname='twitter' user='postgres' password='szaflary' host='analyticpostgres.czdjulbcdn68.us-east-1.rds.amazonaws.com' port='5432'")
  16.  
  17. conn_insert.autocommit = False
  18. cursor = conn.cursor()
  19. cursor_insert = conn_insert.cursor()
  20.  
  21.  
  22. while True:
  23. cursor.execute("select distinct label from networks.net_nodes where network_id = ANY(ARRAY['TWA418.T2','TWA435.T2','TWA409.T2','TWA126.T2', 'TWA443.T2'])")#select label from networks.net_nodes where network_id ILIKE '%.T2' order by scale desc
  24. labels = cursor.fetchall()
  25. count = 1
  26. num_of_labels = len(labels)
  27. for label in labels:
  28. print 'Looking for ads of user: %s ...' % label[0]
  29. print '%s / %s' % (count, num_of_labels)
  30. count += 1
  31. try:
  32. for proc in psutil.process_iter():
  33. # check whether the process name matches
  34. if proc.name() == "browsermob-proxy" or proc.name() == "java" or proc.name() == "chromedriver" or proc.name() == "chromium-browse":
  35. proc.kill()
  36.  
  37. server = Server('./browsermob-proxy-2.1.4/bin/browsermob-proxy')
  38. server.start()
  39. time.sleep(1)
  40. proxy = server.create_proxy()
  41. time.sleep(1)
  42. proxy.new_har("https://ads.twitter.com/transparency/%s" % label[0], options={'captureContent':True})
  43. time.sleep(1)
  44. url = urlparse.urlparse(proxy.proxy).path
  45.  
  46. chrome_options = webdriver.ChromeOptions()
  47. # set chrome options
  48. chrome_options.add_argument('--proxy-server=%s' % url)
  49. chrome_options.add_argument("--headless");
  50. chrome_options.add_argument("--no-sandbox");
  51. chrome_options.add_argument("--disable-dev-shm-usage");
  52. chrome_options.add_argument("--window-size=1920x1080")
  53. chrome_options.add_argument("--disable-gpu")
  54. chrome_options.add_argument("--disable-infobars")
  55. chrome_options.add_argument("--disable-notifications")
  56.  
  57. driver = webdriver.Chrome(
  58. executable_path="/usr/lib/chromium-browser/chromedriver",
  59. chrome_options=chrome_options)
  60.  
  61. driver.get("https://ads.twitter.com/transparency/%s" % label[0])
  62. time.sleep(5)
  63.  
  64. SCROLL_PAUSE_TIME = 3
  65.  
  66. # Get scroll height
  67. last_height = driver.execute_script("return document.body.scrollHeight")
  68.  
  69. while True:
  70. # Scroll down to bottom
  71. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  72.  
  73. # Wait to load page
  74. time.sleep(SCROLL_PAUSE_TIME)
  75.  
  76. # Calculate new scroll height and compare with last scroll height
  77. new_height = driver.execute_script("return document.body.scrollHeight")
  78. if new_height == last_height:
  79. break
  80. last_height = new_height
  81.  
  82. modified_ads_tweets_count = 0
  83. for request in proxy.har['log']['entries']:
  84. if 'https://ads.twitter.com/transparency/tweets_timeline.json' in str(request['request']['url']):
  85. ads_data = json.loads(request['response']['content']['text'])
  86. modified_ads_tweets_count += len(ads_data['tweets'])
  87. for tweet in ads_data['tweets']:
  88. tweet_id = tweet['id']
  89. p_id = tweet['promotedMetadata']['advertiserId']
  90. is_political = tweet['promotedMetadata']['political']
  91. lastactiveatseconds = tweet['lastActiveAtSeconds']
  92. cursor_insert.execute('select upsertadstweet(%s, %s, %s, \'%s\', 1, %s)' \
  93. % (tweet_id, p_id, is_political, json.dumps(tweet), lastactiveatseconds))
  94.  
  95. cursor_insert.execute('INSERT INTO public.tasks '\
  96. '(userid,priority,iters,createdwhen,task,userhandle,tweetid) '\
  97. 'VALUES (%s, 1, 1, now()::timestamp, 6, \'\', %s)' \
  98. % (p_id, tweet_id))
  99.  
  100. if is_political:
  101. cursor_insert.execute('INSERT INTO public.tasks '\
  102. '(userid,priority,iters,createdwhen,task,userhandle,tweetid) '\
  103. 'VALUES (%s, 1, 1, now()::timestamp, 7, \'\', %s)' \
  104. % (p_id, tweet_id))
  105.  
  106. print 'Found modified ads tweets: %s' % modified_ads_tweets_count
  107.  
  108. conn_insert.commit()
  109.  
  110.  
  111. print 'Saved to db analyticspostgres data tweets.'
  112.  
  113. driver.quit()
  114. server.stop()
  115. except Exception as e:
  116. if driver:
  117. driver.quit()
  118. if server:
  119. server.stop()
  120. print e
  121. print 'Scrapper didn\'t worked for %s user !!!' % label[0]
  122. time.sleep(2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement