Advertisement
Guest User

Untitled

a guest
Mar 8th, 2017
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.73 KB | None | 0 0
  1. import json, re
  2. import urllib.parse
  3. from urllib.request import urlopen, Request
  4. from selenium import webdriver
  5. from selenium.webdriver.common.keys import Keys
  6. from bs4 import BeautifulSoup
  7. from selenium.webdriver.common.action_chains import ActionChains
  8. from time import sleep
  9.  
  10. LIKES_THRESHOLD = 400
  11. INSTAGRAM_EMAIL = "chase2332@gmail.com"
  12. INSTAGRAM_PASSWORD = "MIqUfII7ADzF84Eb"
  13. HASHTAG = "puravidabracelets" # Don't include the actual '#'.
  14.  
  15. def write(words):
  16. handle = open("test.html", "w")
  17. handle.write(words)
  18. handle.close()
  19. print("written")
  20.  
  21. def fetch(url):
  22. try:
  23. q = Request(url)
  24. q.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
  25. q.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
  26. response = urlopen(q, timeout=20)
  27. html = response.read()
  28. try:
  29. html = html.decode("utf-8")
  30. except:
  31. html = html.decode("latin1")
  32. return html
  33. except:
  34. sleep(.5)
  35. try:
  36. q = Request(url)
  37. q.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
  38. q.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
  39. response = urlopen(q, timeout=20)
  40. html = response.read()
  41. try:
  42. html = html.decode("utf-8")
  43. except:
  44. html = html.decode("latin1")
  45. return html
  46. except:
  47. print("Couldn't Load Page")
  48.  
  49. def strip_html(data):
  50. p = re.compile(r'<.*?>')
  51. return p.sub('', data)
  52.  
  53. def scroll_page(driver, delay):
  54. driver.execute_script("window.scrollTo(0, 0);")
  55. sleep(delay/2)
  56. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  57. sleep(delay)
  58.  
  59. def count_likes(driver, photo):
  60. hover = ActionChains(driver).move_to_element(photo)
  61. hover.perform()
  62. likes = photo.get_attribute('innerHTML')
  63. link = photo.get_attribute("href")
  64. likes = strip_html(likes)
  65. if "video" in likes.lower():
  66. return None
  67. likes = likes.split(">")[-1].split(";")[-1].split("like")[0].replace(",", "")
  68. try:
  69. likes_f = float(likes.replace("k", "").replace("m", ""))
  70. except:
  71. print(likes)
  72. return None
  73. if "k" in likes:
  74. likes = int(likes_f*1000)
  75. elif "m" in likes:
  76. likes = int(likes_f*1000000)
  77. else:
  78. likes = int(likes_f)
  79. if likes > LIKES_THRESHOLD:
  80. print("Likes Num", likes)
  81. return link
  82. return None
  83.  
  84. def remove_images(driver, photos):
  85. for photo in photos:
  86. link = photo.get_attribute("href")
  87. driver.execute_script("document.querySelectorAll(\"a[href='"+link.split(".com")[1]+"']\")[0].remove();")
  88. driver.execute_script('''var a = document.getElementsByClassName('_myci9');
  89. while(a.length > 20){
  90. a[0].remove();
  91. }''')
  92.  
  93. chromeOptions = webdriver.ChromeOptions()
  94. prefs = {"profile.managed_default_content_settings.images":2}
  95. chromeOptions.add_experimental_option("prefs",prefs)
  96. driver = webdriver.Chrome(chrome_options=chromeOptions)
  97.  
  98. # driver = webdriver.Chrome()
  99. driver.get("https://instagram.com/")
  100. elem = driver.find_element_by_xpath("//a[text()='Log in']")
  101. elem.click()
  102.  
  103. elem = driver.find_element_by_name("username")
  104. elem.clear()
  105. elem.send_keys(INSTAGRAM_EMAIL)
  106. elem = driver.find_element_by_name("password")
  107. elem.clear()
  108. elem.send_keys(INSTAGRAM_PASSWORD)
  109. elem.send_keys(Keys.RETURN)
  110. sleep(6)
  111. driver.get("https://www.instagram.com/explore/tags/"+HASHTAG+"/")
  112. sleep(1)
  113. elem = driver.find_element_by_xpath("//a[text()='Load more']")
  114. elem.click()
  115.  
  116. photos = []
  117.  
  118. timeouts = 0
  119. links = []
  120. count = 0
  121. reset = True
  122. while timeouts < 20:
  123. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  124. photos = driver.find_elements_by_xpath("//a[contains(@href, '?tagged=')]")
  125. for photo in photos:
  126. timeouts = 0
  127. link = count_likes(driver, photo)
  128. if link:
  129. links.append(link)
  130. count += len(photos)
  131. remove_images(driver, photos)#remove thumbnails so things don't get too packed in the web page.
  132. print(str(count)+" total posts. "+str(len(links))+" with more than "+str(LIKES_THRESHOLD)+" likes.")
  133. scroll_page(driver, .6)
  134. if timeouts == 18:
  135. driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
  136. scroll_page(driver, 8)
  137. timeouts += 1
  138.  
  139.  
  140. write(json.dumps(links))
  141. results = []
  142. count = 0
  143. for link in links:
  144. print(count, len(links))
  145. count += 1
  146. page = fetch(link)
  147. if not page:
  148. continue
  149. json_string = page.split("window._sharedData = ")[1].split(";</script>")[0]
  150. data = json.loads(json_string)
  151. username = data["entry_data"]["PostPage"][0]["media"]["owner"]["username"]
  152. link = "https://www.instagram.com/"+username+"/"
  153. if link in [x["link"] for x in results]:
  154. continue
  155. profile_pic_url = data["entry_data"]["PostPage"][0]["media"]["owner"]["profile_pic_url"]
  156. image_src = data["entry_data"]["PostPage"][0]["media"]["display_src"]
  157. page = fetch(link)
  158. if not page:
  159. results.append({
  160. "link":link,
  161. "profile_pic_url":"",
  162. "image_src":"",
  163. "ave_comments":"",
  164. "ave_likes":""
  165. })
  166. continue
  167. json_string = page.split("window._sharedData = ")[1].split(";</script>")[0]
  168. data = json.loads(json_string)
  169. posts = data["entry_data"]["ProfilePage"][0]["user"]["media"]["nodes"]
  170. posts = posts[:10]
  171. comments = 0
  172. likes = 0
  173. for post in posts:
  174. comments += post["comments"]["count"]
  175. likes += post["likes"]["count"]
  176. ave_comments = comments/len(posts)
  177. ave_likes = likes/len(posts)
  178. results.append({
  179. "link":link,
  180. "profile_pic_url":profile_pic_url,
  181. "image_src":image_src,
  182. "ave_comments":ave_comments,
  183. "ave_likes":ave_likes
  184. })
  185.  
  186. results = sorted(results, key=lambda k: k['ave_likes'], reverse=True)
  187. result_string = "Average Likes (last 10 posts), Average # Comments (last 10 posts), Profile Link, Image Link, Profile Pic\n"
  188. for row in results:
  189. result_string += str(row["ave_likes"])+","+str(row["ave_comments"])+","+row["link"]+","+row["image_src"]+","+row["profile_pic_url"]+"\n"
  190.  
  191. handle = open(HASHTAG+".csv", "w")
  192. handle.write(result_string)
  193. handle.close()
  194.  
  195.  
  196. # elem.clear()
  197. # elem.send_keys("madsongha+1@gmail.com")
  198. # elem = driver.find_element_by_name("password")
  199. # elem.clear()
  200. # elem.send_keys("fullcontact")
  201. # elem.send_keys(Keys.RETURN)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement