Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json, re
- import urllib.parse
- from urllib.request import urlopen, Request
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- from bs4 import BeautifulSoup
- from selenium.webdriver.common.action_chains import ActionChains
- from time import sleep
- LIKES_THRESHOLD = 400
- INSTAGRAM_EMAIL = "chase2332@gmail.com"
- INSTAGRAM_PASSWORD = "MIqUfII7ADzF84Eb"
- HASHTAG = "puravidabracelets" # Don't include the actual '#'.
- def write(words):
- handle = open("test.html", "w")
- handle.write(words)
- handle.close()
- print("written")
- def fetch(url):
- try:
- q = Request(url)
- q.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
- q.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
- response = urlopen(q, timeout=20)
- html = response.read()
- try:
- html = html.decode("utf-8")
- except:
- html = html.decode("latin1")
- return html
- except:
- sleep(.5)
- try:
- q = Request(url)
- q.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
- q.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
- response = urlopen(q, timeout=20)
- html = response.read()
- try:
- html = html.decode("utf-8")
- except:
- html = html.decode("latin1")
- return html
- except:
- print("Couldn't Load Page")
- def strip_html(data):
- p = re.compile(r'<.*?>')
- return p.sub('', data)
- def scroll_page(driver, delay):
- driver.execute_script("window.scrollTo(0, 0);")
- sleep(delay/2)
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- sleep(delay)
- def count_likes(driver, photo):
- hover = ActionChains(driver).move_to_element(photo)
- hover.perform()
- likes = photo.get_attribute('innerHTML')
- link = photo.get_attribute("href")
- likes = strip_html(likes)
- if "video" in likes.lower():
- return None
- likes = likes.split(">")[-1].split(";")[-1].split("like")[0].replace(",", "")
- try:
- likes_f = float(likes.replace("k", "").replace("m", ""))
- except:
- print(likes)
- return None
- if "k" in likes:
- likes = int(likes_f*1000)
- elif "m" in likes:
- likes = int(likes_f*1000000)
- else:
- likes = int(likes_f)
- if likes > LIKES_THRESHOLD:
- print("Likes Num", likes)
- return link
- return None
- def remove_images(driver, photos):
- for photo in photos:
- link = photo.get_attribute("href")
- driver.execute_script("document.querySelectorAll(\"a[href='"+link.split(".com")[1]+"']\")[0].remove();")
- driver.execute_script('''var a = document.getElementsByClassName('_myci9');
- while(a.length > 20){
- a[0].remove();
- }''')
- chromeOptions = webdriver.ChromeOptions()
- prefs = {"profile.managed_default_content_settings.images":2}
- chromeOptions.add_experimental_option("prefs",prefs)
- driver = webdriver.Chrome(chrome_options=chromeOptions)
- # driver = webdriver.Chrome()
- driver.get("https://instagram.com/")
- elem = driver.find_element_by_xpath("//a[text()='Log in']")
- elem.click()
- elem = driver.find_element_by_name("username")
- elem.clear()
- elem.send_keys(INSTAGRAM_EMAIL)
- elem = driver.find_element_by_name("password")
- elem.clear()
- elem.send_keys(INSTAGRAM_PASSWORD)
- elem.send_keys(Keys.RETURN)
- sleep(6)
- driver.get("https://www.instagram.com/explore/tags/"+HASHTAG+"/")
- sleep(1)
- elem = driver.find_element_by_xpath("//a[text()='Load more']")
- elem.click()
- photos = []
- timeouts = 0
- links = []
- count = 0
- reset = True
- while timeouts < 20:
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- photos = driver.find_elements_by_xpath("//a[contains(@href, '?tagged=')]")
- for photo in photos:
- timeouts = 0
- link = count_likes(driver, photo)
- if link:
- links.append(link)
- count += len(photos)
- remove_images(driver, photos)#remove thumbnails so things don't get too packed in the web page.
- print(str(count)+" total posts. "+str(len(links))+" with more than "+str(LIKES_THRESHOLD)+" likes.")
- scroll_page(driver, .6)
- if timeouts == 18:
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
- scroll_page(driver, 8)
- timeouts += 1
- write(json.dumps(links))
- results = []
- count = 0
- for link in links:
- print(count, len(links))
- count += 1
- page = fetch(link)
- if not page:
- continue
- json_string = page.split("window._sharedData = ")[1].split(";</script>")[0]
- data = json.loads(json_string)
- username = data["entry_data"]["PostPage"][0]["media"]["owner"]["username"]
- link = "https://www.instagram.com/"+username+"/"
- if link in [x["link"] for x in results]:
- continue
- profile_pic_url = data["entry_data"]["PostPage"][0]["media"]["owner"]["profile_pic_url"]
- image_src = data["entry_data"]["PostPage"][0]["media"]["display_src"]
- page = fetch(link)
- if not page:
- results.append({
- "link":link,
- "profile_pic_url":"",
- "image_src":"",
- "ave_comments":"",
- "ave_likes":""
- })
- continue
- json_string = page.split("window._sharedData = ")[1].split(";</script>")[0]
- data = json.loads(json_string)
- posts = data["entry_data"]["ProfilePage"][0]["user"]["media"]["nodes"]
- posts = posts[:10]
- comments = 0
- likes = 0
- for post in posts:
- comments += post["comments"]["count"]
- likes += post["likes"]["count"]
- ave_comments = comments/len(posts)
- ave_likes = likes/len(posts)
- results.append({
- "link":link,
- "profile_pic_url":profile_pic_url,
- "image_src":image_src,
- "ave_comments":ave_comments,
- "ave_likes":ave_likes
- })
- results = sorted(results, key=lambda k: k['ave_likes'], reverse=True)
- result_string = "Average Likes (last 10 posts), Average # Comments (last 10 posts), Profile Link, Image Link, Profile Pic\n"
- for row in results:
- result_string += str(row["ave_likes"])+","+str(row["ave_comments"])+","+row["link"]+","+row["image_src"]+","+row["profile_pic_url"]+"\n"
- handle = open(HASHTAG+".csv", "w")
- handle.write(result_string)
- handle.close()
- # elem.clear()
- # elem.send_keys("madsongha+1@gmail.com")
- # elem = driver.find_element_by_name("password")
- # elem.clear()
- # elem.send_keys("fullcontact")
- # elem.send_keys(Keys.RETURN)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement