Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #coding: utf-8
- import requests,time
- from stem import Signal
- from stem.control import Controller
- from twocaptcha import TwoCaptcha
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- import requests,io,sys,time,os
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.chrome.options import Options
- from selenium import webdriver
- import time,os
- import json
- import random
- from bs4 import BeautifulSoup
- PORT_1 = 9050
- PORT_2 = 9051
- def iper():
- global PORT_1
- os.system("curl -sx socks5://127.0.0.1:%s ifconfig.co | grep -oP '(?<=Your IP</span>: ).*(?=</span>)'"%PORT_1)
- def newId():
- os.system("killall -HUP tor")
- time.sleep(10)
- def grb(cdo):
- with io.open("ax.html","w+",encoding="utf-8") as ox:
- ox.write(cdo)
- soup = BeautifulSoup(cdo,"html.parser")
- cd = str(list(soup.find_all("script"))[-1]).split("\n")
- cd = cd[-20:-1]
- gt = ""
- chal = ""
- for line in cd:
- if "gt" in line:
- gt = line.split(":")[1].replace("'",'').replace("\n","").replace(",","").replace(" ","")
- elif "challenge" in line:
- chal = line.split(":")[1].replace("'",'').replace("\n","").replace(",","").replace(" ","")
- if (gt == "") or (chal == ""):
- return(False,False)
- return(gt,chal)
- def cpres(gt,chl,ur,api):
- global PORT_1
- #nu = requests.get("https://api-na.geetest.com/reset.php?gt=%s&challenge=%s&lang=fr-fr&pt=0&client_type=web&callback=geetest_1618213037183"%(gt,chl))
- #nch = json.loads(str(nu.text).replace("geetest_1618213037183(","")[:-1])
- #nch = nch["data"]["challenge"]
- #driver.refresh()
- sv = "api-na.geetest.com"
- prx = {"http":"socks5://localhost:%s"%PORT_1,"https":"socks5://localhost:%s"%PORT_1}
- x = requests.get("http://2captcha.com/in.php?key=%s&method=geetest>=%s&challenge=%s&pageurl=%s&api_server=%s"%(api,gt,chl,ur,sv),proxies=prx)
- _id = x.text.split("|")[1].replace("\n","")
- while True:
- try:
- xx = requests.get("http://2captcha.com/res.php?key=%s&action=get&id=%s"%(api,_id),timeout=20,proxies=prx)
- if "OK" in xx.text:
- return(xx.text.replace("OK|",""))
- time.sleep(5)
- except Exception as e:
- return(False)
- chrome_options = Options()
- chrome_options.add_argument('--lang=fr')
- chrome_options.add_argument('--proxy-server=socks5://localhost:%s'%PORT_1)
- chrome_options.add_argument("--start-maximized")
- chrome_options.add_argument('--no-sandbox')
- driver = webdriver.Chrome(options=chrome_options)
- cde = ["67000","64000","68000","78000"]
- ap = "d2780fa5046f5b658915e0fae91085e1"
- cat = input("Category Number: ")
- i = 1
- s = 0
- px = -100
- iper()
- for zp in cde:
- i = 1
- s = 0
- px = -100
- while True:
- if i == px+1:
- print("\n[+] Scraping Completed For %s."%zp)
- break
- driver.get("https://www.leboncoin.fr/recherche?category=%s&locations=%s&owner_type=pro&page=%s"%(cat,zp,i))
- try:
- ux = driver.find_element_by_tag_name("iframe").get_attribute("src")
- driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
- if u"Vous avez été bloqué" in driver.page_source:
- driver.quit()
- print("[-] IP Ban .. Changing IP in 60s")
- newId()
- iper()
- chrome_options = Options()
- chrome_options.add_argument('--lang=fr')
- chrome_options.add_argument('--proxy-server=socks5://localhost:%s'%PORT_1)
- chrome_options.add_argument("--start-maximized")
- chrome_options.add_argument('--no-sandbox')
- driver = webdriver.Chrome(options=chrome_options)
- continue
- if u"On s'assure qu'on s'adresse bien à vous, et non pas à un robot." in driver.page_source:
- print("[+] Captcha Detected ..")
- hd = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9","accept-encoding": "gzip, deflate, br","accept-language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
- rt = requests.get(ux,headers=hd)
- gt,chl = grb(rt.text)
- rsp = ""
- while True:
- rsp = cpres(gt,chl,driver.current_url,ap)
- if rsp != False:
- break
- exed = 'geetestResponse = %s'%rsp
- driver.execute_script(exed)
- time.sleep(1)
- driver.execute_script("captchaCallback();")
- print("[+] Captcha Bypassed")
- time.sleep(10)
- driver.switch_to.default_content()
- raise ValueError
- else:
- driver.switch_to.default_content()
- raise ValueError
- except Exception as e:
- driver.switch_to.default_content()
- data = driver.find_element_by_xpath("//script[contains(text(),'logo') and contains(@type, 'json')]").get_attribute('text')
- if s == 0:
- px = json.loads(data)["props"]["pageProps"]["listingData"]["max_pages"]
- s = 1
- sys.stdout.write("[+] Scraped %s|%s \r"%(i,px))
- sys.stdout.flush()
- with io.open("output/%s-Page%s.json"%(zp,i),"w+",encoding="utf-8") as op:
- op.write(data)
- i = i + 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement