Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- from bs4 import BeautifulSoup
- from selenium import webdriver
- import time
- import urllib.request
- def scrape_captcha(idx):
- try:
- # Load the test website and navigate to the ReCAPTCHA iframe
- driver = webdriver.Firefox()
- driver.get('https://patrickhlauke.github.io/recaptcha/')
- driver.find_element_by_tag_name('iframe').click()
- time.sleep(2)
- driver.switch_to_frame(driver.find_elements_by_tag_name('iframe')[1])
- soup = BeautifulSoup(driver.page_source, 'html.parser')
- divs = soup.find_all("div")
- driver.close()
- driver.quit()
- # Pull the instructions from the "Golden div"
- golden_div = divs[0].find_all('div', {'class':'rc-imageselect-desc-no-canonical'})
- instruction = golden_div[0].text
- instruction_subsection = golden_div[0].span.text
- instruction = instruction[:(-1 * len(instruction_subsection))]
- # Extract all images involved with the challenge and save file
- images = []
- image_div = soup.find_all('div', {'class':'rc-image-tile-wrapper'})
- for div in image_div:
- if div.img.get("src") not in images:
- images.append(div.img.get("src"))
- for image in images:
- urllib.request.urlretrieve(image, "Data/img/captcha" + str(idx) + ".png")
- return True, instruction
- except Exception as e:
- print(e)
- return False, e
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement