Untitled

import requests, json, re, time, random, string

try:
    from urllib.parse import urlparse #python3
except:
    from urlparse import urlparse #python2

search_url = "https://www.google.co.nz/search?q=cdn.shopify:*.co.nz&start={0}&nfpr=1"
#search_url = "https://www.google.co.nz/search?q=cdn.shopify:*.nz&start={0}&nfpr=1"

post_data = {
    'form_type': 'contact',
    'contact[name]': "Mr Smith",
    'contact[email]': "my_email@example.com",
    'contact[comments]': "Hello Spotify customer....",
}

user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
pattern = re.compile('<h3 .*?><a href="(.*?)".*?</h3>')

try:
    with open("domains.txt", 'r') as file:
        persist = json.loads(file.read())
except:
    persist = {'start':0, 'domains':[]}

while True:
    page = requests.get(search_url.format(persist['start']), headers={'User-Agent':user_agent}).text

    results = re.findall(pattern, page)
    if not results:
        if 'unusual traffic' in page:
            print("Blocked by google")
        else:
            print("No more results")
        break

    for result in results:
        url = urlparse(result)
        if url.netloc in persist['domains'] or 'shopify' in url.netloc:
            continue

        contact_url = "{0}://{1}/contact#contact_form".format(url.scheme, url.netloc)
        print(contact_url)

        ## Un-comment below line to actually send the contact request
        # requests.post(contact_url, data=post_data, headers={'User-Agent':user_agent})

        persist['domains'].append(url.netloc)

    persist['start'] += 10
    with open("domains.txt", 'w') as file:
        file.write(json.dumps(persist))

    # help reduce google blocking
    time.sleep(2)
    random_search = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(8))
    page = requests.get('https://www.google.co.nz/search?q={0}'.format(random_search), headers={'User-Agent':user_agent})
    time.sleep(2)

input("Done")