Untitled

import httplib
import re
import string
import sys
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
import base64
import urllib2
import requests
#import argparse
from urlparse import urlsplit
from collections import deque
from bs4 import BeautifulSoup

with open("google_urls.txt", "r") as ins:
    for google_url in ins:
        #parser = argparse.ArgumentParser(description='Lead Gen Script - V1.0')
        #parser.add_argument('-l','--link', help='Google URL', required=True)
        #parser.add_argument('-c','--climit', help='Contact limit', required=True)
        #args = vars(parser.parse_args())

        #google_url = str(args['link'])
        #google_url = str(raw_input("Load thy Fireball: "))
        google_links = [google_url]
        links = []

        #google_url = str(raw_input("Google Maps URL: "))

        hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
               'Accept-Encoding': 'none',
               'Accept-Language': 'en-US,en;q=0.8',
               'Connection': 'keep-alive'}

        def getGoogleLinks(xurl):
            req = urllib2.Request(xurl, headers=hdr)
            html_page = urllib2.urlopen(req)
            soup = BeautifulSoup(html_page, "html.parser")

            for link in soup.findAll('a', attrs={'href': re.compile("^/search")}):
                google_links.append(link.get('href'))

            return google_links

        google_links = getGoogleLinks(google_url)
        google_links[:] = [yurl for yurl in google_links if 'rllag' in yurl]
        nn = len(google_links)
        for b in range(nn):
            if b != 0:
                google_links[b] = 'https://www.google.com'+google_links[b]
        print(111,google_links)
        print("Lead Gen Script - V1.0")
        print("----------------------\n")

        from_email = "racinecountyeyenews@gmail.com"
        contact_limit=100 #CHANGE THIS TO THE DESIRED CONTACT LIMIT
        #google_url = "https://www.google.com/search?rlz=1C1CHWL_en&q=construction%20companies%20in%20elkhorn%20wisconsin&npsic=0&rflfq=1&rlha=0&rllag=42679380,-88548678,3032&tbm=lcl&ved=2ahUKEwjTr6e99P_fAhVJmK0KHQF4DZQQjGp6BAgAEEg&tbs=lrf:!2m1!1e3!2m4!1e2!5m2!2m1!2e4!3sIAE,lf:1,lf_ui:2&rldoc=1&fll=0,0&fspn=0,NaN&fz=0&sll=0,0&sspn=0,NaN&sz=0&rlfi=hd:;si:&qop=0&rlvp=clear#qop=0&rlfi=hd:;si:&rlvp=clear"
        #google_url = str(raw_input("Google Maps URL: "))
        print("")
        print("----------------------")
        j=0
        for google_link in google_links:
            def getLinks(url):
                req = urllib2.Request(url, headers=hdr)
                html_page = urllib2.urlopen(req)
                soup = BeautifulSoup(html_page, "html.parser")

                for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
                    if 'google' not in link.get('href') and 'blogger' not in link.get('href') and 'youtube' not in link.get('href'):
                        links.append(link.get('href'))

                return links

            mail_list = []
            url_list = []
            temp = getLinks(google_link)

            for starting_url in temp:
                try:
                    unprocessed_urls = deque([starting_url])
                    processed_urls = set()
                    i=0

                    while len(unprocessed_urls):
                        i=i+1

                        url = unprocessed_urls.popleft()
                        processed_urls.add(url)

                        parts = urlsplit(url)
                        base_url = "{0.scheme}://{0.netloc}".format(parts)
                        path = url[:url.rfind('/')+1] if '/' in parts.path else url

                        if i>20:
                            break
                        #if j>=contact_limit:
                        #    print(str(j)+" URLs - limit reached. Exiting...")
                        #    sys.exit(0)

                        print("Crawling URL: %s" % url)
                        try:
                            response = requests.get(url)
                        except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
                            continue
                        print(str(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I)))
                        if len(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I)) !=0:
                            #if j>=contact_limit:
                            #    print(str(j)+" URLs - limit reached. Exiting...")
                            #    sys.exit(0)
                            print(8)
                            #mail_list.append(str(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I)))
                            #print(str(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I)))
                            s=str(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
                            result0 = re.search("'(.*)'", s)
                            q=str(result0.group(1))
                            q=str(q.split("'")[0])
                            mail_list.append(q)
                            url_list.append(url)
                            #mail_list.append(new_email)
                            break
                        soup = BeautifulSoup(response.text, 'lxml')

                        for anchor in soup.find_all("a"):
                            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
                            if link.startswith('/'):
                                link = base_url + link
                            elif not link.startswith('http'):
                                link = path + link
                            if not link in unprocessed_urls and not link in processed_urls:
                                unprocessed_urls.append(link)
                except SystemExit:
                        print("Successfully exited.")
                except:
                    pass


            def SendMail(to_email):
                #Authenticate to Gmail's SMTP Protocol
                #server = smtplib.SMTP('smtp.gmail.com', 587)
                #server.starttls()

                #Encrypt (Encode) Password to avoid plain-text exposure
                #server.login(("racinecountyeyenews@gmail.com"), ("RacineCountyEye123!"))

                # Create a text/plain message
                msg = MIMEMultipart('alternative')
                msg['Subject'] = 'Alex from Racine County Eye News, just reaching out to say hello'
                msg['From'] = from_email
                msg['To'] = to_email

                # Attaching the html
                html = """\
<html><head></head><body><p><span style="font-weight: 400;">Hey there,</span><span style="font-weight: 400;"><br /></span><span style="font-weight: 400;"><br /></span><span style="font-weight: 400;">Great website and I hope you guys are staying warm over there.</span><span style="font-weight: 400;"><br /></span><span style="font-weight: 400;"><br /></span><span style="font-weight: 400;">Just wanted to notify your location that we are currently offering advertising to local businesses that operate in the Racine County area. We average 1.7 million views annually and report high engagement from our viewers.</span><span style="font-weight: 400;"><br /></span><span style="font-weight: 400;"><br /></span><span style="font-weight: 400;">Do you have a marketing budget set aside for this quarter?</span></p></strong><br /><br /><br /><br /><i>Stay Warm!</i><br /><br /></p><p>Alex Ebinal, Customer Relations &amp; Business Development</p><p>Phone Number: (262) 770-5175</p><p>CEO Denise Lockwood: (262) 504-9570<br /><br /></p><p>Email: racinecountyeyenews@gmail.com</p><p>Website: <a href="https://racinecountyeye.com/">https://racinecountyeye.com</a></p><p><a href="https://jobs.racinecountyeye.com/">Southeast Wisconsin Job Board</a> | <a href="https://jobs.racinecountyeye.com/products">Post a Job</a><br /><br /></p><img src="https://lh4.googleusercontent.com/cMe1UvyguVslPKkrgYBraniFqOc-gKu5MHjZG_7VMMPJuVk82drGPjIWtnl2ruJDTwiA1zESoJZZFZu_MINoso7VRZLhMieVxwBbIyH_HOhC5h_nmGbsNcgTt1XwcKNAjhkNp0k" alt="RCE" width="102" height="87"></p></span></div></body></html>"""

                # The main body is just another attachment
                #body = email.mime.Text.MIMEText("""Test body...""")
                part = MIMEText(html, 'html')
                msg.attach(part)

                ###### [COMMENT THE FOLLOW TO NOT INCLUDE ATTACHMENT] ###
                #filename='More_Information.pdf'
                #fp=open(filename,'rb')
                #att = MIMEApplication(fp.read(),_subtype="pdf")
                #fp.close()
                #att.add_header('Content-Disposition','attachment',filename=filename)
                #msg.attach(att)
                ######

                #Login
                s = smtplib.SMTP('smtp.gmail.com')
                s.starttls()
                s.login(from_email,'RacineCounty123!')
                s.sendmail(from_email,[to_email], msg.as_string())
                s.quit()

            mail_list = [str(x) for x in mail_list]
            url_list = [str(y) for y in url_list]
            print(mail_list)
            print("\nEmail Addresses fetched:")
            for address in mail_list:
                print(address)
            print("")
            #j=0
            for address in mail_list:
                if str(address) not in open("Business3.txt").read() and j<contact_limit:
                    print "Sending mail to "+str(address)+"...",
                    SendMail(str(address))
                    with open("Business3.txt", "a") as myfile:
                        myfile.write(address+":"+url_list[(mail_list.index(address))]+'\n')
                    print "Sent!\n"
                    j=j+1
                elif str(address) in open("Business3.txt").read():
                    print("Already sent to that email: "+str(address))
                elif j>=contact_limit:
                    print(str(j)+" URLs - limit reached. Exiting...")
                    sys.exit(0)
            links=[]
            mail_list=[]
            url_list=[]
print("----------------------")
print("Script finished successfully.")