Advertisement
kopyl

Untitled

Jul 9th, 2022
833
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.59 KB | None | 0 0
  1. from bson.json_util import dumps
  2. from bs4 import BeautifulSoup as bs4
  3. from contextlib import contextmanager
  4. from datetime import datetime
  5. import random
  6. import requests
  7. import signal
  8.  
  9. import pymongo
  10. from tldextract import extract
  11.  
  12.  
  13. p = print
  14.  
  15.  
  16. client = pymongo.MongoClient(
  17.     f"mongodb://"
  18.     f"kopyl:oleg66@"
  19.     f"localhost"  # REMOTE or LOCAL
  20. )
  21. db_clutch = client["clutchwebsites"]["websites"]
  22.  
  23.  
  24. proxies = [
  25.     {"https": "209.127.191.180:9279"},
  26.     {"https": "45.95.96.132:8691"},
  27.     {"https": "45.95.96.187:8746"},
  28.     {"https": "45.95.96.237:8796"},
  29.     {"https": "45.136.228.154:6209"},
  30.     {"https": "45.94.47.66:8110"},
  31.     {"https": "45.94.47.108:8152"},
  32.     {"https": "193.8.56.119:9183"},
  33.     {"https": "45.95.99.226:7786"},
  34.     {"https": "45.95.99.20:7580"}
  35. ]
  36.  
  37.  
  38. class CustomTimeoutError(Exception):
  39.     pass
  40.  
  41.  
  42. def raise_timeout(signum, frame):
  43.     raise CustomTimeoutError
  44.  
  45.  
  46. @contextmanager
  47. def timeout(time):
  48.     # https://www.jujens.eu/posts/en/2018/Jun/02/python-timeout-function/
  49.     signal.signal(signal.SIGALRM, raise_timeout)
  50.     signal.alarm(time)
  51.  
  52.     try:
  53.         yield
  54.     finally:
  55.         signal.signal(signal.SIGALRM, signal.SIG_IGN)
  56.  
  57.  
  58. def get_agencies_from_page(page_number):
  59.     proxy = random.choice(proxies)
  60.     p("PROXY:", proxy)
  61.     page = requests.get(
  62.         "https://clutch.co/us/web-designers",
  63.         params={"page": page_number},
  64.         proxies=proxy
  65.        
  66.     )
  67.     page = bs4(page.text, "lxml")
  68.     agencies = page.select("li.provider")
  69.     new_agencies = []
  70.     for agency in agencies:
  71.         clutch_links = agency.select('a[href^="/profile/"]')
  72.         for link in clutch_links:
  73.             name = link.text.strip()
  74.             if not name: continue
  75.             if name == "More": continue
  76.             if name == "View Profile": continue
  77.             clutch_link = link["href"]
  78.             if "#reviews" in link["href"]: continue
  79.             if "#showmore" in link["href"]: continue
  80.             if not agency.select("a.website-link__item"): continue
  81.             website_link = agency.select("a.website-link__item")
  82.             website_link = website_link[0]["href"]
  83.             if "?utm_source" in website_link:
  84.                 website_link = website_link.split("?utm_source")
  85.                 website_link = website_link[0]
  86.             elif "?utm_campaign" in website_link:
  87.                 website_link = website_link.split("?utm_campaign")
  88.                 website_link = website_link[0]
  89.             agency_main = {
  90.                 "name": name,
  91.                 "clutch_link": clutch_link,
  92.                 "website_link": website_link
  93.             }
  94.             new_agencies.append(agency_main)
  95.     return new_agencies
  96.  
  97.  
  98. def save_all_usa_clutch_pages(start_page=0):
  99.     for x in range(463):
  100.         if x < start_page:
  101.             continue
  102.         p(f"Getting {x} page")
  103.         pages = get_agencies_from_page(x)
  104.         assert len(pages) <= 50
  105.         db_clutch.insert_many(pages)
  106.         p("Page saved")
  107.  
  108.  
  109. def parse_phone_and_location(url):
  110.     proxy = random.choice(proxies)
  111.     p("proxy:", proxy)
  112.     page = requests.get(url, proxies=proxy)
  113.     page = bs4(page.text, "lxml")
  114.     contacts = page.select_one("li.quick-menu-details")
  115.     location = contacts.select_one("span").text
  116.     phone = contacts.select_one("a").text.strip()
  117.     phone_and_location = {
  118.         "phone": phone,
  119.         "location": location
  120.     }
  121.     return phone_and_location
  122.  
  123.  
  124.  
  125. def update_phone_and_location():
  126.     agency = db_clutch.find_one({"location": {"$exists": False}})
  127.     if not agency:
  128.         p("Not agency...")
  129.         return
  130.     name = agency["name"]
  131.     p("Getting agency's details:", name)
  132.     clutch_link = f"https://clutch.co{agency['clutch_link']}"
  133.     phone_and_location = parse_phone_and_location(clutch_link)
  134.     db_clutch.update_one(
  135.         {"_id": agency["_id"]},
  136.         {"$set": phone_and_location}
  137.     )
  138.     return agency
  139.  
  140.  
  141.  
  142. def incorrect_domain(email):
  143.     domain = email.split("@")
  144.     if len(domain) != 2:
  145.         return True
  146.     domain = domain[1]
  147.     name_and_zone = domain.split(".")
  148.     if len(name_and_zone) < 2:
  149.         return True
  150.     name = name_and_zone[-2]
  151.     zone = name_and_zone[-1]
  152.     for char in name:
  153.         if not char.isalnum() and char not in [".", "-", "_", "+"]:
  154.             return True
  155.     if not zone.isalnum():
  156.         return True
  157.  
  158.  
  159. def is_email(email):
  160.     if not all(
  161.         char in email for char in ["@","."]
  162.     ):
  163.         return False
  164.     if string_has_repetitive_chars(["@", "."], email):
  165.         return False
  166.     if email.startswith("."):
  167.         return False
  168.     if email.endswith("."):
  169.         return False
  170.     if "/" in email:
  171.         return False
  172.     for char in email:
  173.         if not char.isalnum() and char not in ["@", ".", "-", "_", "+"]:
  174.             return False
  175.     if incorrect_domain(email):
  176.         return False
  177.     return True
  178.  
  179.  
  180. def string_has_repetitive_chars(chars: list, string: str) -> int:
  181.     for n, char in enumerate(string):
  182.         if n == 0: continue
  183.         if string[n] == string[n-1] in chars:
  184.             return True
  185.     return False
  186.  
  187.  
  188. def parse_emails(website_link: str) -> list:
  189.     for x in range(5):
  190.         try:
  191.             with timeout(11):
  192.                 request = requests.get(website_link, timeout=10)
  193.                 break
  194.         except (requests.ConnectTimeout, requests.exceptions.ReadTimeout) as e:
  195.             p("timeout")
  196.             return []
  197.         except requests.exceptions.InvalidURL:
  198.             p("INVALID URL")
  199.             return []
  200.         except requests.exceptions.TooManyRedirects:
  201.             p("Too many redirects")
  202.             return []
  203.         except CustomTimeoutError:
  204.             p("CustomTimeoutError")
  205.             return []
  206.         except requests.exceptions.InvalidSchema:
  207.             p("InvalidSchema:")
  208.             return []
  209.         except requests.exceptions.ConnectionError:
  210.             p("Retry")
  211.             continue
  212.         except requests.exceptions.ChunkedEncodingError:
  213.             p("Retry 2")
  214.             continue
  215.     else:
  216.         return []
  217.     page = bs4(request.text, "lxml")
  218.     emails = []
  219.     for element in page.find_all():
  220.         for attribute in element.attrs:
  221.             if (
  222.                 type(element[attribute]) == str and
  223.                 "@" in element[attribute]
  224.             ):
  225.                 emails.extend(element[attribute].split())
  226.             elif (
  227.                 type(element[attribute]) == list and
  228.                 "@" in str(element[attribute])
  229.             ):
  230.                 emails.extend(element[attribute])
  231.         if "@" in element.text:
  232.             emails.extend(element.text.split())
  233.     return emails
  234.  
  235.  
  236. def clean_emails(emails: list) -> list:
  237.     emails = [email.strip() for email in emails]
  238.     emails = [
  239.         email for email in emails
  240.         if is_email(email)
  241.     ]
  242.     emails = list(dict.fromkeys(emails))
  243.     return emails
  244.  
  245.  
  246. def exctract_emails(website_link: str):
  247.     emails = parse_emails(website_link)
  248.     emails = clean_emails(emails)
  249.     return emails
  250.  
  251.  
  252.  
  253. def get_website_root(website_link):
  254.     website_link = website_link.split("/")
  255.     website_root = website_link[:3]
  256.     website_root = "/".join(website_root)
  257.     return website_root
  258.  
  259.  
  260. def get_domain(website_root):
  261.     tsd, td, tsu = extract(website_root)
  262.     if td and tsu:
  263.         url = td + '.' + tsu
  264.     else:
  265.         url = None
  266.     return url
  267.  
  268.  
  269. def get_links(website_link):
  270.     for x in range(5):
  271.         try:
  272.             request = requests.get(website_link, timeout=5)
  273.             break
  274.         except (requests.ConnectTimeout, requests.exceptions.ReadTimeout) as e:
  275.             p("timeout")
  276.             return []
  277.         except requests.exceptions.InvalidURL:
  278.             p("INVALID URL")
  279.             return []
  280.         except requests.exceptions.TooManyRedirects:
  281.             p("Too many redirects")
  282.             return []
  283.         except requests.exceptions.ConnectionError:
  284.             p("Retry")
  285.             continue
  286.         except requests.exceptions.ChunkedEncodingError:
  287.             p("Retry 2")
  288.             continue
  289.     else:
  290.         return []
  291.  
  292.  
  293.     page = bs4(request.text, "lxml")
  294.     links = page.select("a")
  295.     # p(links)
  296.     links = [link.get("href") for link in links]
  297.     links = [link for link in links if link]
  298.     links = [
  299.         link.replace(website_link, "")
  300.         for link in links
  301.     ]
  302.     links = [
  303.         link for link in links if
  304.         not any([x in link for x in ["tel:", "javascript:", "skype:"]])
  305.     ]
  306.     links = [
  307.         link.strip() for link in links if
  308.         not any([link == x for x in [" ", "#"]])
  309.     ]
  310.  
  311.     new_links = []
  312.     for link in links:
  313.         if get_domain(link):
  314.             if get_domain(link) != get_domain(website_link):
  315.                 continue
  316.         link = link.split(get_domain(link))
  317.         if not link:
  318.             continue
  319.         if len(link) == 2:
  320.             link = link[1]
  321.         else:
  322.             link = link[0]
  323.  
  324.         link = link.split("/")
  325.         link = [l for l in link if l]
  326.         if not link:
  327.             continue
  328.         link = link[0]
  329.         if link.startswith("#"):
  330.             continue
  331.  
  332.         if link not in new_links:
  333.             new_links.append(link)
  334.  
  335.     return new_links
  336.  
  337.  
  338. pages_to_visit = [
  339.     "",
  340.     "contact",
  341.     "about",
  342.     "contact-us",
  343.     "about-us",
  344.     "careers",
  345.     "contact.html",
  346.     "contacts",
  347.     "contact.php",
  348.     "support",
  349.     "team"
  350. ]
  351.  
  352.  
  353. def get_emails_amount_may_be_saved():
  354.     emails_saved_amount = (
  355.         db_clutch.count_documents({ "emails": {"$gt": {"$size": 0} } })
  356.     ) + 1
  357.     websites_scraped_amount = (
  358.         db_clutch.count_documents({ "emails": {"$exists": True } })
  359.     ) + 1
  360.     emails_per_website = emails_saved_amount / websites_scraped_amount
  361.     emails_may_be_saved = int(emails_per_website * 23146)
  362.     return emails_may_be_saved, emails_saved_amount, websites_scraped_amount
  363.  
  364.  
  365. links_to_avoid = [
  366.     "https://uniqueamb.com/team",
  367.     "https://www.sleeplessmedia.com/about"
  368. ]
  369.  
  370. def scrape_emails():
  371.     for n, agency in enumerate(list(db_clutch.find({"emails": {"$exists": False}})), 1):
  372.         emails_amount_may_be_saved, emails_saved, websites_scraped_amount = (
  373.             get_emails_amount_may_be_saved()
  374.         )
  375.         _id = agency["_id"]
  376.         website_root = agency["website_root"]
  377.         name = agency["name"]
  378.         website_links = [
  379.             (
  380.                 f"{website_root}/{page}".strip(),
  381.                 page,
  382.                 pages_to_visit.index(page) + 1
  383.             )
  384.             for page in pages_to_visit
  385.         ]
  386.         emails = None
  387.         for website_link, page, ln in website_links:
  388.             p(f"{websites_scraped_amount}.{ln}:", "Downloading", website_link)
  389.             if website_link in links_to_avoid:
  390.                 continue
  391.             emails = exctract_emails(website_link)
  392.             if emails:
  393.                 source = f"/{page}" if page else "/"
  394.                 p(
  395.                     "Email/s found",
  396.                     "Total Emails:", emails_saved,
  397.                     "possibly_emails_will_be_saved",
  398.                     emails_amount_may_be_saved
  399.                 )
  400.                 emails = {
  401.                     "source": source,
  402.                     "emails": emails,
  403.                     "saved_on": datetime.now()
  404.                 }
  405.                 break
  406.         db_clutch.update_one(
  407.             {"_id": _id},
  408.             {
  409.                 "$set": {"emails": emails}
  410.             }
  411.         )
  412.  
  413.  
  414.  
  415. scrape_emails()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement