Advertisement
Guest User

Untitled

a guest
Jul 15th, 2019
784
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.52 KB | None | 0 0
  1. # C:\Users\Pichau\AppData\Local\Programs\Python\Python37\python.exe scraper.py
  2.  
  3. import sys
  4. import csv
  5. from urllib.parse import urlparse
  6. from selenium_wait_reload import selenium_wait_reload
  7. import requests
  8. import re
  9. import traceback
  10. import time
  11. from bs4 import BeautifulSoup
  12. from datetime import datetime
  13. from multiprocessing import Manager, Process, Queue, Lock
  14. from selenium.webdriver.support.ui import WebDriverWait
  15. from selenium import webdriver
  16. from selenium.webdriver.firefox.options import Options as FirefoxOptions
  17. from selenium.webdriver.chrome.options import Options as ChromeOptions
  18. from decimal import *
  19. import os
  20. import pymysql
  21. """
  22. THIS SCRIPT RUNS AS A SERVICE ON A WEBSERVER, THERE IS A WEB INTERFACE TO COMUNICATE WITH IT
  23. AND DO THINGS LIKE UPLOAD THE INPUT FILE, POPULATE THE RULES AND SO ON...
  24. BECAUSE OF THIS IT MIGHT NEED SOME TWEAKING TO MAKE IT RUN
  25. Here is how the rules work:
  26.  
  27. *If the rule is just simple text, it will create a normal rule
  28. *A website matches a rule if it contains the exact text that is on the rule
  29. *If the rule has a separator it will have to match all the words, so to match both
  30.      "contact us" AND "about us" in a single rule the rule would be
  31.      "contact us #AND# about us" (the separator is " #AND# ", notice the blank space before and after the #)
  32. *If the rule starts with "url:" then the text will only be matched if it is in a link (inside the href attribute of an 'a' tag). Here is how a rule to check if the site contains any links to godaddy: "url:godaddy" (no spaces)
  33. *If a website matches any of the rules in a category than it will fall into that category, so to match "contact us" OR "about us" they would be two separate simple rules (prioritizing the special category)
  34. *Here is how the rules are matched:
  35.         *The scraper checks if the url matches agaisnt any of the special rules, if it matches at least one rule, then it is categorized as special and the other rules and categories are ignored.
  36.         *The scraper starts to test the negative rules, if it matches any of these rules the URL is categorized as negative.
  37.         *Now the scraper starts testing the positive rules, if it matches agaisnt any of these rules and did not match any of the negative rules, the URL is categorized as positive
  38.         *If the URL matches rules from both negative and positive categories then the URL is categorized as both. if it didn't match any of the positive or negative rules, than it is categorized as nothing
  39.  
  40. """
  41. WEBROOT = "/var/www/html/"
  42. OUTPUT_FILE = "%sprogress/result.csv" % WEBROOT
  43. START_FILE = "%sstart" % WEBROOT
  44. THREAD_POOL_SIZE = 4
  45. INPUT_FILE_NAME = "input_list.txt"
  46.  
  47. def find_all_links(dom):
  48.     links = []
  49.     soup = BeautifulSoup(dom, 'html.parser')
  50.     for a in soup.findAll('a', href=True):
  51.         links.append(a['href'])
  52.     return links
  53.  
  54. def strip_css_and_js(page_text, return_text_only = True):
  55.     soup = BeautifulSoup(page_text, 'html.parser')
  56.     for style in soup.find_all("style"):
  57.         style.decompose()
  58.  
  59.     for script in soup.find_all("script"):
  60.         script.decompose()
  61.  
  62.     if return_text_only:
  63.         return soup.get_text()
  64.     else:
  65.         return str(soup)
  66.  
  67. def save_results(results, path):
  68.     with open(path, mode='w') as csv_file:
  69.         f = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
  70.         header = ["Category", "URL", "Redirected"]
  71.         f.writerow(header)
  72.         for row in results:
  73.             f.writerow([row["Category"], row["URL"], row["Redirected"]])
  74.  
  75. def fetch_rules():
  76.     rules = {
  77.         "Positive": [],
  78.         "Negative": [],
  79.         "Special": []
  80.     }
  81.  
  82.     server = ""
  83.     db = ""
  84.     user = ""
  85.     pwd = ""
  86.     connection = pymysql.connect(host=server, user=user, password=pwd, db=db, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
  87.     with connection:
  88.         cursor = connection.cursor()
  89.         cursor.execute("select * from rule")
  90.         rows = cursor.fetchall()
  91.         for row in rows:
  92.             rules[row['category']].append(row['rule_text'])
  93.  
  94.     return rules
  95.  
  96. def fetch_urls():
  97.     input_path = "%s%s" % (WEBROOT, INPUT_FILE_NAME)
  98.     with open(input_path, mode='r') as input_file:
  99.         lines = input_file.readlines()
  100.     for line in lines:
  101.         url = line.replace("\n","").replace("\r","")
  102.         urls.put(url)
  103.     # os.unlink(input_path)
  104.  
  105. def log(log_text, thread_id = None):
  106.     timestamp = datetime.today().strftime("%Y-%m-%d %H:%M:%S")
  107.     if thread_id is None:
  108.         log_text = "[%s] %s" % (timestamp, log_text)
  109.     else:
  110.         log_text = "[%s] {%s} %s" % (timestamp, thread_id, log_text)
  111.     print(log_text)
  112.  
  113. def validate_rule(page, rule):
  114.     rules = rule.split(" #AND# ")
  115.     text_found = True
  116.     for r in rules:
  117.         if r.startswith("url:"):
  118.             links = find_all_links(page)
  119.             needle = r.replace("url:","")
  120.             for link in links:
  121.                 if needle in link:
  122.                     return True
  123.             return False
  124.         else:
  125.             match = re.search(r, page, re.IGNORECASE)
  126.             if match is None:
  127.                 text_found = False
  128.     return text_found
  129.  
  130.  
  131. def get_browser(use_firefox = True):
  132.     if use_firefox:
  133.         options = FirefoxOptions()
  134.         options.headless = True
  135.         browser = webdriver.Firefox(options = options)
  136.         browser.implicitly_wait(4)
  137.         return browser
  138.     else:
  139.         chrome_options = ChromeOptions()
  140.         chrome_options.add_argument("--headless")
  141.         browser = webdriver.Chrome(chrome_options=chrome_options)
  142.         browser.implicitly_wait(4)
  143.         return browser
  144.  
  145. def start_validations(urls, rules, results, thread_id):
  146.     try:
  147.         log("thread %s started" % thread_id, thread_id)
  148.         browser = get_browser(thread_id % 2 == 1)
  149.         while not urls.empty():
  150.             url = "http://%s" % urls.get()
  151.             try:
  152.                 log("starting %s" % url, thread_id)
  153.                 browser.get(url)
  154.                 time.sleep(0.5)
  155.                 WebDriverWait(browser, 6).until(selenium_wait_reload(4))
  156.                 html = browser.page_source
  157.                 result = check_url(html, rules)
  158.  
  159.                 original_domain = url.split("://")[1].split("/")[0].replace("www.","")
  160.                 tested_domain = browser.current_url.split("://")[1].split("/")[0].replace("www.","")
  161.                 redirected_url = "" if tested_domain == original_domain else browser.current_url
  162.  
  163.                 results.append({"Category":result, "URL":url, "Redirected":redirected_url})
  164.                 log("finished %s" % url, thread_id)
  165.             except Exception as e:
  166.                 log("couldn't test url %s" % url, thread_id )
  167.                 log(str(e), thread_id)
  168.                 results.append({"Category":"Connection Error", "URL":url, "Redirected":""})
  169.                 browser.quit()
  170.                 time.sleep(2)
  171.                 browser = get_browser(thread_id % 2 == 1)
  172.     except Exception as e:
  173.         log(str(e), thread_id)
  174.     finally:
  175.         log("closing thread", thread_id)
  176.         browser.quit()
  177.  
  178. def calculate_progress(urls):
  179.     progress_folder ="%sprogress/" % WEBROOT
  180.     if not os.path.exists(progress_folder):
  181.         os.makedirs(progress_folder)
  182.  
  183.     initial_size = urls.qsize()
  184.     while not urls.empty():
  185.         current_size = urls.qsize()
  186.         on_queue = initial_size - current_size
  187.         progress = '{0:.0f}'.format((on_queue / initial_size * 100))
  188.         for progress_file in os.listdir(progress_folder):
  189.             file_path = os.path.join(progress_folder, progress_file)
  190.             if os.path.isfile(file_path) and not file_path.endswith(".csv"):
  191.                 os.unlink(file_path)
  192.         os.mknod("%s%s" % (progress_folder, progress))
  193.         time.sleep(1)
  194.  
  195. def check_url(source, rules):
  196.     content = strip_css_and_js(source, False)
  197.     for special_rule in rules["Special"]:
  198.         if validate_rule(content, special_rule):
  199.             return "Special"
  200.  
  201.     is_negative = False
  202.     is_positive = False
  203.     for negative_rule in rules["Negative"]:
  204.         if validate_rule(content, negative_rule):
  205.             is_negative = True
  206.             break
  207.  
  208.     for positive_rule in rules["Positive"]:
  209.         if validate_rule(content, positive_rule):
  210.             is_positive = True
  211.             break
  212.  
  213.     if is_positive and is_negative:
  214.         return "both"
  215.     if is_negative:
  216.         return "Negative"
  217.     if is_positive:
  218.         return "Positive"
  219.     return "nothing"
  220.  
  221. def wait_signal_to_start():
  222.     while True:
  223.         if os.path.isfile(START_FILE):
  224.             # os.unlink(START_FILE)
  225.             break
  226.         else:
  227.             log("start file not found, waiting to check again")
  228.         time.sleep(30)
  229.  
  230.  
  231. if __name__ == '__main__':
  232.     while True:
  233.         try:
  234.             wait_signal_to_start()
  235.             log("scraper started")
  236.             if os.path.isfile(OUTPUT_FILE):
  237.                 os.unlink(OUTPUT_FILE)
  238.  
  239.             manager = Manager()
  240.  
  241.             rules = fetch_rules()
  242.             urls = manager.Queue()
  243.             fetch_urls()
  244.             results = manager.list()
  245.  
  246.             jobs = []
  247.             p = Process(target=calculate_progress, args=(urls,))
  248.             jobs.append(p)
  249.             p.start()
  250.             for i in range(THREAD_POOL_SIZE):
  251.                 log("spawning thread with id %s" % i)
  252.                 p = Process(target=start_validations, args=(urls, rules, results, i))
  253.                 jobs.append(p)
  254.                 p.start()
  255.                 time.sleep(2)
  256.             for j in jobs:
  257.                 j.join()
  258.            
  259.             save_results(results, OUTPUT_FILE)
  260.             log("scraper finished")
  261.         except Exception as e:
  262.             log(str(e))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement