Untitled

# C:\Users\Pichau\AppData\Local\Programs\Python\Python37\python.exe scraper.py

import sys
import csv
from urllib.parse import urlparse
from selenium_wait_reload import selenium_wait_reload
import requests
import re
import traceback
import time
from bs4 import BeautifulSoup
from datetime import datetime
from multiprocessing import Manager, Process, Queue, Lock
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
from decimal import *
import os
import pymysql
"""
THIS SCRIPT RUNS AS A SERVICE ON A WEBSERVER, THERE IS A WEB INTERFACE TO COMUNICATE WITH IT
AND DO THINGS LIKE UPLOAD THE INPUT FILE, POPULATE THE RULES AND SO ON...
BECAUSE OF THIS IT MIGHT NEED SOME TWEAKING TO MAKE IT RUN
Here is how the rules work:

*If the rule is just simple text, it will create a normal rule
*A website matches a rule if it contains the exact text that is on the rule
*If the rule has a separator it will have to match all the words, so to match both
     "contact us" AND "about us" in a single rule the rule would be
     "contact us #AND# about us" (the separator is " #AND# ", notice the blank space before and after the #)
*If the rule starts with "url:" then the text will only be matched if it is in a link (inside the href attribute of an 'a' tag). Here is how a rule to check if the site contains any links to godaddy: "url:godaddy" (no spaces)
*If a website matches any of the rules in a category than it will fall into that category, so to match "contact us" OR "about us" they would be two separate simple rules (prioritizing the special category)
*Here is how the rules are matched:
        *The scraper checks if the url matches agaisnt any of the special rules, if it matches at least one rule, then it is categorized as special and the other rules and categories are ignored.
        *The scraper starts to test the negative rules, if it matches any of these rules the URL is categorized as negative.
        *Now the scraper starts testing the positive rules, if it matches agaisnt any of these rules and did not match any of the negative rules, the URL is categorized as positive
        *If the URL matches rules from both negative and positive categories then the URL is categorized as both. if it didn't match any of the positive or negative rules, than it is categorized as nothing

"""
WEBROOT = "/var/www/html/"
OUTPUT_FILE = "%sprogress/result.csv" % WEBROOT
START_FILE = "%sstart" % WEBROOT
THREAD_POOL_SIZE = 4
INPUT_FILE_NAME = "input_list.txt"

def find_all_links(dom):
    links = []
    soup = BeautifulSoup(dom, 'html.parser')
    for a in soup.findAll('a', href=True):
        links.append(a['href'])
    return links

def strip_css_and_js(page_text, return_text_only = True):
    soup = BeautifulSoup(page_text, 'html.parser')
    for style in soup.find_all("style"):
        style.decompose()

    for script in soup.find_all("script"):
        script.decompose()

    if return_text_only:
        return soup.get_text()
    else:
        return str(soup)

def save_results(results, path):
    with open(path, mode='w') as csv_file:
        f = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        header = ["Category", "URL", "Redirected"]
        f.writerow(header)
        for row in results:
            f.writerow([row["Category"], row["URL"], row["Redirected"]])

def fetch_rules():
    rules = {
        "Positive": [],
        "Negative": [],
        "Special": []
    }

    server = ""
    db = ""
    user = ""
    pwd = ""
    connection = pymysql.connect(host=server, user=user, password=pwd, db=db, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
    with connection:
        cursor = connection.cursor()
        cursor.execute("select * from rule")
        rows = cursor.fetchall()
        for row in rows:
            rules[row['category']].append(row['rule_text'])

    return rules

def fetch_urls():
    input_path = "%s%s" % (WEBROOT, INPUT_FILE_NAME)
    with open(input_path, mode='r') as input_file:
        lines = input_file.readlines()
    for line in lines:
        url = line.replace("\n","").replace("\r","")
        urls.put(url)
    # os.unlink(input_path)

def log(log_text, thread_id = None):
    timestamp = datetime.today().strftime("%Y-%m-%d %H:%M:%S")
    if thread_id is None:
        log_text = "[%s] %s" % (timestamp, log_text)
    else:
        log_text = "[%s] {%s} %s" % (timestamp, thread_id, log_text)
    print(log_text)

def validate_rule(page, rule):
    rules = rule.split(" #AND# ")
    text_found = True
    for r in rules:
        if r.startswith("url:"):
            links = find_all_links(page)
            needle = r.replace("url:","")
            for link in links:
                if needle in link:
                    return True
            return False
        else:
            match = re.search(r, page, re.IGNORECASE)
            if match is None:
                text_found = False
    return text_found


def get_browser(use_firefox = True):
    if use_firefox:
        options = FirefoxOptions()
        options.headless = True
        browser = webdriver.Firefox(options = options)
        browser.implicitly_wait(4)
        return browser
    else:
        chrome_options = ChromeOptions()
        chrome_options.add_argument("--headless")
        browser = webdriver.Chrome(chrome_options=chrome_options)
        browser.implicitly_wait(4)
        return browser

def start_validations(urls, rules, results, thread_id):
    try:
        log("thread %s started" % thread_id, thread_id)
        browser = get_browser(thread_id % 2 == 1)
        while not urls.empty():
            url = "http://%s" % urls.get()
            try:
                log("starting %s" % url, thread_id)
                browser.get(url)
                time.sleep(0.5)
                WebDriverWait(browser, 6).until(selenium_wait_reload(4))
                html = browser.page_source
                result = check_url(html, rules)

                original_domain = url.split("://")[1].split("/")[0].replace("www.","")
                tested_domain = browser.current_url.split("://")[1].split("/")[0].replace("www.","")
                redirected_url = "" if tested_domain == original_domain else browser.current_url

                results.append({"Category":result, "URL":url, "Redirected":redirected_url})
                log("finished %s" % url, thread_id)
            except Exception as e:
                log("couldn't test url %s" % url, thread_id )
                log(str(e), thread_id)
                results.append({"Category":"Connection Error", "URL":url, "Redirected":""})
                browser.quit()
                time.sleep(2)
                browser = get_browser(thread_id % 2 == 1)
    except Exception as e:
        log(str(e), thread_id)
    finally:
        log("closing thread", thread_id)
        browser.quit()

def calculate_progress(urls):
    progress_folder ="%sprogress/" % WEBROOT
    if not os.path.exists(progress_folder):
        os.makedirs(progress_folder)

    initial_size = urls.qsize()
    while not urls.empty():
        current_size = urls.qsize()
        on_queue = initial_size - current_size
        progress = '{0:.0f}'.format((on_queue / initial_size * 100))
        for progress_file in os.listdir(progress_folder):
            file_path = os.path.join(progress_folder, progress_file)
            if os.path.isfile(file_path) and not file_path.endswith(".csv"):
                os.unlink(file_path)
        os.mknod("%s%s" % (progress_folder, progress))
        time.sleep(1)

def check_url(source, rules):
    content = strip_css_and_js(source, False)
    for special_rule in rules["Special"]:
        if validate_rule(content, special_rule):
            return "Special"

    is_negative = False
    is_positive = False
    for negative_rule in rules["Negative"]:
        if validate_rule(content, negative_rule):
            is_negative = True
            break

    for positive_rule in rules["Positive"]:
        if validate_rule(content, positive_rule):
            is_positive = True
            break

    if is_positive and is_negative:
        return "both"
    if is_negative:
        return "Negative"
    if is_positive:
        return "Positive"
    return "nothing"

def wait_signal_to_start():
    while True:
        if os.path.isfile(START_FILE):
            # os.unlink(START_FILE)
            break
        else:
            log("start file not found, waiting to check again")
        time.sleep(30)


if __name__ == '__main__':
    while True:
        try:
            wait_signal_to_start()
            log("scraper started")
            if os.path.isfile(OUTPUT_FILE):
                os.unlink(OUTPUT_FILE)

            manager = Manager()

            rules = fetch_rules()
            urls = manager.Queue()
            fetch_urls()
            results = manager.list()

            jobs = []
            p = Process(target=calculate_progress, args=(urls,))
            jobs.append(p)
            p.start()
            for i in range(THREAD_POOL_SIZE):
                log("spawning thread with id %s" % i)
                p = Process(target=start_validations, args=(urls, rules, results, i))
                jobs.append(p)
                p.start()
                time.sleep(2)
            for j in jobs:
                j.join()

            save_results(results, OUTPUT_FILE)
            log("scraper finished")
        except Exception as e:
            log(str(e))