Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # C:\Users\Pichau\AppData\Local\Programs\Python\Python37\python.exe scraper.py
- import sys
- import csv
- from urllib.parse import urlparse
- from selenium_wait_reload import selenium_wait_reload
- import requests
- import re
- import traceback
- import time
- from bs4 import BeautifulSoup
- from datetime import datetime
- from multiprocessing import Manager, Process, Queue, Lock
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium import webdriver
- from selenium.webdriver.firefox.options import Options as FirefoxOptions
- from selenium.webdriver.chrome.options import Options as ChromeOptions
- from decimal import *
- import os
- import pymysql
- """
- THIS SCRIPT RUNS AS A SERVICE ON A WEBSERVER, THERE IS A WEB INTERFACE TO COMUNICATE WITH IT
- AND DO THINGS LIKE UPLOAD THE INPUT FILE, POPULATE THE RULES AND SO ON...
- BECAUSE OF THIS IT MIGHT NEED SOME TWEAKING TO MAKE IT RUN
- Here is how the rules work:
- *If the rule is just simple text, it will create a normal rule
- *A website matches a rule if it contains the exact text that is on the rule
- *If the rule has a separator it will have to match all the words, so to match both
- "contact us" AND "about us" in a single rule the rule would be
- "contact us #AND# about us" (the separator is " #AND# ", notice the blank space before and after the #)
- *If the rule starts with "url:" then the text will only be matched if it is in a link (inside the href attribute of an 'a' tag). Here is how a rule to check if the site contains any links to godaddy: "url:godaddy" (no spaces)
- *If a website matches any of the rules in a category than it will fall into that category, so to match "contact us" OR "about us" they would be two separate simple rules (prioritizing the special category)
- *Here is how the rules are matched:
- *The scraper checks if the url matches agaisnt any of the special rules, if it matches at least one rule, then it is categorized as special and the other rules and categories are ignored.
- *The scraper starts to test the negative rules, if it matches any of these rules the URL is categorized as negative.
- *Now the scraper starts testing the positive rules, if it matches agaisnt any of these rules and did not match any of the negative rules, the URL is categorized as positive
- *If the URL matches rules from both negative and positive categories then the URL is categorized as both. if it didn't match any of the positive or negative rules, than it is categorized as nothing
- """
- WEBROOT = "/var/www/html/"
- OUTPUT_FILE = "%sprogress/result.csv" % WEBROOT
- START_FILE = "%sstart" % WEBROOT
- THREAD_POOL_SIZE = 4
- INPUT_FILE_NAME = "input_list.txt"
- def find_all_links(dom):
- links = []
- soup = BeautifulSoup(dom, 'html.parser')
- for a in soup.findAll('a', href=True):
- links.append(a['href'])
- return links
- def strip_css_and_js(page_text, return_text_only = True):
- soup = BeautifulSoup(page_text, 'html.parser')
- for style in soup.find_all("style"):
- style.decompose()
- for script in soup.find_all("script"):
- script.decompose()
- if return_text_only:
- return soup.get_text()
- else:
- return str(soup)
- def save_results(results, path):
- with open(path, mode='w') as csv_file:
- f = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
- header = ["Category", "URL", "Redirected"]
- f.writerow(header)
- for row in results:
- f.writerow([row["Category"], row["URL"], row["Redirected"]])
- def fetch_rules():
- rules = {
- "Positive": [],
- "Negative": [],
- "Special": []
- }
- server = ""
- db = ""
- user = ""
- pwd = ""
- connection = pymysql.connect(host=server, user=user, password=pwd, db=db, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
- with connection:
- cursor = connection.cursor()
- cursor.execute("select * from rule")
- rows = cursor.fetchall()
- for row in rows:
- rules[row['category']].append(row['rule_text'])
- return rules
- def fetch_urls():
- input_path = "%s%s" % (WEBROOT, INPUT_FILE_NAME)
- with open(input_path, mode='r') as input_file:
- lines = input_file.readlines()
- for line in lines:
- url = line.replace("\n","").replace("\r","")
- urls.put(url)
- # os.unlink(input_path)
- def log(log_text, thread_id = None):
- timestamp = datetime.today().strftime("%Y-%m-%d %H:%M:%S")
- if thread_id is None:
- log_text = "[%s] %s" % (timestamp, log_text)
- else:
- log_text = "[%s] {%s} %s" % (timestamp, thread_id, log_text)
- print(log_text)
- def validate_rule(page, rule):
- rules = rule.split(" #AND# ")
- text_found = True
- for r in rules:
- if r.startswith("url:"):
- links = find_all_links(page)
- needle = r.replace("url:","")
- for link in links:
- if needle in link:
- return True
- return False
- else:
- match = re.search(r, page, re.IGNORECASE)
- if match is None:
- text_found = False
- return text_found
- def get_browser(use_firefox = True):
- if use_firefox:
- options = FirefoxOptions()
- options.headless = True
- browser = webdriver.Firefox(options = options)
- browser.implicitly_wait(4)
- return browser
- else:
- chrome_options = ChromeOptions()
- chrome_options.add_argument("--headless")
- browser = webdriver.Chrome(chrome_options=chrome_options)
- browser.implicitly_wait(4)
- return browser
- def start_validations(urls, rules, results, thread_id):
- try:
- log("thread %s started" % thread_id, thread_id)
- browser = get_browser(thread_id % 2 == 1)
- while not urls.empty():
- url = "http://%s" % urls.get()
- try:
- log("starting %s" % url, thread_id)
- browser.get(url)
- time.sleep(0.5)
- WebDriverWait(browser, 6).until(selenium_wait_reload(4))
- html = browser.page_source
- result = check_url(html, rules)
- original_domain = url.split("://")[1].split("/")[0].replace("www.","")
- tested_domain = browser.current_url.split("://")[1].split("/")[0].replace("www.","")
- redirected_url = "" if tested_domain == original_domain else browser.current_url
- results.append({"Category":result, "URL":url, "Redirected":redirected_url})
- log("finished %s" % url, thread_id)
- except Exception as e:
- log("couldn't test url %s" % url, thread_id )
- log(str(e), thread_id)
- results.append({"Category":"Connection Error", "URL":url, "Redirected":""})
- browser.quit()
- time.sleep(2)
- browser = get_browser(thread_id % 2 == 1)
- except Exception as e:
- log(str(e), thread_id)
- finally:
- log("closing thread", thread_id)
- browser.quit()
- def calculate_progress(urls):
- progress_folder ="%sprogress/" % WEBROOT
- if not os.path.exists(progress_folder):
- os.makedirs(progress_folder)
- initial_size = urls.qsize()
- while not urls.empty():
- current_size = urls.qsize()
- on_queue = initial_size - current_size
- progress = '{0:.0f}'.format((on_queue / initial_size * 100))
- for progress_file in os.listdir(progress_folder):
- file_path = os.path.join(progress_folder, progress_file)
- if os.path.isfile(file_path) and not file_path.endswith(".csv"):
- os.unlink(file_path)
- os.mknod("%s%s" % (progress_folder, progress))
- time.sleep(1)
- def check_url(source, rules):
- content = strip_css_and_js(source, False)
- for special_rule in rules["Special"]:
- if validate_rule(content, special_rule):
- return "Special"
- is_negative = False
- is_positive = False
- for negative_rule in rules["Negative"]:
- if validate_rule(content, negative_rule):
- is_negative = True
- break
- for positive_rule in rules["Positive"]:
- if validate_rule(content, positive_rule):
- is_positive = True
- break
- if is_positive and is_negative:
- return "both"
- if is_negative:
- return "Negative"
- if is_positive:
- return "Positive"
- return "nothing"
- def wait_signal_to_start():
- while True:
- if os.path.isfile(START_FILE):
- # os.unlink(START_FILE)
- break
- else:
- log("start file not found, waiting to check again")
- time.sleep(30)
- if __name__ == '__main__':
- while True:
- try:
- wait_signal_to_start()
- log("scraper started")
- if os.path.isfile(OUTPUT_FILE):
- os.unlink(OUTPUT_FILE)
- manager = Manager()
- rules = fetch_rules()
- urls = manager.Queue()
- fetch_urls()
- results = manager.list()
- jobs = []
- p = Process(target=calculate_progress, args=(urls,))
- jobs.append(p)
- p.start()
- for i in range(THREAD_POOL_SIZE):
- log("spawning thread with id %s" % i)
- p = Process(target=start_validations, args=(urls, rules, results, i))
- jobs.append(p)
- p.start()
- time.sleep(2)
- for j in jobs:
- j.join()
- save_results(results, OUTPUT_FILE)
- log("scraper finished")
- except Exception as e:
- log(str(e))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement