Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # crawler jobs
- import requests
- import json
- import time
- import os
- import subprocess
- from threading import Thread
- from utils import *
- class Robot:
- def __init__(self):
- self.SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
- self.data_extracted = {}
- self.data_extracted['jobs'] = []
- self.SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
- self.file = os.path.join(self.SITE_ROOT, 'app', 'static','data.json')
- self.headers = {
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
- }
- self.max_results_per_city = 100
- self.cities = {
- 'AU': open(os.path.join(self.SITE_ROOT, 'cities', 'AU'), 'r').read().splitlines(),
- }
- self.urls = {
- 'AU': 'https://au.indeed.com',
- }
- self.job_title = ['Penetration Tester']#['security protections', 'ida pro', 'gdb', 'windbg', 'immunity debugger', 'boofuzz', 'peach fuzzer', 'winafl', 'python', 'assembly', 'mitre att&ck', 'ttps', 'penetration testing','exploits', 'metasploit', 'metasploit framework', 'ethical hacker', 'pentest', 'computer security', 'hacking', 'ceh', 'oscp', 'osce', 'osee', 'penetration testing', 'offensive security', 'mitre att&ck', 'vulnerability research', 'vulnerability researcher', 'fuzzing', 'clang', 'llvm', 'address sanitizer', 'afl', 'fuzzers','information security','product security','application security']
- def run(self):
- for country in self.cities:
- for city in self.cities[country]:
- for job_title in self.job_title:
- for start in range(0, self.max_results_per_city, 10):
- url = self.urls[country] + \
- "/jobs?q={}&l={}&sort=date&start={}".format(
- job_title, city, start)
- time.sleep(1)
- response = requests.get(url, headers=self.headers)
- data = response.text
- soup = get_soup(data)
- html = soup.find_all(name="div", attrs={"class": "row"})
- for page in html:
- prefix = ['30', 'monaten', 'meses', 'luni', 'mois', 'month', 'months', 'maanden', 'mesi', 'mies.', 'm\u00e5nader', '\u043c\u0435\u0441\u044f\u0446\u0435\u0432']
- forbidden = ['clearance','TS/SCI','dod','bpss','ctc','ebs','sc','dv','4+ years','5+ years','6+ years','7+ years','8+ years','9+ years','10+ years','11+ years','12+ years']
- job = extract_job_title(page)
- date_str = extract_date(page)
- try:
- job_description = extract_fulltext(page['data-jk'])
- except:
- pass
- s_date = date_str.replace('+', '')
- skill_match = [s_prefix for s_prefix in forbidden if s_prefix in job_description]
- month_match = [match_prefix for match_prefix in prefix if match_prefix in s_date]
- job_title_match = [job_prefix for job_prefix in self.job_title if job_prefix in job]
- print(job)
- if len(month_match) > 0:
- pass
- elif "NOT_FOUND" in s_date:
- pass
- elif len(skill_match) > 0 :
- pass
- elif not len(job_title_match) > 0:
- pass
- else:
- self.data_extracted['jobs'].append({
- 'job_title': job,
- 'company': extract_company(page),
- 'city': extract_location(page),
- 'date': extract_date(page),
- 'job_description': extract_fulltext(page['data-jk']).lower(),
- 'url': [self.urls[country] + extract_link(page)]
- })
- with open(self.file, 'w') as outfile:
- json.dump(self.data_extracted, outfile, indent=4)
- from app import app
- T1 = Thread(target=Robot().run)
- T1.start()
- #app.run()
- T1.join()
- """
- output
- Penetration Tester
- Senior Test Analyst | $95k + Super
- Application Security - Penetration Tester
- Manager, Red Team
- Penetration Tester - Melbourne
- Senior Security Consultant - SpiderLabs
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement