Advertisement
Guest User

Untitled

a guest
Oct 22nd, 2019
184
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.71 KB | None | 0 0
  1. # crawler jobs
  2. import requests
  3. import json
  4. import time
  5. import os
  6. import subprocess
  7. from threading import Thread
  8. from utils import *
  9.  
  10.  
  11. class Robot:
  12.  
  13.     def __init__(self):
  14.         self.SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
  15.         self.data_extracted = {}
  16.         self.data_extracted['jobs'] = []
  17.         self.SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
  18.         self.file = os.path.join(self.SITE_ROOT, 'app', 'static','data.json')
  19.         self.headers = {
  20.             'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
  21.         }
  22.         self.max_results_per_city = 100
  23.         self.cities = {
  24.             'AU': open(os.path.join(self.SITE_ROOT, 'cities', 'AU'), 'r').read().splitlines(),
  25.         }
  26.         self.urls = {
  27.             'AU': 'https://au.indeed.com',
  28.         }
  29.         self.job_title = ['Penetration Tester']#['security protections', 'ida pro', 'gdb', 'windbg', 'immunity debugger', 'boofuzz', 'peach fuzzer', 'winafl', 'python', 'assembly', 'mitre att&ck', 'ttps', 'penetration testing','exploits', 'metasploit', 'metasploit framework', 'ethical hacker', 'pentest', 'computer security', 'hacking', 'ceh', 'oscp', 'osce', 'osee', 'penetration testing', 'offensive security', 'mitre att&ck', 'vulnerability research', 'vulnerability researcher', 'fuzzing', 'clang', 'llvm', 'address sanitizer', 'afl', 'fuzzers','information security','product security','application security']
  30.  
  31.     def run(self):
  32.         for country in self.cities:
  33.             for city in self.cities[country]:
  34.                 for job_title in self.job_title:
  35.                     for start in range(0, self.max_results_per_city, 10):
  36.                         url = self.urls[country] + \
  37.                             "/jobs?q={}&l={}&sort=date&start={}".format(
  38.                                 job_title, city, start)
  39.                         time.sleep(1)
  40.                         response = requests.get(url, headers=self.headers)
  41.                         data = response.text
  42.                         soup = get_soup(data)
  43.                         html = soup.find_all(name="div", attrs={"class": "row"})
  44.                         for page in html:
  45.                             prefix = ['30', 'monaten', 'meses', 'luni', 'mois', 'month', 'months', 'maanden', 'mesi', 'mies.', 'm\u00e5nader', '\u043c\u0435\u0441\u044f\u0446\u0435\u0432']
  46.                             forbidden = ['clearance','TS/SCI','dod','bpss','ctc','ebs','sc','dv','4+ years','5+ years','6+ years','7+ years','8+ years','9+ years','10+ years','11+ years','12+ years']
  47.                             job = extract_job_title(page)
  48.                             date_str = extract_date(page)
  49.                             try:
  50.                                 job_description = extract_fulltext(page['data-jk'])
  51.                             except:
  52.                                 pass
  53.                             s_date = date_str.replace('+', '')
  54.                             skill_match = [s_prefix for s_prefix in forbidden if s_prefix in job_description]
  55.                             month_match = [match_prefix for match_prefix in prefix if match_prefix in s_date]
  56.                             job_title_match = [job_prefix for job_prefix in self.job_title if job_prefix in job]
  57.                             print(job)
  58.                             if len(month_match) > 0:
  59.                                 pass
  60.                             elif "NOT_FOUND" in s_date:
  61.                                 pass
  62.                             elif len(skill_match) > 0 :
  63.                                 pass
  64.                             elif not len(job_title_match) > 0
  65.                                 pass
  66.                             else:
  67.                                 self.data_extracted['jobs'].append({
  68.                                     'job_title': job,
  69.                                     'company': extract_company(page),
  70.                                     'city': extract_location(page),
  71.                                     'date': extract_date(page),
  72.                                     'job_description': extract_fulltext(page['data-jk']).lower(),
  73.                                     'url':  [self.urls[country] + extract_link(page)]
  74.                                 })
  75.                             with open(self.file, 'w') as outfile:
  76.                                 json.dump(self.data_extracted, outfile, indent=4)
  77. from app import app
  78. T1 = Thread(target=Robot().run)
  79. T1.start()
  80. #app.run()
  81. T1.join()
  82.  
  83. """
  84. output
  85.  
  86. Penetration Tester
  87. Senior Test Analyst | $95k + Super
  88. Application Security - Penetration Tester
  89. Manager, Red Team
  90. Penetration Tester - Melbourne
  91. Senior Security Consultant - SpiderLabs
  92. """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement