Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import time
- from bs4 import BeautifulSoup
- from bs4 import SoupStrainer
- import os
- import httplib2
- #import Links
- #import Keywords
- import MySQLdb
- import peewee
- from peewee import *
- from datetime import datetime
- import argparse
- import logging
- logger = logging.getLogger(__name__)
- logger.setLevel(logging.DEBUG)
- fh = logging.FileHandler('crawler.log')
- fh.setLevel(logging.DEBUG)
- #ch = logging.StreamHandler()
- #ch.setLevel(logging.DEBUG)
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- #ch.setFormatter(formatter)
- #logger.addHandler(ch)
- fh.setFormatter(formatter)
- logger.addHandler(fh)
- parser = argparse.ArgumentParser()
- parser.add_argument('-l', '--url', help="The base link to be crawled", required=True)
- parser.add_argument('-k', '--keywords', help="Keywords to search", required=True)
- args = parser.parse_args()
- keywords = (args.keywords).split(',')
- mapping = dict()
- mapping[args.url] = keywords
- logger.info(mapping)
- db = MySQLDatabase('WebSpider', user='ruut', passwd='ruut')
- parsed = set()
- class DATA(peewee.Model):
- parent_link = peewee.CharField()
- sub_link = peewee.CharField()
- keyword = peewee.CharField()
- count = peewee.IntegerField()
- class Meta:
- database = db
- db_table = 'DATA'
- def make_soup(s):
- match=re.compile('https://|http://')
- if re.search(match,s):
- try:
- http = httplib2.Http()
- status, response = http.request(s)
- page = BeautifulSoup(response,'lxml')
- return page
- except:
- return None
- else:
- return None
- def get_list_of_urls(url):
- match = re.compile('(https?://(?:www.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9].[^s]{2,}|www.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9].[^s]{2,}|https?://(?:www.|(?!www))[a-zA-Z0-9].[^s]{2,}|www.[a-zA-Z0-9].[^s]{2,})')
- soup = make_soup(url)
- l = set()
- try:
- for a in soup.find_all('a'):
- try:
- if '?' not in a['href'] and re.search(match,a['href']) and re.search(re.compile(url),a['href']) and a['href']!=url:
- l.add(str(a['href']))
- except Exception as e:
- logger.info('Exception ' + str(a)+' has no href')
- logger.info(e)
- continue
- except Exception as e:
- logger.info('Exception ' + url+' has no links')
- logger.info(e)
- pass
- return l
- def get_all_the_urls(base,list_of_urls,depth):
- logger.info(depth)
- if depth == 10:
- return
- else:
- depth = depth + 1
- for i in list_of_urls: #scan the list of urls
- s = get_list_of_urls(i)
- get_all_the_urls(base,s,depth)
- for j in s: #scan the sublinks
- try:
- if j in parsed:
- continue
- soup = make_soup(j)
- logger.info('url is '+ j)
- for k in mapping[base]: #look for keys on the webpage
- key_count = len(soup(text=re.compile(k, re.IGNORECASE)))
- logger.info('Key count is '+str(key_count))
- if(key_count>0):
- record = DATA(parent_link = base,sub_link = j ,keyword = k ,count = key_count) #i,j,k,key_count
- record.save()
- parsed.add(j)
- logger.info('saved data successfully ' +str(key_count))
- except Exception as e:
- logger.info('Exception ' +str(e)+' in keywords searching')
- continue
- def populate_db():
- k = set()
- k.add(args.url)
- temp = time.time()
- logger.info(str(datetime.now()))
- get_all_the_urls(args.url,k,0)
- logger.info('time taken '+str(time.time()-temp))
- populate_db()
Add Comment
Please, Sign In to add comment