Untitled

import re
import time
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import os
import httplib2
#import Links
#import Keywords
import MySQLdb
import peewee
from peewee import *
from datetime import datetime
import argparse
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('crawler.log')
fh.setLevel(logging.DEBUG)
#ch = logging.StreamHandler()
#ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#ch.setFormatter(formatter)
#logger.addHandler(ch)
fh.setFormatter(formatter)
logger.addHandler(fh)

parser = argparse.ArgumentParser()

parser.add_argument('-l', '--url', help="The base link to be crawled", required=True)
parser.add_argument('-k', '--keywords', help="Keywords to search", required=True)

args = parser.parse_args()

keywords = (args.keywords).split(',')

mapping  = dict()

mapping[args.url] = keywords

logger.info(mapping)

db = MySQLDatabase('WebSpider', user='ruut', passwd='ruut')

parsed = set()

class DATA(peewee.Model):
    parent_link = peewee.CharField()
    sub_link  = peewee.CharField()
    keyword = peewee.CharField()
    count =  peewee.IntegerField()
    class Meta:
        database = db
        db_table = 'DATA'

def make_soup(s):
   match=re.compile('https://|http://')
   if re.search(match,s):
    try:
         http = httplib2.Http()
             status, response = http.request(s)
             page = BeautifulSoup(response,'lxml')
         return page
    except:
         return None
   else:
     return None

def get_list_of_urls(url):
    match = re.compile('(https?://(?:www.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9].[^s]{2,}|www.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9].[^s]{2,}|https?://(?:www.|(?!www))[a-zA-Z0-9].[^s]{2,}|www.[a-zA-Z0-9].[^s]{2,})')
    soup = make_soup(url)
    l = set()
    try:
        for a in soup.find_all('a'):
                try:
                if '?' not in a['href'] and re.search(match,a['href']) and re.search(re.compile(url),a['href']) and a['href']!=url:
                    l.add(str(a['href']))
                except Exception as e:
                logger.info('Exception ' + str(a)+' has no href')
                logger.info(e)
                continue
    except Exception as e:
        logger.info('Exception ' + url+'    has no links')
        logger.info(e)
        pass
    return l

def get_all_the_urls(base,list_of_urls,depth):
    logger.info(depth)
    if depth == 10:
        return
    else:
        depth = depth  + 1
        for i in list_of_urls: #scan the list of urls
            s = get_list_of_urls(i)
            get_all_the_urls(base,s,depth)
            for j in s: #scan the sublinks
               try:
                    if j in parsed:
                   continue
                soup = make_soup(j)
                logger.info('url is     '+ j)
                for k in mapping[base]:  #look for keys on the webpage
                    key_count = len(soup(text=re.compile(k, re.IGNORECASE)))
                    logger.info('Key count is '+str(key_count))
                    if(key_count>0):
                        record = DATA(parent_link = base,sub_link = j ,keyword = k ,count = key_count) #i,j,k,key_count
                            record.save()
                        parsed.add(j)
                        logger.info('saved data successfully ' +str(key_count))
               except Exception as e:
                logger.info('Exception ' +str(e)+'  in keywords searching')
                continue

def populate_db():
    k = set()
    k.add(args.url)
    temp = time.time()
    logger.info(str(datetime.now()))
    get_all_the_urls(args.url,k,0)
    logger.info('time taken '+str(time.time()-temp))
populate_db()