Guest User

Untitled

a guest
Dec 18th, 2017
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.70 KB | None | 0 0
  1. import re
  2. import time
  3. from bs4 import BeautifulSoup
  4. from bs4 import SoupStrainer
  5. import os
  6. import httplib2
  7. #import Links
  8. #import Keywords
  9. import MySQLdb
  10. import peewee
  11. from peewee import *
  12. from datetime import datetime
  13. import argparse
  14. import logging
  15. logger = logging.getLogger(__name__)
  16. logger.setLevel(logging.DEBUG)
  17. fh = logging.FileHandler('crawler.log')
  18. fh.setLevel(logging.DEBUG)
  19. #ch = logging.StreamHandler()
  20. #ch.setLevel(logging.DEBUG)
  21. formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  22. #ch.setFormatter(formatter)
  23. #logger.addHandler(ch)
  24. fh.setFormatter(formatter)
  25. logger.addHandler(fh)
  26.  
  27. parser = argparse.ArgumentParser()
  28.  
  29. parser.add_argument('-l', '--url', help="The base link to be crawled", required=True)
  30. parser.add_argument('-k', '--keywords', help="Keywords to search", required=True)
  31.  
  32. args = parser.parse_args()
  33.  
  34. keywords = (args.keywords).split(',')
  35.  
  36. mapping = dict()
  37.  
  38. mapping[args.url] = keywords
  39.  
  40. logger.info(mapping)
  41.  
  42. db = MySQLDatabase('WebSpider', user='ruut', passwd='ruut')
  43.  
  44. parsed = set()
  45.  
  46. class DATA(peewee.Model):
  47. parent_link = peewee.CharField()
  48. sub_link = peewee.CharField()
  49. keyword = peewee.CharField()
  50. count = peewee.IntegerField()
  51. class Meta:
  52. database = db
  53. db_table = 'DATA'
  54.  
  55. def make_soup(s):
  56. match=re.compile('https://|http://')
  57. if re.search(match,s):
  58. try:
  59. http = httplib2.Http()
  60. status, response = http.request(s)
  61. page = BeautifulSoup(response,'lxml')
  62. return page
  63. except:
  64. return None
  65. else:
  66. return None
  67.  
  68. def get_list_of_urls(url):
  69. match = re.compile('(https?://(?:www.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9].[^s]{2,}|www.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9].[^s]{2,}|https?://(?:www.|(?!www))[a-zA-Z0-9].[^s]{2,}|www.[a-zA-Z0-9].[^s]{2,})')
  70. soup = make_soup(url)
  71. l = set()
  72. try:
  73. for a in soup.find_all('a'):
  74. try:
  75. if '?' not in a['href'] and re.search(match,a['href']) and re.search(re.compile(url),a['href']) and a['href']!=url:
  76. l.add(str(a['href']))
  77. except Exception as e:
  78. logger.info('Exception ' + str(a)+' has no href')
  79. logger.info(e)
  80. continue
  81. except Exception as e:
  82. logger.info('Exception ' + url+' has no links')
  83. logger.info(e)
  84. pass
  85. return l
  86.  
  87. def get_all_the_urls(base,list_of_urls,depth):
  88. logger.info(depth)
  89. if depth == 10:
  90. return
  91. else:
  92. depth = depth + 1
  93. for i in list_of_urls: #scan the list of urls
  94. s = get_list_of_urls(i)
  95. get_all_the_urls(base,s,depth)
  96. for j in s: #scan the sublinks
  97. try:
  98. if j in parsed:
  99. continue
  100. soup = make_soup(j)
  101. logger.info('url is '+ j)
  102. for k in mapping[base]: #look for keys on the webpage
  103. key_count = len(soup(text=re.compile(k, re.IGNORECASE)))
  104. logger.info('Key count is '+str(key_count))
  105. if(key_count>0):
  106. record = DATA(parent_link = base,sub_link = j ,keyword = k ,count = key_count) #i,j,k,key_count
  107. record.save()
  108. parsed.add(j)
  109. logger.info('saved data successfully ' +str(key_count))
  110. except Exception as e:
  111. logger.info('Exception ' +str(e)+' in keywords searching')
  112. continue
  113.  
  114. def populate_db():
  115. k = set()
  116. k.add(args.url)
  117. temp = time.time()
  118. logger.info(str(datetime.now()))
  119. get_all_the_urls(args.url,k,0)
  120. logger.info('time taken '+str(time.time()-temp))
  121. populate_db()
Add Comment
Please, Sign In to add comment