SHARE
TWEET

MakMan Google Scrapper

a guest Oct 19th, 2016 488 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. """MakMan Google Scrapper
  2.  
  3. Usage:
  4.  makman_scrapy.py <search> <pages> <processes>
  5.  makman_scrapy.py (-h | --help)
  6.  
  7. Arguments:
  8.  <search>        String to be Searched
  9.  <pages>         Number of pages
  10.  <processes>     Number of parallel processes
  11.  
  12. Options:
  13.  -h, --help     Show this screen.
  14.  
  15. """
  16.  
  17. import requests, re, sys
  18. from docopt import docopt
  19. from bs4    import BeautifulSoup
  20. from time   import time as timer
  21. from functools import partial
  22. from multiprocessing import Pool
  23.  
  24. def get_urls(search_string, start):
  25.     temp = []
  26.     url = 'http://www.google.com/search'
  27.     payload = { 'q' : search_string, 'start' : start }
  28.     my_headers = { 'User-agent' : 'Mozilla/11.0' }
  29.     r = requests.get( url, params = payload, headers = my_headers )
  30.     soup = BeautifulSoup( r.text, 'html.parser' )
  31.     h3tags = soup.find_all( 'h3', class_='r' )
  32.     for h3 in h3tags:
  33.         try:
  34.             temp.append( re.search('url\?q=(.+?)\&sa', h3.a['href']).group(1) )
  35.         except:
  36.             continue
  37.     return temp
  38.  
  39. def main():
  40.     start = timer()
  41.     result = []
  42.     arguments = docopt( __doc__, version='MakMan Google Scrapper' )
  43.     search = arguments['<search>']
  44.     pages = arguments['<pages>']
  45.     processes = int( arguments['<processes>'] )
  46.     make_request = partial( get_urls, search )
  47.     pagelist = [ str(x*10) for x in range( 0, int(pages) ) ]
  48.     with Pool(processes) as p:
  49.         tmp = p.map(make_request, pagelist)
  50.     for x in tmp:
  51.         result.extend(x)
  52.     result = list( set( result ) )
  53.     print( *result, sep = '\n' )
  54.     print( '\nTotal URLs Scraped : %s ' % str( len( result ) ) )
  55.     print( 'Script Execution Time : %s ' % ( timer() - start, ) )
  56.  
  57. if __name__ == '__main__':
  58.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top