Advertisement
Guest User

MakMan Google Scrapper

a guest
Oct 19th, 2016
660
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.72 KB | None | 0 0
  1. """MakMan Google Scrapper
  2.  
  3. Usage:
  4.  makman_scrapy.py <search> <pages> <processes>
  5.  makman_scrapy.py (-h | --help)
  6.  
  7. Arguments:
  8.  <search>        String to be Searched
  9.  <pages>         Number of pages
  10.  <processes>     Number of parallel processes
  11.  
  12. Options:
  13.  -h, --help     Show this screen.
  14.  
  15. """
  16.  
  17. import requests, re, sys
  18. from docopt import docopt
  19. from bs4    import BeautifulSoup
  20. from time   import time as timer
  21. from functools import partial
  22. from multiprocessing import Pool
  23.  
  24. def get_urls(search_string, start):
  25.     temp = []
  26.     url = 'http://www.google.com/search'
  27.     payload = { 'q' : search_string, 'start' : start }
  28.     my_headers = { 'User-agent' : 'Mozilla/11.0' }
  29.     r = requests.get( url, params = payload, headers = my_headers )
  30.     soup = BeautifulSoup( r.text, 'html.parser' )
  31.     h3tags = soup.find_all( 'h3', class_='r' )
  32.     for h3 in h3tags:
  33.         try:
  34.             temp.append( re.search('url\?q=(.+?)\&sa', h3.a['href']).group(1) )
  35.         except:
  36.             continue
  37.     return temp
  38.  
  39. def main():
  40.     start = timer()
  41.     result = []
  42.     arguments = docopt( __doc__, version='MakMan Google Scrapper' )
  43.     search = arguments['<search>']
  44.     pages = arguments['<pages>']
  45.     processes = int( arguments['<processes>'] )
  46.     make_request = partial( get_urls, search )
  47.     pagelist = [ str(x*10) for x in range( 0, int(pages) ) ]
  48.     with Pool(processes) as p:
  49.         tmp = p.map(make_request, pagelist)
  50.     for x in tmp:
  51.         result.extend(x)
  52.     result = list( set( result ) )
  53.     print( *result, sep = '\n' )
  54.     print( '\nTotal URLs Scraped : %s ' % str( len( result ) ) )
  55.     print( 'Script Execution Time : %s ' % ( timer() - start, ) )
  56.  
  57. if __name__ == '__main__':
  58.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement