Scriptomania_

YAGS by Scriptomania (Yet Another Google Scraper)

Aug 25th, 2011
298
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.61 KB | None | 0 0
  1. ###################################################
  2. #YAGS by Scriptomania (Yet Another Google Scraper)#
  3. ###################################################
  4. # A very simple Google scraping script. Gets around
  5. # 1k results/keyword using the scroogle.org proxy.
  6. # Call & string limitations untested, report
  7. # back with some findings!
  8. #
  9. #
  10. # Version 1.0
  11. # Current limitations:
  12. # - limited search calls per hour...
  13.  
  14. from BeautifulSoup import BeautifulSoup, SoupStrainer
  15. import httplib, urllib, string
  16.  
  17. #Define search terms as well as filename
  18. key = raw_input("Search term: ")
  19. if key == "":
  20.     exit()
  21. file = raw_input("Save results to filename (without extension): ")
  22. if file == "":
  23.     exit()
  24.  
  25. #Main scraping routine
  26. p = 0
  27. while p <> 10:
  28.     if p == 0:
  29.         Rs = urllib.urlencode({"Gw":key, "n":1})
  30.     else:
  31.         Rs = urllib.urlencode({"Gw":key, "n":1, "z":p})
  32.     Head = {"Content-Type":"application/x-www-form-urlencoded",
  33.             "Accept":"text/plain"}
  34.     hOpen = httplib.HTTPConnection("scroogle.org")
  35.     hOpen.request("POST", "/cgi-bin/nbbw.cgi", Rs, Head)
  36.     try:
  37.         hResponse = hOpen.getresponse()
  38.         f = open(file + ".txt", "a")
  39.         for link in BeautifulSoup(hResponse.read(), parseOnlyThese=SoupStrainer('a')):
  40.             if link.has_key('href') and string.find(link['href'], "scroogle") == -1:
  41.                 print link['href']
  42.                 f.write(link['href'] + "\n")
  43.         p = p + 1
  44.         f.close()
  45.     except httplib.CannotSendRequest, e:
  46.         pass
  47.     except httplib.BadStatusLine, e:
  48.         pass
  49.     except httplib.HTTPException, e:
  50.         pass
Add Comment
Please, Sign In to add comment