Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################################
- #YAGS by Scriptomania (Yet Another Google Scraper)#
- ###################################################
- # A very simple Google scraping script. Gets around
- # 1k results/keyword using the scroogle.org proxy.
- # Call & string limitations untested, report
- # back with some findings!
- #
- #
- # Version 1.0
- # Current limitations:
- # - limited search calls per hour...
- from BeautifulSoup import BeautifulSoup, SoupStrainer
- import httplib, urllib, string
- #Define search terms as well as filename
- key = raw_input("Search term: ")
- if key == "":
- exit()
- file = raw_input("Save results to filename (without extension): ")
- if file == "":
- exit()
- #Main scraping routine
- p = 0
- while p <> 10:
- if p == 0:
- Rs = urllib.urlencode({"Gw":key, "n":1})
- else:
- Rs = urllib.urlencode({"Gw":key, "n":1, "z":p})
- Head = {"Content-Type":"application/x-www-form-urlencoded",
- "Accept":"text/plain"}
- hOpen = httplib.HTTPConnection("scroogle.org")
- hOpen.request("POST", "/cgi-bin/nbbw.cgi", Rs, Head)
- try:
- hResponse = hOpen.getresponse()
- f = open(file + ".txt", "a")
- for link in BeautifulSoup(hResponse.read(), parseOnlyThese=SoupStrainer('a')):
- if link.has_key('href') and string.find(link['href'], "scroogle") == -1:
- print link['href']
- f.write(link['href'] + "\n")
- p = p + 1
- f.close()
- except httplib.CannotSendRequest, e:
- pass
- except httplib.BadStatusLine, e:
- pass
- except httplib.HTTPException, e:
- pass
Add Comment
Please, Sign In to add comment