Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys, time, os from mechanize
- import Browser
- LOGIN_URL = 'http://www.example.com/login'
- USERNAME = 'DavidMertz'
- PASSWORD = 'TheSpanishInquisition'
- SEARCH_URL = 'http://www.example.com/search?'
- FIXED_QUERY = 'food=spam&' 'utensil=spork&' 'date=the_future&'
- VARIABLE_QUERY = ['actor=%s' % actor for actor in
- ('Graham Chapman',
- 'John Cleese',
- 'Terry Gilliam',
- 'Eric Idle',
- 'Terry Jones',
- 'Michael Palin')]
- def fetch():
- result_no = 0 # Number the output files
- br = Browser() # Create a browser
- br.open(LOGIN_URL) # Open the login page
- br.select_form(name="login") # Find the login form
- br['username'] = USERNAME # Set the form values
- br['password'] = PASSWORD
- resp = br.submit() # Submit the form
- # Automatic redirect sometimes fails, follow manually when needed
- if 'Redirecting' in br.title():
- resp = br.follow_link(text_regex='click here')
- # Loop through the searches, keeping fixed query parameters
- for actor in in VARIABLE_QUERY:
- # I like to watch what's happening in the console
- print >> sys.stderr, '***', actor
- # Lets do the actual query now
- br.open(SEARCH_URL + FIXED_QUERY + actor)
- # The query actually gives us links to the content pages we like,
- # but there are some other links on the page that we ignore
- nice_links = [l for l in br.links()
- if 'good_path' in l.url
- and 'credential' in l.url]
- if not nice_links: # Maybe the relevant results are empty
- break
- for link in nice_links:
- try:
- response = br.follow_link(link)
- # More console reporting on title of followed link page
- print >> sys.stderr, br.title()
- # Increment output filenames, open and write the file
- result_no += 1
- out = open(result_%04d' % result_no, 'w')
- print >> out, response.read()
- out.close()
- # Nothing ever goes perfectly, ignore if we do not get page
- except mechanize._response.httperror_seek_wrapper:
- print >> sys.stderr, "Response error (probably 404)"
- # Let's not hammer the site too much between fetches
- time.sleep(1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement