Advertisement
Guest User

Untitled

a guest
Jan 12th, 2018
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.41 KB | None | 0 0
  1. import sys, time, os from mechanize
  2. import Browser
  3.  
  4. LOGIN_URL = 'http://www.example.com/login'
  5. USERNAME = 'DavidMertz'
  6. PASSWORD = 'TheSpanishInquisition'
  7. SEARCH_URL = 'http://www.example.com/search?'
  8. FIXED_QUERY = 'food=spam&' 'utensil=spork&' 'date=the_future&'
  9. VARIABLE_QUERY = ['actor=%s' % actor for actor in
  10. ('Graham Chapman',
  11. 'John Cleese',
  12. 'Terry Gilliam',
  13. 'Eric Idle',
  14. 'Terry Jones',
  15. 'Michael Palin')]
  16.  
  17. def fetch():
  18. result_no = 0 # Number the output files
  19. br = Browser() # Create a browser
  20. br.open(LOGIN_URL) # Open the login page
  21. br.select_form(name="login") # Find the login form
  22. br['username'] = USERNAME # Set the form values
  23. br['password'] = PASSWORD
  24. resp = br.submit() # Submit the form
  25.  
  26. # Automatic redirect sometimes fails, follow manually when needed
  27. if 'Redirecting' in br.title():
  28. resp = br.follow_link(text_regex='click here')
  29.  
  30. # Loop through the searches, keeping fixed query parameters
  31. for actor in in VARIABLE_QUERY:
  32. # I like to watch what's happening in the console
  33. print >> sys.stderr, '***', actor
  34. # Lets do the actual query now
  35. br.open(SEARCH_URL + FIXED_QUERY + actor)
  36. # The query actually gives us links to the content pages we like,
  37. # but there are some other links on the page that we ignore
  38. nice_links = [l for l in br.links()
  39. if 'good_path' in l.url
  40. and 'credential' in l.url]
  41. if not nice_links: # Maybe the relevant results are empty
  42. break
  43. for link in nice_links:
  44. try:
  45. response = br.follow_link(link)
  46. # More console reporting on title of followed link page
  47. print >> sys.stderr, br.title()
  48. # Increment output filenames, open and write the file
  49. result_no += 1
  50. out = open(result_%04d' % result_no, 'w')
  51. print >> out, response.read()
  52. out.close()
  53. # Nothing ever goes perfectly, ignore if we do not get page
  54. except mechanize._response.httperror_seek_wrapper:
  55. print >> sys.stderr, "Response error (probably 404)"
  56. # Let's not hammer the site too much between fetches
  57. time.sleep(1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement