Advertisement
Guest User

Untitled

a guest
Aug 29th, 2016
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.65 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import httplib, urllib, base64, json, re, os, sys, codecs, locale, time
  4.  
  5. pull_count = 50 ## Var used to control # of hits per pull
  6. offset = 0 ## Var used to push pagination counter to http get
  7. num_paginations = 10 ## Var used to control max # of paginations
  8. local_counter = 1 ## Helps Write commas to json file for all but last run
  9. timer_counter = 1 ## Variable used to make system wait after 5 pulls
  10. dump_file = 'BingDump.json' ## Name of local file where results are written to
  11. api_domain = 'api.cognitive.microsoft.com'
  12. query = 'Bill Gates'
  13. user_agent = 'Mozilla/5.0 (MAC OSX, Educational Usage Only)'
  14. x_search = '199.99.99.99'
  15.  
  16. #Request Headers, open connection, open file write to output file
  17. headers = {
  18. 'Ocp-Apim-Subscription-Key': 'MYSUBSCRIPTIONKEY',
  19. 'User-Agent' : user_agent,
  20. 'X-Search-ClientIP': x_search,
  21. }
  22. conn = httplib.HTTPSConnection(api_domain)
  23. fhand = open(dump_file,'w')
  24.  
  25. #Function to build URL for API PULL
  26. def scraper() :
  27. pull_count_str = str(pull_count)
  28. offset_str = str(offset)
  29. params = urllib.urlencode({
  30. 'q': query,
  31. 'count': pull_count_str,
  32. 'offset': offset_str,
  33. 'mkt': 'en-us',
  34. 'safesearch': 'Moderate',
  35. 'responseFilter': 'webpages', #controls whether pull scrapes from web/image/news etc
  36. })
  37. return(params)
  38.  
  39. #Function set to wait 4 seconds after 5 pulls
  40. def holdup(entry) :
  41. if entry != 5 :
  42. entry += 1
  43. else:
  44. entry = 1
  45. time.sleep(4)
  46. return(entry)
  47.  
  48. #Function that establishes http get, and writes data to json file
  49. def getwrite(entry1, entry2) :
  50. conn.request("GET", "/bing/v5.0/search?%s" % entry1, "{body}", entry2)
  51. response = conn.getresponse()
  52. data = response.read()
  53. json_data = json.loads(data)
  54. fhand.write(json.dumps(json_data, indent=4))
  55.  
  56. #Main Code - Pulls data iteratively and writes it to json file
  57. fhand.write('{')
  58.  
  59. for i in range(num_paginations) :
  60.  
  61. dict_load = '"' + str(local_counter) + '"' + ' : '
  62. fhand.write(dict_load)
  63. try:
  64. link_params = scraper()
  65. print('Retrieving: ' + api_domain + '/bing/v5.0/search?' + link_params)
  66. getwrite(link_params, headers)
  67. except Exception as e:
  68. print("[Errno {0}] {1}".format(e.errno, e.strerror))
  69. fhand.write('"Error. Could not pull data"')
  70. offset += pull_count
  71. if local_counter != num_paginations : fhand.write(', ')
  72. local_counter += 1
  73. timer_counter = holdup(timer_counter)
  74.  
  75. fhand.write('}')
  76. fhand.close()
  77. conn.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement