Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import httplib, urllib, base64, json, re, os, sys, codecs, locale, time
- pull_count = 50 ## Var used to control # of hits per pull
- offset = 0 ## Var used to push pagination counter to http get
- num_paginations = 10 ## Var used to control max # of paginations
- local_counter = 1 ## Helps Write commas to json file for all but last run
- timer_counter = 1 ## Variable used to make system wait after 5 pulls
- dump_file = 'BingDump.json' ## Name of local file where results are written to
- api_domain = 'api.cognitive.microsoft.com'
- query = 'Bill Gates'
- user_agent = 'Mozilla/5.0 (MAC OSX, Educational Usage Only)'
- x_search = '199.99.99.99'
- #Request Headers, open connection, open file write to output file
- headers = {
- 'Ocp-Apim-Subscription-Key': 'MYSUBSCRIPTIONKEY',
- 'User-Agent' : user_agent,
- 'X-Search-ClientIP': x_search,
- }
- conn = httplib.HTTPSConnection(api_domain)
- fhand = open(dump_file,'w')
- #Function to build URL for API PULL
- def scraper() :
- pull_count_str = str(pull_count)
- offset_str = str(offset)
- params = urllib.urlencode({
- 'q': query,
- 'count': pull_count_str,
- 'offset': offset_str,
- 'mkt': 'en-us',
- 'safesearch': 'Moderate',
- 'responseFilter': 'webpages', #controls whether pull scrapes from web/image/news etc
- })
- return(params)
- #Function set to wait 4 seconds after 5 pulls
- def holdup(entry) :
- if entry != 5 :
- entry += 1
- else:
- entry = 1
- time.sleep(4)
- return(entry)
- #Function that establishes http get, and writes data to json file
- def getwrite(entry1, entry2) :
- conn.request("GET", "/bing/v5.0/search?%s" % entry1, "{body}", entry2)
- response = conn.getresponse()
- data = response.read()
- json_data = json.loads(data)
- fhand.write(json.dumps(json_data, indent=4))
- #Main Code - Pulls data iteratively and writes it to json file
- fhand.write('{')
- for i in range(num_paginations) :
- dict_load = '"' + str(local_counter) + '"' + ' : '
- fhand.write(dict_load)
- try:
- link_params = scraper()
- print('Retrieving: ' + api_domain + '/bing/v5.0/search?' + link_params)
- getwrite(link_params, headers)
- except Exception as e:
- print("[Errno {0}] {1}".format(e.errno, e.strerror))
- fhand.write('"Error. Could not pull data"')
- offset += pull_count
- if local_counter != num_paginations : fhand.write(', ')
- local_counter += 1
- timer_counter = holdup(timer_counter)
- fhand.write('}')
- fhand.close()
- conn.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement