Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def crawl_request_files_blocking(filelist, csvoutdir):
- s = get_project_settings()
- s['FEED_FORMAT'] = 'csv'
- s['LOG_LEVEL'] = 'DEBUG'
- for f in filelist:
- id = extractid(f)
- csvoutfile = os.path.join(csvoutdir, id+'_urls.csv')
- logfile = os.path.join(csvoutdir, id+'.log')
- stngs = s.copy()
- stngs['FEED_URI'] = csvoutfile
- stngs['LOG_FILE'] = logfile
- crawlprocess = CrawlerProcess(stngs)
- print('Started crawl process {} for {}'.format(crawlprocess, f))
- crawlprocess.crawl('googlescrape',requests_file=f)
- crawlprocess.start() # blocking call
- crawlprocess.join()
- print('CRAWL DONE') # If you reached this point, all crawl jobs are done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement