Advertisement
Guest User

Untitled

a guest
Feb 28th, 2017
163
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.75 KB | None | 0 0
  1.  
  2. def crawl_request_files_blocking(filelist, csvoutdir):
  3.  
  4. s = get_project_settings()
  5. s['FEED_FORMAT'] = 'csv'
  6. s['LOG_LEVEL'] = 'DEBUG'
  7.  
  8.  
  9. for f in filelist:
  10. id = extractid(f)
  11. csvoutfile = os.path.join(csvoutdir, id+'_urls.csv')
  12. logfile = os.path.join(csvoutdir, id+'.log')
  13.  
  14. stngs = s.copy()
  15. stngs['FEED_URI'] = csvoutfile
  16. stngs['LOG_FILE'] = logfile
  17.  
  18. crawlprocess = CrawlerProcess(stngs)
  19. print('Started crawl process {} for {}'.format(crawlprocess, f))
  20. crawlprocess.crawl('googlescrape',requests_file=f)
  21. crawlprocess.start() # blocking call
  22. crawlprocess.join()
  23.  
  24. print('CRAWL DONE') # If you reached this point, all crawl jobs are done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement