Advertisement
uopspop

Untitled

Dec 16th, 2019
253
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.54 KB | None | 0 0
  1. # my_sls_scraper/crawl.py
  2. import sys
  3. import imp
  4. import os
  5. import logging
  6. from urllib.parse import urlparse
  7.  
  8. from scrapy.spiderloader import SpiderLoader
  9. from scrapy.crawler import CrawlerProcess
  10. from scrapy.utils.project import get_project_settings
  11.  
  12. # Need to "mock" sqlite for the process to not crash in AWS Lambda / Amazon Linux
  13. sys.modules["sqlite"] = imp.new_module("sqlite")
  14. sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
  15.  
  16.  
  17. def is_in_aws():
  18.     return os.getenv('AWS_EXECUTION_ENV') is not None
  19.  
  20.  
  21. def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
  22.     project_settings = get_project_settings()
  23.     spider_loader = SpiderLoader(project_settings)
  24.  
  25.     spider_cls = spider_loader.load(spider_name)
  26.  
  27.     feed_uri = ""
  28.     feed_format = "json"
  29.  
  30.     try:
  31.         spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
  32.             "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
  33.     except Exception:
  34.         logging.exception("Spider or kwargs need start_urls.")
  35.  
  36.     if is_in_aws():
  37.         # Lambda can only write to the /tmp folder.
  38.         settings['HTTPCACHE_DIR'] =  "/tmp"
  39.     else:
  40.         feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
  41.             os.path.join(os.getcwd(), "feed"),
  42.             spider_key,
  43.         )
  44.  
  45.     settings['FEED_URI'] = feed_uri
  46.     settings['FEED_FORMAT'] = feed_format
  47.  
  48.     process = CrawlerProcess({**project_settings, **settings})
  49.  
  50.     process.crawl(spider_cls, **spider_kwargs)
  51.     process.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement