uopspop

Untitled

Dec 16th, 2019
148
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # my_sls_scraper/crawl.py
  2. import sys
  3. import imp
  4. import os
  5. import logging
  6. from urllib.parse import urlparse
  7.  
  8. from scrapy.spiderloader import SpiderLoader
  9. from scrapy.crawler import CrawlerProcess
  10. from scrapy.utils.project import get_project_settings
  11.  
  12. # Need to "mock" sqlite for the process to not crash in AWS Lambda / Amazon Linux
  13. sys.modules["sqlite"] = imp.new_module("sqlite")
  14. sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
  15.  
  16.  
  17. def is_in_aws():
  18.     return os.getenv('AWS_EXECUTION_ENV') is not None
  19.  
  20.  
  21. def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
  22.     project_settings = get_project_settings()
  23.     spider_loader = SpiderLoader(project_settings)
  24.  
  25.     spider_cls = spider_loader.load(spider_name)
  26.  
  27.     feed_uri = ""
  28.     feed_format = "json"
  29.  
  30.     try:
  31.         spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
  32.             "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
  33.     except Exception:
  34.         logging.exception("Spider or kwargs need start_urls.")
  35.  
  36.     if is_in_aws():
  37.         # Lambda can only write to the /tmp folder.
  38.         settings['HTTPCACHE_DIR'] =  "/tmp"
  39.     else:
  40.         feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
  41.             os.path.join(os.getcwd(), "feed"),
  42.             spider_key,
  43.         )
  44.  
  45.     settings['FEED_URI'] = feed_uri
  46.     settings['FEED_FORMAT'] = feed_format
  47.  
  48.     process = CrawlerProcess({**project_settings, **settings})
  49.  
  50.     process.crawl(spider_cls, **spider_kwargs)
  51.     process.start()
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×