daily pastebin goal
15%
SHARE
TWEET

emr-etl-runner config.yml

a guest Apr 6th, 2017 10 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. aws:
  2.  # Credentials can be hardcoded or set in environment variables
  3.   access_key_id: #####
  4.   secret_access_key: ####
  5.   s3:
  6.     region: us-east-1
  7.     buckets:
  8.       assets: s3n://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket
  9.       jsonpath_assets: s3://snowplow-hosted-assets  # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here
  10.       log: s3n://aff-snow/snow-log
  11.       raw:
  12.         in:
  13.           - s3n://aff-snow          # e.g. s3://my-in-bucket
  14.         processing: s3n://aff-snow/snow-processing
  15.         archive: s3n://aff-snow/snow-archive    # e.g. s3://my-archive-bucket/in
  16.       enriched:
  17.         good: s3n://aff-snow/enriched/good       # e.g. s3://my-out-bucket/enriched/good
  18.         bad: s3n://aff-snow/enriched/bad        # e.g. s3://my-out-bucket/enriched/bad
  19.         errors: s3n://aff-snow/enriched/errors     # Leave blank unless continue_on_unexpected_error: set to true below
  20.         archive: s3n://aff-snow/enriched/archive    # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched
  21.       shredded:
  22.         good: s3n://aff-snow/shredded/good       # e.g. s3://my-out-bucket/shredded/good
  23.         bad: s3n://aff-snow/shredded/bad        # e.g. s3://my-out-bucket/shredded/bad
  24.         errors: s3n://aff-snow/shredded/errors     # Leave blank unless continue_on_unexpected_error: set to true below
  25.         archive: s3n://aff-snow/shredded/archive    # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded
  26.   emr:
  27.     ami_version: 4.5.0      # Don't change this
  28.     region: us-east-1        # Always set this
  29.     jobflow_role: EMR_EC2_DefaultRole # Created using $ aws emr create-default-roles
  30.     service_role: EMR_DefaultRole     # Created using $ aws emr create-default-roles
  31.     placement: # Set this if not running in VPC. Leave blank otherwise
  32.     ec2_subnet_id: subnet-f9b69ed1 # Set this if running in VPC. Leave blank otherwise
  33.     ec2_key_name: snowplow
  34.     bootstrap: []           # Set this to specify custom boostrap actions. Leave empty otherwise
  35.     software:
  36.       hbase:               # Optional. To launch on cluster, provide version, "0.92.0", keep quotes. Leave empty otherwise.
  37.       lingual:             # Optional. To launch on cluster, provide version, "1.1", keep quotes. Leave empty otherwise.
  38.     # Adjust your Hadoop cluster below
  39.     jobflow:
  40.       master_instance_type: m1.medium
  41.       core_instance_count: 1
  42.       core_instance_type: m1.medium
  43.       core_instance_ebs:   # Optional. Attach an EBS volume to each core instance.
  44.         volume_size: 100    # Gigabytes
  45.         volume_type: "gp2"
  46.         volume_iops: 400    # Optional. Will only be used if volume_type is "io1"
  47.         ebs_optimized: false # Optional. Will default to true
  48.       task_instance_count: 0 # Increase to use spot instances
  49.       task_instance_type: m1.medium
  50.       task_instance_bid: 0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances
  51.     bootstrap_failure_tries: 2 # Number of times to attempt the job in the event of bootstrap failures
  52.     additional_info:       # Optional JSON string for selecting additional features
  53. collectors:
  54.   format: thrift # Or 'clj-tomcat' for the Clojure Collector, or 'thrift' for Thrift records, or 'tsv/com.amazon.aws.cloudfront/wd_access_log' for Cloudfront access logs
  55. enrich:
  56.   job_name: johnsnow # Give your job a name
  57.   versions:
  58.     hadoop_enrich: 1.8.0 # Version of the Hadoop Enrichment process
  59.     hadoop_shred: 0.10.0 # Version of the Hadoop Shredding process
  60.     hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process
  61.   continue_on_unexpected_error: false # Set to 'true' (and set out_errors: above) if you don't want any exceptions thrown from ETL
  62.   output_compression: GZIP # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP
  63. storage:
  64.   download:
  65.     folder: # Postgres-only config option. Where to store the downloaded files. Leave blank for Redshift
  66.   targets:
  67.     - name: "johnsnow"
  68.       type: redshift
  69.       host: ###### # The endpoint as shown in the Redshift console
  70.       database: ##### # Name of database
  71.       port: #### # Default Redshift port
  72.       table: atomic.events
  73.       username: #####
  74.       password: #####
  75.       maxerror: 3 # Stop loading on first error, or increase to permit more load errors
  76.       comprows: 200000 # Default for a 1 XL node cluster. Not used unless --include compupdate specified
  77.       ssl_mode: disable    
  78. monitoring:
  79.   tags: {} # Name-value pairs describing this job
  80.   logging:
  81.     level: DEBUG # You can optionally switch to INFO for production
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top