Untitled

{
  "objects": [
    {
      "myComment": "This object is used to set default configuration for objects in the pipeline.",

      "id": "Default",
      "name": "Default",
      "failureAndRerunMode": "CASCADE",
      "schedule": {
        "ref": "DefaultSchedule"
      },
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "scheduleType": "cron",
      "pipelineLogUri": "#{myLogUri}"
    },
    {
      "myComment" : "The DynamoDB table from which we need to export data from",

      "id": "DynamoDBInputDataNode",
      "name": "DynamoDB",
      "dataFormat": {
        "ref": "DDBExportFormat"
      },
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    },
    {
      "myComment" : "The S3 path to which we export data to",

      "id":   "S3StagingDataNode",
      "name": "S3StagingDataNode",
      "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}/",
      "dataFormat": {
        "ref": "S3StagingDataFormat"
      },
      "type": "S3DataNode"
    },
    {
      "myComment" : "Format for the S3 Path",

      "id": "S3StagingDataFormat",
      "name": "DefaultDataFormat1",
      "column": "not_used STRING",
      "type": "CSV"
    },
    {
      "myComment" : "Format for the DynamoDB table",

      "id": "DDBExportFormat",
      "name": "DDBExportFormat",
      "column": "not_used STRING",
      "type": "DynamoDBExportDataFormat"
    },
    {
      "myComment" : "Activity used to run the hive script to export data to CSV",

      "id": "TableBackupStagingActivity",
      "name": "TableBackupStagingActivity",
      "input": {
        "ref": "DynamoDBInputDataNode"
      },
      "output": {
        "ref": "S3StagingDataNode"
      },
      "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myS3SourceColMapping})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n\nCREATE EXTERNAL TABLE s3TempTable (#{myS3TargetColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n'\nLOCATION '#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}';\n\nINSERT OVERWRITE TABLE s3TempTable SELECT #{myHiveSelectColumns} FROM tempHiveTable;",
      "runsOn": { "ref" : "EmrCluster1" },
      "type": "HiveActivity"
    },
    {
      "myComment": "This object is used to specify the copy activity for moving data from S3 to Redshift.",

      "id": "RedshiftLoadActivity",
      "name": "RedshiftLoadActivity",
      "input": {
        "ref": "S3StagingDataNode"
      },
      "output": {
        "ref": "RedshiftCluster1"
      },
      "dependsOn": {
        "ref": "TableBackupStagingActivity"
      },
      "runsOn": {
        "ref": "EmrCluster1"
      },
      "type": "RedshiftCopyActivity",
      "insertMode": "TRUNCATE"
    },
    {
      "myComment": "This object is used to control the task schedule.",

      "id": "DefaultSchedule",
      "name": "RunOnce",
      "occurrences": "1",
      "period": "1 Day",
      "type": "Schedule",
      "startAt": "FIRST_ACTIVATION_DATE_TIME"
    },
    {
      "myComment": "This object provides connection information for the Redshift cluster.",

      "id": "RedshiftDatabase1",
      "name": "RedshiftDatabase1",
      "connectionString": "jdbc:postgresql://#{myRedshiftEndpoint}:5439/dev",
      "type": "RedshiftDatabase",
      "username": "#{myRedshiftUsername}",
      "*password": "#{*myRedshiftPassword}"
    },
    {
      "myComment": "This object provides the configuration for the EMR cluster.",

      "id": "EmrCluster1",
      "name": "EmrCluster1",
      "enableDebugging": "true",
      "coreInstanceCount": "3",
      "coreInstanceType": "m3.xlarge",
      "releaseLabel": "emr-4.4.0",
      "masterInstanceType": "m3.xlarge",
      "type": "EmrCluster",
      "terminateAfter": "1 Week",
      "applications": "hive"
    },
    {
      "myComment": "This object contains information about the Redshift database.",

      "id": "RedshiftCluster1",
      "name": "RedshiftCluster1",
      "createTableSql": "#{myRedshiftCreateTableQuery}",
      "database": {
        "ref": "RedshiftDatabase1"
      },
      "primaryKeys": ["#{myRedshiftPrimaryKeys}"],
      "type": "RedshiftDataNode",
      "tableName": "#{myRedshiftTable}"
    }
  ],
  "parameters": [
    {
      "description": "S3 directory where pipeline logs will be pushed to.",
      "id": "myLogUri",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "description": "The name of the source table in DynamoDB.",
      "id": "myDDBTableName",
      "type": "String"
    },
    {
      "description": "S3 directory where staging data will be stored.",
      "id": "myOutputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "description": "The mapping between the projected columns in the source table in Hive and their data types.",
      "id": "myS3SourceColMapping",
      "type": "String"
    },
    {
      "description": "The mapping between the columns in the table in DynamoDB and the columns referenced in the Hive query for data export.",
      "id": "myDDBTableColMapping",
      "type": "String"
    },
    {
      "description": "The mapping between the projected columns in the target table in Hive and their data types.",
      "id": "myS3TargetColMapping",
      "type": "String"
    },
    {
      "description": "The list of columns in Hive SELECT query.",
      "id": "myHiveSelectColumns",
      "type": "String"
    },
    {
      "description": "The query for inserting data into the target table in Redshift.",
      "id": "myRedshiftCreateTableQuery",
      "type": "String"
    },
    {
      "description": "The target Redshift table.",
      "id": "myRedshiftTable",
      "type": "String"
    },
    {
      "description": "One or more columns that make up the primary key of the target table.",
      "id": "myRedshiftPrimaryKeys",
      "type": "String"
    },
    {
      "description": "The username to use for connecting to the Redshift cluster.",
      "id": "myRedshiftUsername",
      "type": "String"
    },
    {
      "description": "The password for the above user to establish connection to the Redshift cluster.",
      "id": "*myRedshiftPassword",
      "type": "String"
    },
    {
      "description": "The endpoint for the Redshift cluster.",
      "id": "myRedshiftEndpoint",
      "type": "String"
    }
  ]
}