Advertisement
Guest User

Untitled

a guest
Feb 21st, 2017
150
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.49 KB | None | 0 0
  1. {
  2. "objects": [
  3. {
  4. "myComment": "This object is used to set default configuration for objects in the pipeline.",
  5.  
  6. "id": "Default",
  7. "name": "Default",
  8. "failureAndRerunMode": "CASCADE",
  9. "schedule": {
  10. "ref": "DefaultSchedule"
  11. },
  12. "resourceRole": "DataPipelineDefaultResourceRole",
  13. "role": "DataPipelineDefaultRole",
  14. "scheduleType": "cron",
  15. "pipelineLogUri": "#{myLogUri}"
  16. },
  17. {
  18. "myComment" : "The DynamoDB table from which we need to export data from",
  19.  
  20. "id": "DynamoDBInputDataNode",
  21. "name": "DynamoDB",
  22. "dataFormat": {
  23. "ref": "DDBExportFormat"
  24. },
  25. "type": "DynamoDBDataNode",
  26. "tableName": "#{myDDBTableName}"
  27. },
  28. {
  29. "myComment" : "The S3 path to which we export data to",
  30.  
  31. "id": "S3StagingDataNode",
  32. "name": "S3StagingDataNode",
  33. "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}/",
  34. "dataFormat": {
  35. "ref": "S3StagingDataFormat"
  36. },
  37. "type": "S3DataNode"
  38. },
  39. {
  40. "myComment" : "Format for the S3 Path",
  41.  
  42. "id": "S3StagingDataFormat",
  43. "name": "DefaultDataFormat1",
  44. "column": "not_used STRING",
  45. "type": "CSV"
  46. },
  47. {
  48. "myComment" : "Format for the DynamoDB table",
  49.  
  50. "id": "DDBExportFormat",
  51. "name": "DDBExportFormat",
  52. "column": "not_used STRING",
  53. "type": "DynamoDBExportDataFormat"
  54. },
  55. {
  56. "myComment" : "Activity used to run the hive script to export data to CSV",
  57.  
  58. "id": "TableBackupStagingActivity",
  59. "name": "TableBackupStagingActivity",
  60. "input": {
  61. "ref": "DynamoDBInputDataNode"
  62. },
  63. "output": {
  64. "ref": "S3StagingDataNode"
  65. },
  66. "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myS3SourceColMapping})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n\nCREATE EXTERNAL TABLE s3TempTable (#{myS3TargetColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n'\nLOCATION '#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}';\n\nINSERT OVERWRITE TABLE s3TempTable SELECT #{myHiveSelectColumns} FROM tempHiveTable;",
  67. "runsOn": { "ref" : "EmrCluster1" },
  68. "type": "HiveActivity"
  69. },
  70. {
  71. "myComment": "This object is used to specify the copy activity for moving data from S3 to Redshift.",
  72.  
  73. "id": "RedshiftLoadActivity",
  74. "name": "RedshiftLoadActivity",
  75. "input": {
  76. "ref": "S3StagingDataNode"
  77. },
  78. "output": {
  79. "ref": "RedshiftCluster1"
  80. },
  81. "dependsOn": {
  82. "ref": "TableBackupStagingActivity"
  83. },
  84. "runsOn": {
  85. "ref": "EmrCluster1"
  86. },
  87. "type": "RedshiftCopyActivity",
  88. "insertMode": "TRUNCATE"
  89. },
  90. {
  91. "myComment": "This object is used to control the task schedule.",
  92.  
  93. "id": "DefaultSchedule",
  94. "name": "RunOnce",
  95. "occurrences": "1",
  96. "period": "1 Day",
  97. "type": "Schedule",
  98. "startAt": "FIRST_ACTIVATION_DATE_TIME"
  99. },
  100. {
  101. "myComment": "This object provides connection information for the Redshift cluster.",
  102.  
  103. "id": "RedshiftDatabase1",
  104. "name": "RedshiftDatabase1",
  105. "connectionString": "jdbc:postgresql://#{myRedshiftEndpoint}:5439/dev",
  106. "type": "RedshiftDatabase",
  107. "username": "#{myRedshiftUsername}",
  108. "*password": "#{*myRedshiftPassword}"
  109. },
  110. {
  111. "myComment": "This object provides the configuration for the EMR cluster.",
  112.  
  113. "id": "EmrCluster1",
  114. "name": "EmrCluster1",
  115. "enableDebugging": "true",
  116. "coreInstanceCount": "3",
  117. "coreInstanceType": "m3.xlarge",
  118. "releaseLabel": "emr-4.4.0",
  119. "masterInstanceType": "m3.xlarge",
  120. "type": "EmrCluster",
  121. "terminateAfter": "1 Week",
  122. "applications": "hive"
  123. },
  124. {
  125. "myComment": "This object contains information about the Redshift database.",
  126.  
  127. "id": "RedshiftCluster1",
  128. "name": "RedshiftCluster1",
  129. "createTableSql": "#{myRedshiftCreateTableQuery}",
  130. "database": {
  131. "ref": "RedshiftDatabase1"
  132. },
  133. "primaryKeys": ["#{myRedshiftPrimaryKeys}"],
  134. "type": "RedshiftDataNode",
  135. "tableName": "#{myRedshiftTable}"
  136. }
  137. ],
  138. "parameters": [
  139. {
  140. "description": "S3 directory where pipeline logs will be pushed to.",
  141. "id": "myLogUri",
  142. "type": "AWS::S3::ObjectKey"
  143. },
  144. {
  145. "description": "The name of the source table in DynamoDB.",
  146. "id": "myDDBTableName",
  147. "type": "String"
  148. },
  149. {
  150. "description": "S3 directory where staging data will be stored.",
  151. "id": "myOutputS3Loc",
  152. "type": "AWS::S3::ObjectKey"
  153. },
  154. {
  155. "description": "The mapping between the projected columns in the source table in Hive and their data types.",
  156. "id": "myS3SourceColMapping",
  157. "type": "String"
  158. },
  159. {
  160. "description": "The mapping between the columns in the table in DynamoDB and the columns referenced in the Hive query for data export.",
  161. "id": "myDDBTableColMapping",
  162. "type": "String"
  163. },
  164. {
  165. "description": "The mapping between the projected columns in the target table in Hive and their data types.",
  166. "id": "myS3TargetColMapping",
  167. "type": "String"
  168. },
  169. {
  170. "description": "The list of columns in Hive SELECT query.",
  171. "id": "myHiveSelectColumns",
  172. "type": "String"
  173. },
  174. {
  175. "description": "The query for inserting data into the target table in Redshift.",
  176. "id": "myRedshiftCreateTableQuery",
  177. "type": "String"
  178. },
  179. {
  180. "description": "The target Redshift table.",
  181. "id": "myRedshiftTable",
  182. "type": "String"
  183. },
  184. {
  185. "description": "One or more columns that make up the primary key of the target table.",
  186. "id": "myRedshiftPrimaryKeys",
  187. "type": "String"
  188. },
  189. {
  190. "description": "The username to use for connecting to the Redshift cluster.",
  191. "id": "myRedshiftUsername",
  192. "type": "String"
  193. },
  194. {
  195. "description": "The password for the above user to establish connection to the Redshift cluster.",
  196. "id": "*myRedshiftPassword",
  197. "type": "String"
  198. },
  199. {
  200. "description": "The endpoint for the Redshift cluster.",
  201. "id": "myRedshiftEndpoint",
  202. "type": "String"
  203. }
  204. ]
  205. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement