Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ====================================================================
- # PullCsvFromS3
- # Pull CSV data from a directory S3 to our local system
- # ====================================================================
- job.name=PullCsvFromS3
- job.description=Pull CSV data from a directory S3 to our local system
- fs.uri=file:///
- # Set working directory
- work.dir=/Users/tilak/gobblin/mopar-demo
- writer.staging.dir=${work.dir}/taskStaging
- writer.output.dir=${work.dir}/taskOutput
- mr.job.root.dir=${work.dir}/working
- # Set state store
- state.store.enabled=true
- state.store.type=mysql
- state.store.db.jdbc.driver=com.mysql.jdbc.Driver
- state.store.db.url=jdbc:mysql://localhost/mopar_demo
- state.store.db.user=gobblin
- state.store.db.password=gobblin
- # Set writer and publisher
- writer.fs.uri=file:///
- data.publisher.final.dir=${work.dir}/output
- writer.builder.class=org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriterBuilder
- data.publisher.type=org.apache.gobblin.data.management.copy.publisher.CopyDataPublisher
- writer.destination.type=HDFS
- data.publisher.fs.uri=${fs.uri}
- data.publisher.metadata.output.dir=${work.dir}/metadata_out
- # Source Configuration
- source.class=org.apache.gobblin.data.management.copy.CopySource
- gobblin.dataset.profile.class=org.apache.gobblin.data.management.copy.CopyableGlobDatasetFinder
- gobblin.dataset.pattern=pricing.products_*.csv
- # To copy from particular directory gobblin.dataset.pattern=some_folder/*.csv
- gobblin.copy.recursive.update=true
- fork.record.queue.capacity=1
- # Source S3 Configuration
- source.filebased.fs.uri=s3a://<bucket_name>
- source.filebased.preserve.file.name=true
- source.filebased.encrypted.fs.s3a.access.key=<s3-access-key>
- source.filebased.encrypted.fs.s3a.secret.key=<s3-secret-key>
- fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
- fs.s3a.buffer.dir=${work.dir}/buffer-dir
- fs.s3a.connection.ssl.enabled=false
- # Converters
- converter.classes=org.apache.gobblin.converter.IdentityConverter
- # ====================================================================
- # Distcp configurations (do not change)
- # ====================================================================
- job.class=org.apache.gobblin.azkaban.AzkabanJobLauncher
- extract.namespace=org.apache.gobblin.copy
- distcp.persist.dir=/tmp/distcp-persist-dir
- task.maxretries=0
- workunit.retry.enabled=false
- # Job History server
- job.history.store.enabled=true
- job.history.store.url=jdbc:mysql://localhost/mopar_demo
- job.history.store.jdbc.driver=com.mysql.jdbc.Driver
- job.history.store.user=gobblin
- job.history.store.password=gobblin
- # Other s3a settings
- # Should be greater than 5MB else distcp won't work
- fs.s3a.multipart.size=67108864
Add Comment
Please, Sign In to add comment