Advertisement
Guest User

Untitled

a guest
Aug 28th, 2015
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.62 KB | None | 0 0
  1. val s3Paths = "s3://yourbucket/path/to/file1.txt,s3://yourbucket/path/to/directory"
  2. val pageLength = 100
  3. val key = "YOURKEY"
  4. val secret = "YOUR_SECRET"
  5.  
  6. import com.amazonaws.services.s3._, model._
  7. import com.amazonaws.auth.BasicAWSCredentials
  8. import com.amazonaws.services.s3.model.ObjectListing
  9. import scala.collection.JavaConverters._
  10. import scala.io.Source
  11. import java.io.InputStream
  12. import org.apache.spark.rdd.RDD
  13.  
  14. def s3 = new AmazonS3Client(new BasicAWSCredentials(key, secret))
  15. var inputLinesRDD_raw:RDD[String] = null
  16. s3Paths.split(",").foreach{ s3Path =>
  17. val regex = """(?i)^s3://([^/]+)/(.*)""".r
  18. val bucket = regex.findFirstMatchIn(s3Path).map(_ group 1).getOrElse(null)
  19. val prefix = regex.findFirstMatchIn(s3Path).map(_ group 2).getOrElse(null)
  20. println("Processing s3 resource: bucket '%s', prefix '%s'".format(bucket, prefix))
  21. @transient val request = new ListObjectsRequest()
  22. request.setBucketName(bucket)
  23. request.setPrefix(prefix)
  24. request.setMaxKeys(pageLength)
  25. @transient var listing = s3.listObjects(request)
  26. var proceed = true
  27. while (proceed){
  28. if (listing.getObjectSummaries.isEmpty){
  29. proceed = false
  30. }else{
  31. @transient val s3FileKeys = listing.getObjectSummaries.asScala.map(_.getKey).toList
  32. val inputLines = sc.parallelize(s3FileKeys).flatMap { key => Source.fromInputStream(s3.getObject(bucket, key).getObjectContent: InputStream).getLines }
  33. if (inputLinesRDD_raw == null){
  34. inputLinesRDD_raw = inputLines
  35. }else{
  36. inputLinesRDD_raw = inputLinesRDD_raw.union(inputLines)
  37. }
  38. listing = s3.listNextBatchOfObjects(listing)
  39. }
  40. }
  41. }
  42.  
  43. // TODO do something with inputLinesRDD_raw
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement