Advertisement
Guest User

Untitled

a guest
Oct 15th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.79 KB | None | 0 0
  1. import org.apache.spark.SparkContext
  2.  
  3. object Converter extends App {
  4.  
  5. val inputPath = "/Users/xme2734/Downloads/maildir/allen-p/*/"
  6. val outputPath = "output/clean-data"
  7.  
  8. val sc = new SparkContext("local[*]", "Clean Data ")
  9.  
  10. try {
  11. // convert the data splitted in several files in one dataset
  12. val file_contents = sc.wholeTextFiles(inputPath + "[0-9]*.").map(in => {
  13. val path = in._1
  14. val text = in._2
  15. val path2: Array[String] = path.split("[/]")
  16. val id = "[" + path2(path2.size - 3) + "][" + path2(path2.size - 2) + "][" + path2(path2.size - 1) + "]"
  17. val text2 = text.trim.replaceAll("""\s*\n\s*""", " ")
  18. (id, text2)
  19. })
  20.  
  21. println(s"Saving to $outputPath")
  22. file_contents.saveAsTextFile(outputPath)
  23.  
  24. Console.in.read()
  25. } finally {
  26. sc.stop()
  27. }
  28. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement