Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.spark.SparkContext
- object Converter extends App {
- val inputPath = "/Users/xme2734/Downloads/maildir/allen-p/*/"
- val outputPath = "output/clean-data"
- val sc = new SparkContext("local[*]", "Clean Data ")
- try {
- // convert the data splitted in several files in one dataset
- val file_contents = sc.wholeTextFiles(inputPath + "[0-9]*.").map(in => {
- val path = in._1
- val text = in._2
- val path2: Array[String] = path.split("[/]")
- val id = "[" + path2(path2.size - 3) + "][" + path2(path2.size - 2) + "][" + path2(path2.size - 1) + "]"
- val text2 = text.trim.replaceAll("""\s*\n\s*""", " ")
- (id, text2)
- })
- println(s"Saving to $outputPath")
- file_contents.saveAsTextFile(outputPath)
- Console.in.read()
- } finally {
- sc.stop()
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement