Advertisement
Guest User

Untitled

a guest
Mar 21st, 2019
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.25 KB | None | 0 0
  1. import data.academyjan2019g.tokenizer.services.Tokenizer
  2. import data.academyjan2019g.tokenizer.services.parsers.TokenizerSchemaStringParser
  3. import org.apache.spark.sql.SparkSession
  4.  
  5. /*import scala.util.Random
  6.  
  7. val alpha = "0123456789"
  8. val size = alpha.size
  9.  
  10. def randStr(n:Int) = {
  11. (1 to n).map(x => alpha(Random.nextInt.abs % size)).mkString
  12. }
  13.  
  14. val filepath = "/Users/szymonk/Desktop/Projects/data.academyjan2019g.tokenizer/src/main/resources/"
  15. val file = new File(filepath + "write.txt")
  16. file.createNewFile()
  17. val writer = new PrintWriter(file)
  18.  
  19. for (i <- 1 until 10000000) {
  20. writer.write(randStr(19) + "\n")
  21. }
  22.  
  23. writer.close()*/
  24.  
  25.  
  26. val sparkSession = SparkSession.builder
  27. .master("local")
  28. .appName("Tokenizer Batch Job")
  29. .getOrCreate()
  30.  
  31. sparkSession
  32. .sparkContext
  33. .setLogLevel("ERROR")
  34.  
  35. import sparkSession.implicits._
  36.  
  37. val input = sparkSession
  38. .read
  39. .text("/Users/szymonk/Desktop/Projects/data.academyjan2019g.tokenizer/src/main/resources/write.txt")
  40.  
  41. val t0 = System.nanoTime()
  42. val schema = TokenizerSchemaStringParser.parse("6-guid-2")
  43. val output = input.map(x => Tokenizer.applySchema(x.mkString, schema))
  44. println(output.count())
  45. val t1 = System.nanoTime()
  46. println("Time: " + ((t1 - t0) / 1000000000) + "s")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement