Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import data.academyjan2019g.tokenizer.services.Tokenizer
- import data.academyjan2019g.tokenizer.services.parsers.TokenizerSchemaStringParser
- import org.apache.spark.sql.SparkSession
- /*import scala.util.Random
- val alpha = "0123456789"
- val size = alpha.size
- def randStr(n:Int) = {
- (1 to n).map(x => alpha(Random.nextInt.abs % size)).mkString
- }
- val filepath = "/Users/szymonk/Desktop/Projects/data.academyjan2019g.tokenizer/src/main/resources/"
- val file = new File(filepath + "write.txt")
- file.createNewFile()
- val writer = new PrintWriter(file)
- for (i <- 1 until 10000000) {
- writer.write(randStr(19) + "\n")
- }
- writer.close()*/
- val sparkSession = SparkSession.builder
- .master("local")
- .appName("Tokenizer Batch Job")
- .getOrCreate()
- sparkSession
- .sparkContext
- .setLogLevel("ERROR")
- import sparkSession.implicits._
- val input = sparkSession
- .read
- .text("/Users/szymonk/Desktop/Projects/data.academyjan2019g.tokenizer/src/main/resources/write.txt")
- val t0 = System.nanoTime()
- val schema = TokenizerSchemaStringParser.parse("6-guid-2")
- val output = input.map(x => Tokenizer.applySchema(x.mkString, schema))
- println(output.count())
- val t1 = System.nanoTime()
- println("Time: " + ((t1 - t0) / 1000000000) + "s")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement