Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Exception: Java gateway process exited before sending the driver its port number
- from pyspark import SparkContext
- from pyspark.sql import SQLContext
- import pandas as pd
- sc = SparkContext('local','example') # if using locally
- sql_sc = SQLContext(sc)
- Spark_Full = sc.emptyRDD()
- chunk_100k = pd.read_csv("contour-export-2017-12-14.csv", chunksize=100000)
- # if you have headers in your csv file:
- headers = list(pd.read_csv("contour-export-2017-12-14.csv", nrows=0).columns)
- for chunky in chunk_100k:
- Spark_Full += sc.parallelize(chunky.values.tolist())
- YourSparkDataFrame = Spark_Full.toDF(headers)
- # if you do not have headers, leave empty instead:
- # YourSparkDataFrame = Spark_Full.toDF()
- YourSparkDataFrame.show()
Add Comment
Please, Sign In to add comment