Untitled

import findspark

findspark.init("/usr/local/spark/")
import pyspark

findspark.find()

def create_sc():
    sc_conf = pyspark.SparkConf().setMaster("local")
    sc_conf.setAppName("Anna")
    sc_conf.set("spark.dynamicAllocation.enabled", True)
    sc_conf.set("spark.shuffle.service.enabled", True)
    sc_conf.set('spark.seria<lizer', "org.apache.spark.serializer.KryoSerializer")
    sc_conf.set('spark.executor.memory', "32g")
    sc_conf.set('spark.sql.parquet.binaryAsString',True)
    sc_conf.set("spark.sql.execution.arrow.enabled", "true")
    sc_conf.set("spark.dynamicAllocation.minExecutors", "5");
    sc_conf.set("spark.dynamicAllocation.maxExecutors", "30");
    sc_conf.set("spark.dynamicAllocation.initialExecutors", "10");
    sc_conf.set("spark.rpc.message.maxSize", 1024)

    print(sc_conf.getAll())
    global sc

    try:
        if sc is not None:
            sc.stop()
        sc1 = pyspark.SparkContext(conf=sc_conf)
    except:
        sc1 = pyspark.SparkContext(conf=sc_conf)
    return sc1


def dir_exist(sc, path):
    fs = sc._jvm.org.apache.hadoop.fs.psfileSystem.get(sc._jsc.hadoopConfiguration())
    jpath = sc._jvm.org.apache.hadoop.fs.Path(path)
    return fs.exists(jpath) and fs.isDirectory(jpath)


sc = create_sc()
sqlContext = pyspark.SQLContext(sc)