Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def setTypes_function(df):
- for i in range(1,132):
- actual = df.select(regexp_extract('v'+str(i), '(\d+\.?\d*)', 1).alias('d')).collect()
- j = 0
- found = False
- while ((j < len(actual)) & (found==False)) :
- if actual[j][0] == '' :
- found = True
- j = j+1
- if found == False :
- df = df.withColumn("v"+str(i), df["v"+str(i)].cast(DoubleType()))
- return df
- tab1 = spark.read.csv("/tmp/bnptrain.csv", header = True) #data frame con dati bnp
- tab1.createOrReplaceTempView("bnp1")
- realTab = setTypes_function(tab1)
- realTab.persist()
- realTab = realTab.withColumn("target", realTab["target"].cast(IntegerType()))
- realTab = realTab.withColumn("ID", realTab["ID"].cast(IntegerType()))
- realTab.createOrReplaceTempView("bnp1")
- schema = spark.sql("select 'va' as name, 'av' as type").schema
- types = spark.createDataFrame(realTab.dtypes, schema)
- types.createOrReplaceTempView("types")
- final = getStats_function('bnp1',types)
- final.createOrReplaceTempView("bnp_final")
- stats = spark.sql("select bnp_final.name, types.type, empty_count, min, max, avg, stddev, value, value_count from bnp_final join types on bnp_final.name=types.name order by bnp_final.order")
- stats.coalesce(1).write.csv("/tmp/bnp_stats.csv", header = True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement