SHARE
TWEET

Untitled

a guest Dec 16th, 2018 109 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. def setTypes_function(df):
  2.     for i in range(1,132):
  3.         actual = df.select(regexp_extract('v'+str(i), '(\d+\.?\d*)', 1).alias('d')).collect()
  4.         j = 0
  5.         found = False
  6.         while ((j < len(actual)) & (found==False)) :
  7.             if actual[j][0] == '' :
  8.                 found = True
  9.             j = j+1
  10.         if found == False :
  11.             df = df.withColumn("v"+str(i), df["v"+str(i)].cast(DoubleType()))
  12.     return df
  13.    
  14.  
  15. tab1 = spark.read.csv("/tmp/bnptrain.csv", header = True)   #data frame con dati bnp
  16. tab1.createOrReplaceTempView("bnp1")
  17. realTab = setTypes_function(tab1)
  18. realTab.persist()
  19. realTab = realTab.withColumn("target", realTab["target"].cast(IntegerType()))
  20. realTab = realTab.withColumn("ID", realTab["ID"].cast(IntegerType()))
  21. realTab.createOrReplaceTempView("bnp1")
  22.  
  23. schema = spark.sql("select 'va' as name, 'av' as type").schema
  24. types = spark.createDataFrame(realTab.dtypes, schema)
  25. types.createOrReplaceTempView("types")
  26.  
  27. final = getStats_function('bnp1',types)
  28. final.createOrReplaceTempView("bnp_final")
  29.  
  30. stats = spark.sql("select bnp_final.name, types.type, empty_count, min, max, avg, stddev, value, value_count from bnp_final join types on bnp_final.name=types.name order by bnp_final.order")
  31.  
  32. stats.coalesce(1).write.csv("/tmp/bnp_stats.csv", header = True)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top