Advertisement
Guest User

Untitled

a guest
Jun 25th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.94 KB | None | 0 0
  1. import sys
  2.  
  3. from pyspark.shell import sqlContext
  4.  
  5. sys.path.insert(0, '.')
  6. from pyspark import SparkContext, SparkConf
  7. from pyspark.sql.functions import explode
  8.  
  9.  
  10. def strip(line: str):
  11. if line[-1] == ',':
  12. return int(line[3:-1])
  13. else:
  14. return float(line[4:-1])
  15.  
  16.  
  17. def mapper(line):
  18. print(line)
  19. return line['col'][0]
  20.  
  21. if __name__ == "__main__":
  22. # conf = SparkConf().setAppName("airports").setMaster("local[*]")
  23. #
  24. # sc = SparkContext(conf = conf)
  25. #
  26. # json = sc.textFile("dataMay-31-2017.json")
  27. # jsonCol = json.filter(lambda line: '\t\t\t' in line)
  28. # jsonCol = jsonCol.map(strip)
  29. # print(jsonCol.collect())
  30.  
  31. df = sqlContext.read.json('dataMay-31-2017.json', multiLine=True)
  32. ndf = df.select(explode(df.data))
  33. data = []
  34. paralData = SparkContext.parallelize(data)
  35. pairTuples = ndf.rdd.map(lambda r:tuple(r.col))
  36. asd = pairTuples.min()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement