Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import findspark
- findspark.init('/home/patryk/Pobrane/spark-2.1.1-bin-hadoop2.7')
- from pyspark.sql import SparkSession
- from pyspark.sql import SQLContext, SparkSession
- from pyspark import SparkContext, SparkConf
- import json
- import pyspark.sql.functions as f
- from pyspark.sql.functions import col, udf
- from pyspark.sql.types import *
- sc = SparkContext()
- #PRZEROBIENIE PLIKU JSON NA FORMAT OBSLUGIWANY PRZEZ PYSPARK
- #original file was written with pretty-print inside a list
- # with open('dataMay-31-2017.json') as jsonfile:
- # js = json.load(jsonfile)
- #
- # buff = dict()
- # #write a new file with one object per line
- # with open("flattened.json", 'w') as outfile:
- # iterator = 0
- # for d in js:
- # buff[d] = js[d]
- # json.dump(buff, outfile)
- # outfile.write('\n')
- sc = SparkContext.getOrCreate()
- sqlContext = SQLContext(sc)
- spark = SparkSession(sc)
- to_sort = sqlContext.read.json("flattened.json")
- selected = to_sort.select("*", f.explode("data").alias("exploded_data"))
- selected = selected.select("exploded_data")
- selected.show()
- selected.registerTempTable("selected")
- same_wartosci = sqlContext.sql("select exploded_data from selected").rdd.map(lambda line: {line[0][0], float(line[0][1
- ])}).reduce
- B
- yKey(lambda line: print(line))
- sorted_list = []
- mini = f.min("exploded_data")
- #print(same_wartosci.collect())
- # while same_wartosci.collect():
- # minimum = same_wartosci.min()
- # same_wartosci = same_wartosci.filter(lambda line: minimum != line)
- # print(len(same_wartosci.collect()))
- # print(minimum)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement