Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Importing the required libraries
- import pandas as pd
- from pyspark.sql.types import *
- from pyspark.ml.regression import RandomForestRegressor
- from pyspark.mllib.util import MLUtils
- from pyspark.ml import Pipeline
- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
- from pyspark.ml.evaluation import RegressionEvaluator
- from pyspark.ml.linalg import Vectors
- from pyspark.ml import Pipeline
- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
- from pyspark.mllib.fpm import *
- from pyspark.sql import SparkSession
- spark = SparkSession .builder .appName("Python Spark") .config("spark.some.config.option", "some-value")
- # read the data
- df = pd.read_json("events.json")
- df = (df.rdd.map(lambda x: (x[1],[x[0]])).reduceByKey(lambda x,y: x+y).sortBy(lambda k_v: (k_v[0], sorted(k_v[1], key=lambda x: x[1], reverse=True))).collect())
Add Comment
Please, Sign In to add comment