Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # With column helps you to create new columm basing on the values of old
- df = df.withColumn('percentagescaleddays', round((df['DAYSONMARKET'] - min_days) / (max_days - min_days)) * 100)
- # return a tuple with colnames n=and datatypes
- df.dtypes
- # Drop columns in list
- cols_to_drop = ['LOTSIZEDIMENSION', 'LISTTYPE']
- df = df.drop(*cols_to_drop)
- # where is used to filter rows that pass the specified condition
- text_filter = ~df['ASSUMABLEMORTGAGE'].isin(yes_values) | df['ASSUMABLEMORTGAGE'].isNull()
- df = df.where(text_filter)
- # import of aggregate functions
- from pyspark.sql.functions import mean, stddev
- # with agg we can apply the aggregate function, collect()[0][0] can be used to retrive the calculated value
- mean_val = df.agg({'log_SalesClosePrice': 'mean'}).collect()[0][0]
- stddev_val = df.agg({'log_SalesClosePrice': 'stddev'}).collect()[0][0]
- # Joins (conditions can be a list)
- condition = [walk_df['latitude'] == df['latitude'], walk_df['longitude'] == df['longitude']]
- df_1.join(df_2, condition, 'type_in_string')
- # add df to view
- df.createOrReplaceTempView('df')
- # run sql query
- spark.sql('query in string')
- # we can cast a column to a type given in string
- walk_df['latitude'].cast('double')
- # we can convert a column with date to spark date using to_date
- df = df.withColumn('date', to_date(df['LISTDATE']))
- # and then in pyspark.sql.functions we have all functions to manipulate date
- ## Using lag and window you can create column that have value from a previous row
- from pyspark.sql.functions import lag, datediff, to_date
- from pyspark.sql.window import Window
- # Cast data type
- mort_df = mort_df.withColumn('DATE', to_date(mort_df['DATE']))
- # Create window
- w = Window().orderBy(mort_df['DATE'])
- # Create lag column
- mort_df = mort_df.withColumn('DATE-1', lag('DATE', count=1).over(w))
- # use like function to find pattern in a string
- has_attached_garage = df['GARAGEDESCRIPTION'].like('%Attached Garage%')
- # for conditionals use when and otherwise
- df = df.withColumn('has_attached_garage', (when(has_attached_garage, 1)
- .when(has_detached_garage, 0)
- .otherwise(None)))
- ##############################################
- from pyspark.sql.functions import coalesce, first
- # Pivot
- piv_df = ex_df.groupBy('NO').pivot('ex_garage_list').agg(coalesce(first('constant_val')))
- # Join the dataframes together and fill null
- joined_df = df.join(piv_df, on='NO', how='left')
- # Columns to zero fill
- zfill_cols = piv_df.columns
- # Zero fill the pivoted values
- zfilled_df = joined_df.fillna(0, subset=zfill_cols)
- ##############################################
- # Fit Lasso model (α = 1) to training data
- regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1)
- # Pipelines
- from pyspark.ml import Pipeline
- pipeline = Pipeline(stages=[indexer, onehot, assemble, regression])
- pipeline = pipeline.fit(train_df) # call fit for all
- pipelines.stages[2]
Add Comment
Please, Sign In to add comment