Spark-comands

# With column helps you to create new columm basing on the values of old
df = df.withColumn('percentagescaleddays', round((df['DAYSONMARKET'] - min_days) / (max_days - min_days)) * 100)

# return a tuple with colnames n=and datatypes
df.dtypes

# Drop columns in list
cols_to_drop = ['LOTSIZEDIMENSION', 'LISTTYPE']
df = df.drop(*cols_to_drop)

# where is used to filter rows that pass the specified condition
text_filter = ~df['ASSUMABLEMORTGAGE'].isin(yes_values) | df['ASSUMABLEMORTGAGE'].isNull()
df = df.where(text_filter)

# import of aggregate functions
from pyspark.sql.functions import mean, stddev

# with agg we can apply the aggregate function, collect()[0][0] can be used to retrive the calculated value
mean_val = df.agg({'log_SalesClosePrice': 'mean'}).collect()[0][0]
stddev_val = df.agg({'log_SalesClosePrice': 'stddev'}).collect()[0][0]

# Joins (conditions can be a list)
condition = [walk_df['latitude'] == df['latitude'], walk_df['longitude'] == df['longitude']]
df_1.join(df_2, condition, 'type_in_string')

# add df to view
df.createOrReplaceTempView('df')

# run sql query
spark.sql('query in string')

# we can cast a column to a type given in string
walk_df['latitude'].cast('double')


# we can convert a column with date to spark date using to_date
df = df.withColumn('date', to_date(df['LISTDATE']))

# and then in pyspark.sql.functions we have all functions to manipulate date

## Using lag and window you can create column that have value from a previous row

from pyspark.sql.functions import lag, datediff, to_date
from pyspark.sql.window import Window

# Cast data type
mort_df = mort_df.withColumn('DATE', to_date(mort_df['DATE']))

# Create window
w = Window().orderBy(mort_df['DATE'])
# Create lag column
mort_df = mort_df.withColumn('DATE-1', lag('DATE', count=1).over(w))

# use like function to find pattern in a string
has_attached_garage = df['GARAGEDESCRIPTION'].like('%Attached Garage%')

# for conditionals use when and otherwise
df = df.withColumn('has_attached_garage', (when(has_attached_garage, 1)
                                          .when(has_detached_garage, 0)
                                          .otherwise(None)))


##############################################
from pyspark.sql.functions import coalesce, first

# Pivot
piv_df = ex_df.groupBy('NO').pivot('ex_garage_list').agg(coalesce(first('constant_val')))

# Join the dataframes together and fill null
joined_df = df.join(piv_df, on='NO', how='left')

# Columns to zero fill
zfill_cols = piv_df.columns

# Zero fill the pivoted values
zfilled_df = joined_df.fillna(0, subset=zfill_cols)
##############################################

# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1)

# Pipelines

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[indexer, onehot, assemble, regression])
pipeline = pipeline.fit(train_df) # call fit for all
pipelines.stages[2]