Untitled

# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.1
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# ```
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#   http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# ```

# +
import os

import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, explode, hex
import pyspark


from sedona.spark import SedonaContext, ShapefileReader, Adapter, GridType, IndexType, JoinQueryRaw, SedonaKepler, SedonaPyDeck

#from utilities import getConfig
# -

# ## Setup Sedona environment

# +
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.4_2.12:1.5.1,'
           'org.datasyslab:geotools-wrapper:1.5.1-28.2,'
           'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
    getOrCreate()

sedona = SedonaContext.create(config)
sc = sedona.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
# -

# ## Read countries shapefile into a Sedona DataFrame
# Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/

countries = ShapefileReader.readToGeometryRDD(sc, "/pyspark-example/data/apachesedona/data/ne_50m_admin_0_countries_lakes")
countries_df = Adapter.toDf(countries, sedona)
countries_df.createOrReplaceTempView("country")
countries_df.printSchema()

# ## Read airports shapefile into a Sedona DataFrame
# Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/

airports = ShapefileReader.readToGeometryRDD(sc, "/pyspark-example/data/apachesedona/data/ne_50m_airports")
airports_df = Adapter.toDf(airports, sedona)
airports_df.createOrReplaceTempView("airport")
airports_df.printSchema()

#
#
# ## Run Spatial Join using SQL API

result = sedona.sql("SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")

# ## Run Spatial Join using RDD API

# +
airports_rdd = Adapter.toSpatialRdd(airports_df, "geometry")
# Drop the duplicate name column in countries_df
countries_df = countries_df.drop("NAME")
countries_rdd = Adapter.toSpatialRdd(countries_df, "geometry")

airports_rdd.analyze()
countries_rdd.analyze()

# 4 is the num partitions used in spatial partitioning. This is an optional parameter
airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)
countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())

buildOnSpatialPartitionedRDD = True
usingIndex = True
considerBoundaryIntersection = True
airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)

result_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)

result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, sedona)

result2.createOrReplaceTempView("join_result_with_all_cols")
# Select the columns needed in the join
result2 = sedona.sql("SELECT leftgeometry as country_geom, NAME_EN, rightgeometry as airport_geom, name FROM join_result_with_all_cols")
# -

# ## Print spatial join results

# The result of SQL API
result.show()
# The result of RDD API
result2.show()

# ## Group airports by country

# result.createOrReplaceTempView("result")
result2.createOrReplaceTempView("result")
groupedresult = sedona.sql("SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom")
groupedresult.show()
groupedresult.createOrReplaceTempView("grouped_result")

# ## Visualize the number of airports in each country

# ### Visualize using SedonaKepler

sedona_kepler_map = SedonaKepler.create_map(df=groupedresult, name="AirportCount")
sedona_kepler_map

# ### Visualize using SedonaPyDeck
# The above visualization is generated by a pre-set config informing SedonaKepler that the map to be rendered has to be a choropleth map with choropleth of the `AirportCount` column value.
#
# This can be also be achieved using [SedonaPyDeck](https://sedona.apache.org/1.5.0/tutorial/sql/#sedonapydeck) and its `create_choropleth_map` API.

sedona_pydeck_map = SedonaPyDeck.create_choropleth_map(df=groupedresult, plot_col='AirportCount')
sedona_pydeck_map

# ## Visualize Uber H3 cells using SedonaKepler
# The following tutorial depicts how Uber H3 cells can be generated using Sedona and visualized using SedonaKepler.

# ### Generate H3 cell IDs
# [ST_H3CellIDs](https://sedona.apache.org/1.5.0/api/flink/Function/#st_h3cellids) can be used to generated cell IDs for given geometries

h3_df = sedona.sql("SELECT g.NAME_EN, g.country_geom, ST_H3CellIDs(g.country_geom, 3, false) as h3_cellID from grouped_result g")
h3_df.show(2)

# ### Since each geometry can have multiple H3 cell IDs, let's explode the generated H3 cell ID array to get individual cells

exploded_h3 = h3_df.select(h3_df.NAME_EN, h3_df.country_geom, explode(h3_df.h3_cellID).alias("h3"))
exploded_h3.show(2)

# ### Convert generated long H3 cell ID to a hex cell ID
# SedonaKepler accepts each H3 cell ID as a hexadecimal to automatically visualize them. Also, let us sample the data to be able to visualize sparse cells on the map.

exploded_h3 = exploded_h3.sample(0.3)
exploded_h3.createOrReplaceTempView("exploded_h3")
hex_exploded_h3 = exploded_h3.select(exploded_h3.NAME_EN, exploded_h3.country_geom,  hex(exploded_h3.h3).alias("ex_h3"))
#hex_exploded_h3.set_geometry("ex_h3", inplace=True)
hex_exploded_h3.show(2)
hex_exploded_h3.printSchema()

# ### Visualize using SedonaKepler
# Now, simply provide the final df to SedonaKepler.create_map and you can automagically visualize the H3 cells on the map!

sedona_kepler_h3 = SedonaKepler.create_map(df=hex_exploded_h3, name="h3")
filename = "/pyspark-example/data/apachesedona/kepler.html"
sedona_kepler_h3.save_to_html(file_name=filename)