Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ---
- # jupyter:
- # jupytext:
- # text_representation:
- # extension: .py
- # format_name: light
- # format_version: '1.5'
- # jupytext_version: 1.16.1
- # kernelspec:
- # display_name: Python 3 (ipykernel)
- # language: python
- # name: python3
- # ---
- # ```
- # Licensed to the Apache Software Foundation (ASF) under one
- # or more contributor license agreements. See the NOTICE file
- # distributed with this work for additional information
- # regarding copyright ownership. The ASF licenses this file
- # to you under the Apache License, Version 2.0 (the
- # "License"); you may not use this file except in compliance
- # with the License. You may obtain a copy of the License at
- # http://www.apache.org/licenses/LICENSE-2.0
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- # KIND, either express or implied. See the License for the
- # specific language governing permissions and limitations
- # under the License.
- # ```
- # +
- import os
- import geopandas as gpd
- from pyspark.sql import SparkSession
- from pyspark.sql.functions import col, expr, when, explode, hex
- import pyspark
- from sedona.spark import SedonaContext, ShapefileReader, Adapter, GridType, IndexType, JoinQueryRaw, SedonaKepler, SedonaPyDeck
- #from utilities import getConfig
- # -
- # ## Setup Sedona environment
- # +
- config = SedonaContext.builder() .\
- config('spark.jars.packages',
- 'org.apache.sedona:sedona-spark-shaded-3.4_2.12:1.5.1,'
- 'org.datasyslab:geotools-wrapper:1.5.1-28.2,'
- 'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \
- config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
- getOrCreate()
- sedona = SedonaContext.create(config)
- sc = sedona.sparkContext
- sc.setSystemProperty("sedona.global.charset", "utf8")
- # -
- # ## Read countries shapefile into a Sedona DataFrame
- # Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/
- countries = ShapefileReader.readToGeometryRDD(sc, "/pyspark-example/data/apachesedona/data/ne_50m_admin_0_countries_lakes")
- countries_df = Adapter.toDf(countries, sedona)
- countries_df.createOrReplaceTempView("country")
- countries_df.printSchema()
- # ## Read airports shapefile into a Sedona DataFrame
- # Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/
- airports = ShapefileReader.readToGeometryRDD(sc, "/pyspark-example/data/apachesedona/data/ne_50m_airports")
- airports_df = Adapter.toDf(airports, sedona)
- airports_df.createOrReplaceTempView("airport")
- airports_df.printSchema()
- #
- #
- # ## Run Spatial Join using SQL API
- result = sedona.sql("SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")
- # ## Run Spatial Join using RDD API
- # +
- airports_rdd = Adapter.toSpatialRdd(airports_df, "geometry")
- # Drop the duplicate name column in countries_df
- countries_df = countries_df.drop("NAME")
- countries_rdd = Adapter.toSpatialRdd(countries_df, "geometry")
- airports_rdd.analyze()
- countries_rdd.analyze()
- # 4 is the num partitions used in spatial partitioning. This is an optional parameter
- airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)
- countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())
- buildOnSpatialPartitionedRDD = True
- usingIndex = True
- considerBoundaryIntersection = True
- airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)
- result_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)
- result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, sedona)
- result2.createOrReplaceTempView("join_result_with_all_cols")
- # Select the columns needed in the join
- result2 = sedona.sql("SELECT leftgeometry as country_geom, NAME_EN, rightgeometry as airport_geom, name FROM join_result_with_all_cols")
- # -
- # ## Print spatial join results
- # The result of SQL API
- result.show()
- # The result of RDD API
- result2.show()
- # ## Group airports by country
- # result.createOrReplaceTempView("result")
- result2.createOrReplaceTempView("result")
- groupedresult = sedona.sql("SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom")
- groupedresult.show()
- groupedresult.createOrReplaceTempView("grouped_result")
- # ## Visualize the number of airports in each country
- # ### Visualize using SedonaKepler
- sedona_kepler_map = SedonaKepler.create_map(df=groupedresult, name="AirportCount")
- sedona_kepler_map
- # ### Visualize using SedonaPyDeck
- # The above visualization is generated by a pre-set config informing SedonaKepler that the map to be rendered has to be a choropleth map with choropleth of the `AirportCount` column value.
- #
- # This can be also be achieved using [SedonaPyDeck](https://sedona.apache.org/1.5.0/tutorial/sql/#sedonapydeck) and its `create_choropleth_map` API.
- sedona_pydeck_map = SedonaPyDeck.create_choropleth_map(df=groupedresult, plot_col='AirportCount')
- sedona_pydeck_map
- # ## Visualize Uber H3 cells using SedonaKepler
- # The following tutorial depicts how Uber H3 cells can be generated using Sedona and visualized using SedonaKepler.
- # ### Generate H3 cell IDs
- # [ST_H3CellIDs](https://sedona.apache.org/1.5.0/api/flink/Function/#st_h3cellids) can be used to generated cell IDs for given geometries
- h3_df = sedona.sql("SELECT g.NAME_EN, g.country_geom, ST_H3CellIDs(g.country_geom, 3, false) as h3_cellID from grouped_result g")
- h3_df.show(2)
- # ### Since each geometry can have multiple H3 cell IDs, let's explode the generated H3 cell ID array to get individual cells
- exploded_h3 = h3_df.select(h3_df.NAME_EN, h3_df.country_geom, explode(h3_df.h3_cellID).alias("h3"))
- exploded_h3.show(2)
- # ### Convert generated long H3 cell ID to a hex cell ID
- # SedonaKepler accepts each H3 cell ID as a hexadecimal to automatically visualize them. Also, let us sample the data to be able to visualize sparse cells on the map.
- exploded_h3 = exploded_h3.sample(0.3)
- exploded_h3.createOrReplaceTempView("exploded_h3")
- hex_exploded_h3 = exploded_h3.select(exploded_h3.NAME_EN, exploded_h3.country_geom, hex(exploded_h3.h3).alias("ex_h3"))
- #hex_exploded_h3.set_geometry("ex_h3", inplace=True)
- hex_exploded_h3.show(2)
- hex_exploded_h3.printSchema()
- # ### Visualize using SedonaKepler
- # Now, simply provide the final df to SedonaKepler.create_map and you can automagically visualize the H3 cells on the map!
- sedona_kepler_h3 = SedonaKepler.create_map(df=hex_exploded_h3, name="h3")
- filename = "/pyspark-example/data/apachesedona/kepler.html"
- sedona_kepler_h3.save_to_html(file_name=filename)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement