Advertisement
Guest User

Untitled

a guest
Apr 3rd, 2024
42
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.78 KB | None | 0 0
  1. # ---
  2. # jupyter:
  3. # jupytext:
  4. # text_representation:
  5. # extension: .py
  6. # format_name: light
  7. # format_version: '1.5'
  8. # jupytext_version: 1.16.1
  9. # kernelspec:
  10. # display_name: Python 3 (ipykernel)
  11. # language: python
  12. # name: python3
  13. # ---
  14.  
  15. # ```
  16. # Licensed to the Apache Software Foundation (ASF) under one
  17. # or more contributor license agreements. See the NOTICE file
  18. # distributed with this work for additional information
  19. # regarding copyright ownership. The ASF licenses this file
  20. # to you under the Apache License, Version 2.0 (the
  21. # "License"); you may not use this file except in compliance
  22. # with the License. You may obtain a copy of the License at
  23. # http://www.apache.org/licenses/LICENSE-2.0
  24. # Unless required by applicable law or agreed to in writing,
  25. # software distributed under the License is distributed on an
  26. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  27. # KIND, either express or implied. See the License for the
  28. # specific language governing permissions and limitations
  29. # under the License.
  30. # ```
  31.  
  32. # +
  33. import os
  34.  
  35. import geopandas as gpd
  36. from pyspark.sql import SparkSession
  37. from pyspark.sql.functions import col, expr, when, explode, hex
  38. import pyspark
  39.  
  40.  
  41. from sedona.spark import SedonaContext, ShapefileReader, Adapter, GridType, IndexType, JoinQueryRaw, SedonaKepler, SedonaPyDeck
  42.  
  43. #from utilities import getConfig
  44. # -
  45.  
  46. # ## Setup Sedona environment
  47.  
  48. # +
  49. config = SedonaContext.builder() .\
  50. config('spark.jars.packages',
  51. 'org.apache.sedona:sedona-spark-shaded-3.4_2.12:1.5.1,'
  52. 'org.datasyslab:geotools-wrapper:1.5.1-28.2,'
  53. 'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \
  54. config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
  55. getOrCreate()
  56.  
  57. sedona = SedonaContext.create(config)
  58. sc = sedona.sparkContext
  59. sc.setSystemProperty("sedona.global.charset", "utf8")
  60. # -
  61.  
  62. # ## Read countries shapefile into a Sedona DataFrame
  63. # Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/
  64.  
  65. countries = ShapefileReader.readToGeometryRDD(sc, "/pyspark-example/data/apachesedona/data/ne_50m_admin_0_countries_lakes")
  66. countries_df = Adapter.toDf(countries, sedona)
  67. countries_df.createOrReplaceTempView("country")
  68. countries_df.printSchema()
  69.  
  70. # ## Read airports shapefile into a Sedona DataFrame
  71. # Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/
  72.  
  73. airports = ShapefileReader.readToGeometryRDD(sc, "/pyspark-example/data/apachesedona/data/ne_50m_airports")
  74. airports_df = Adapter.toDf(airports, sedona)
  75. airports_df.createOrReplaceTempView("airport")
  76. airports_df.printSchema()
  77.  
  78. #
  79. #
  80. # ## Run Spatial Join using SQL API
  81.  
  82. result = sedona.sql("SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")
  83.  
  84. # ## Run Spatial Join using RDD API
  85.  
  86. # +
  87. airports_rdd = Adapter.toSpatialRdd(airports_df, "geometry")
  88. # Drop the duplicate name column in countries_df
  89. countries_df = countries_df.drop("NAME")
  90. countries_rdd = Adapter.toSpatialRdd(countries_df, "geometry")
  91.  
  92. airports_rdd.analyze()
  93. countries_rdd.analyze()
  94.  
  95. # 4 is the num partitions used in spatial partitioning. This is an optional parameter
  96. airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)
  97. countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())
  98.  
  99. buildOnSpatialPartitionedRDD = True
  100. usingIndex = True
  101. considerBoundaryIntersection = True
  102. airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)
  103.  
  104. result_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)
  105.  
  106. result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, sedona)
  107.  
  108. result2.createOrReplaceTempView("join_result_with_all_cols")
  109. # Select the columns needed in the join
  110. result2 = sedona.sql("SELECT leftgeometry as country_geom, NAME_EN, rightgeometry as airport_geom, name FROM join_result_with_all_cols")
  111. # -
  112.  
  113. # ## Print spatial join results
  114.  
  115. # The result of SQL API
  116. result.show()
  117. # The result of RDD API
  118. result2.show()
  119.  
  120. # ## Group airports by country
  121.  
  122. # result.createOrReplaceTempView("result")
  123. result2.createOrReplaceTempView("result")
  124. groupedresult = sedona.sql("SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom")
  125. groupedresult.show()
  126. groupedresult.createOrReplaceTempView("grouped_result")
  127.  
  128. # ## Visualize the number of airports in each country
  129.  
  130. # ### Visualize using SedonaKepler
  131.  
  132. sedona_kepler_map = SedonaKepler.create_map(df=groupedresult, name="AirportCount")
  133. sedona_kepler_map
  134.  
  135. # ### Visualize using SedonaPyDeck
  136. # The above visualization is generated by a pre-set config informing SedonaKepler that the map to be rendered has to be a choropleth map with choropleth of the `AirportCount` column value.
  137. #
  138. # This can be also be achieved using [SedonaPyDeck](https://sedona.apache.org/1.5.0/tutorial/sql/#sedonapydeck) and its `create_choropleth_map` API.
  139.  
  140. sedona_pydeck_map = SedonaPyDeck.create_choropleth_map(df=groupedresult, plot_col='AirportCount')
  141. sedona_pydeck_map
  142.  
  143. # ## Visualize Uber H3 cells using SedonaKepler
  144. # The following tutorial depicts how Uber H3 cells can be generated using Sedona and visualized using SedonaKepler.
  145.  
  146. # ### Generate H3 cell IDs
  147. # [ST_H3CellIDs](https://sedona.apache.org/1.5.0/api/flink/Function/#st_h3cellids) can be used to generated cell IDs for given geometries
  148.  
  149. h3_df = sedona.sql("SELECT g.NAME_EN, g.country_geom, ST_H3CellIDs(g.country_geom, 3, false) as h3_cellID from grouped_result g")
  150. h3_df.show(2)
  151.  
  152. # ### Since each geometry can have multiple H3 cell IDs, let's explode the generated H3 cell ID array to get individual cells
  153.  
  154. exploded_h3 = h3_df.select(h3_df.NAME_EN, h3_df.country_geom, explode(h3_df.h3_cellID).alias("h3"))
  155. exploded_h3.show(2)
  156.  
  157. # ### Convert generated long H3 cell ID to a hex cell ID
  158. # SedonaKepler accepts each H3 cell ID as a hexadecimal to automatically visualize them. Also, let us sample the data to be able to visualize sparse cells on the map.
  159.  
  160. exploded_h3 = exploded_h3.sample(0.3)
  161. exploded_h3.createOrReplaceTempView("exploded_h3")
  162. hex_exploded_h3 = exploded_h3.select(exploded_h3.NAME_EN, exploded_h3.country_geom, hex(exploded_h3.h3).alias("ex_h3"))
  163. #hex_exploded_h3.set_geometry("ex_h3", inplace=True)
  164. hex_exploded_h3.show(2)
  165. hex_exploded_h3.printSchema()
  166.  
  167. # ### Visualize using SedonaKepler
  168. # Now, simply provide the final df to SedonaKepler.create_map and you can automagically visualize the H3 cells on the map!
  169.  
  170. sedona_kepler_h3 = SedonaKepler.create_map(df=hex_exploded_h3, name="h3")
  171. filename = "/pyspark-example/data/apachesedona/kepler.html"
  172. sedona_kepler_h3.save_to_html(file_name=filename)
  173.  
  174.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement