Advertisement
Guest User

Untitled

a guest
Oct 17th, 2018
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.04 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 13,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import pyspark\n",
  10. "sc = pyspark.SparkContext.getOrCreate()\n",
  11. "\n",
  12. "spark = SparkSession \\\n",
  13. " .builder \\\n",
  14. " .appName(\"Spark SQL Finger 3\") \\\n",
  15. " .getOrCreate()"
  16. ]
  17. },
  18. {
  19. "cell_type": "code",
  20. "execution_count": 14,
  21. "metadata": {},
  22. "outputs": [],
  23. "source": [
  24. "datos_jugadores_regtemp = [\n",
  25. " (1, 'Manu Ginobili', 30, 35, 8, 8, 5, 2),\n",
  26. " (2, 'Kobe Bryant', 40, 30, 3, 5, 20, 2),\n",
  27. " (3, 'LeBron James', 58, 20, 3, 8, 30, 2),\n",
  28. " (4, 'Andre Iguodala', 35, 15, 5, 10, 20, 4),\n",
  29. " (5, 'Carmelo Anthony', 40, 10, 6, 6, 22, 1)\n",
  30. "]\n",
  31. "\n",
  32. "datos_jugadores_po = [\n",
  33. " #1, 2, 3, 4, 5, 6 ,7 ,8 ,9\n",
  34. " (1, 1, 1, 8, 3, 4, 2, 12, 1),\n",
  35. " (1, 1, 3, 4, 1, 5, 2, 15, 1),\n",
  36. " (1, 1, 4, 12, 2, 3, 2, 10, 1),\n",
  37. " (2, 1, 1, 18, 3, 3, 3, 8, 2),\n",
  38. " (2, 1, 4, 9, 3, 4, 1, 5, 1),\n",
  39. " (3, 2, 1, 12, 6, 4, 1, 3, 1),\n",
  40. " (3, 2, 2, 15, 8, 5, 3, 7, 1),\n",
  41. " (3, 2, 3, 22, 6, 2, 1, 4, 2),\n",
  42. " (3, 2, 4, 18, 10, 4, 2, 4, 1),\n",
  43. " (4, 2, 1, 12, 2, 1, 1, 9, 0),\n",
  44. " (4, 2, 2, 25, 8, 4, 2, 4, 3),\n",
  45. " (4, 2, 4, 15, 4, 2, 2, 8, 1),\n",
  46. " (1, 3, 3, 2, 1, 0, 0, 3, 2),\n",
  47. " (5, 3, 1, 8, 1, 2, 0, 6, 0),\n",
  48. " (5, 3, 1, 7, 0, 0, 1, 6, 1),\n",
  49. "]"
  50. ]
  51. },
  52. {
  53. "cell_type": "code",
  54. "execution_count": 15,
  55. "metadata": {},
  56. "outputs": [],
  57. "source": [
  58. "df_tr = spark.createDataFrame(datos_jugadores_regtemp)\n",
  59. "df_po = spark.createDataFrame(datos_jugadores_po)"
  60. ]
  61. },
  62. {
  63. "cell_type": "code",
  64. "execution_count": 16,
  65. "metadata": {},
  66. "outputs": [
  67. {
  68. "data": {
  69. "text/plain": [
  70. "['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8']"
  71. ]
  72. },
  73. "execution_count": 16,
  74. "metadata": {},
  75. "output_type": "execute_result"
  76. }
  77. ],
  78. "source": [
  79. "# La columna del promedio de asistencias es _4.\n",
  80. "df_tr.columns"
  81. ]
  82. },
  83. {
  84. "cell_type": "code",
  85. "execution_count": 17,
  86. "metadata": {},
  87. "outputs": [
  88. {
  89. "data": {
  90. "text/plain": [
  91. "['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9']"
  92. ]
  93. },
  94. "execution_count": 17,
  95. "metadata": {},
  96. "output_type": "execute_result"
  97. }
  98. ],
  99. "source": [
  100. "# La columna con la cantidad de asistencias es _8.\n",
  101. "df_po.columns"
  102. ]
  103. },
  104. {
  105. "cell_type": "code",
  106. "execution_count": 18,
  107. "metadata": {},
  108. "outputs": [],
  109. "source": [
  110. "df_tr.createOrReplaceTempView('temporada_regular')\n",
  111. "df_po.createOrReplaceTempView('playoffs')"
  112. ]
  113. },
  114. {
  115. "cell_type": "code",
  116. "execution_count": 19,
  117. "metadata": {},
  118. "outputs": [],
  119. "source": [
  120. "# Creamos una tabla nueva agrupando por jugador y partido, para calcular el total de asistencias por partido.\n",
  121. "po_suma = spark.sql(\"SELECT _1, _2, sum(_8) as asistencias_totales FROM playoffs GROUP BY _1,_2\")\n",
  122. "po_suma.createOrReplaceTempView('po_asistencias')"
  123. ]
  124. },
  125. {
  126. "cell_type": "code",
  127. "execution_count": 20,
  128. "metadata": {},
  129. "outputs": [],
  130. "source": [
  131. "# La columna _2 en \"temporada_regular\" es el nombre del jugador, _1 es el id de jugador en ambas tablas.\n",
  132. "prom_y_max_asistencias = spark.sql(\"SELECT _2 as nombre, _4 as prom_asistencias, max_asistencias FROM temporada_regular as tr\\\n",
  133. " JOIN (SELECT _1, max(asistencias_totales) as max_asistencias FROM po_asistencias GROUP BY _1) as po\\\n",
  134. " ON tr._1 = po._1\")"
  135. ]
  136. },
  137. {
  138. "cell_type": "code",
  139. "execution_count": 21,
  140. "metadata": {},
  141. "outputs": [
  142. {
  143. "name": "stdout",
  144. "output_type": "stream",
  145. "text": [
  146. "+---------------+----------------+---------------+\n",
  147. "| nombre|prom_asistencias|max_asistencias|\n",
  148. "+---------------+----------------+---------------+\n",
  149. "|Carmelo Anthony| 10| 12|\n",
  150. "| Manu Ginobili| 35| 37|\n",
  151. "| LeBron James| 20| 18|\n",
  152. "| Kobe Bryant| 30| 13|\n",
  153. "| Andre Iguodala| 15| 21|\n",
  154. "+---------------+----------------+---------------+\n",
  155. "\n"
  156. ]
  157. }
  158. ],
  159. "source": [
  160. "prom_y_max_asistencias.show()"
  161. ]
  162. },
  163. {
  164. "cell_type": "code",
  165. "execution_count": 22,
  166. "metadata": {},
  167. "outputs": [],
  168. "source": [
  169. "prom_y_max_asistencias.createOrReplaceTempView('resultado')"
  170. ]
  171. },
  172. {
  173. "cell_type": "code",
  174. "execution_count": 23,
  175. "metadata": {},
  176. "outputs": [],
  177. "source": [
  178. "query_final = spark.sql(\"SELECT nombre FROM resultado as res WHERE res.max_asistencias > prom_asistencias\")"
  179. ]
  180. },
  181. {
  182. "cell_type": "code",
  183. "execution_count": 24,
  184. "metadata": {},
  185. "outputs": [
  186. {
  187. "name": "stdout",
  188. "output_type": "stream",
  189. "text": [
  190. "+---------------+\n",
  191. "| nombre|\n",
  192. "+---------------+\n",
  193. "|Carmelo Anthony|\n",
  194. "| Manu Ginobili|\n",
  195. "| Andre Iguodala|\n",
  196. "+---------------+\n",
  197. "\n"
  198. ]
  199. }
  200. ],
  201. "source": [
  202. "query_final.show()"
  203. ]
  204. },
  205. {
  206. "cell_type": "code",
  207. "execution_count": null,
  208. "metadata": {},
  209. "outputs": [],
  210. "source": []
  211. }
  212. ],
  213. "metadata": {
  214. "kernelspec": {
  215. "display_name": "PySpark",
  216. "language": "python",
  217. "name": "pyspark"
  218. },
  219. "language_info": {
  220. "codemirror_mode": {
  221. "name": "ipython",
  222. "version": 3
  223. },
  224. "file_extension": ".py",
  225. "mimetype": "text/x-python",
  226. "name": "python",
  227. "nbconvert_exporter": "python",
  228. "pygments_lexer": "ipython3",
  229. "version": "3.6.6"
  230. }
  231. },
  232. "nbformat": 4,
  233. "nbformat_minor": 2
  234. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement