Advertisement
Guest User

Untitled

a guest
Jul 20th, 2019
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.59 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Manual Mapping of Ordinal Features"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [],
  15. "source": [
  16. "import numpy as np\n",
  17. "import pandas as pd\n",
  18. "\n",
  19. "from itertools import chain\n",
  20. "from pyspark.sql import functions as F\n",
  21. "from pyspark.sql import SparkSession"
  22. ]
  23. },
  24. {
  25. "cell_type": "code",
  26. "execution_count": 2,
  27. "metadata": {},
  28. "outputs": [],
  29. "source": [
  30. "spark = SparkSession \\\n",
  31. " .builder \\\n",
  32. " .appName(\"Pysparkexample\") \\\n",
  33. " .config(\"spark.some.config.option\", \"some-value\") \\\n",
  34. " .getOrCreate()\n",
  35. "sc = spark.sparkContext"
  36. ]
  37. },
  38. {
  39. "cell_type": "markdown",
  40. "metadata": {},
  41. "source": [
  42. "Assume having columns in a dataframe with ordinal values which you want to map to integers (or doubles)"
  43. ]
  44. },
  45. {
  46. "cell_type": "code",
  47. "execution_count": 3,
  48. "metadata": {},
  49. "outputs": [
  50. {
  51. "name": "stdout",
  52. "output_type": "stream",
  53. "text": [
  54. "+-----+------+\n",
  55. "|feat1| feat2|\n",
  56. "+-----+------+\n",
  57. "| HI| LARGE|\n",
  58. "| MID|MEDIUM|\n",
  59. "| LO| SMALL|\n",
  60. "+-----+------+\n",
  61. "only showing top 3 rows\n",
  62. "\n"
  63. ]
  64. }
  65. ],
  66. "source": [
  67. "def fresh_df(N=100000, seed=None):\n",
  68. " np.random.seed(seed)\n",
  69. " feat1 = np.random.choice([\"HI\", \"LO\", \"MID\"], size=N)\n",
  70. " feat2 = np.random.choice([\"SMALL\", \"MEDIUM\", \"LARGE\"], size=N)\n",
  71. "\n",
  72. " pdf = pd.DataFrame({\n",
  73. " \"feat1\": feat1,\n",
  74. " \"feat2\": feat2\n",
  75. " })\n",
  76. " return spark.createDataFrame(pdf)\n",
  77. "\n",
  78. "fresh_df(N=100).show(3)"
  79. ]
  80. },
  81. {
  82. "cell_type": "markdown",
  83. "metadata": {},
  84. "source": [
  85. "So, you build manually the mappings:"
  86. ]
  87. },
  88. {
  89. "cell_type": "code",
  90. "execution_count": 4,
  91. "metadata": {},
  92. "outputs": [],
  93. "source": [
  94. "feat1_dict = {\"HI\": 1, \"MID\": 2, \"LO\": 3}\n",
  95. "feat2_dict = {\"SMALL\": 0, \"MEDIUM\": 1, \"LARGE\": 2}"
  96. ]
  97. },
  98. {
  99. "cell_type": "markdown",
  100. "metadata": {},
  101. "source": [
  102. "And turn each of them into a dataframe of its own:"
  103. ]
  104. },
  105. {
  106. "cell_type": "code",
  107. "execution_count": 5,
  108. "metadata": {},
  109. "outputs": [],
  110. "source": [
  111. "feat1_map_df = spark.createDataFrame(pd.DataFrame([(k,v) for k, v in feat1_dict.items()], columns=[\"feat\", \"label\"]))\n",
  112. "feat2_map_df = spark.createDataFrame(pd.DataFrame([(k,v) for k, v in feat2_dict.items()], columns=[\"feat\", \"label\"]))"
  113. ]
  114. },
  115. {
  116. "cell_type": "markdown",
  117. "metadata": {},
  118. "source": [
  119. "Collect the mapping dataframes in a dictionary:"
  120. ]
  121. },
  122. {
  123. "cell_type": "code",
  124. "execution_count": 6,
  125. "metadata": {},
  126. "outputs": [],
  127. "source": [
  128. "mappings = {\n",
  129. " \"feat1\": feat1_map_df,\n",
  130. " \"feat2\": feat2_map_df\n",
  131. "}"
  132. ]
  133. },
  134. {
  135. "cell_type": "code",
  136. "execution_count": 7,
  137. "metadata": {},
  138. "outputs": [
  139. {
  140. "name": "stdout",
  141. "output_type": "stream",
  142. "text": [
  143. "+-----+------+\n",
  144. "|feat1| feat2|\n",
  145. "+-----+------+\n",
  146. "| MID| LARGE|\n",
  147. "| HI| LARGE|\n",
  148. "| MID| SMALL|\n",
  149. "| MID| LARGE|\n",
  150. "| HI|MEDIUM|\n",
  151. "| HI| SMALL|\n",
  152. "| MID|MEDIUM|\n",
  153. "| LO|MEDIUM|\n",
  154. "| MID|MEDIUM|\n",
  155. "| MID|MEDIUM|\n",
  156. "+-----+------+\n",
  157. "\n"
  158. ]
  159. }
  160. ],
  161. "source": [
  162. "df = fresh_df(N=10, seed=42)\n",
  163. "df.show()"
  164. ]
  165. },
  166. {
  167. "cell_type": "markdown",
  168. "metadata": {},
  169. "source": [
  170. "Using `join`s you can now replace the values using the mappings:"
  171. ]
  172. },
  173. {
  174. "cell_type": "code",
  175. "execution_count": 8,
  176. "metadata": {},
  177. "outputs": [
  178. {
  179. "name": "stdout",
  180. "output_type": "stream",
  181. "text": [
  182. "+----+-----+\n",
  183. "|feat|label|\n",
  184. "+----+-----+\n",
  185. "| HI| 1|\n",
  186. "| MID| 2|\n",
  187. "| LO| 3|\n",
  188. "+----+-----+\n",
  189. "\n"
  190. ]
  191. }
  192. ],
  193. "source": [
  194. "mappings[\"feat1\"].show()"
  195. ]
  196. },
  197. {
  198. "cell_type": "code",
  199. "execution_count": 9,
  200. "metadata": {},
  201. "outputs": [],
  202. "source": [
  203. "def label_column(df, mapping, feat_name):\n",
  204. " return df.join(F.broadcast(mapping), df[feat_name] == mapping.feat)"
  205. ]
  206. },
  207. {
  208. "cell_type": "markdown",
  209. "metadata": {},
  210. "source": [
  211. "Here's an example:"
  212. ]
  213. },
  214. {
  215. "cell_type": "code",
  216. "execution_count": 10,
  217. "metadata": {},
  218. "outputs": [],
  219. "source": [
  220. "df = fresh_df(N=1000000, seed=42)\n",
  221. "cols = df.columns"
  222. ]
  223. },
  224. {
  225. "cell_type": "code",
  226. "execution_count": 11,
  227. "metadata": {},
  228. "outputs": [
  229. {
  230. "name": "stdout",
  231. "output_type": "stream",
  232. "text": [
  233. "250 ms ± 88.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
  234. ]
  235. }
  236. ],
  237. "source": [
  238. "%%timeit\n",
  239. "for col in cols:\n",
  240. " df2 = label_column(df, mappings[col], col).drop('feat')\n",
  241. " df2 = df2.withColumnRenamed(\"label\", col + \"_mapped\")\n",
  242. "df2.cache().count()"
  243. ]
  244. }
  245. ],
  246. "metadata": {
  247. "kernelspec": {
  248. "display_name": "Python 3",
  249. "language": "python",
  250. "name": "python3"
  251. },
  252. "language_info": {
  253. "codemirror_mode": {
  254. "name": "ipython",
  255. "version": 3
  256. },
  257. "file_extension": ".py",
  258. "mimetype": "text/x-python",
  259. "name": "python",
  260. "nbconvert_exporter": "python",
  261. "pygments_lexer": "ipython3",
  262. "version": "3.7.3"
  263. }
  264. },
  265. "nbformat": 4,
  266. "nbformat_minor": 2
  267. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement