Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Manual Mapping of Ordinal Features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "\n",
- "from itertools import chain\n",
- "from pyspark.sql import functions as F\n",
- "from pyspark.sql import SparkSession"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "spark = SparkSession \\\n",
- " .builder \\\n",
- " .appName(\"Pysparkexample\") \\\n",
- " .config(\"spark.some.config.option\", \"some-value\") \\\n",
- " .getOrCreate()\n",
- "sc = spark.sparkContext"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Assume having columns in a dataframe with ordinal values which you want to map to integers (or doubles)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----+------+\n",
- "|feat1| feat2|\n",
- "+-----+------+\n",
- "| HI| LARGE|\n",
- "| MID|MEDIUM|\n",
- "| LO| SMALL|\n",
- "+-----+------+\n",
- "only showing top 3 rows\n",
- "\n"
- ]
- }
- ],
- "source": [
- "def fresh_df(N=100000, seed=None):\n",
- " np.random.seed(seed)\n",
- " feat1 = np.random.choice([\"HI\", \"LO\", \"MID\"], size=N)\n",
- " feat2 = np.random.choice([\"SMALL\", \"MEDIUM\", \"LARGE\"], size=N)\n",
- "\n",
- " pdf = pd.DataFrame({\n",
- " \"feat1\": feat1,\n",
- " \"feat2\": feat2\n",
- " })\n",
- " return spark.createDataFrame(pdf)\n",
- "\n",
- "fresh_df(N=100).show(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "So, you build manually the mappings:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "feat1_dict = {\"HI\": 1, \"MID\": 2, \"LO\": 3}\n",
- "feat2_dict = {\"SMALL\": 0, \"MEDIUM\": 1, \"LARGE\": 2}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And turn each of them into a dataframe of its own:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "feat1_map_df = spark.createDataFrame(pd.DataFrame([(k,v) for k, v in feat1_dict.items()], columns=[\"feat\", \"label\"]))\n",
- "feat2_map_df = spark.createDataFrame(pd.DataFrame([(k,v) for k, v in feat2_dict.items()], columns=[\"feat\", \"label\"]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Collect the mapping dataframes in a dictionary:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "mappings = {\n",
- " \"feat1\": feat1_map_df,\n",
- " \"feat2\": feat2_map_df\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+-----+------+\n",
- "|feat1| feat2|\n",
- "+-----+------+\n",
- "| MID| LARGE|\n",
- "| HI| LARGE|\n",
- "| MID| SMALL|\n",
- "| MID| LARGE|\n",
- "| HI|MEDIUM|\n",
- "| HI| SMALL|\n",
- "| MID|MEDIUM|\n",
- "| LO|MEDIUM|\n",
- "| MID|MEDIUM|\n",
- "| MID|MEDIUM|\n",
- "+-----+------+\n",
- "\n"
- ]
- }
- ],
- "source": [
- "df = fresh_df(N=10, seed=42)\n",
- "df.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Using `join`s you can now replace the values using the mappings:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+----+-----+\n",
- "|feat|label|\n",
- "+----+-----+\n",
- "| HI| 1|\n",
- "| MID| 2|\n",
- "| LO| 3|\n",
- "+----+-----+\n",
- "\n"
- ]
- }
- ],
- "source": [
- "mappings[\"feat1\"].show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "def label_column(df, mapping, feat_name):\n",
- " return df.join(F.broadcast(mapping), df[feat_name] == mapping.feat)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Here's an example:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = fresh_df(N=1000000, seed=42)\n",
- "cols = df.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "250 ms ± 88.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "for col in cols:\n",
- " df2 = label_column(df, mappings[col], col).drop('feat')\n",
- " df2 = df2.withColumnRenamed(\"label\", col + \"_mapped\")\n",
- "df2.cache().count()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement