Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Initialize notebook"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import json\n",
- "import os\n",
- "import datetime\n",
- "from sqlalchemy import create_engine\n",
- "import splunklib.results as results\n",
- "import splunklib.client as client\n",
- "import warnings\n",
- "\n",
- "warnings.filterwarnings('ignore')\n",
- "pd.set_option('display.max_columns', None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# yogu's package id = ED04-D268 (canceled return flight on 5/14/18)\n",
- "# steven's package id = 8DB5-7345 (delayed return flight on 5/15/18)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Define custom functions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "def snowflake_connect():\n",
- " \"\"\"Connects to the Snowflake EDW using credentials set in your bash profile.\"\"\"\n",
- " user = os.environ['SNOWFLAKE_USERNAME']\n",
- " pwd = os.environ['SNOWFLAKE_PASSWORD']\n",
- " acct = 'vp76644'\n",
- " conn = create_engine(f'snowflake://{user}:{pwd}@{acct}/warehouse')\n",
- " return conn\n",
- "\n",
- "\n",
- "def splunk_connect():\n",
- " \"\"\"Connects to the Splunk SDK using credentials set in your bash profile.\"\"\"\n",
- " user = os.environ['SPLUNK_USERNAME']\n",
- " pwd = os.environ['SPLUNK_PASSWORD']\n",
- " conn = client.connect(host='upside.splunkcloud.com',\n",
- " port=8089,\n",
- " username=user,\n",
- " password=pwd)\n",
- " return conn\n",
- "\n",
- "\n",
- "def query_purchased_flight_packages():\n",
- " # Snowflake connector\n",
- " sf_conn = snowflake_connect()\n",
- " \n",
- " # create SQL query string\n",
- " sql_query = '''\n",
- " SELECT *\n",
- " FROM edw.f_package_transactions\n",
- " LEFT JOIN edw.l_upside_packages_v ON\n",
- " edw.f_package_transactions.session_id=edw.l_upside_packages_v.session_id\n",
- " WHERE transaction_datetime_et >= DATEADD(day,-30,CURRENT_DATE());\n",
- " '''\n",
- " \n",
- " # convert to dataframe\n",
- " df = pd.read_sql_query(sql_query,\n",
- " sf_conn,\n",
- " coerce_float=False,\n",
- " parse_dates=False)\n",
- " \n",
- " # remove duplicate columns\n",
- " df = df.T.groupby(level=0).first().T\n",
- " \n",
- " # filter out non-air and non-valid packages\n",
- " df = df[(df['is_air_present'] == 1) &\n",
- " (df['is_package_valid'] == 1) & \n",
- " (df['is_package_voided'] == 0) &\n",
- " (df['is_package_cancelled'] == 0)]\n",
- " \n",
- " # filter out flights that haven't occured yet\n",
- " df_oneway = df[(df['trip_type'] == 'oneway') &\n",
- " (df['ob_seg1_dept_time'] < datetime.datetime.today())]\n",
- " df_other = df[(df['trip_type'] != 'oneway') &\n",
- " (df['ob_seg1_dept_time'] < datetime.datetime.today()) &\n",
- " (df['rt_seg1_dept_time'] < datetime.datetime.today())]\n",
- " \n",
- " return pd.concat([df_oneway, df_other]).reset_index(drop=True)\n",
- "\n",
- "\n",
- "def get_the_flightstats(package_id, splunk_conn):\n",
- " kwargs_oneshot = {'earliest_time': (datetime.datetime.today() - datetime.timedelta(days=30)).strftime('%Y-%m-%dT12:00:00.000-05:00')}\n",
- " \n",
- " searchquery_oneshot = 'search \"payload.trip.referenceNumber\"=\"{}\" index=prod kubernetes.container_name=\"trips-status-webhooks\" | spath \"payload.trip.referenceNumber\" |'.format(package_id) \n",
- " oneshotsearch_results = splunk_conn.jobs.oneshot(searchquery_oneshot, **kwargs_oneshot)\n",
- " reader = results.ResultsReader(oneshotsearch_results)\n",
- " payload_dict = []\n",
- " for item in reader:\n",
- " payload_dict.append(item)\n",
- " return payload_dict"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Get flight data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "number of packages: 564\n"
- ]
- }
- ],
- "source": [
- "df_flights = query_purchased_flight_packages()\n",
- "\n",
- "print('number of packages: {}'.format(len(df_flights)))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Get flight stats delay data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "processed 0 of 564 packages\n",
- "processed 50 of 564 packages\n",
- "processed 100 of 564 packages\n",
- "processed 150 of 564 packages\n",
- "processed 200 of 564 packages\n",
- "processed 250 of 564 packages\n",
- "processed 300 of 564 packages\n",
- "processed 350 of 564 packages\n",
- "processed 400 of 564 packages\n",
- "processed 450 of 564 packages\n",
- "processed 500 of 564 packages\n",
- "processed 550 of 564 packages\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>airline_code</th>\n",
- " <th>alert_datetime</th>\n",
- " <th>arrival_airport</th>\n",
- " <th>arrival_estimated_datetime</th>\n",
- " <th>arrival_scheduled_datetime</th>\n",
- " <th>current_delay</th>\n",
- " <th>departure_airport</th>\n",
- " <th>departure_estimated_datetime</th>\n",
- " <th>departure_scheduled_datetime</th>\n",
- " <th>flight_number</th>\n",
- " <th>previous_delay</th>\n",
- " <th>upside_package_id</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>WN</td>\n",
- " <td>2018-05-07 02:23:51</td>\n",
- " <td>RIC</td>\n",
- " <td>2018-05-06 23:50:00</td>\n",
- " <td>2018-05-06 23:30:00</td>\n",
- " <td>25</td>\n",
- " <td>ATL</td>\n",
- " <td>2018-05-06 22:25:00</td>\n",
- " <td>2018-05-06 22:00:00</td>\n",
- " <td>2971</td>\n",
- " <td>0</td>\n",
- " <td>3D05-9542</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>AA</td>\n",
- " <td>2018-05-10 18:07:47</td>\n",
- " <td>LGA</td>\n",
- " <td>2018-05-10 16:13:00</td>\n",
- " <td>2018-05-10 16:15:00</td>\n",
- " <td>5</td>\n",
- " <td>ROA</td>\n",
- " <td>2018-05-10 14:35:00</td>\n",
- " <td>2018-05-10 14:30:00</td>\n",
- " <td>3947</td>\n",
- " <td>27</td>\n",
- " <td>2BDB-4E0C</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>AA</td>\n",
- " <td>2018-05-10 17:08:08</td>\n",
- " <td>LGA</td>\n",
- " <td>2018-05-10 16:42:00</td>\n",
- " <td>2018-05-10 16:15:00</td>\n",
- " <td>27</td>\n",
- " <td>ROA</td>\n",
- " <td>2018-05-10 14:57:00</td>\n",
- " <td>2018-05-10 14:30:00</td>\n",
- " <td>3947</td>\n",
- " <td>0</td>\n",
- " <td>2BDB-4E0C</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>WN</td>\n",
- " <td>2018-04-30 20:21:24</td>\n",
- " <td>DEN</td>\n",
- " <td>2018-04-30 20:05:00</td>\n",
- " <td>2018-04-30 19:40:00</td>\n",
- " <td>40</td>\n",
- " <td>STL</td>\n",
- " <td>2018-04-30 19:00:00</td>\n",
- " <td>2018-04-30 18:20:00</td>\n",
- " <td>1655</td>\n",
- " <td>0</td>\n",
- " <td>F216-EA7E</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>UA</td>\n",
- " <td>2018-05-14 13:03:15</td>\n",
- " <td>ORD</td>\n",
- " <td>2018-05-14 10:13:00</td>\n",
- " <td>2018-05-14 08:58:00</td>\n",
- " <td>75</td>\n",
- " <td>PIT</td>\n",
- " <td>2018-05-14 09:30:00</td>\n",
- " <td>2018-05-14 08:15:00</td>\n",
- " <td>3485</td>\n",
- " <td>25</td>\n",
- " <td>AC03-DD26</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " airline_code alert_datetime arrival_airport arrival_estimated_datetime \\\n",
- "0 WN 2018-05-07 02:23:51 RIC 2018-05-06 23:50:00 \n",
- "1 AA 2018-05-10 18:07:47 LGA 2018-05-10 16:13:00 \n",
- "2 AA 2018-05-10 17:08:08 LGA 2018-05-10 16:42:00 \n",
- "3 WN 2018-04-30 20:21:24 DEN 2018-04-30 20:05:00 \n",
- "4 UA 2018-05-14 13:03:15 ORD 2018-05-14 10:13:00 \n",
- "\n",
- " arrival_scheduled_datetime current_delay departure_airport \\\n",
- "0 2018-05-06 23:30:00 25 ATL \n",
- "1 2018-05-10 16:15:00 5 ROA \n",
- "2 2018-05-10 16:15:00 27 ROA \n",
- "3 2018-04-30 19:40:00 40 STL \n",
- "4 2018-05-14 08:58:00 75 PIT \n",
- "\n",
- " departure_estimated_datetime departure_scheduled_datetime flight_number \\\n",
- "0 2018-05-06 22:25:00 2018-05-06 22:00:00 2971 \n",
- "1 2018-05-10 14:35:00 2018-05-10 14:30:00 3947 \n",
- "2 2018-05-10 14:57:00 2018-05-10 14:30:00 3947 \n",
- "3 2018-04-30 19:00:00 2018-04-30 18:20:00 1655 \n",
- "4 2018-05-14 09:30:00 2018-05-14 08:15:00 3485 \n",
- "\n",
- " previous_delay upside_package_id \n",
- "0 0 3D05-9542 \n",
- "1 27 2BDB-4E0C \n",
- "2 0 2BDB-4E0C \n",
- "3 0 F216-EA7E \n",
- "4 25 AC03-DD26 "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "package_ids = df_flights['upside_package_id'].tolist()\n",
- "splunk_conn = splunk_connect()\n",
- "datetime_str = '%Y-%m-%dT%H:%M:%S'\n",
- "df_fstats = []\n",
- "\n",
- "for i, package_id in enumerate(package_ids):\n",
- " if i % 50 == 0:\n",
- " print('processed {} of {} packages'.format(i, len(package_ids)))\n",
- " splunk_payload = get_the_flightstats(package_id, splunk_conn)\n",
- " for row in splunk_payload:\n",
- " raw_payload = json.loads(row['_raw'])['payload']\n",
- " alert = raw_payload['alertDetails']\n",
- " if alert.get('delay'):\n",
- " leg_index = alert['legIndex']\n",
- " alert_datetime = datetime.datetime.strptime(alert['dateTime'][:19], datetime_str)\n",
- " package_id = raw_payload['trip']['referenceNumber']\n",
- " \n",
- " # get delay info\n",
- " current_delay = alert['delay']['current']\n",
- " if alert['delay'].get('previous'):\n",
- " previous_delay = alert['delay']['previous']\n",
- " else:\n",
- " previous_delay = 0\n",
- " \n",
- " if current_delay > 0:\n",
- " # get flight leg info\n",
- " flight_leg_info = raw_payload['trip']['legs'][leg_index]['flights'][0]\n",
- " airline_code = flight_leg_info['bookedAirlineCode']\n",
- " flight_number = flight_leg_info['flightNumber']\n",
- " \n",
- " # get arrival info\n",
- " arrival_info = flight_leg_info['flightStatuses'][0]['arrival']\n",
- " arrival_airport = arrival_info['airportCode']\n",
- " arrival_scheduled_dt = datetime.datetime.strptime(arrival_info['scheduledGateDateTime'][:19],\n",
- " datetime_str)\n",
- " arrival_estimated_dt = datetime.datetime.strptime(arrival_info['estimatedGateDateTime'][:19],\n",
- " datetime_str)\n",
- " \n",
- " # get departure info\n",
- " departure_info = flight_leg_info['flightStatuses'][0]['departure']\n",
- " departure_airport = departure_info['airportCode']\n",
- " departure_scheduled_dt = datetime.datetime.strptime(departure_info['scheduledGateDateTime'],\n",
- " datetime_str)\n",
- " departure_estimated_dt = datetime.datetime.strptime(departure_info['estimatedGateDateTime'],\n",
- " datetime_str)\n",
- " \n",
- " # create flat json to convert to dataframe\n",
- " if alert_datetime < (departure_estimated_dt + datetime.timedelta(hours=12)):\n",
- " df_fstats.append({\n",
- " 'upside_package_id': package_id,\n",
- " 'alert_datetime': alert_datetime,\n",
- " 'current_delay': current_delay,\n",
- " 'previous_delay': previous_delay,\n",
- " 'airline_code': airline_code,\n",
- " 'flight_number': flight_number,\n",
- " 'departure_airport': departure_airport,\n",
- " 'departure_scheduled_datetime': departure_scheduled_dt,\n",
- " 'departure_estimated_datetime': departure_estimated_dt,\n",
- " 'arrival_airport': arrival_airport,\n",
- " 'arrival_scheduled_datetime': arrival_scheduled_dt,\n",
- " 'arrival_estimated_datetime': arrival_estimated_dt\n",
- " })\n",
- " \n",
- "df_fstats = pd.DataFrame(df_fstats)\n",
- "df_fstats.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Delay stats"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "percent of flights delayed more than 0 minutes: 24.82%\n",
- "percent of flights delayed more than 30 minutes: 16.84%\n",
- "percent of flights delayed more than 60 minutes: 9.22%\n",
- "percent of flights delayed more than 120 minutes: 4.08%\n",
- "percent of flights delayed more than 180 minutes: 1.77%\n",
- "percent of flights delayed more than 240 minutes: 0.53%\n"
- ]
- }
- ],
- "source": [
- "df_delayed = df_fstats.sort_values('alert_datetime', ascending=False).drop_duplicates(subset='upside_package_id')\n",
- "\n",
- "for delay in [0, 30, 60, 120, 180, 240]:\n",
- " delay_pct = len(df_delayed[df_delayed['current_delay'] > delay]) / len(df_flights) * 100\n",
- " print('percent of flights delayed more than {} minutes: {:.2f}%'.format(delay, delay_pct))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>airline</th>\n",
- " <th>num_delays</th>\n",
- " <th>num_flights</th>\n",
- " <th>percent_delayed</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>AA</td>\n",
- " <td>72.0</td>\n",
- " <td>247</td>\n",
- " <td>29.149798</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>11</th>\n",
- " <td>UA</td>\n",
- " <td>44.0</td>\n",
- " <td>207</td>\n",
- " <td>21.256039</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>AS</td>\n",
- " <td>6.0</td>\n",
- " <td>37</td>\n",
- " <td>16.216216</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>B6</td>\n",
- " <td>10.0</td>\n",
- " <td>35</td>\n",
- " <td>28.571429</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>12</th>\n",
- " <td>WN</td>\n",
- " <td>7.0</td>\n",
- " <td>16</td>\n",
- " <td>43.750000</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " airline num_delays num_flights percent_delayed\n",
- "0 AA 72.0 247 29.149798\n",
- "11 UA 44.0 207 21.256039\n",
- "2 AS 6.0 37 16.216216\n",
- "3 B6 10.0 35 28.571429\n",
- "12 WN 7.0 16 43.750000"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_delayed_airlines = pd.concat([df_delayed['airline_code'].value_counts(),\n",
- " df_flights['air_primary_carrier'].value_counts()], axis=1)\n",
- "df_delayed_airlines = df_delayed_airlines.reset_index()\n",
- "df_delayed_airlines.columns = ['airline', 'num_delays', 'num_flights']\n",
- "df_delayed_airlines['num_delays'] = df_delayed_airlines['num_delays'].fillna(0)\n",
- "df_delayed_airlines['percent_delayed'] = df_delayed_airlines['num_delays'] / df_delayed_airlines['num_flights'] * 100\n",
- "df_delayed_airlines = df_delayed_airlines.sort_values('num_flights', ascending=False)\n",
- "\n",
- "df_delayed_airlines.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# current on trip query\n",
- "q = '''select *\n",
- "from edw.F_PACKAGE_TRANSACTIONS a\n",
- "inner join edw.L_UPSIDE_PACKAGES_V c\n",
- " on a.UPSIDE_PACKAGE_ID = c.UPSIDE_PACKAGE_ID\n",
- "where\n",
- " coalesce(c.\"RT_SEG1_ARR_TIME\",c.\"HOTEL_END_DT\",c.\"OB_SEG1_ARR_TIME\") > current_timestamp\n",
- "and coalesce(c.\"OB_SEG1_ARR_TIME\", c.HOTEL_START_DT ) < current_timestamp\n",
- "'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment