Guest User

Untitled

a guest
May 29th, 2018
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 20.59 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "## Initialize notebook"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [],
  15. "source": [
  16. "import pandas as pd\n",
  17. "import json\n",
  18. "import os\n",
  19. "import datetime\n",
  20. "from sqlalchemy import create_engine\n",
  21. "import splunklib.results as results\n",
  22. "import splunklib.client as client\n",
  23. "import warnings\n",
  24. "\n",
  25. "warnings.filterwarnings('ignore')\n",
  26. "pd.set_option('display.max_columns', None)"
  27. ]
  28. },
  29. {
  30. "cell_type": "code",
  31. "execution_count": null,
  32. "metadata": {},
  33. "outputs": [],
  34. "source": [
  35. "# yogu's package id = ED04-D268 (canceled return flight on 5/14/18)\n",
  36. "# steven's package id = 8DB5-7345 (delayed return flight on 5/15/18)"
  37. ]
  38. },
  39. {
  40. "cell_type": "markdown",
  41. "metadata": {},
  42. "source": [
  43. "## Define custom functions"
  44. ]
  45. },
  46. {
  47. "cell_type": "code",
  48. "execution_count": 2,
  49. "metadata": {},
  50. "outputs": [],
  51. "source": [
  52. "def snowflake_connect():\n",
  53. " \"\"\"Connects to the Snowflake EDW using credentials set in your bash profile.\"\"\"\n",
  54. " user = os.environ['SNOWFLAKE_USERNAME']\n",
  55. " pwd = os.environ['SNOWFLAKE_PASSWORD']\n",
  56. " acct = 'vp76644'\n",
  57. " conn = create_engine(f'snowflake://{user}:{pwd}@{acct}/warehouse')\n",
  58. " return conn\n",
  59. "\n",
  60. "\n",
  61. "def splunk_connect():\n",
  62. " \"\"\"Connects to the Splunk SDK using credentials set in your bash profile.\"\"\"\n",
  63. " user = os.environ['SPLUNK_USERNAME']\n",
  64. " pwd = os.environ['SPLUNK_PASSWORD']\n",
  65. " conn = client.connect(host='upside.splunkcloud.com',\n",
  66. " port=8089,\n",
  67. " username=user,\n",
  68. " password=pwd)\n",
  69. " return conn\n",
  70. "\n",
  71. "\n",
  72. "def query_purchased_flight_packages():\n",
  73. " # Snowflake connector\n",
  74. " sf_conn = snowflake_connect()\n",
  75. " \n",
  76. " # create SQL query string\n",
  77. " sql_query = '''\n",
  78. " SELECT *\n",
  79. " FROM edw.f_package_transactions\n",
  80. " LEFT JOIN edw.l_upside_packages_v ON\n",
  81. " edw.f_package_transactions.session_id=edw.l_upside_packages_v.session_id\n",
  82. " WHERE transaction_datetime_et >= DATEADD(day,-30,CURRENT_DATE());\n",
  83. " '''\n",
  84. " \n",
  85. " # convert to dataframe\n",
  86. " df = pd.read_sql_query(sql_query,\n",
  87. " sf_conn,\n",
  88. " coerce_float=False,\n",
  89. " parse_dates=False)\n",
  90. " \n",
  91. " # remove duplicate columns\n",
  92. " df = df.T.groupby(level=0).first().T\n",
  93. " \n",
  94. " # filter out non-air and non-valid packages\n",
  95. " df = df[(df['is_air_present'] == 1) &\n",
  96. " (df['is_package_valid'] == 1) & \n",
  97. " (df['is_package_voided'] == 0) &\n",
  98. " (df['is_package_cancelled'] == 0)]\n",
  99. " \n",
  100. " # filter out flights that haven't occured yet\n",
  101. " df_oneway = df[(df['trip_type'] == 'oneway') &\n",
  102. " (df['ob_seg1_dept_time'] < datetime.datetime.today())]\n",
  103. " df_other = df[(df['trip_type'] != 'oneway') &\n",
  104. " (df['ob_seg1_dept_time'] < datetime.datetime.today()) &\n",
  105. " (df['rt_seg1_dept_time'] < datetime.datetime.today())]\n",
  106. " \n",
  107. " return pd.concat([df_oneway, df_other]).reset_index(drop=True)\n",
  108. "\n",
  109. "\n",
  110. "def get_the_flightstats(package_id, splunk_conn):\n",
  111. " kwargs_oneshot = {'earliest_time': (datetime.datetime.today() - datetime.timedelta(days=30)).strftime('%Y-%m-%dT12:00:00.000-05:00')}\n",
  112. " \n",
  113. " searchquery_oneshot = 'search \"payload.trip.referenceNumber\"=\"{}\" index=prod kubernetes.container_name=\"trips-status-webhooks\" | spath \"payload.trip.referenceNumber\" |'.format(package_id) \n",
  114. " oneshotsearch_results = splunk_conn.jobs.oneshot(searchquery_oneshot, **kwargs_oneshot)\n",
  115. " reader = results.ResultsReader(oneshotsearch_results)\n",
  116. " payload_dict = []\n",
  117. " for item in reader:\n",
  118. " payload_dict.append(item)\n",
  119. " return payload_dict"
  120. ]
  121. },
  122. {
  123. "cell_type": "markdown",
  124. "metadata": {},
  125. "source": [
  126. "## Get flight data"
  127. ]
  128. },
  129. {
  130. "cell_type": "code",
  131. "execution_count": 3,
  132. "metadata": {},
  133. "outputs": [
  134. {
  135. "name": "stdout",
  136. "output_type": "stream",
  137. "text": [
  138. "number of packages: 564\n"
  139. ]
  140. }
  141. ],
  142. "source": [
  143. "df_flights = query_purchased_flight_packages()\n",
  144. "\n",
  145. "print('number of packages: {}'.format(len(df_flights)))"
  146. ]
  147. },
  148. {
  149. "cell_type": "markdown",
  150. "metadata": {},
  151. "source": [
  152. "## Get flight stats delay data"
  153. ]
  154. },
  155. {
  156. "cell_type": "code",
  157. "execution_count": 5,
  158. "metadata": {},
  159. "outputs": [
  160. {
  161. "name": "stdout",
  162. "output_type": "stream",
  163. "text": [
  164. "processed 0 of 564 packages\n",
  165. "processed 50 of 564 packages\n",
  166. "processed 100 of 564 packages\n",
  167. "processed 150 of 564 packages\n",
  168. "processed 200 of 564 packages\n",
  169. "processed 250 of 564 packages\n",
  170. "processed 300 of 564 packages\n",
  171. "processed 350 of 564 packages\n",
  172. "processed 400 of 564 packages\n",
  173. "processed 450 of 564 packages\n",
  174. "processed 500 of 564 packages\n",
  175. "processed 550 of 564 packages\n"
  176. ]
  177. },
  178. {
  179. "data": {
  180. "text/html": [
  181. "<div>\n",
  182. "<style scoped>\n",
  183. " .dataframe tbody tr th:only-of-type {\n",
  184. " vertical-align: middle;\n",
  185. " }\n",
  186. "\n",
  187. " .dataframe tbody tr th {\n",
  188. " vertical-align: top;\n",
  189. " }\n",
  190. "\n",
  191. " .dataframe thead th {\n",
  192. " text-align: right;\n",
  193. " }\n",
  194. "</style>\n",
  195. "<table border=\"1\" class=\"dataframe\">\n",
  196. " <thead>\n",
  197. " <tr style=\"text-align: right;\">\n",
  198. " <th></th>\n",
  199. " <th>airline_code</th>\n",
  200. " <th>alert_datetime</th>\n",
  201. " <th>arrival_airport</th>\n",
  202. " <th>arrival_estimated_datetime</th>\n",
  203. " <th>arrival_scheduled_datetime</th>\n",
  204. " <th>current_delay</th>\n",
  205. " <th>departure_airport</th>\n",
  206. " <th>departure_estimated_datetime</th>\n",
  207. " <th>departure_scheduled_datetime</th>\n",
  208. " <th>flight_number</th>\n",
  209. " <th>previous_delay</th>\n",
  210. " <th>upside_package_id</th>\n",
  211. " </tr>\n",
  212. " </thead>\n",
  213. " <tbody>\n",
  214. " <tr>\n",
  215. " <th>0</th>\n",
  216. " <td>WN</td>\n",
  217. " <td>2018-05-07 02:23:51</td>\n",
  218. " <td>RIC</td>\n",
  219. " <td>2018-05-06 23:50:00</td>\n",
  220. " <td>2018-05-06 23:30:00</td>\n",
  221. " <td>25</td>\n",
  222. " <td>ATL</td>\n",
  223. " <td>2018-05-06 22:25:00</td>\n",
  224. " <td>2018-05-06 22:00:00</td>\n",
  225. " <td>2971</td>\n",
  226. " <td>0</td>\n",
  227. " <td>3D05-9542</td>\n",
  228. " </tr>\n",
  229. " <tr>\n",
  230. " <th>1</th>\n",
  231. " <td>AA</td>\n",
  232. " <td>2018-05-10 18:07:47</td>\n",
  233. " <td>LGA</td>\n",
  234. " <td>2018-05-10 16:13:00</td>\n",
  235. " <td>2018-05-10 16:15:00</td>\n",
  236. " <td>5</td>\n",
  237. " <td>ROA</td>\n",
  238. " <td>2018-05-10 14:35:00</td>\n",
  239. " <td>2018-05-10 14:30:00</td>\n",
  240. " <td>3947</td>\n",
  241. " <td>27</td>\n",
  242. " <td>2BDB-4E0C</td>\n",
  243. " </tr>\n",
  244. " <tr>\n",
  245. " <th>2</th>\n",
  246. " <td>AA</td>\n",
  247. " <td>2018-05-10 17:08:08</td>\n",
  248. " <td>LGA</td>\n",
  249. " <td>2018-05-10 16:42:00</td>\n",
  250. " <td>2018-05-10 16:15:00</td>\n",
  251. " <td>27</td>\n",
  252. " <td>ROA</td>\n",
  253. " <td>2018-05-10 14:57:00</td>\n",
  254. " <td>2018-05-10 14:30:00</td>\n",
  255. " <td>3947</td>\n",
  256. " <td>0</td>\n",
  257. " <td>2BDB-4E0C</td>\n",
  258. " </tr>\n",
  259. " <tr>\n",
  260. " <th>3</th>\n",
  261. " <td>WN</td>\n",
  262. " <td>2018-04-30 20:21:24</td>\n",
  263. " <td>DEN</td>\n",
  264. " <td>2018-04-30 20:05:00</td>\n",
  265. " <td>2018-04-30 19:40:00</td>\n",
  266. " <td>40</td>\n",
  267. " <td>STL</td>\n",
  268. " <td>2018-04-30 19:00:00</td>\n",
  269. " <td>2018-04-30 18:20:00</td>\n",
  270. " <td>1655</td>\n",
  271. " <td>0</td>\n",
  272. " <td>F216-EA7E</td>\n",
  273. " </tr>\n",
  274. " <tr>\n",
  275. " <th>4</th>\n",
  276. " <td>UA</td>\n",
  277. " <td>2018-05-14 13:03:15</td>\n",
  278. " <td>ORD</td>\n",
  279. " <td>2018-05-14 10:13:00</td>\n",
  280. " <td>2018-05-14 08:58:00</td>\n",
  281. " <td>75</td>\n",
  282. " <td>PIT</td>\n",
  283. " <td>2018-05-14 09:30:00</td>\n",
  284. " <td>2018-05-14 08:15:00</td>\n",
  285. " <td>3485</td>\n",
  286. " <td>25</td>\n",
  287. " <td>AC03-DD26</td>\n",
  288. " </tr>\n",
  289. " </tbody>\n",
  290. "</table>\n",
  291. "</div>"
  292. ],
  293. "text/plain": [
  294. " airline_code alert_datetime arrival_airport arrival_estimated_datetime \\\n",
  295. "0 WN 2018-05-07 02:23:51 RIC 2018-05-06 23:50:00 \n",
  296. "1 AA 2018-05-10 18:07:47 LGA 2018-05-10 16:13:00 \n",
  297. "2 AA 2018-05-10 17:08:08 LGA 2018-05-10 16:42:00 \n",
  298. "3 WN 2018-04-30 20:21:24 DEN 2018-04-30 20:05:00 \n",
  299. "4 UA 2018-05-14 13:03:15 ORD 2018-05-14 10:13:00 \n",
  300. "\n",
  301. " arrival_scheduled_datetime current_delay departure_airport \\\n",
  302. "0 2018-05-06 23:30:00 25 ATL \n",
  303. "1 2018-05-10 16:15:00 5 ROA \n",
  304. "2 2018-05-10 16:15:00 27 ROA \n",
  305. "3 2018-04-30 19:40:00 40 STL \n",
  306. "4 2018-05-14 08:58:00 75 PIT \n",
  307. "\n",
  308. " departure_estimated_datetime departure_scheduled_datetime flight_number \\\n",
  309. "0 2018-05-06 22:25:00 2018-05-06 22:00:00 2971 \n",
  310. "1 2018-05-10 14:35:00 2018-05-10 14:30:00 3947 \n",
  311. "2 2018-05-10 14:57:00 2018-05-10 14:30:00 3947 \n",
  312. "3 2018-04-30 19:00:00 2018-04-30 18:20:00 1655 \n",
  313. "4 2018-05-14 09:30:00 2018-05-14 08:15:00 3485 \n",
  314. "\n",
  315. " previous_delay upside_package_id \n",
  316. "0 0 3D05-9542 \n",
  317. "1 27 2BDB-4E0C \n",
  318. "2 0 2BDB-4E0C \n",
  319. "3 0 F216-EA7E \n",
  320. "4 25 AC03-DD26 "
  321. ]
  322. },
  323. "execution_count": 5,
  324. "metadata": {},
  325. "output_type": "execute_result"
  326. }
  327. ],
  328. "source": [
  329. "package_ids = df_flights['upside_package_id'].tolist()\n",
  330. "splunk_conn = splunk_connect()\n",
  331. "datetime_str = '%Y-%m-%dT%H:%M:%S'\n",
  332. "df_fstats = []\n",
  333. "\n",
  334. "for i, package_id in enumerate(package_ids):\n",
  335. " if i % 50 == 0:\n",
  336. " print('processed {} of {} packages'.format(i, len(package_ids)))\n",
  337. " splunk_payload = get_the_flightstats(package_id, splunk_conn)\n",
  338. " for row in splunk_payload:\n",
  339. " raw_payload = json.loads(row['_raw'])['payload']\n",
  340. " alert = raw_payload['alertDetails']\n",
  341. " if alert.get('delay'):\n",
  342. " leg_index = alert['legIndex']\n",
  343. " alert_datetime = datetime.datetime.strptime(alert['dateTime'][:19], datetime_str)\n",
  344. " package_id = raw_payload['trip']['referenceNumber']\n",
  345. " \n",
  346. " # get delay info\n",
  347. " current_delay = alert['delay']['current']\n",
  348. " if alert['delay'].get('previous'):\n",
  349. " previous_delay = alert['delay']['previous']\n",
  350. " else:\n",
  351. " previous_delay = 0\n",
  352. " \n",
  353. " if current_delay > 0:\n",
  354. " # get flight leg info\n",
  355. " flight_leg_info = raw_payload['trip']['legs'][leg_index]['flights'][0]\n",
  356. " airline_code = flight_leg_info['bookedAirlineCode']\n",
  357. " flight_number = flight_leg_info['flightNumber']\n",
  358. " \n",
  359. " # get arrival info\n",
  360. " arrival_info = flight_leg_info['flightStatuses'][0]['arrival']\n",
  361. " arrival_airport = arrival_info['airportCode']\n",
  362. " arrival_scheduled_dt = datetime.datetime.strptime(arrival_info['scheduledGateDateTime'][:19],\n",
  363. " datetime_str)\n",
  364. " arrival_estimated_dt = datetime.datetime.strptime(arrival_info['estimatedGateDateTime'][:19],\n",
  365. " datetime_str)\n",
  366. " \n",
  367. " # get departure info\n",
  368. " departure_info = flight_leg_info['flightStatuses'][0]['departure']\n",
  369. " departure_airport = departure_info['airportCode']\n",
  370. " departure_scheduled_dt = datetime.datetime.strptime(departure_info['scheduledGateDateTime'],\n",
  371. " datetime_str)\n",
  372. " departure_estimated_dt = datetime.datetime.strptime(departure_info['estimatedGateDateTime'],\n",
  373. " datetime_str)\n",
  374. " \n",
  375. " # create flat json to convert to dataframe\n",
  376. " if alert_datetime < (departure_estimated_dt + datetime.timedelta(hours=12)):\n",
  377. " df_fstats.append({\n",
  378. " 'upside_package_id': package_id,\n",
  379. " 'alert_datetime': alert_datetime,\n",
  380. " 'current_delay': current_delay,\n",
  381. " 'previous_delay': previous_delay,\n",
  382. " 'airline_code': airline_code,\n",
  383. " 'flight_number': flight_number,\n",
  384. " 'departure_airport': departure_airport,\n",
  385. " 'departure_scheduled_datetime': departure_scheduled_dt,\n",
  386. " 'departure_estimated_datetime': departure_estimated_dt,\n",
  387. " 'arrival_airport': arrival_airport,\n",
  388. " 'arrival_scheduled_datetime': arrival_scheduled_dt,\n",
  389. " 'arrival_estimated_datetime': arrival_estimated_dt\n",
  390. " })\n",
  391. " \n",
  392. "df_fstats = pd.DataFrame(df_fstats)\n",
  393. "df_fstats.head()"
  394. ]
  395. },
  396. {
  397. "cell_type": "markdown",
  398. "metadata": {},
  399. "source": [
  400. "## Delay stats"
  401. ]
  402. },
  403. {
  404. "cell_type": "code",
  405. "execution_count": 16,
  406. "metadata": {},
  407. "outputs": [
  408. {
  409. "name": "stdout",
  410. "output_type": "stream",
  411. "text": [
  412. "percent of flights delayed more than 0 minutes: 24.82%\n",
  413. "percent of flights delayed more than 30 minutes: 16.84%\n",
  414. "percent of flights delayed more than 60 minutes: 9.22%\n",
  415. "percent of flights delayed more than 120 minutes: 4.08%\n",
  416. "percent of flights delayed more than 180 minutes: 1.77%\n",
  417. "percent of flights delayed more than 240 minutes: 0.53%\n"
  418. ]
  419. }
  420. ],
  421. "source": [
  422. "df_delayed = df_fstats.sort_values('alert_datetime', ascending=False).drop_duplicates(subset='upside_package_id')\n",
  423. "\n",
  424. "for delay in [0, 30, 60, 120, 180, 240]:\n",
  425. " delay_pct = len(df_delayed[df_delayed['current_delay'] > delay]) / len(df_flights) * 100\n",
  426. " print('percent of flights delayed more than {} minutes: {:.2f}%'.format(delay, delay_pct))"
  427. ]
  428. },
  429. {
  430. "cell_type": "code",
  431. "execution_count": 38,
  432. "metadata": {},
  433. "outputs": [
  434. {
  435. "data": {
  436. "text/html": [
  437. "<div>\n",
  438. "<style scoped>\n",
  439. " .dataframe tbody tr th:only-of-type {\n",
  440. " vertical-align: middle;\n",
  441. " }\n",
  442. "\n",
  443. " .dataframe tbody tr th {\n",
  444. " vertical-align: top;\n",
  445. " }\n",
  446. "\n",
  447. " .dataframe thead th {\n",
  448. " text-align: right;\n",
  449. " }\n",
  450. "</style>\n",
  451. "<table border=\"1\" class=\"dataframe\">\n",
  452. " <thead>\n",
  453. " <tr style=\"text-align: right;\">\n",
  454. " <th></th>\n",
  455. " <th>airline</th>\n",
  456. " <th>num_delays</th>\n",
  457. " <th>num_flights</th>\n",
  458. " <th>percent_delayed</th>\n",
  459. " </tr>\n",
  460. " </thead>\n",
  461. " <tbody>\n",
  462. " <tr>\n",
  463. " <th>0</th>\n",
  464. " <td>AA</td>\n",
  465. " <td>72.0</td>\n",
  466. " <td>247</td>\n",
  467. " <td>29.149798</td>\n",
  468. " </tr>\n",
  469. " <tr>\n",
  470. " <th>11</th>\n",
  471. " <td>UA</td>\n",
  472. " <td>44.0</td>\n",
  473. " <td>207</td>\n",
  474. " <td>21.256039</td>\n",
  475. " </tr>\n",
  476. " <tr>\n",
  477. " <th>2</th>\n",
  478. " <td>AS</td>\n",
  479. " <td>6.0</td>\n",
  480. " <td>37</td>\n",
  481. " <td>16.216216</td>\n",
  482. " </tr>\n",
  483. " <tr>\n",
  484. " <th>3</th>\n",
  485. " <td>B6</td>\n",
  486. " <td>10.0</td>\n",
  487. " <td>35</td>\n",
  488. " <td>28.571429</td>\n",
  489. " </tr>\n",
  490. " <tr>\n",
  491. " <th>12</th>\n",
  492. " <td>WN</td>\n",
  493. " <td>7.0</td>\n",
  494. " <td>16</td>\n",
  495. " <td>43.750000</td>\n",
  496. " </tr>\n",
  497. " </tbody>\n",
  498. "</table>\n",
  499. "</div>"
  500. ],
  501. "text/plain": [
  502. " airline num_delays num_flights percent_delayed\n",
  503. "0 AA 72.0 247 29.149798\n",
  504. "11 UA 44.0 207 21.256039\n",
  505. "2 AS 6.0 37 16.216216\n",
  506. "3 B6 10.0 35 28.571429\n",
  507. "12 WN 7.0 16 43.750000"
  508. ]
  509. },
  510. "execution_count": 38,
  511. "metadata": {},
  512. "output_type": "execute_result"
  513. }
  514. ],
  515. "source": [
  516. "df_delayed_airlines = pd.concat([df_delayed['airline_code'].value_counts(),\n",
  517. " df_flights['air_primary_carrier'].value_counts()], axis=1)\n",
  518. "df_delayed_airlines = df_delayed_airlines.reset_index()\n",
  519. "df_delayed_airlines.columns = ['airline', 'num_delays', 'num_flights']\n",
  520. "df_delayed_airlines['num_delays'] = df_delayed_airlines['num_delays'].fillna(0)\n",
  521. "df_delayed_airlines['percent_delayed'] = df_delayed_airlines['num_delays'] / df_delayed_airlines['num_flights'] * 100\n",
  522. "df_delayed_airlines = df_delayed_airlines.sort_values('num_flights', ascending=False)\n",
  523. "\n",
  524. "df_delayed_airlines.head()"
  525. ]
  526. },
  527. {
  528. "cell_type": "code",
  529. "execution_count": null,
  530. "metadata": {},
  531. "outputs": [],
  532. "source": [
  533. "# current on trip query\n",
  534. "q = '''select *\n",
  535. "from edw.F_PACKAGE_TRANSACTIONS a\n",
  536. "inner join edw.L_UPSIDE_PACKAGES_V c\n",
  537. " on a.UPSIDE_PACKAGE_ID = c.UPSIDE_PACKAGE_ID\n",
  538. "where\n",
  539. " coalesce(c.\"RT_SEG1_ARR_TIME\",c.\"HOTEL_END_DT\",c.\"OB_SEG1_ARR_TIME\") > current_timestamp\n",
  540. "and coalesce(c.\"OB_SEG1_ARR_TIME\", c.HOTEL_START_DT ) < current_timestamp\n",
  541. "'''"
  542. ]
  543. },
  544. {
  545. "cell_type": "code",
  546. "execution_count": null,
  547. "metadata": {},
  548. "outputs": [],
  549. "source": []
  550. }
  551. ],
  552. "metadata": {
  553. "kernelspec": {
  554. "display_name": "Python 3",
  555. "language": "python",
  556. "name": "python3"
  557. },
  558. "language_info": {
  559. "codemirror_mode": {
  560. "name": "ipython",
  561. "version": 3
  562. },
  563. "file_extension": ".py",
  564. "mimetype": "text/x-python",
  565. "name": "python",
  566. "nbconvert_exporter": "python",
  567. "pygments_lexer": "ipython3",
  568. "version": "3.6.3"
  569. }
  570. },
  571. "nbformat": 4,
  572. "nbformat_minor": 2
  573. }
Add Comment
Please, Sign In to add comment