Advertisement
Guest User

Untitled

a guest
May 29th, 2016
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 39.53 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "## Creating the data frame"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 645,
  13. "metadata": {
  14. "collapsed": false
  15. },
  16. "outputs": [],
  17. "source": [
  18. "%matplotlib inline\n",
  19. "import pandas as pd\n",
  20. "import numpy as np"
  21. ]
  22. },
  23. {
  24. "cell_type": "markdown",
  25. "metadata": {},
  26. "source": [
  27. "Creating data frame from .csv"
  28. ]
  29. },
  30. {
  31. "cell_type": "code",
  32. "execution_count": 647,
  33. "metadata": {
  34. "collapsed": false
  35. },
  36. "outputs": [],
  37. "source": [
  38. "df = pd.read_csv(\"datafood.csv\", low_memory=False)"
  39. ]
  40. },
  41. {
  42. "cell_type": "markdown",
  43. "metadata": {},
  44. "source": [
  45. "Number of rows:"
  46. ]
  47. },
  48. {
  49. "cell_type": "code",
  50. "execution_count": 648,
  51. "metadata": {
  52. "collapsed": false
  53. },
  54. "outputs": [
  55. {
  56. "data": {
  57. "text/plain": [
  58. "52694"
  59. ]
  60. },
  61. "execution_count": 648,
  62. "metadata": {},
  63. "output_type": "execute_result"
  64. }
  65. ],
  66. "source": [
  67. "len(df) "
  68. ]
  69. },
  70. {
  71. "cell_type": "markdown",
  72. "metadata": {},
  73. "source": [
  74. "Number of columns:"
  75. ]
  76. },
  77. {
  78. "cell_type": "code",
  79. "execution_count": 649,
  80. "metadata": {
  81. "collapsed": false
  82. },
  83. "outputs": [
  84. {
  85. "data": {
  86. "text/plain": [
  87. "17"
  88. ]
  89. },
  90. "execution_count": 649,
  91. "metadata": {},
  92. "output_type": "execute_result"
  93. }
  94. ],
  95. "source": [
  96. "len(df.columns)"
  97. ]
  98. },
  99. {
  100. "cell_type": "markdown",
  101. "metadata": {},
  102. "source": [
  103. "A taste of the dataframe:"
  104. ]
  105. },
  106. {
  107. "cell_type": "code",
  108. "execution_count": 650,
  109. "metadata": {
  110. "collapsed": false
  111. },
  112. "outputs": [
  113. {
  114. "data": {
  115. "text/html": [
  116. "<div>\n",
  117. "<table border=\"1\" class=\"dataframe\">\n",
  118. " <thead>\n",
  119. " <tr style=\"text-align: right;\">\n",
  120. " <th></th>\n",
  121. " <th>business_id</th>\n",
  122. " <th>business_name</th>\n",
  123. " <th>business_address</th>\n",
  124. " <th>business_city</th>\n",
  125. " <th>business_state</th>\n",
  126. " <th>business_postal_code</th>\n",
  127. " <th>business_latitude</th>\n",
  128. " <th>business_longitude</th>\n",
  129. " <th>business_location</th>\n",
  130. " <th>business_phone_number</th>\n",
  131. " <th>inspection_id</th>\n",
  132. " <th>inspection_date</th>\n",
  133. " <th>inspection_score</th>\n",
  134. " <th>inspection_type</th>\n",
  135. " <th>violation_id</th>\n",
  136. " <th>violation_description</th>\n",
  137. " <th>risk_category</th>\n",
  138. " </tr>\n",
  139. " </thead>\n",
  140. " <tbody>\n",
  141. " <tr>\n",
  142. " <th>0</th>\n",
  143. " <td>10</td>\n",
  144. " <td>Tiramisu Kitchen</td>\n",
  145. " <td>033 Belden Pl</td>\n",
  146. " <td>San Francisco</td>\n",
  147. " <td>CA</td>\n",
  148. " <td>94104</td>\n",
  149. " <td>37.791116</td>\n",
  150. " <td>-122.403816</td>\n",
  151. " <td>(37.791116, -122.403816)</td>\n",
  152. " <td>NaN</td>\n",
  153. " <td>10_20140114</td>\n",
  154. " <td>1/14/2014 0:00</td>\n",
  155. " <td>92.0</td>\n",
  156. " <td>Routine - Unscheduled</td>\n",
  157. " <td>10_20140114_103119</td>\n",
  158. " <td>Inadequate and inaccessible handwashing facili...</td>\n",
  159. " <td>Moderate Risk</td>\n",
  160. " </tr>\n",
  161. " <tr>\n",
  162. " <th>1</th>\n",
  163. " <td>10</td>\n",
  164. " <td>Tiramisu Kitchen</td>\n",
  165. " <td>033 Belden Pl</td>\n",
  166. " <td>San Francisco</td>\n",
  167. " <td>CA</td>\n",
  168. " <td>94104</td>\n",
  169. " <td>37.791116</td>\n",
  170. " <td>-122.403816</td>\n",
  171. " <td>(37.791116, -122.403816)</td>\n",
  172. " <td>NaN</td>\n",
  173. " <td>10_20140114</td>\n",
  174. " <td>1/14/2014 0:00</td>\n",
  175. " <td>92.0</td>\n",
  176. " <td>Routine - Unscheduled</td>\n",
  177. " <td>10_20140114_103145</td>\n",
  178. " <td>Improper storage of equipment utensils or linens</td>\n",
  179. " <td>Low Risk</td>\n",
  180. " </tr>\n",
  181. " <tr>\n",
  182. " <th>2</th>\n",
  183. " <td>10</td>\n",
  184. " <td>Tiramisu Kitchen</td>\n",
  185. " <td>033 Belden Pl</td>\n",
  186. " <td>San Francisco</td>\n",
  187. " <td>CA</td>\n",
  188. " <td>94104</td>\n",
  189. " <td>37.791116</td>\n",
  190. " <td>-122.403816</td>\n",
  191. " <td>(37.791116, -122.403816)</td>\n",
  192. " <td>NaN</td>\n",
  193. " <td>10_20140114</td>\n",
  194. " <td>1/14/2014 0:00</td>\n",
  195. " <td>92.0</td>\n",
  196. " <td>Routine - Unscheduled</td>\n",
  197. " <td>10_20140114_103154</td>\n",
  198. " <td>Unclean or degraded floors walls or ceilings</td>\n",
  199. " <td>Low Risk</td>\n",
  200. " </tr>\n",
  201. " </tbody>\n",
  202. "</table>\n",
  203. "</div>"
  204. ],
  205. "text/plain": [
  206. " business_id business_name business_address business_city \\\n",
  207. "0 10 Tiramisu Kitchen 033 Belden Pl San Francisco \n",
  208. "1 10 Tiramisu Kitchen 033 Belden Pl San Francisco \n",
  209. "2 10 Tiramisu Kitchen 033 Belden Pl San Francisco \n",
  210. "\n",
  211. " business_state business_postal_code business_latitude business_longitude \\\n",
  212. "0 CA 94104 37.791116 -122.403816 \n",
  213. "1 CA 94104 37.791116 -122.403816 \n",
  214. "2 CA 94104 37.791116 -122.403816 \n",
  215. "\n",
  216. " business_location business_phone_number inspection_id \\\n",
  217. "0 (37.791116, -122.403816) NaN 10_20140114 \n",
  218. "1 (37.791116, -122.403816) NaN 10_20140114 \n",
  219. "2 (37.791116, -122.403816) NaN 10_20140114 \n",
  220. "\n",
  221. " inspection_date inspection_score inspection_type \\\n",
  222. "0 1/14/2014 0:00 92.0 Routine - Unscheduled \n",
  223. "1 1/14/2014 0:00 92.0 Routine - Unscheduled \n",
  224. "2 1/14/2014 0:00 92.0 Routine - Unscheduled \n",
  225. "\n",
  226. " violation_id violation_description \\\n",
  227. "0 10_20140114_103119 Inadequate and inaccessible handwashing facili... \n",
  228. "1 10_20140114_103145 Improper storage of equipment utensils or linens \n",
  229. "2 10_20140114_103154 Unclean or degraded floors walls or ceilings \n",
  230. "\n",
  231. " risk_category \n",
  232. "0 Moderate Risk \n",
  233. "1 Low Risk \n",
  234. "2 Low Risk "
  235. ]
  236. },
  237. "execution_count": 650,
  238. "metadata": {},
  239. "output_type": "execute_result"
  240. }
  241. ],
  242. "source": [
  243. "df.head(3)"
  244. ]
  245. },
  246. {
  247. "cell_type": "markdown",
  248. "metadata": {},
  249. "source": [
  250. "Since this attempts to assess the relationship between location and safety of sold foods, my first step will be filter out all the businesses that conform with health regulations (NaN observations of the variable \"risk_category\"):"
  251. ]
  252. },
  253. {
  254. "cell_type": "code",
  255. "execution_count": 651,
  256. "metadata": {
  257. "collapsed": false
  258. },
  259. "outputs": [],
  260. "source": [
  261. "df = df[df.risk_category.notnull()]"
  262. ]
  263. },
  264. {
  265. "cell_type": "markdown",
  266. "metadata": {},
  267. "source": [
  268. "But which variable should I use as a proxy for location? "
  269. ]
  270. },
  271. {
  272. "cell_type": "markdown",
  273. "metadata": {},
  274. "source": [
  275. "At first I intended to use \"business_postal_code\", but I have come to realize that \"business_latitude\" and \"business_longitude\" can be plotted into a map, which it would bring usefull information. Let us take a look at the missing values. "
  276. ]
  277. },
  278. {
  279. "cell_type": "code",
  280. "execution_count": 652,
  281. "metadata": {
  282. "collapsed": false
  283. },
  284. "outputs": [
  285. {
  286. "data": {
  287. "text/plain": [
  288. "909"
  289. ]
  290. },
  291. "execution_count": 652,
  292. "metadata": {},
  293. "output_type": "execute_result"
  294. }
  295. ],
  296. "source": [
  297. "len(df[df.business_postal_code.isnull()])"
  298. ]
  299. },
  300. {
  301. "cell_type": "code",
  302. "execution_count": 653,
  303. "metadata": {
  304. "collapsed": false
  305. },
  306. "outputs": [
  307. {
  308. "data": {
  309. "text/plain": [
  310. "9306"
  311. ]
  312. },
  313. "execution_count": 653,
  314. "metadata": {},
  315. "output_type": "execute_result"
  316. }
  317. ],
  318. "source": [
  319. "len(df[df.business_latitude.isnull()])"
  320. ]
  321. },
  322. {
  323. "cell_type": "code",
  324. "execution_count": 654,
  325. "metadata": {
  326. "collapsed": false
  327. },
  328. "outputs": [
  329. {
  330. "data": {
  331. "text/plain": [
  332. "9306"
  333. ]
  334. },
  335. "execution_count": 654,
  336. "metadata": {},
  337. "output_type": "execute_result"
  338. }
  339. ],
  340. "source": [
  341. "len(df[df.business_longitude.isnull()])"
  342. ]
  343. },
  344. {
  345. "cell_type": "code",
  346. "execution_count": 655,
  347. "metadata": {
  348. "collapsed": false
  349. },
  350. "outputs": [
  351. {
  352. "data": {
  353. "text/plain": [
  354. "39864"
  355. ]
  356. },
  357. "execution_count": 655,
  358. "metadata": {},
  359. "output_type": "execute_result"
  360. }
  361. ],
  362. "source": [
  363. "len(df)"
  364. ]
  365. },
  366. {
  367. "cell_type": "markdown",
  368. "metadata": {},
  369. "source": [
  370. "9306 missing values of 39864 rows, quite a number. I stick to \"business_postal_code\""
  371. ]
  372. },
  373. {
  374. "cell_type": "markdown",
  375. "metadata": {},
  376. "source": [
  377. "My focus is on the variables related to risk category and location. The variable \"business_name\" may also be interesting later on. \n",
  378. "\n",
  379. "I drop the following columns:\n",
  380. "\n",
  381. "- business_city (\"San Francisco\" for all observations)\n",
  382. "- business_state (\"CA\" for all observations)\n",
  383. "- business_phone_number\n",
  384. "- inspection_id\n",
  385. "- business_location\n",
  386. "- violation_id- inspection_score (irrelevant)\n",
  387. "- inspection_date\n",
  388. "- inspection_type"
  389. ]
  390. },
  391. {
  392. "cell_type": "code",
  393. "execution_count": 656,
  394. "metadata": {
  395. "collapsed": false
  396. },
  397. "outputs": [
  398. {
  399. "data": {
  400. "text/html": [
  401. "<div>\n",
  402. "<table border=\"1\" class=\"dataframe\">\n",
  403. " <thead>\n",
  404. " <tr style=\"text-align: right;\">\n",
  405. " <th></th>\n",
  406. " <th>business_id</th>\n",
  407. " <th>business_name</th>\n",
  408. " <th>business_address</th>\n",
  409. " <th>business_city</th>\n",
  410. " <th>business_state</th>\n",
  411. " <th>business_postal_code</th>\n",
  412. " <th>business_latitude</th>\n",
  413. " <th>business_longitude</th>\n",
  414. " <th>business_location</th>\n",
  415. " <th>business_phone_number</th>\n",
  416. " <th>inspection_id</th>\n",
  417. " <th>inspection_date</th>\n",
  418. " <th>inspection_score</th>\n",
  419. " <th>inspection_type</th>\n",
  420. " <th>violation_id</th>\n",
  421. " <th>violation_description</th>\n",
  422. " <th>risk_category</th>\n",
  423. " </tr>\n",
  424. " </thead>\n",
  425. " <tbody>\n",
  426. " <tr>\n",
  427. " <th>0</th>\n",
  428. " <td>10</td>\n",
  429. " <td>Tiramisu Kitchen</td>\n",
  430. " <td>033 Belden Pl</td>\n",
  431. " <td>San Francisco</td>\n",
  432. " <td>CA</td>\n",
  433. " <td>94104</td>\n",
  434. " <td>37.791116</td>\n",
  435. " <td>-122.403816</td>\n",
  436. " <td>(37.791116, -122.403816)</td>\n",
  437. " <td>NaN</td>\n",
  438. " <td>10_20140114</td>\n",
  439. " <td>1/14/2014 0:00</td>\n",
  440. " <td>92.0</td>\n",
  441. " <td>Routine - Unscheduled</td>\n",
  442. " <td>10_20140114_103119</td>\n",
  443. " <td>Inadequate and inaccessible handwashing facili...</td>\n",
  444. " <td>Moderate Risk</td>\n",
  445. " </tr>\n",
  446. " <tr>\n",
  447. " <th>1</th>\n",
  448. " <td>10</td>\n",
  449. " <td>Tiramisu Kitchen</td>\n",
  450. " <td>033 Belden Pl</td>\n",
  451. " <td>San Francisco</td>\n",
  452. " <td>CA</td>\n",
  453. " <td>94104</td>\n",
  454. " <td>37.791116</td>\n",
  455. " <td>-122.403816</td>\n",
  456. " <td>(37.791116, -122.403816)</td>\n",
  457. " <td>NaN</td>\n",
  458. " <td>10_20140114</td>\n",
  459. " <td>1/14/2014 0:00</td>\n",
  460. " <td>92.0</td>\n",
  461. " <td>Routine - Unscheduled</td>\n",
  462. " <td>10_20140114_103145</td>\n",
  463. " <td>Improper storage of equipment utensils or linens</td>\n",
  464. " <td>Low Risk</td>\n",
  465. " </tr>\n",
  466. " </tbody>\n",
  467. "</table>\n",
  468. "</div>"
  469. ],
  470. "text/plain": [
  471. " business_id business_name business_address business_city \\\n",
  472. "0 10 Tiramisu Kitchen 033 Belden Pl San Francisco \n",
  473. "1 10 Tiramisu Kitchen 033 Belden Pl San Francisco \n",
  474. "\n",
  475. " business_state business_postal_code business_latitude business_longitude \\\n",
  476. "0 CA 94104 37.791116 -122.403816 \n",
  477. "1 CA 94104 37.791116 -122.403816 \n",
  478. "\n",
  479. " business_location business_phone_number inspection_id \\\n",
  480. "0 (37.791116, -122.403816) NaN 10_20140114 \n",
  481. "1 (37.791116, -122.403816) NaN 10_20140114 \n",
  482. "\n",
  483. " inspection_date inspection_score inspection_type \\\n",
  484. "0 1/14/2014 0:00 92.0 Routine - Unscheduled \n",
  485. "1 1/14/2014 0:00 92.0 Routine - Unscheduled \n",
  486. "\n",
  487. " violation_id violation_description \\\n",
  488. "0 10_20140114_103119 Inadequate and inaccessible handwashing facili... \n",
  489. "1 10_20140114_103145 Improper storage of equipment utensils or linens \n",
  490. "\n",
  491. " risk_category \n",
  492. "0 Moderate Risk \n",
  493. "1 Low Risk "
  494. ]
  495. },
  496. "execution_count": 656,
  497. "metadata": {},
  498. "output_type": "execute_result"
  499. }
  500. ],
  501. "source": [
  502. "df.head(2)"
  503. ]
  504. },
  505. {
  506. "cell_type": "code",
  507. "execution_count": 657,
  508. "metadata": {
  509. "collapsed": false
  510. },
  511. "outputs": [],
  512. "source": [
  513. "df = df.drop([\"business_address\", \"inspection_id\", \"inspection_type\", \"business_phone_number\", \"inspection_score\", \"violation_id\", \"inspection_date\", \"business_city\", \"business_state\", \"business_latitude\", \"business_longitude\", \"business_location\"], axis=1)"
  514. ]
  515. },
  516. {
  517. "cell_type": "code",
  518. "execution_count": 658,
  519. "metadata": {
  520. "collapsed": false
  521. },
  522. "outputs": [
  523. {
  524. "data": {
  525. "text/html": [
  526. "<div>\n",
  527. "<table border=\"1\" class=\"dataframe\">\n",
  528. " <thead>\n",
  529. " <tr style=\"text-align: right;\">\n",
  530. " <th></th>\n",
  531. " <th>business_id</th>\n",
  532. " <th>business_name</th>\n",
  533. " <th>business_postal_code</th>\n",
  534. " <th>violation_description</th>\n",
  535. " <th>risk_category</th>\n",
  536. " </tr>\n",
  537. " </thead>\n",
  538. " <tbody>\n",
  539. " <tr>\n",
  540. " <th>0</th>\n",
  541. " <td>10</td>\n",
  542. " <td>Tiramisu Kitchen</td>\n",
  543. " <td>94104</td>\n",
  544. " <td>Inadequate and inaccessible handwashing facili...</td>\n",
  545. " <td>Moderate Risk</td>\n",
  546. " </tr>\n",
  547. " <tr>\n",
  548. " <th>1</th>\n",
  549. " <td>10</td>\n",
  550. " <td>Tiramisu Kitchen</td>\n",
  551. " <td>94104</td>\n",
  552. " <td>Improper storage of equipment utensils or linens</td>\n",
  553. " <td>Low Risk</td>\n",
  554. " </tr>\n",
  555. " </tbody>\n",
  556. "</table>\n",
  557. "</div>"
  558. ],
  559. "text/plain": [
  560. " business_id business_name business_postal_code \\\n",
  561. "0 10 Tiramisu Kitchen 94104 \n",
  562. "1 10 Tiramisu Kitchen 94104 \n",
  563. "\n",
  564. " violation_description risk_category \n",
  565. "0 Inadequate and inaccessible handwashing facili... Moderate Risk \n",
  566. "1 Improper storage of equipment utensils or linens Low Risk "
  567. ]
  568. },
  569. "execution_count": 658,
  570. "metadata": {},
  571. "output_type": "execute_result"
  572. }
  573. ],
  574. "source": [
  575. "df.head(2)"
  576. ]
  577. },
  578. {
  579. "cell_type": "code",
  580. "execution_count": null,
  581. "metadata": {
  582. "collapsed": true
  583. },
  584. "outputs": [],
  585. "source": []
  586. },
  587. {
  588. "cell_type": "markdown",
  589. "metadata": {},
  590. "source": [
  591. "## Distribution of \"risk_category\""
  592. ]
  593. },
  594. {
  595. "cell_type": "markdown",
  596. "metadata": {},
  597. "source": [
  598. "### Absolute frequency. "
  599. ]
  600. },
  601. {
  602. "cell_type": "code",
  603. "execution_count": 659,
  604. "metadata": {
  605. "collapsed": false
  606. },
  607. "outputs": [
  608. {
  609. "data": {
  610. "text/plain": [
  611. "Low Risk 20797\n",
  612. "Moderate Risk 13492\n",
  613. "High Risk 5575\n",
  614. "Name: risk_category, dtype: int64"
  615. ]
  616. },
  617. "execution_count": 659,
  618. "metadata": {},
  619. "output_type": "execute_result"
  620. }
  621. ],
  622. "source": [
  623. "df.risk_category.value_counts(dropna=False, sort=True)"
  624. ]
  625. },
  626. {
  627. "cell_type": "markdown",
  628. "metadata": {},
  629. "source": [
  630. "### Relative frequency. "
  631. ]
  632. },
  633. {
  634. "cell_type": "code",
  635. "execution_count": 660,
  636. "metadata": {
  637. "collapsed": false
  638. },
  639. "outputs": [
  640. {
  641. "data": {
  642. "text/plain": [
  643. "Low Risk 0.521699\n",
  644. "Moderate Risk 0.338451\n",
  645. "High Risk 0.139850\n",
  646. "Name: risk_category, dtype: float64"
  647. ]
  648. },
  649. "execution_count": 660,
  650. "metadata": {},
  651. "output_type": "execute_result"
  652. }
  653. ],
  654. "source": [
  655. "df.risk_category.value_counts(dropna=False, normalize=True, sort=True)"
  656. ]
  657. },
  658. {
  659. "cell_type": "markdown",
  660. "metadata": {},
  661. "source": [
  662. "## Distribution of \"business_postal_code\""
  663. ]
  664. },
  665. {
  666. "cell_type": "code",
  667. "execution_count": 661,
  668. "metadata": {
  669. "collapsed": false
  670. },
  671. "outputs": [
  672. {
  673. "data": {
  674. "text/plain": [
  675. "NaN 909\n",
  676. "94158 14\n",
  677. "94014 16\n",
  678. "94013 2\n",
  679. "CA 15\n",
  680. "94109 3258\n",
  681. "94108 1608\n",
  682. "94101 8\n",
  683. "94103 3342\n",
  684. "94102 2636\n",
  685. "94105 761\n",
  686. "94104 741\n",
  687. "94107 1959\n",
  688. "0 11\n",
  689. "94118 1520\n",
  690. "94112 1312\n",
  691. "94110 3833\n",
  692. "94111 960\n",
  693. "94116 668\n",
  694. "94117 1096\n",
  695. "94114 1562\n",
  696. "94115 1460\n",
  697. "94609 10\n",
  698. "94513 6\n",
  699. "92672 8\n",
  700. "94127 545\n",
  701. "94124 806\n",
  702. "94123 1395\n",
  703. "94122 2223\n",
  704. "94121 1383\n",
  705. "94120 10\n",
  706. "94129 8\n",
  707. "941033148 2\n",
  708. "94130 11\n",
  709. "94131 388\n",
  710. "94132 669\n",
  711. "94133 4134\n",
  712. "94134 545\n",
  713. "94140 3\n",
  714. "94143 27\n",
  715. "Name: business_postal_code, dtype: int64"
  716. ]
  717. },
  718. "execution_count": 661,
  719. "metadata": {},
  720. "output_type": "execute_result"
  721. }
  722. ],
  723. "source": [
  724. "df.business_postal_code.value_counts(dropna=False, sort=False)"
  725. ]
  726. },
  727. {
  728. "cell_type": "markdown",
  729. "metadata": {},
  730. "source": [
  731. "The variable shows some issues: postal cod 0, 941033148 (probably a typo), and CA. Other issues are the 1267 rows with NaN postal number (aprox. 2,5% of the total). We clean up those rows."
  732. ]
  733. },
  734. {
  735. "cell_type": "code",
  736. "execution_count": 662,
  737. "metadata": {
  738. "collapsed": false
  739. },
  740. "outputs": [],
  741. "source": [
  742. "df = df[df.business_postal_code.notnull()]"
  743. ]
  744. },
  745. {
  746. "cell_type": "code",
  747. "execution_count": 663,
  748. "metadata": {
  749. "collapsed": false
  750. },
  751. "outputs": [],
  752. "source": [
  753. "df = df[df.business_postal_code!=\"941033148\"]"
  754. ]
  755. },
  756. {
  757. "cell_type": "code",
  758. "execution_count": 664,
  759. "metadata": {
  760. "collapsed": true
  761. },
  762. "outputs": [],
  763. "source": [
  764. "df = df[df.business_postal_code!=\"0\"]"
  765. ]
  766. },
  767. {
  768. "cell_type": "code",
  769. "execution_count": 665,
  770. "metadata": {
  771. "collapsed": false
  772. },
  773. "outputs": [],
  774. "source": [
  775. "df = df[df.business_postal_code!=\"CA\"]"
  776. ]
  777. },
  778. {
  779. "cell_type": "code",
  780. "execution_count": 666,
  781. "metadata": {
  782. "collapsed": false
  783. },
  784. "outputs": [],
  785. "source": [
  786. "df.business_postal_code = pd.to_numeric(df.business_postal_code) "
  787. ]
  788. },
  789. {
  790. "cell_type": "markdown",
  791. "metadata": {},
  792. "source": [
  793. "### Absolute frequency"
  794. ]
  795. },
  796. {
  797. "cell_type": "code",
  798. "execution_count": 667,
  799. "metadata": {
  800. "collapsed": false
  801. },
  802. "outputs": [
  803. {
  804. "data": {
  805. "text/plain": [
  806. "business_postal_code\n",
  807. "92672 8\n",
  808. "94013 2\n",
  809. "94014 16\n",
  810. "94101 8\n",
  811. "94102 2636\n",
  812. "94103 3342\n",
  813. "94104 741\n",
  814. "94105 761\n",
  815. "94107 1959\n",
  816. "94108 1608\n",
  817. "94109 3258\n",
  818. "94110 3833\n",
  819. "94111 960\n",
  820. "94112 1312\n",
  821. "94114 1562\n",
  822. "94115 1460\n",
  823. "94116 668\n",
  824. "94117 1096\n",
  825. "94118 1520\n",
  826. "94120 10\n",
  827. "94121 1383\n",
  828. "94122 2223\n",
  829. "94123 1395\n",
  830. "94124 806\n",
  831. "94127 545\n",
  832. "94129 8\n",
  833. "94130 11\n",
  834. "94131 388\n",
  835. "94132 669\n",
  836. "94133 4134\n",
  837. "94134 545\n",
  838. "94140 3\n",
  839. "94143 27\n",
  840. "94158 14\n",
  841. "94513 6\n",
  842. "94609 10\n",
  843. "dtype: int64"
  844. ]
  845. },
  846. "execution_count": 667,
  847. "metadata": {},
  848. "output_type": "execute_result"
  849. }
  850. ],
  851. "source": [
  852. "df.groupby(\"business_postal_code\").size()"
  853. ]
  854. },
  855. {
  856. "cell_type": "markdown",
  857. "metadata": {},
  858. "source": [
  859. "### Relative frequency."
  860. ]
  861. },
  862. {
  863. "cell_type": "code",
  864. "execution_count": 668,
  865. "metadata": {
  866. "collapsed": false
  867. },
  868. "outputs": [
  869. {
  870. "data": {
  871. "text/plain": [
  872. "business_postal_code\n",
  873. "92672 0.020551\n",
  874. "94013 0.005138\n",
  875. "94014 0.041103\n",
  876. "94101 0.020551\n",
  877. "94102 6.771649\n",
  878. "94103 8.585301\n",
  879. "94104 1.903563\n",
  880. "94105 1.954941\n",
  881. "94107 5.032497\n",
  882. "94108 4.130809\n",
  883. "94109 8.369512\n",
  884. "94110 9.846636\n",
  885. "94111 2.466155\n",
  886. "94112 3.370411\n",
  887. "94114 4.012639\n",
  888. "94115 3.750610\n",
  889. "94116 1.716033\n",
  890. "94117 2.815526\n",
  891. "94118 3.904745\n",
  892. "94120 0.025689\n",
  893. "94121 3.552804\n",
  894. "94122 5.710689\n",
  895. "94123 3.583631\n",
  896. "94124 2.070542\n",
  897. "94127 1.400057\n",
  898. "94129 0.020551\n",
  899. "94130 0.028258\n",
  900. "94131 0.996737\n",
  901. "94132 1.718601\n",
  902. "94133 10.619878\n",
  903. "94134 1.400057\n",
  904. "94140 0.007707\n",
  905. "94143 0.069361\n",
  906. "94158 0.035965\n",
  907. "94513 0.015413\n",
  908. "94609 0.025689\n",
  909. "dtype: float64"
  910. ]
  911. },
  912. "execution_count": 668,
  913. "metadata": {},
  914. "output_type": "execute_result"
  915. }
  916. ],
  917. "source": [
  918. "df.groupby(\"business_postal_code\").size()/len(df)*100"
  919. ]
  920. },
  921. {
  922. "cell_type": "code",
  923. "execution_count": null,
  924. "metadata": {
  925. "collapsed": true
  926. },
  927. "outputs": [],
  928. "source": []
  929. },
  930. {
  931. "cell_type": "markdown",
  932. "metadata": {},
  933. "source": [
  934. "## Distribution of \"violation_description\""
  935. ]
  936. },
  937. {
  938. "cell_type": "markdown",
  939. "metadata": {},
  940. "source": [
  941. "### Absolute frequency"
  942. ]
  943. },
  944. {
  945. "cell_type": "code",
  946. "execution_count": 669,
  947. "metadata": {
  948. "collapsed": false
  949. },
  950. "outputs": [
  951. {
  952. "data": {
  953. "text/plain": [
  954. "risk_category violation_description \n",
  955. "High Risk Contaminated or adulterated food 192\n",
  956. " High risk food holding temperature 1527\n",
  957. " High risk vermin infestation 808\n",
  958. " Improper cooking time or temperatures 12\n",
  959. " Improper cooling methods 752\n",
  960. " Improper reheating of food 143\n",
  961. " Mobile food facility not operating with an approved commissary 2\n",
  962. " Mobile food facility with unapproved operating conditions 8\n",
  963. " No hot water or running water 178\n",
  964. " No restroom facility within 200 feet of mobile food facility 1\n",
  965. " Other high risk violation 29\n",
  966. " Prohibited food served to sensitive population 4\n",
  967. " Sewage or wastewater contamination 37\n",
  968. " Unapproved food source 24\n",
  969. " Unauthorized or unsafe use of time as a public health control measure 84\n",
  970. " Unclean hands or improper use of gloves 648\n",
  971. " Unclean or unsanitary food contact surfaces 991\n",
  972. " Unreported or unrestricted ill employee with communicable disease 5\n",
  973. "Low Risk Food safety certificate or food handler card not available 1025\n",
  974. " Improper food labeling or menu misrepresentation 152\n",
  975. " Improper food storage 2108\n",
  976. " Improper or defective plumbing 703\n",
  977. " Improper storage of equipment utensils or linens 1184\n",
  978. " Improper storage use or identification of toxic substances 525\n",
  979. " Improperly washed fruits and vegetables 15\n",
  980. " Inadequate HACCP plan record keeping 19\n",
  981. " Inadequate dressing rooms or improper storage of personal items 248\n",
  982. " Inadequate or unsanitary refuse containers or area or no garbage service 174\n",
  983. " Inadequate ventilation or lighting 552\n",
  984. " Inadequate warewashing facilities or equipment 549\n",
  985. " ... \n",
  986. " Unapproved living quarters in food facility 23\n",
  987. " Unapproved or unmaintained equipment or utensils 2333\n",
  988. " Unclean nonfood contact surfaces 2161\n",
  989. " Unclean or degraded floors walls or ceilings 3357\n",
  990. " Unclean unmaintained or improperly constructed toilet facilities 242\n",
  991. " Unpermitted food facility 44\n",
  992. " Unsanitary employee garments hair or nails 49\n",
  993. " Wiping cloths not clean or properly stored or inadequate sanitizer 2224\n",
  994. " Worker safety hazards 118\n",
  995. "Moderate Risk Consumer advisory not provided for raw or undercooked foods 29\n",
  996. " Discharge from employee nose mouth or eye 4\n",
  997. " Employee eating or smoking 151\n",
  998. " Food in poor condition 92\n",
  999. " Foods not protected from contamination 1789\n",
  1000. " Improper thawing methods 516\n",
  1001. " Improperly displayed mobile food permit or signage 2\n",
  1002. " Inadequate and inaccessible handwashing facilities 2380\n",
  1003. " Inadequate food safety knowledge or lack of certified food safety manager 1071\n",
  1004. " Inadequate procedures or records for time as a public health control 219\n",
  1005. " Inadequate sewage or wastewater disposal 79\n",
  1006. " Inadequately cleaned or sanitized food contact surfaces 2132\n",
  1007. " Insufficient hot water or running water 483\n",
  1008. " Moderate risk food holding temperature 2580\n",
  1009. " Moderate risk vermin infestation 1439\n",
  1010. " Noncompliance with Gulf Coast oyster regulation 1\n",
  1011. " Noncompliance with HAACP plan or variance 49\n",
  1012. " Noncompliance with shell fish tags or display 45\n",
  1013. " Other moderate risk violation 82\n",
  1014. " Reservice of previously served foods 6\n",
  1015. " Unsanitary mobile food facility 1\n",
  1016. "dtype: int64"
  1017. ]
  1018. },
  1019. "execution_count": 669,
  1020. "metadata": {},
  1021. "output_type": "execute_result"
  1022. }
  1023. ],
  1024. "source": [
  1025. "df.groupby([\"risk_category\", \"violation_description\"]).size()"
  1026. ]
  1027. },
  1028. {
  1029. "cell_type": "markdown",
  1030. "metadata": {},
  1031. "source": [
  1032. "### Relative frequency"
  1033. ]
  1034. },
  1035. {
  1036. "cell_type": "code",
  1037. "execution_count": 670,
  1038. "metadata": {
  1039. "collapsed": false
  1040. },
  1041. "outputs": [
  1042. {
  1043. "data": {
  1044. "text/plain": [
  1045. "risk_category violation_description \n",
  1046. "High Risk Contaminated or adulterated food 0.493231\n",
  1047. " High risk food holding temperature 3.922727\n",
  1048. " High risk vermin infestation 2.075680\n",
  1049. " Improper cooking time or temperatures 0.030827\n",
  1050. " Improper cooling methods 1.931821\n",
  1051. " Improper reheating of food 0.367354\n",
  1052. " Mobile food facility not operating with an approved commissary 0.005138\n",
  1053. " Mobile food facility with unapproved operating conditions 0.020551\n",
  1054. " No hot water or running water 0.457266\n",
  1055. " No restroom facility within 200 feet of mobile food facility 0.002569\n",
  1056. " Other high risk violation 0.074498\n",
  1057. " Prohibited food served to sensitive population 0.010276\n",
  1058. " Sewage or wastewater contamination 0.095050\n",
  1059. " Unapproved food source 0.061654\n",
  1060. " Unauthorized or unsafe use of time as a public health control measure 0.215789\n",
  1061. " Unclean hands or improper use of gloves 1.664654\n",
  1062. " Unclean or unsanitary food contact surfaces 2.545791\n",
  1063. " Unreported or unrestricted ill employee with communicable disease 0.012845\n",
  1064. "Low Risk Food safety certificate or food handler card not available 2.633134\n",
  1065. " Improper food labeling or menu misrepresentation 0.390474\n",
  1066. " Improper food storage 5.415264\n",
  1067. " Improper or defective plumbing 1.805944\n",
  1068. " Improper storage of equipment utensils or linens 3.041591\n",
  1069. " Improper storage use or identification of toxic substances 1.348678\n",
  1070. " Improperly washed fruits and vegetables 0.038534\n",
  1071. " Inadequate HACCP plan record keeping 0.048809\n",
  1072. " Inadequate dressing rooms or improper storage of personal items 0.637090\n",
  1073. " Inadequate or unsanitary refuse containers or area or no garbage service 0.446991\n",
  1074. " Inadequate ventilation or lighting 1.418039\n",
  1075. " Inadequate warewashing facilities or equipment 1.410332\n",
  1076. " ... \n",
  1077. " Unapproved living quarters in food facility 0.059085\n",
  1078. " Unapproved or unmaintained equipment or utensils 5.993269\n",
  1079. " Unclean nonfood contact surfaces 5.551417\n",
  1080. " Unclean or degraded floors walls or ceilings 8.623834\n",
  1081. " Unclean unmaintained or improperly constructed toilet facilities 0.621676\n",
  1082. " Unpermitted food facility 0.113032\n",
  1083. " Unsanitary employee garments hair or nails 0.125877\n",
  1084. " Wiping cloths not clean or properly stored or inadequate sanitizer 5.713258\n",
  1085. " Worker safety hazards 0.303132\n",
  1086. "Moderate Risk Consumer advisory not provided for raw or undercooked foods 0.074498\n",
  1087. " Discharge from employee nose mouth or eye 0.010276\n",
  1088. " Employee eating or smoking 0.387906\n",
  1089. " Food in poor condition 0.236340\n",
  1090. " Foods not protected from contamination 4.595782\n",
  1091. " Improper thawing methods 1.325558\n",
  1092. " Improperly displayed mobile food permit or signage 0.005138\n",
  1093. " Inadequate and inaccessible handwashing facilities 6.114008\n",
  1094. " Inadequate food safety knowledge or lack of certified food safety manager 2.751304\n",
  1095. " Inadequate procedures or records for time as a public health control 0.562592\n",
  1096. " Inadequate sewage or wastewater disposal 0.202944\n",
  1097. " Inadequately cleaned or sanitized food contact surfaces 5.476918\n",
  1098. " Insufficient hot water or running water 1.240784\n",
  1099. " Moderate risk food holding temperature 6.627790\n",
  1100. " Moderate risk vermin infestation 3.696663\n",
  1101. " Noncompliance with Gulf Coast oyster regulation 0.002569\n",
  1102. " Noncompliance with HAACP plan or variance 0.125877\n",
  1103. " Noncompliance with shell fish tags or display 0.115601\n",
  1104. " Other moderate risk violation 0.210651\n",
  1105. " Reservice of previously served foods 0.015413\n",
  1106. " Unsanitary mobile food facility 0.002569\n",
  1107. "dtype: float64"
  1108. ]
  1109. },
  1110. "execution_count": 670,
  1111. "metadata": {},
  1112. "output_type": "execute_result"
  1113. }
  1114. ],
  1115. "source": [
  1116. "df.groupby([\"risk_category\", \"violation_description\"]).size()/len(df)*100"
  1117. ]
  1118. },
  1119. {
  1120. "cell_type": "code",
  1121. "execution_count": null,
  1122. "metadata": {
  1123. "collapsed": true
  1124. },
  1125. "outputs": [],
  1126. "source": []
  1127. },
  1128. {
  1129. "cell_type": "code",
  1130. "execution_count": null,
  1131. "metadata": {
  1132. "collapsed": true
  1133. },
  1134. "outputs": [],
  1135. "source": []
  1136. },
  1137. {
  1138. "cell_type": "code",
  1139. "execution_count": null,
  1140. "metadata": {
  1141. "collapsed": true
  1142. },
  1143. "outputs": [],
  1144. "source": []
  1145. }
  1146. ],
  1147. "metadata": {
  1148. "kernelspec": {
  1149. "display_name": "Python 2",
  1150. "language": "python",
  1151. "name": "python2"
  1152. },
  1153. "language_info": {
  1154. "codemirror_mode": {
  1155. "name": "ipython",
  1156. "version": 2
  1157. },
  1158. "file_extension": ".py",
  1159. "mimetype": "text/x-python",
  1160. "name": "python",
  1161. "nbconvert_exporter": "python",
  1162. "pygments_lexer": "ipython2",
  1163. "version": "2.7.11"
  1164. }
  1165. },
  1166. "nbformat": 4,
  1167. "nbformat_minor": 0
  1168. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement