Advertisement
Guest User

Untitled

a guest
Jun 19th, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 25.85 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "### installing packeges"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 347,
  13. "metadata": {},
  14. "outputs": [
  15. {
  16. "name": "stdout",
  17. "output_type": "stream",
  18. "text": [
  19. "Requirement already satisfied: beautifulsoup4 in /home/jupyterlab/conda/lib/python3.6/site-packages (4.7.1)\n",
  20. "Requirement already satisfied: soupsieve>=1.2 in /home/jupyterlab/conda/lib/python3.6/site-packages (from beautifulsoup4) (1.8)\n",
  21. "Note: you may need to restart the kernel to use updated packages.\n"
  22. ]
  23. }
  24. ],
  25. "source": [
  26. "pip install beautifulsoup4"
  27. ]
  28. },
  29. {
  30. "cell_type": "code",
  31. "execution_count": 348,
  32. "metadata": {},
  33. "outputs": [
  34. {
  35. "name": "stdout",
  36. "output_type": "stream",
  37. "text": [
  38. "Requirement already satisfied: lxml in /home/jupyterlab/conda/lib/python3.6/site-packages (4.3.0)\n",
  39. "Note: you may need to restart the kernel to use updated packages.\n"
  40. ]
  41. }
  42. ],
  43. "source": [
  44. "pip install lxml"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": 349,
  50. "metadata": {},
  51. "outputs": [
  52. {
  53. "name": "stdout",
  54. "output_type": "stream",
  55. "text": [
  56. "Requirement already satisfied: html5lib in /home/jupyterlab/conda/lib/python3.6/site-packages (0.9999999)\n",
  57. "Note: you may need to restart the kernel to use updated packages.\n"
  58. ]
  59. }
  60. ],
  61. "source": [
  62. "pip install html5lib"
  63. ]
  64. },
  65. {
  66. "cell_type": "code",
  67. "execution_count": 350,
  68. "metadata": {},
  69. "outputs": [
  70. {
  71. "name": "stdout",
  72. "output_type": "stream",
  73. "text": [
  74. "Requirement already satisfied: requests in /home/jupyterlab/conda/lib/python3.6/site-packages (2.21.0)\n",
  75. "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/jupyterlab/conda/lib/python3.6/site-packages (from requests) (3.0.4)\n",
  76. "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /home/jupyterlab/conda/lib/python3.6/site-packages (from requests) (1.24.1)\n",
  77. "Requirement already satisfied: certifi>=2017.4.17 in /home/jupyterlab/conda/lib/python3.6/site-packages (from requests) (2019.3.9)\n",
  78. "Requirement already satisfied: idna<2.9,>=2.5 in /home/jupyterlab/conda/lib/python3.6/site-packages (from requests) (2.8)\n",
  79. "Note: you may need to restart the kernel to use updated packages.\n"
  80. ]
  81. }
  82. ],
  83. "source": [
  84. "pip install requests"
  85. ]
  86. },
  87. {
  88. "cell_type": "code",
  89. "execution_count": 351,
  90. "metadata": {},
  91. "outputs": [],
  92. "source": [
  93. "from bs4 import BeautifulSoup"
  94. ]
  95. },
  96. {
  97. "cell_type": "code",
  98. "execution_count": 352,
  99. "metadata": {},
  100. "outputs": [],
  101. "source": [
  102. "import requests\n",
  103. "import pandas as pd"
  104. ]
  105. },
  106. {
  107. "cell_type": "markdown",
  108. "metadata": {},
  109. "source": [
  110. "### open html file"
  111. ]
  112. },
  113. {
  114. "cell_type": "code",
  115. "execution_count": 353,
  116. "metadata": {},
  117. "outputs": [],
  118. "source": [
  119. "path='Wikipedia.html'\n",
  120. "with open(path) as html_file:\n",
  121. " soup=BeautifulSoup(html_file,'lxml')\n"
  122. ]
  123. },
  124. {
  125. "cell_type": "markdown",
  126. "metadata": {},
  127. "source": [
  128. "### hendlind data"
  129. ]
  130. },
  131. {
  132. "cell_type": "code",
  133. "execution_count": 407,
  134. "metadata": {},
  135. "outputs": [
  136. {
  137. "data": {
  138. "text/html": [
  139. "<div>\n",
  140. "<style scoped>\n",
  141. " .dataframe tbody tr th:only-of-type {\n",
  142. " vertical-align: middle;\n",
  143. " }\n",
  144. "\n",
  145. " .dataframe tbody tr th {\n",
  146. " vertical-align: top;\n",
  147. " }\n",
  148. "\n",
  149. " .dataframe thead th {\n",
  150. " text-align: right;\n",
  151. " }\n",
  152. "</style>\n",
  153. "<table border=\"1\" class=\"dataframe\">\n",
  154. " <thead>\n",
  155. " <tr style=\"text-align: right;\">\n",
  156. " <th></th>\n",
  157. " <th>Postcode</th>\n",
  158. " <th>Borough</th>\n",
  159. " <th>Neighbourhood</th>\n",
  160. " </tr>\n",
  161. " </thead>\n",
  162. " <tbody>\n",
  163. " <tr>\n",
  164. " <th>0</th>\n",
  165. " <td>M1B</td>\n",
  166. " <td>Scarborough</td>\n",
  167. " <td>Malvern,Rouge</td>\n",
  168. " </tr>\n",
  169. " <tr>\n",
  170. " <th>1</th>\n",
  171. " <td>M1C</td>\n",
  172. " <td>Scarborough</td>\n",
  173. " <td>Highland Creek,Port Union,Rouge Hill</td>\n",
  174. " </tr>\n",
  175. " <tr>\n",
  176. " <th>2</th>\n",
  177. " <td>M1E</td>\n",
  178. " <td>Scarborough</td>\n",
  179. " <td>Guildwood,Morningside,West Hill</td>\n",
  180. " </tr>\n",
  181. " <tr>\n",
  182. " <th>3</th>\n",
  183. " <td>M1G</td>\n",
  184. " <td>Scarborough</td>\n",
  185. " <td>Woburn</td>\n",
  186. " </tr>\n",
  187. " <tr>\n",
  188. " <th>4</th>\n",
  189. " <td>M1H</td>\n",
  190. " <td>Scarborough</td>\n",
  191. " <td>Cedarbrae</td>\n",
  192. " </tr>\n",
  193. " <tr>\n",
  194. " <th>5</th>\n",
  195. " <td>M1J</td>\n",
  196. " <td>Scarborough</td>\n",
  197. " <td>Scarborough Village</td>\n",
  198. " </tr>\n",
  199. " <tr>\n",
  200. " <th>6</th>\n",
  201. " <td>M1K</td>\n",
  202. " <td>Scarborough</td>\n",
  203. " <td>East Birchmount Park,Ionview,Kennedy Park</td>\n",
  204. " </tr>\n",
  205. " <tr>\n",
  206. " <th>7</th>\n",
  207. " <td>M1L</td>\n",
  208. " <td>Scarborough</td>\n",
  209. " <td>Clairlea,Golden Mile,Oakridge</td>\n",
  210. " </tr>\n",
  211. " <tr>\n",
  212. " <th>8</th>\n",
  213. " <td>M1M</td>\n",
  214. " <td>Scarborough</td>\n",
  215. " <td>Cliffcrest,Cliffside,Scarborough Village West</td>\n",
  216. " </tr>\n",
  217. " <tr>\n",
  218. " <th>9</th>\n",
  219. " <td>M1N</td>\n",
  220. " <td>Scarborough</td>\n",
  221. " <td>Birch Cliff,Cliffside West</td>\n",
  222. " </tr>\n",
  223. " <tr>\n",
  224. " <th>10</th>\n",
  225. " <td>M1P</td>\n",
  226. " <td>Scarborough</td>\n",
  227. " <td>Dorset Park,Scarborough Town Centre,Wexford He...</td>\n",
  228. " </tr>\n",
  229. " <tr>\n",
  230. " <th>11</th>\n",
  231. " <td>M1R</td>\n",
  232. " <td>Scarborough</td>\n",
  233. " <td>Maryvale,Wexford</td>\n",
  234. " </tr>\n",
  235. " <tr>\n",
  236. " <th>12</th>\n",
  237. " <td>M1S</td>\n",
  238. " <td>Scarborough</td>\n",
  239. " <td>Agincourt</td>\n",
  240. " </tr>\n",
  241. " <tr>\n",
  242. " <th>13</th>\n",
  243. " <td>M1T</td>\n",
  244. " <td>Scarborough</td>\n",
  245. " <td>Clarks Corners,Sullivan,Tam O'Shanter</td>\n",
  246. " </tr>\n",
  247. " <tr>\n",
  248. " <th>14</th>\n",
  249. " <td>M1V</td>\n",
  250. " <td>Scarborough</td>\n",
  251. " <td>Agincourt North,L'Amoreaux East,Milliken,Steel...</td>\n",
  252. " </tr>\n",
  253. " <tr>\n",
  254. " <th>15</th>\n",
  255. " <td>M1W</td>\n",
  256. " <td>Scarborough</td>\n",
  257. " <td>L'Amoreaux West</td>\n",
  258. " </tr>\n",
  259. " <tr>\n",
  260. " <th>16</th>\n",
  261. " <td>M1X</td>\n",
  262. " <td>Scarborough</td>\n",
  263. " <td>Upper Rouge</td>\n",
  264. " </tr>\n",
  265. " <tr>\n",
  266. " <th>17</th>\n",
  267. " <td>M2H</td>\n",
  268. " <td>North York</td>\n",
  269. " <td>Hillcrest Village</td>\n",
  270. " </tr>\n",
  271. " <tr>\n",
  272. " <th>18</th>\n",
  273. " <td>M2J</td>\n",
  274. " <td>North York</td>\n",
  275. " <td>Fairview,Henry Farm,Oriole</td>\n",
  276. " </tr>\n",
  277. " <tr>\n",
  278. " <th>19</th>\n",
  279. " <td>M2K</td>\n",
  280. " <td>North York</td>\n",
  281. " <td>Bayview Village</td>\n",
  282. " </tr>\n",
  283. " <tr>\n",
  284. " <th>20</th>\n",
  285. " <td>M2L</td>\n",
  286. " <td>North York</td>\n",
  287. " <td>Silver Hills,York Mills</td>\n",
  288. " </tr>\n",
  289. " <tr>\n",
  290. " <th>21</th>\n",
  291. " <td>M2M</td>\n",
  292. " <td>North York</td>\n",
  293. " <td>Newtonbrook,Willowdale</td>\n",
  294. " </tr>\n",
  295. " <tr>\n",
  296. " <th>22</th>\n",
  297. " <td>M2N</td>\n",
  298. " <td>North York</td>\n",
  299. " <td>Willowdale South</td>\n",
  300. " </tr>\n",
  301. " <tr>\n",
  302. " <th>23</th>\n",
  303. " <td>M2P</td>\n",
  304. " <td>North York</td>\n",
  305. " <td>York Mills West</td>\n",
  306. " </tr>\n",
  307. " <tr>\n",
  308. " <th>24</th>\n",
  309. " <td>M2R</td>\n",
  310. " <td>North York</td>\n",
  311. " <td>Willowdale West</td>\n",
  312. " </tr>\n",
  313. " <tr>\n",
  314. " <th>25</th>\n",
  315. " <td>M3A</td>\n",
  316. " <td>North York</td>\n",
  317. " <td>Parkwoods</td>\n",
  318. " </tr>\n",
  319. " <tr>\n",
  320. " <th>26</th>\n",
  321. " <td>M3B</td>\n",
  322. " <td>North York</td>\n",
  323. " <td>Don Mills North</td>\n",
  324. " </tr>\n",
  325. " <tr>\n",
  326. " <th>27</th>\n",
  327. " <td>M3C</td>\n",
  328. " <td>North York</td>\n",
  329. " <td>Don Mills South,Flemingdon Park</td>\n",
  330. " </tr>\n",
  331. " <tr>\n",
  332. " <th>28</th>\n",
  333. " <td>M3H</td>\n",
  334. " <td>North York</td>\n",
  335. " <td>Bathurst Manor,Downsview North,Wilson Heights</td>\n",
  336. " </tr>\n",
  337. " <tr>\n",
  338. " <th>29</th>\n",
  339. " <td>M3J</td>\n",
  340. " <td>North York</td>\n",
  341. " <td>Northwood Park,York University</td>\n",
  342. " </tr>\n",
  343. " <tr>\n",
  344. " <th>...</th>\n",
  345. " <td>...</td>\n",
  346. " <td>...</td>\n",
  347. " <td>...</td>\n",
  348. " </tr>\n",
  349. " <tr>\n",
  350. " <th>73</th>\n",
  351. " <td>M6C</td>\n",
  352. " <td>York</td>\n",
  353. " <td>Humewood-Cedarvale</td>\n",
  354. " </tr>\n",
  355. " <tr>\n",
  356. " <th>74</th>\n",
  357. " <td>M6E</td>\n",
  358. " <td>York</td>\n",
  359. " <td>Caledonia-Fairbanks</td>\n",
  360. " </tr>\n",
  361. " <tr>\n",
  362. " <th>75</th>\n",
  363. " <td>M6G</td>\n",
  364. " <td>Downtown Toronto</td>\n",
  365. " <td>Christie</td>\n",
  366. " </tr>\n",
  367. " <tr>\n",
  368. " <th>76</th>\n",
  369. " <td>M6H</td>\n",
  370. " <td>West Toronto</td>\n",
  371. " <td>Dovercourt Village,Dufferin</td>\n",
  372. " </tr>\n",
  373. " <tr>\n",
  374. " <th>77</th>\n",
  375. " <td>M6J</td>\n",
  376. " <td>West Toronto</td>\n",
  377. " <td>Little Portugal,Trinity</td>\n",
  378. " </tr>\n",
  379. " <tr>\n",
  380. " <th>78</th>\n",
  381. " <td>M6K</td>\n",
  382. " <td>West Toronto</td>\n",
  383. " <td>Brockton,Exhibition Place,Parkdale Village</td>\n",
  384. " </tr>\n",
  385. " <tr>\n",
  386. " <th>79</th>\n",
  387. " <td>M6L</td>\n",
  388. " <td>North York</td>\n",
  389. " <td>Downsview,North Park,Upwood Park</td>\n",
  390. " </tr>\n",
  391. " <tr>\n",
  392. " <th>80</th>\n",
  393. " <td>M6M</td>\n",
  394. " <td>York</td>\n",
  395. " <td>Del Ray,Keelesdale,Mount Dennis,Silverthorn</td>\n",
  396. " </tr>\n",
  397. " <tr>\n",
  398. " <th>81</th>\n",
  399. " <td>M6N</td>\n",
  400. " <td>York</td>\n",
  401. " <td>Runnymede,The Junction North</td>\n",
  402. " </tr>\n",
  403. " <tr>\n",
  404. " <th>82</th>\n",
  405. " <td>M6P</td>\n",
  406. " <td>West Toronto</td>\n",
  407. " <td>High Park,The Junction South</td>\n",
  408. " </tr>\n",
  409. " <tr>\n",
  410. " <th>83</th>\n",
  411. " <td>M6R</td>\n",
  412. " <td>West Toronto</td>\n",
  413. " <td>Parkdale,Roncesvalles</td>\n",
  414. " </tr>\n",
  415. " <tr>\n",
  416. " <th>84</th>\n",
  417. " <td>M6S</td>\n",
  418. " <td>West Toronto</td>\n",
  419. " <td>Runnymede,Swansea</td>\n",
  420. " </tr>\n",
  421. " <tr>\n",
  422. " <th>85</th>\n",
  423. " <td>M7A</td>\n",
  424. " <td>Queen's Park</td>\n",
  425. " <td>Queen's Park</td>\n",
  426. " </tr>\n",
  427. " <tr>\n",
  428. " <th>86</th>\n",
  429. " <td>M7R</td>\n",
  430. " <td>Mississauga</td>\n",
  431. " <td>Canada Post Gateway Processing Centre</td>\n",
  432. " </tr>\n",
  433. " <tr>\n",
  434. " <th>87</th>\n",
  435. " <td>M7Y</td>\n",
  436. " <td>East Toronto</td>\n",
  437. " <td>Business Reply Mail Processing Centre 969 Eastern</td>\n",
  438. " </tr>\n",
  439. " <tr>\n",
  440. " <th>88</th>\n",
  441. " <td>M8V</td>\n",
  442. " <td>Etobicoke</td>\n",
  443. " <td>Humber Bay Shores,Mimico South,New Toronto</td>\n",
  444. " </tr>\n",
  445. " <tr>\n",
  446. " <th>89</th>\n",
  447. " <td>M8W</td>\n",
  448. " <td>Etobicoke</td>\n",
  449. " <td>Alderwood,Long Branch</td>\n",
  450. " </tr>\n",
  451. " <tr>\n",
  452. " <th>90</th>\n",
  453. " <td>M8X</td>\n",
  454. " <td>Etobicoke</td>\n",
  455. " <td>Montgomery Road,Old Mill North,The Kingsway</td>\n",
  456. " </tr>\n",
  457. " <tr>\n",
  458. " <th>91</th>\n",
  459. " <td>M8Y</td>\n",
  460. " <td>Etobicoke</td>\n",
  461. " <td>Humber Bay,King's Mill Park,Kingsway Park Sout...</td>\n",
  462. " </tr>\n",
  463. " <tr>\n",
  464. " <th>92</th>\n",
  465. " <td>M8Z</td>\n",
  466. " <td>Etobicoke</td>\n",
  467. " <td>Kingsway Park South West,Mimico NW,Royal York ...</td>\n",
  468. " </tr>\n",
  469. " <tr>\n",
  470. " <th>93</th>\n",
  471. " <td>M9A</td>\n",
  472. " <td>Etobicoke</td>\n",
  473. " <td>Islington Avenue</td>\n",
  474. " </tr>\n",
  475. " <tr>\n",
  476. " <th>94</th>\n",
  477. " <td>M9B</td>\n",
  478. " <td>Etobicoke</td>\n",
  479. " <td>Cloverdale,Islington,Martin Grove,Princess Gar...</td>\n",
  480. " </tr>\n",
  481. " <tr>\n",
  482. " <th>95</th>\n",
  483. " <td>M9C</td>\n",
  484. " <td>Etobicoke</td>\n",
  485. " <td>Bloordale Gardens,Eringate,Markland Wood,Old B...</td>\n",
  486. " </tr>\n",
  487. " <tr>\n",
  488. " <th>96</th>\n",
  489. " <td>M9L</td>\n",
  490. " <td>North York</td>\n",
  491. " <td>Humber Summit</td>\n",
  492. " </tr>\n",
  493. " <tr>\n",
  494. " <th>97</th>\n",
  495. " <td>M9M</td>\n",
  496. " <td>North York</td>\n",
  497. " <td>Emery,Humberlea</td>\n",
  498. " </tr>\n",
  499. " <tr>\n",
  500. " <th>98</th>\n",
  501. " <td>M9N</td>\n",
  502. " <td>York</td>\n",
  503. " <td>Weston</td>\n",
  504. " </tr>\n",
  505. " <tr>\n",
  506. " <th>99</th>\n",
  507. " <td>M9P</td>\n",
  508. " <td>Etobicoke</td>\n",
  509. " <td>Westmount</td>\n",
  510. " </tr>\n",
  511. " <tr>\n",
  512. " <th>100</th>\n",
  513. " <td>M9R</td>\n",
  514. " <td>Etobicoke</td>\n",
  515. " <td>Kingsview Village,Martin Grove Gardens,Richvie...</td>\n",
  516. " </tr>\n",
  517. " <tr>\n",
  518. " <th>101</th>\n",
  519. " <td>M9V</td>\n",
  520. " <td>Etobicoke</td>\n",
  521. " <td>Albion Gardens,Beaumond Heights,Humbergate,Jam...</td>\n",
  522. " </tr>\n",
  523. " <tr>\n",
  524. " <th>102</th>\n",
  525. " <td>M9W</td>\n",
  526. " <td>Etobicoke</td>\n",
  527. " <td>Northwest</td>\n",
  528. " </tr>\n",
  529. " </tbody>\n",
  530. "</table>\n",
  531. "<p>103 rows × 3 columns</p>\n",
  532. "</div>"
  533. ],
  534. "text/plain": [
  535. " Postcode Borough \\\n",
  536. "0 M1B Scarborough \n",
  537. "1 M1C Scarborough \n",
  538. "2 M1E Scarborough \n",
  539. "3 M1G Scarborough \n",
  540. "4 M1H Scarborough \n",
  541. "5 M1J Scarborough \n",
  542. "6 M1K Scarborough \n",
  543. "7 M1L Scarborough \n",
  544. "8 M1M Scarborough \n",
  545. "9 M1N Scarborough \n",
  546. "10 M1P Scarborough \n",
  547. "11 M1R Scarborough \n",
  548. "12 M1S Scarborough \n",
  549. "13 M1T Scarborough \n",
  550. "14 M1V Scarborough \n",
  551. "15 M1W Scarborough \n",
  552. "16 M1X Scarborough \n",
  553. "17 M2H North York \n",
  554. "18 M2J North York \n",
  555. "19 M2K North York \n",
  556. "20 M2L North York \n",
  557. "21 M2M North York \n",
  558. "22 M2N North York \n",
  559. "23 M2P North York \n",
  560. "24 M2R North York \n",
  561. "25 M3A North York \n",
  562. "26 M3B North York \n",
  563. "27 M3C North York \n",
  564. "28 M3H North York \n",
  565. "29 M3J North York \n",
  566. ".. ... ... \n",
  567. "73 M6C York \n",
  568. "74 M6E York \n",
  569. "75 M6G Downtown Toronto \n",
  570. "76 M6H West Toronto \n",
  571. "77 M6J West Toronto \n",
  572. "78 M6K West Toronto \n",
  573. "79 M6L North York \n",
  574. "80 M6M York \n",
  575. "81 M6N York \n",
  576. "82 M6P West Toronto \n",
  577. "83 M6R West Toronto \n",
  578. "84 M6S West Toronto \n",
  579. "85 M7A Queen's Park \n",
  580. "86 M7R Mississauga \n",
  581. "87 M7Y East Toronto \n",
  582. "88 M8V Etobicoke \n",
  583. "89 M8W Etobicoke \n",
  584. "90 M8X Etobicoke \n",
  585. "91 M8Y Etobicoke \n",
  586. "92 M8Z Etobicoke \n",
  587. "93 M9A Etobicoke \n",
  588. "94 M9B Etobicoke \n",
  589. "95 M9C Etobicoke \n",
  590. "96 M9L North York \n",
  591. "97 M9M North York \n",
  592. "98 M9N York \n",
  593. "99 M9P Etobicoke \n",
  594. "100 M9R Etobicoke \n",
  595. "101 M9V Etobicoke \n",
  596. "102 M9W Etobicoke \n",
  597. "\n",
  598. " Neighbourhood \n",
  599. "0 Malvern,Rouge \n",
  600. "1 Highland Creek,Port Union,Rouge Hill \n",
  601. "2 Guildwood,Morningside,West Hill \n",
  602. "3 Woburn \n",
  603. "4 Cedarbrae \n",
  604. "5 Scarborough Village \n",
  605. "6 East Birchmount Park,Ionview,Kennedy Park \n",
  606. "7 Clairlea,Golden Mile,Oakridge \n",
  607. "8 Cliffcrest,Cliffside,Scarborough Village West \n",
  608. "9 Birch Cliff,Cliffside West \n",
  609. "10 Dorset Park,Scarborough Town Centre,Wexford He... \n",
  610. "11 Maryvale,Wexford \n",
  611. "12 Agincourt \n",
  612. "13 Clarks Corners,Sullivan,Tam O'Shanter \n",
  613. "14 Agincourt North,L'Amoreaux East,Milliken,Steel... \n",
  614. "15 L'Amoreaux West \n",
  615. "16 Upper Rouge \n",
  616. "17 Hillcrest Village \n",
  617. "18 Fairview,Henry Farm,Oriole \n",
  618. "19 Bayview Village \n",
  619. "20 Silver Hills,York Mills \n",
  620. "21 Newtonbrook,Willowdale \n",
  621. "22 Willowdale South \n",
  622. "23 York Mills West \n",
  623. "24 Willowdale West \n",
  624. "25 Parkwoods \n",
  625. "26 Don Mills North \n",
  626. "27 Don Mills South,Flemingdon Park \n",
  627. "28 Bathurst Manor,Downsview North,Wilson Heights \n",
  628. "29 Northwood Park,York University \n",
  629. ".. ... \n",
  630. "73 Humewood-Cedarvale \n",
  631. "74 Caledonia-Fairbanks \n",
  632. "75 Christie \n",
  633. "76 Dovercourt Village,Dufferin \n",
  634. "77 Little Portugal,Trinity \n",
  635. "78 Brockton,Exhibition Place,Parkdale Village \n",
  636. "79 Downsview,North Park,Upwood Park \n",
  637. "80 Del Ray,Keelesdale,Mount Dennis,Silverthorn \n",
  638. "81 Runnymede,The Junction North \n",
  639. "82 High Park,The Junction South \n",
  640. "83 Parkdale,Roncesvalles \n",
  641. "84 Runnymede,Swansea \n",
  642. "85 Queen's Park \n",
  643. "86 Canada Post Gateway Processing Centre \n",
  644. "87 Business Reply Mail Processing Centre 969 Eastern \n",
  645. "88 Humber Bay Shores,Mimico South,New Toronto \n",
  646. "89 Alderwood,Long Branch \n",
  647. "90 Montgomery Road,Old Mill North,The Kingsway \n",
  648. "91 Humber Bay,King's Mill Park,Kingsway Park Sout... \n",
  649. "92 Kingsway Park South West,Mimico NW,Royal York ... \n",
  650. "93 Islington Avenue \n",
  651. "94 Cloverdale,Islington,Martin Grove,Princess Gar... \n",
  652. "95 Bloordale Gardens,Eringate,Markland Wood,Old B... \n",
  653. "96 Humber Summit \n",
  654. "97 Emery,Humberlea \n",
  655. "98 Weston \n",
  656. "99 Westmount \n",
  657. "100 Kingsview Village,Martin Grove Gardens,Richvie... \n",
  658. "101 Albion Gardens,Beaumond Heights,Humbergate,Jam... \n",
  659. "102 Northwest \n",
  660. "\n",
  661. "[103 rows x 3 columns]"
  662. ]
  663. },
  664. "execution_count": 407,
  665. "metadata": {},
  666. "output_type": "execute_result"
  667. }
  668. ],
  669. "source": [
  670. "# locate the table and it's content, in the file\n",
  671. "table=soup.table\n",
  672. "table_rows=table.find_all('tr')\n",
  673. "\n",
  674. "# locating the headers\n",
  675. "column=[]\n",
  676. "th=table.find_all('th')\n",
  677. "for th2 in th:\n",
  678. " # assigning headers to a list\n",
  679. " column=column+[th2.text]\n",
  680. " \n",
  681. "# locating row's data\n",
  682. "rows=[] \n",
  683. "for tr in table_rows:\n",
  684. " td=tr.find_all('td')\n",
  685. " # assigning row's data to a list\n",
  686. " rows=rows+[i.text for i in td]\n",
  687. " \n",
  688. "#seperating the list to 3 lists for each column\n",
  689. "head1=[]\n",
  690. "head2=[]\n",
  691. "head3=[]\n",
  692. "\n",
  693. "l=len(rows)/3\n",
  694. "l=int(l)\n",
  695. "\n",
  696. "for r in range(1,l+1):\n",
  697. " head1=head1+[rows[r*3-3]]\n",
  698. " head2=head2+[rows[r*3+1-3]]\n",
  699. " head3=head3+[rows[r*3+2-3]]\n",
  700. " \n",
  701. "#remove \\n from the data\n",
  702. "for i in range(0,l):\n",
  703. " head3[i]=head3[i].replace('\\n','')\n",
  704. " \n",
  705. "for i in range(0,3):\n",
  706. " column[i]=column[i].replace('\\n','') \n",
  707. "\n",
  708. "#create dictionary withe the columns and rows\n",
  709. "dict={}\n",
  710. "dict={column[0]:head1,column[1]:head2,column[2]:head3}\n",
  711. " \n",
  712. "#create data frame\n",
  713. "df=pd.DataFrame(dict)\n",
  714. "\n",
  715. "#remove rows where the 'Borough' cell is 'Not assigned'\n",
  716. "df=df[df.Borough != 'Not assigned']\n",
  717. " \n",
  718. "#if a 'Neighbourhood' cell is 'Not assigned' -> copy 'Borough' cell\n",
  719. "l=len(df)\n",
  720. "\n",
  721. "for i in range (0,l):\n",
  722. " if (df.iloc[i,2]=='Not assigned'):\n",
  723. " df.iloc[i,2]=df.iloc[i,1]\n",
  724. " \n",
  725. "#group by 'Postcode'\n",
  726. "df2=df.groupby('Postcode').agg(','.join)\n",
  727. "\n",
  728. "#add index column\n",
  729. "df2.reset_index(inplace=True)\n",
  730. "\n",
  731. "#crate new data frame with 'Borough' columns seperated by ','\n",
  732. "df3 = df2[\"Borough\"].str.split(\",\", n = 1, expand = True) \n",
  733. "\n",
  734. "#appending new 'Borough' column to the old data frame\n",
  735. "df2['Borough']=df3[0]\n",
  736. "df2\n",
  737. "\n",
  738. "\n"
  739. ]
  740. },
  741. {
  742. "cell_type": "code",
  743. "execution_count": 401,
  744. "metadata": {},
  745. "outputs": [
  746. {
  747. "data": {
  748. "text/plain": [
  749. "(103, 3)"
  750. ]
  751. },
  752. "execution_count": 401,
  753. "metadata": {},
  754. "output_type": "execute_result"
  755. }
  756. ],
  757. "source": [
  758. "df2.shape"
  759. ]
  760. }
  761. ],
  762. "metadata": {
  763. "kernelspec": {
  764. "display_name": "Python 3",
  765. "language": "python",
  766. "name": "python3"
  767. },
  768. "language_info": {
  769. "codemirror_mode": {
  770. "name": "ipython",
  771. "version": 3
  772. },
  773. "file_extension": ".py",
  774. "mimetype": "text/x-python",
  775. "name": "python",
  776. "nbconvert_exporter": "python",
  777. "pygments_lexer": "ipython3",
  778. "version": "3.6.8"
  779. }
  780. },
  781. "nbformat": 4,
  782. "nbformat_minor": 2
  783. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement