Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.76 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "# Ignore useless warnings (see SciPy issue #5998)\n",
  10. "import warnings\n",
  11. "warnings.simplefilter(action='ignore')\n",
  12. "\n",
  13. "# Data Manipulation and Visualisation\n",
  14. "import pandas as pd\n",
  15. "import matplotlib.pyplot as plt\n",
  16. "import seaborn as sns\n",
  17. "\n",
  18. "# Notebook Mod\n",
  19. "from IPython.core.display import display, HTML\n",
  20. "%matplotlib inline\n",
  21. "%config InlineBackend.figure_format = 'retina'"
  22. ]
  23. },
  24. {
  25. "cell_type": "code",
  26. "execution_count": 2,
  27. "metadata": {},
  28. "outputs": [
  29. {
  30. "data": {
  31. "text/html": [
  32. "<div>\n",
  33. "<style scoped>\n",
  34. " .dataframe tbody tr th:only-of-type {\n",
  35. " vertical-align: middle;\n",
  36. " }\n",
  37. "\n",
  38. " .dataframe tbody tr th {\n",
  39. " vertical-align: top;\n",
  40. " }\n",
  41. "\n",
  42. " .dataframe thead th {\n",
  43. " text-align: right;\n",
  44. " }\n",
  45. "</style>\n",
  46. "<table border=\"1\" class=\"dataframe\">\n",
  47. " <thead>\n",
  48. " <tr style=\"text-align: right;\">\n",
  49. " <th></th>\n",
  50. " <th>State</th>\n",
  51. " <th>Education District</th>\n",
  52. " <th>School Level</th>\n",
  53. " <th>Type of School</th>\n",
  54. " <th>School Code</th>\n",
  55. " <th>School Name</th>\n",
  56. " <th>Correspondence Address</th>\n",
  57. " <th>Poscode</th>\n",
  58. " <th>City</th>\n",
  59. " <th>Urban/Rural</th>\n",
  60. " <th>Gov/Gov Aided</th>\n",
  61. " <th>Enrollment</th>\n",
  62. " <th>N. of Teachers</th>\n",
  63. " <th>PreSchool</th>\n",
  64. " <th>Integration</th>\n",
  65. " <th>X Cood</th>\n",
  66. " <th>Y Cood</th>\n",
  67. " </tr>\n",
  68. " <tr>\n",
  69. " <th>BIL</th>\n",
  70. " <th></th>\n",
  71. " <th></th>\n",
  72. " <th></th>\n",
  73. " <th></th>\n",
  74. " <th></th>\n",
  75. " <th></th>\n",
  76. " <th></th>\n",
  77. " <th></th>\n",
  78. " <th></th>\n",
  79. " <th></th>\n",
  80. " <th></th>\n",
  81. " <th></th>\n",
  82. " <th></th>\n",
  83. " <th></th>\n",
  84. " <th></th>\n",
  85. " <th></th>\n",
  86. " <th></th>\n",
  87. " </tr>\n",
  88. " </thead>\n",
  89. " <tbody>\n",
  90. " <tr>\n",
  91. " <th>1</th>\n",
  92. " <td>PERAK</td>\n",
  93. " <td>PPD BATANG PADANG</td>\n",
  94. " <td>Rendah</td>\n",
  95. " <td>SK</td>\n",
  96. " <td>ABA0001</td>\n",
  97. " <td>SEKOLAH KEBANGSAAN TOH TANDEWA SAKTI</td>\n",
  98. " <td>JALAN KELAB</td>\n",
  99. " <td>35000</td>\n",
  100. " <td>TAPAH</td>\n",
  101. " <td>Bandar</td>\n",
  102. " <td>KERAJAAN</td>\n",
  103. " <td>447.0</td>\n",
  104. " <td>41.0</td>\n",
  105. " <td>ADA</td>\n",
  106. " <td>ADA</td>\n",
  107. " <td>101.255932</td>\n",
  108. " <td>4.196633</td>\n",
  109. " </tr>\n",
  110. " <tr>\n",
  111. " <th>2</th>\n",
  112. " <td>PERAK</td>\n",
  113. " <td>PPD BATANG PADANG</td>\n",
  114. " <td>Rendah</td>\n",
  115. " <td>SK</td>\n",
  116. " <td>ABA0002</td>\n",
  117. " <td>SEKOLAH KEBANGSAAN PENDITA ZA'BA</td>\n",
  118. " <td>JALAN TAPAH ROAD</td>\n",
  119. " <td>35400</td>\n",
  120. " <td>TAPAH ROAD</td>\n",
  121. " <td>Bandar</td>\n",
  122. " <td>KERAJAAN</td>\n",
  123. " <td>389.0</td>\n",
  124. " <td>33.0</td>\n",
  125. " <td>ADA</td>\n",
  126. " <td>TIADA</td>\n",
  127. " <td>101.200617</td>\n",
  128. " <td>4.178276</td>\n",
  129. " </tr>\n",
  130. " <tr>\n",
  131. " <th>3</th>\n",
  132. " <td>PERAK</td>\n",
  133. " <td>PPD BATANG PADANG</td>\n",
  134. " <td>Rendah</td>\n",
  135. " <td>SK</td>\n",
  136. " <td>ABA0003</td>\n",
  137. " <td>SEKOLAH KEBANGSAAN BANIR</td>\n",
  138. " <td>BANIR</td>\n",
  139. " <td>35400</td>\n",
  140. " <td>TAPAH ROAD</td>\n",
  141. " <td>Bandar</td>\n",
  142. " <td>KERAJAAN</td>\n",
  143. " <td>43.0</td>\n",
  144. " <td>12.0</td>\n",
  145. " <td>ADA</td>\n",
  146. " <td>TIADA</td>\n",
  147. " <td>101.181893</td>\n",
  148. " <td>4.204361</td>\n",
  149. " </tr>\n",
  150. " <tr>\n",
  151. " <th>4</th>\n",
  152. " <td>PERAK</td>\n",
  153. " <td>PPD BATANG PADANG</td>\n",
  154. " <td>Rendah</td>\n",
  155. " <td>SK</td>\n",
  156. " <td>ABA0004</td>\n",
  157. " <td>SEKOLAH KEBANGSAAN TEMOH</td>\n",
  158. " <td>KAMPUNG TEMOH STESEN</td>\n",
  159. " <td>35350</td>\n",
  160. " <td>TEMOH</td>\n",
  161. " <td>Bandar</td>\n",
  162. " <td>KERAJAAN</td>\n",
  163. " <td>111.0</td>\n",
  164. " <td>18.0</td>\n",
  165. " <td>ADA</td>\n",
  166. " <td>TIADA</td>\n",
  167. " <td>101.172819</td>\n",
  168. " <td>4.235841</td>\n",
  169. " </tr>\n",
  170. " <tr>\n",
  171. " <th>5</th>\n",
  172. " <td>PERAK</td>\n",
  173. " <td>PPD BATANG PADANG</td>\n",
  174. " <td>Rendah</td>\n",
  175. " <td>SK</td>\n",
  176. " <td>ABA0005</td>\n",
  177. " <td>SEKOLAH KEBANGSAAN CHENDERIANG</td>\n",
  178. " <td>JALAN CHENDERIANG</td>\n",
  179. " <td>35300</td>\n",
  180. " <td>CHENDERIANG</td>\n",
  181. " <td>Luar Bandar</td>\n",
  182. " <td>KERAJAAN</td>\n",
  183. " <td>179.0</td>\n",
  184. " <td>19.0</td>\n",
  185. " <td>ADA</td>\n",
  186. " <td>TIADA</td>\n",
  187. " <td>101.240556</td>\n",
  188. " <td>4.267980</td>\n",
  189. " </tr>\n",
  190. " </tbody>\n",
  191. "</table>\n",
  192. "</div>"
  193. ],
  194. "text/plain": [
  195. " State Education District School Level Type of School School Code \\\n",
  196. "BIL \n",
  197. "1 PERAK PPD BATANG PADANG Rendah SK ABA0001 \n",
  198. "2 PERAK PPD BATANG PADANG Rendah SK ABA0002 \n",
  199. "3 PERAK PPD BATANG PADANG Rendah SK ABA0003 \n",
  200. "4 PERAK PPD BATANG PADANG Rendah SK ABA0004 \n",
  201. "5 PERAK PPD BATANG PADANG Rendah SK ABA0005 \n",
  202. "\n",
  203. " School Name Correspondence Address Poscode \\\n",
  204. "BIL \n",
  205. "1 SEKOLAH KEBANGSAAN TOH TANDEWA SAKTI JALAN KELAB 35000 \n",
  206. "2 SEKOLAH KEBANGSAAN PENDITA ZA'BA JALAN TAPAH ROAD 35400 \n",
  207. "3 SEKOLAH KEBANGSAAN BANIR BANIR 35400 \n",
  208. "4 SEKOLAH KEBANGSAAN TEMOH KAMPUNG TEMOH STESEN 35350 \n",
  209. "5 SEKOLAH KEBANGSAAN CHENDERIANG JALAN CHENDERIANG 35300 \n",
  210. "\n",
  211. " City Urban/Rural Gov/Gov Aided Enrollment N. of Teachers \\\n",
  212. "BIL \n",
  213. "1 TAPAH Bandar KERAJAAN 447.0 41.0 \n",
  214. "2 TAPAH ROAD Bandar KERAJAAN 389.0 33.0 \n",
  215. "3 TAPAH ROAD Bandar KERAJAAN 43.0 12.0 \n",
  216. "4 TEMOH Bandar KERAJAAN 111.0 18.0 \n",
  217. "5 CHENDERIANG Luar Bandar KERAJAAN 179.0 19.0 \n",
  218. "\n",
  219. " PreSchool Integration X Cood Y Cood \n",
  220. "BIL \n",
  221. "1 ADA ADA 101.255932 4.196633 \n",
  222. "2 ADA TIADA 101.200617 4.178276 \n",
  223. "3 ADA TIADA 101.181893 4.204361 \n",
  224. "4 ADA TIADA 101.172819 4.235841 \n",
  225. "5 ADA TIADA 101.240556 4.267980 "
  226. ]
  227. },
  228. "execution_count": 2,
  229. "metadata": {},
  230. "output_type": "execute_result"
  231. }
  232. ],
  233. "source": [
  234. "school_df=pd.read_excel('../Data_Project/Msia_Schools2019.xlsx',index_col=0,header=1)\n",
  235. "# Remove telephone and email columns\n",
  236. "school_df.drop(['NOTELEFON','NOFAX','EMAIL'],axis=1,inplace=True)\n",
  237. "# Translate the column name to English\n",
  238. "school_df.columns=['State','Education District','School Level','Type of School','School Code','School Name','Correspondence Address',\n",
  239. " 'Poscode','City','Urban/Rural','Gov/Gov Aided','Enrollment','N. of Teachers','PreSchool','Integration','X Cood','Y Cood']\n",
  240. "school_df.head()"
  241. ]
  242. },
  243. {
  244. "cell_type": "code",
  245. "execution_count": 3,
  246. "metadata": {},
  247. "outputs": [
  248. {
  249. "data": {
  250. "text/plain": [
  251. "State 16\n",
  252. "Education District 144\n",
  253. "School Level 2\n",
  254. "Type of School 18\n",
  255. "School Code 10211\n",
  256. "School Name 9600\n",
  257. "Correspondence Address 7293\n",
  258. "Poscode 1098\n",
  259. "City 603\n",
  260. "Urban/Rural 2\n",
  261. "Gov/Gov Aided 2\n",
  262. "PreSchool 2\n",
  263. "Integration 2\n",
  264. "dtype: int64"
  265. ]
  266. },
  267. "execution_count": 3,
  268. "metadata": {},
  269. "output_type": "execute_result"
  270. }
  271. ],
  272. "source": [
  273. "school_df.select_dtypes('object').apply(pd.Series.nunique)"
  274. ]
  275. },
  276. {
  277. "cell_type": "code",
  278. "execution_count": 4,
  279. "metadata": {},
  280. "outputs": [
  281. {
  282. "data": {
  283. "text/plain": [
  284. "State 0\n",
  285. "Education District 0\n",
  286. "School Level 0\n",
  287. "Type of School 0\n",
  288. "School Code 0\n",
  289. "School Name 0\n",
  290. "Correspondence Address 19\n",
  291. "Poscode 2\n",
  292. "City 2\n",
  293. "Urban/Rural 0\n",
  294. "Gov/Gov Aided 0\n",
  295. "Enrollment 7\n",
  296. "N. of Teachers 7\n",
  297. "PreSchool 0\n",
  298. "Integration 0\n",
  299. "X Cood 5\n",
  300. "Y Cood 5\n",
  301. "dtype: int64"
  302. ]
  303. },
  304. "execution_count": 4,
  305. "metadata": {},
  306. "output_type": "execute_result"
  307. }
  308. ],
  309. "source": [
  310. "# Number of missing values in each features/columns\n",
  311. "school_df.isnull().sum()"
  312. ]
  313. },
  314. {
  315. "cell_type": "code",
  316. "execution_count": 5,
  317. "metadata": {},
  318. "outputs": [],
  319. "source": [
  320. "cols=list(school_df.select_dtypes('object').columns)\n",
  321. "\n",
  322. "# Some quick formating \n",
  323. "for col in cols:\n",
  324. " if (col=='School Code') | (col=='Type of School'):\n",
  325. " pass\n",
  326. " else:\n",
  327. " school_df[col]=school_df[col].str.title()"
  328. ]
  329. },
  330. {
  331. "cell_type": "code",
  332. "execution_count": 6,
  333. "metadata": {},
  334. "outputs": [],
  335. "source": [
  336. "# Derive new feature i.e. student to teacher ratio\n",
  337. "school_df[\"Student:Teacher Ratio\"]=round(school_df['Enrollment']/school_df['N. of Teachers'],3)"
  338. ]
  339. },
  340. {
  341. "cell_type": "code",
  342. "execution_count": 10,
  343. "metadata": {},
  344. "outputs": [
  345. {
  346. "data": {
  347. "text/plain": [
  348. "State Wilayah Persekutuan Putrajaya\n",
  349. "Education District Ppw Sentul\n",
  350. "School Level Rendah\n",
  351. "Type of School SUKAN\n",
  352. "School Code YRA5101\n",
  353. "School Name Victoria Institution\n",
  354. "Urban/Rural Luar Bandar\n",
  355. "Gov/Gov Aided Kerajaan\n",
  356. "Enrollment 3858\n",
  357. "N. of Teachers 261\n",
  358. "PreSchool Tiada\n",
  359. "Integration Tiada\n",
  360. "X Cood 119.256\n",
  361. "Y Cood 7.3591\n",
  362. "Student:Teacher Ratio 27.34\n",
  363. "dtype: object"
  364. ]
  365. },
  366. "execution_count": 10,
  367. "metadata": {},
  368. "output_type": "execute_result"
  369. }
  370. ],
  371. "source": [
  372. "school_df.max()"
  373. ]
  374. },
  375. {
  376. "cell_type": "code",
  377. "execution_count": 8,
  378. "metadata": {},
  379. "outputs": [],
  380. "source": [
  381. "# Export the data\n",
  382. "school_df.to_csv(\"../Data_Project/Dataset/Msia_Schools2019_Clean.xlsx\")"
  383. ]
  384. },
  385. {
  386. "cell_type": "markdown",
  387. "metadata": {},
  388. "source": [
  389. "# One-Hot Encoding/Label Encoding Categorical Features"
  390. ]
  391. },
  392. {
  393. "cell_type": "code",
  394. "execution_count": 9,
  395. "metadata": {},
  396. "outputs": [],
  397. "source": [
  398. "#pd.get_dummies(school_df,columns=['PreSchool','Integration'])"
  399. ]
  400. }
  401. ],
  402. "metadata": {
  403. "kernelspec": {
  404. "display_name": "Python 3",
  405. "language": "python",
  406. "name": "python3"
  407. },
  408. "language_info": {
  409. "codemirror_mode": {
  410. "name": "ipython",
  411. "version": 3
  412. },
  413. "file_extension": ".py",
  414. "mimetype": "text/x-python",
  415. "name": "python",
  416. "nbconvert_exporter": "python",
  417. "pygments_lexer": "ipython3",
  418. "version": "3.7.3"
  419. }
  420. },
  421. "nbformat": 4,
  422. "nbformat_minor": 4
  423. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement