Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Ignore useless warnings (see SciPy issue #5998)\n",
- "import warnings\n",
- "warnings.simplefilter(action='ignore')\n",
- "\n",
- "# Data Manipulation and Visualisation\n",
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "\n",
- "# Notebook Mod\n",
- "from IPython.core.display import display, HTML\n",
- "%matplotlib inline\n",
- "%config InlineBackend.figure_format = 'retina'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>State</th>\n",
- " <th>Education District</th>\n",
- " <th>School Level</th>\n",
- " <th>Type of School</th>\n",
- " <th>School Code</th>\n",
- " <th>School Name</th>\n",
- " <th>Correspondence Address</th>\n",
- " <th>Poscode</th>\n",
- " <th>City</th>\n",
- " <th>Urban/Rural</th>\n",
- " <th>Gov/Gov Aided</th>\n",
- " <th>Enrollment</th>\n",
- " <th>N. of Teachers</th>\n",
- " <th>PreSchool</th>\n",
- " <th>Integration</th>\n",
- " <th>X Cood</th>\n",
- " <th>Y Cood</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>BIL</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>PERAK</td>\n",
- " <td>PPD BATANG PADANG</td>\n",
- " <td>Rendah</td>\n",
- " <td>SK</td>\n",
- " <td>ABA0001</td>\n",
- " <td>SEKOLAH KEBANGSAAN TOH TANDEWA SAKTI</td>\n",
- " <td>JALAN KELAB</td>\n",
- " <td>35000</td>\n",
- " <td>TAPAH</td>\n",
- " <td>Bandar</td>\n",
- " <td>KERAJAAN</td>\n",
- " <td>447.0</td>\n",
- " <td>41.0</td>\n",
- " <td>ADA</td>\n",
- " <td>ADA</td>\n",
- " <td>101.255932</td>\n",
- " <td>4.196633</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>PERAK</td>\n",
- " <td>PPD BATANG PADANG</td>\n",
- " <td>Rendah</td>\n",
- " <td>SK</td>\n",
- " <td>ABA0002</td>\n",
- " <td>SEKOLAH KEBANGSAAN PENDITA ZA'BA</td>\n",
- " <td>JALAN TAPAH ROAD</td>\n",
- " <td>35400</td>\n",
- " <td>TAPAH ROAD</td>\n",
- " <td>Bandar</td>\n",
- " <td>KERAJAAN</td>\n",
- " <td>389.0</td>\n",
- " <td>33.0</td>\n",
- " <td>ADA</td>\n",
- " <td>TIADA</td>\n",
- " <td>101.200617</td>\n",
- " <td>4.178276</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>PERAK</td>\n",
- " <td>PPD BATANG PADANG</td>\n",
- " <td>Rendah</td>\n",
- " <td>SK</td>\n",
- " <td>ABA0003</td>\n",
- " <td>SEKOLAH KEBANGSAAN BANIR</td>\n",
- " <td>BANIR</td>\n",
- " <td>35400</td>\n",
- " <td>TAPAH ROAD</td>\n",
- " <td>Bandar</td>\n",
- " <td>KERAJAAN</td>\n",
- " <td>43.0</td>\n",
- " <td>12.0</td>\n",
- " <td>ADA</td>\n",
- " <td>TIADA</td>\n",
- " <td>101.181893</td>\n",
- " <td>4.204361</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>PERAK</td>\n",
- " <td>PPD BATANG PADANG</td>\n",
- " <td>Rendah</td>\n",
- " <td>SK</td>\n",
- " <td>ABA0004</td>\n",
- " <td>SEKOLAH KEBANGSAAN TEMOH</td>\n",
- " <td>KAMPUNG TEMOH STESEN</td>\n",
- " <td>35350</td>\n",
- " <td>TEMOH</td>\n",
- " <td>Bandar</td>\n",
- " <td>KERAJAAN</td>\n",
- " <td>111.0</td>\n",
- " <td>18.0</td>\n",
- " <td>ADA</td>\n",
- " <td>TIADA</td>\n",
- " <td>101.172819</td>\n",
- " <td>4.235841</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>PERAK</td>\n",
- " <td>PPD BATANG PADANG</td>\n",
- " <td>Rendah</td>\n",
- " <td>SK</td>\n",
- " <td>ABA0005</td>\n",
- " <td>SEKOLAH KEBANGSAAN CHENDERIANG</td>\n",
- " <td>JALAN CHENDERIANG</td>\n",
- " <td>35300</td>\n",
- " <td>CHENDERIANG</td>\n",
- " <td>Luar Bandar</td>\n",
- " <td>KERAJAAN</td>\n",
- " <td>179.0</td>\n",
- " <td>19.0</td>\n",
- " <td>ADA</td>\n",
- " <td>TIADA</td>\n",
- " <td>101.240556</td>\n",
- " <td>4.267980</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " State Education District School Level Type of School School Code \\\n",
- "BIL \n",
- "1 PERAK PPD BATANG PADANG Rendah SK ABA0001 \n",
- "2 PERAK PPD BATANG PADANG Rendah SK ABA0002 \n",
- "3 PERAK PPD BATANG PADANG Rendah SK ABA0003 \n",
- "4 PERAK PPD BATANG PADANG Rendah SK ABA0004 \n",
- "5 PERAK PPD BATANG PADANG Rendah SK ABA0005 \n",
- "\n",
- " School Name Correspondence Address Poscode \\\n",
- "BIL \n",
- "1 SEKOLAH KEBANGSAAN TOH TANDEWA SAKTI JALAN KELAB 35000 \n",
- "2 SEKOLAH KEBANGSAAN PENDITA ZA'BA JALAN TAPAH ROAD 35400 \n",
- "3 SEKOLAH KEBANGSAAN BANIR BANIR 35400 \n",
- "4 SEKOLAH KEBANGSAAN TEMOH KAMPUNG TEMOH STESEN 35350 \n",
- "5 SEKOLAH KEBANGSAAN CHENDERIANG JALAN CHENDERIANG 35300 \n",
- "\n",
- " City Urban/Rural Gov/Gov Aided Enrollment N. of Teachers \\\n",
- "BIL \n",
- "1 TAPAH Bandar KERAJAAN 447.0 41.0 \n",
- "2 TAPAH ROAD Bandar KERAJAAN 389.0 33.0 \n",
- "3 TAPAH ROAD Bandar KERAJAAN 43.0 12.0 \n",
- "4 TEMOH Bandar KERAJAAN 111.0 18.0 \n",
- "5 CHENDERIANG Luar Bandar KERAJAAN 179.0 19.0 \n",
- "\n",
- " PreSchool Integration X Cood Y Cood \n",
- "BIL \n",
- "1 ADA ADA 101.255932 4.196633 \n",
- "2 ADA TIADA 101.200617 4.178276 \n",
- "3 ADA TIADA 101.181893 4.204361 \n",
- "4 ADA TIADA 101.172819 4.235841 \n",
- "5 ADA TIADA 101.240556 4.267980 "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "school_df=pd.read_excel('../Data_Project/Msia_Schools2019.xlsx',index_col=0,header=1)\n",
- "# Remove telephone and email columns\n",
- "school_df.drop(['NOTELEFON','NOFAX','EMAIL'],axis=1,inplace=True)\n",
- "# Translate the column name to English\n",
- "school_df.columns=['State','Education District','School Level','Type of School','School Code','School Name','Correspondence Address',\n",
- " 'Poscode','City','Urban/Rural','Gov/Gov Aided','Enrollment','N. of Teachers','PreSchool','Integration','X Cood','Y Cood']\n",
- "school_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "State 16\n",
- "Education District 144\n",
- "School Level 2\n",
- "Type of School 18\n",
- "School Code 10211\n",
- "School Name 9600\n",
- "Correspondence Address 7293\n",
- "Poscode 1098\n",
- "City 603\n",
- "Urban/Rural 2\n",
- "Gov/Gov Aided 2\n",
- "PreSchool 2\n",
- "Integration 2\n",
- "dtype: int64"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "school_df.select_dtypes('object').apply(pd.Series.nunique)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "State 0\n",
- "Education District 0\n",
- "School Level 0\n",
- "Type of School 0\n",
- "School Code 0\n",
- "School Name 0\n",
- "Correspondence Address 19\n",
- "Poscode 2\n",
- "City 2\n",
- "Urban/Rural 0\n",
- "Gov/Gov Aided 0\n",
- "Enrollment 7\n",
- "N. of Teachers 7\n",
- "PreSchool 0\n",
- "Integration 0\n",
- "X Cood 5\n",
- "Y Cood 5\n",
- "dtype: int64"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Number of missing values in each features/columns\n",
- "school_df.isnull().sum()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "cols=list(school_df.select_dtypes('object').columns)\n",
- "\n",
- "# Some quick formating \n",
- "for col in cols:\n",
- " if (col=='School Code') | (col=='Type of School'):\n",
- " pass\n",
- " else:\n",
- " school_df[col]=school_df[col].str.title()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Derive new feature i.e. student to teacher ratio\n",
- "school_df[\"Student:Teacher Ratio\"]=round(school_df['Enrollment']/school_df['N. of Teachers'],3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "State Wilayah Persekutuan Putrajaya\n",
- "Education District Ppw Sentul\n",
- "School Level Rendah\n",
- "Type of School SUKAN\n",
- "School Code YRA5101\n",
- "School Name Victoria Institution\n",
- "Urban/Rural Luar Bandar\n",
- "Gov/Gov Aided Kerajaan\n",
- "Enrollment 3858\n",
- "N. of Teachers 261\n",
- "PreSchool Tiada\n",
- "Integration Tiada\n",
- "X Cood 119.256\n",
- "Y Cood 7.3591\n",
- "Student:Teacher Ratio 27.34\n",
- "dtype: object"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "school_df.max()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Export the data\n",
- "school_df.to_csv(\"../Data_Project/Dataset/Msia_Schools2019_Clean.xlsx\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# One-Hot Encoding/Label Encoding Categorical Features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "#pd.get_dummies(school_df,columns=['PreSchool','Integration'])"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement