Untitled

  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n\n"
    }
   ],
   "source": [
    "from sklearn.datasets import load_boston\n",
    "boston_market_data = load_boston()\n",
    "print(boston_market_data['DESCR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "boston_market_train_data, boston_market_test_data, \\\n",
    "boston_market_train_target, boston_market_test_target = \\\n",
    "train_test_split(boston_market_data['data'],boston_market_data['target'], test_size=0.1, random_state=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "Training dataset:\nboston_market_train_data: (455, 13)\nboston_market_train_target: (455,)\n"
    }
   ],
   "source": [
    "print(\"Training dataset:\")\n",
    "print(\"boston_market_train_data:\", boston_market_train_data.shape)\n",
    "print(\"boston_market_train_target:\", boston_market_train_target.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "Training dataset:\nboston_market_test_data: (51, 13)\nboston_market_test_target: (51,)\n"
    }
   ],
   "source": [
    "print(\"Training dataset:\")\n",
    "print(\"boston_market_test_data:\", boston_market_test_data.shape)\n",
    "print(\"boston_market_test_target:\", boston_market_test_target.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "linear_regression = LinearRegression()\n",
    "linear_regression.fit(boston_market_train_data, boston_market_train_target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "Mean squared error of a learned model: 27.96\n"
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "print(\"Mean squared error of a learned model: %.2f\" % \n",
    "      mean_squared_error(boston_market_test_target, linear_regression.predict(boston_market_test_data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "Variance score: 0.72\n"
    }
   ],
   "source": [
    "from sklearn.metrics import r2_score\n",
    "print('Variance score: %.2f' % r2_score(boston_market_test_target, linear_regression.predict(boston_market_test_data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "[ 0.60217169  0.60398145  0.35873597 -1.10867706]\n"
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "scores = cross_val_score(LinearRegression(), boston_market_data['data'], boston_market_data['target'], cv=4)\n",
    "print(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "Model predicted for data under index 5 value [16.73451257]\nReal value for data under index 5 is 14.3\n"
    }
   ],
   "source": [
    "id = 5\n",
    "linear_regression_prediction = linear_regression.predict(boston_market_test_data[id,:].reshape(1,-1))\n",
    "print(\"Model predicted for data under index {0} value {1}\".format(id, linear_regression_prediction))\n",
    "print(\"Real value for data under index {0} is {1}\".format(id, boston_market_test_target[id]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# References\n",
    "__ALL images (unless otherwise stated) are from book__: Raschka, Sebastian. Python machine learning. Birmingham, UK: Packt Publishing, 2015, ISBN 1783555130\n",
    "\n",
    "If You are using Your own computer\n",
    "install required packages: \n",
    "```conda install numpy pandas scikit-learn matplotlib seaborn```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}