Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2019
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 26.14 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Trabalho (traindata)"
  8. ]
  9. },
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "#Grupo 7 | 1110299 Marta Ferreira | 1180167 Mariana Oliveira | 1180149 Ana Lima"
  15. ]
  16. },
  17. {
  18. "cell_type": "code",
  19. "execution_count": 1,
  20. "metadata": {},
  21. "outputs": [],
  22. "source": [
  23. "#Passos que temos que fazer: (Para retirar daqui depois)\n",
  24. "#\n",
  25. "#Carregar dados - e tratar os dados! temos de usar mais do que o que estamos a dar nas aulas\n",
  26. "#Split (treino/Teste)\n",
  27. "#criar standardscaler (standardizar, média 0 e desvio padrao 1)\n",
  28. "#treinar standardscaler\n",
  29. "#Transformar dados treino\n",
  30. "#treinar KNN\n",
  31. "#Transformar testset\n",
  32. "#Score/predict\n",
  33. "#Trabalho da Iris fizemos uma batota, porque normalizamos dados entes de os dividir (confirmar)"
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": 2,
  39. "metadata": {},
  40. "outputs": [],
  41. "source": [
  42. "import numpy as np\n",
  43. "import matplotlib.pyplot as plt\n",
  44. "import pandas as pd\n",
  45. "\n",
  46. "from sklearn import preprocessing as pp\n",
  47. "from sklearn.model_selection import train_test_split\n",
  48. "from sklearn.preprocessing import (\n",
  49. " StandardScaler,\n",
  50. " MinMaxScaler,\n",
  51. " RobustScaler)\n",
  52. "\n",
  53. "from sklearn.neighbors import KNeighborsClassifier\n",
  54. "from sklearn.pipeline import make_pipeline\n",
  55. "from sklearn.pipeline import Pipeline"
  56. ]
  57. },
  58. {
  59. "cell_type": "code",
  60. "execution_count": 3,
  61. "metadata": {},
  62. "outputs": [
  63. {
  64. "data": {
  65. "text/html": [
  66. "<div>\n",
  67. "<style scoped>\n",
  68. " .dataframe tbody tr th:only-of-type {\n",
  69. " vertical-align: middle;\n",
  70. " }\n",
  71. "\n",
  72. " .dataframe tbody tr th {\n",
  73. " vertical-align: top;\n",
  74. " }\n",
  75. "\n",
  76. " .dataframe thead th {\n",
  77. " text-align: right;\n",
  78. " }\n",
  79. "</style>\n",
  80. "<table border=\"1\" class=\"dataframe\">\n",
  81. " <thead>\n",
  82. " <tr style=\"text-align: right;\">\n",
  83. " <th></th>\n",
  84. " <th>0</th>\n",
  85. " <th>1</th>\n",
  86. " <th>2</th>\n",
  87. " <th>3</th>\n",
  88. " <th>4</th>\n",
  89. " <th>5</th>\n",
  90. " <th>6</th>\n",
  91. " <th>7</th>\n",
  92. " <th>8</th>\n",
  93. " <th>9</th>\n",
  94. " <th>10</th>\n",
  95. " <th>11</th>\n",
  96. " <th>12</th>\n",
  97. " <th>13</th>\n",
  98. " </tr>\n",
  99. " <tr>\n",
  100. " <th>Id</th>\n",
  101. " <th></th>\n",
  102. " <th></th>\n",
  103. " <th></th>\n",
  104. " <th></th>\n",
  105. " <th></th>\n",
  106. " <th></th>\n",
  107. " <th></th>\n",
  108. " <th></th>\n",
  109. " <th></th>\n",
  110. " <th></th>\n",
  111. " <th></th>\n",
  112. " <th></th>\n",
  113. " <th></th>\n",
  114. " <th></th>\n",
  115. " </tr>\n",
  116. " </thead>\n",
  117. " <tbody>\n",
  118. " <tr>\n",
  119. " <th>0</th>\n",
  120. " <td>63.0</td>\n",
  121. " <td>1.0</td>\n",
  122. " <td>1.0</td>\n",
  123. " <td>145.0</td>\n",
  124. " <td>233.0</td>\n",
  125. " <td>1.0</td>\n",
  126. " <td>2.0</td>\n",
  127. " <td>150.0</td>\n",
  128. " <td>0.0</td>\n",
  129. " <td>2.3</td>\n",
  130. " <td>3.0</td>\n",
  131. " <td>0.0</td>\n",
  132. " <td>6.0</td>\n",
  133. " <td>0</td>\n",
  134. " </tr>\n",
  135. " <tr>\n",
  136. " <th>1</th>\n",
  137. " <td>67.0</td>\n",
  138. " <td>1.0</td>\n",
  139. " <td>4.0</td>\n",
  140. " <td>160.0</td>\n",
  141. " <td>286.0</td>\n",
  142. " <td>0.0</td>\n",
  143. " <td>2.0</td>\n",
  144. " <td>108.0</td>\n",
  145. " <td>1.0</td>\n",
  146. " <td>1.5</td>\n",
  147. " <td>2.0</td>\n",
  148. " <td>3.0</td>\n",
  149. " <td>3.0</td>\n",
  150. " <td>2</td>\n",
  151. " </tr>\n",
  152. " <tr>\n",
  153. " <th>2</th>\n",
  154. " <td>67.0</td>\n",
  155. " <td>1.0</td>\n",
  156. " <td>4.0</td>\n",
  157. " <td>120.0</td>\n",
  158. " <td>229.0</td>\n",
  159. " <td>0.0</td>\n",
  160. " <td>2.0</td>\n",
  161. " <td>129.0</td>\n",
  162. " <td>1.0</td>\n",
  163. " <td>2.6</td>\n",
  164. " <td>2.0</td>\n",
  165. " <td>2.0</td>\n",
  166. " <td>7.0</td>\n",
  167. " <td>1</td>\n",
  168. " </tr>\n",
  169. " <tr>\n",
  170. " <th>3</th>\n",
  171. " <td>37.0</td>\n",
  172. " <td>1.0</td>\n",
  173. " <td>3.0</td>\n",
  174. " <td>130.0</td>\n",
  175. " <td>250.0</td>\n",
  176. " <td>0.0</td>\n",
  177. " <td>0.0</td>\n",
  178. " <td>187.0</td>\n",
  179. " <td>0.0</td>\n",
  180. " <td>3.5</td>\n",
  181. " <td>3.0</td>\n",
  182. " <td>0.0</td>\n",
  183. " <td>3.0</td>\n",
  184. " <td>0</td>\n",
  185. " </tr>\n",
  186. " <tr>\n",
  187. " <th>4</th>\n",
  188. " <td>41.0</td>\n",
  189. " <td>0.0</td>\n",
  190. " <td>2.0</td>\n",
  191. " <td>130.0</td>\n",
  192. " <td>204.0</td>\n",
  193. " <td>0.0</td>\n",
  194. " <td>2.0</td>\n",
  195. " <td>172.0</td>\n",
  196. " <td>0.0</td>\n",
  197. " <td>1.4</td>\n",
  198. " <td>1.0</td>\n",
  199. " <td>0.0</td>\n",
  200. " <td>3.0</td>\n",
  201. " <td>0</td>\n",
  202. " </tr>\n",
  203. " </tbody>\n",
  204. "</table>\n",
  205. "</div>"
  206. ],
  207. "text/plain": [
  208. " 0 1 2 3 4 5 6 7 8 9 10 11 12 13\n",
  209. "Id \n",
  210. "0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0 0\n",
  211. "1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2\n",
  212. "2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1\n",
  213. "3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0\n",
  214. "4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0"
  215. ]
  216. },
  217. "execution_count": 3,
  218. "metadata": {},
  219. "output_type": "execute_result"
  220. }
  221. ],
  222. "source": [
  223. "# Importação dos dados do ficheiro excel\n",
  224. "traindata = pd.read_csv(\"traindata.csv\", header=0, index_col=0) \n",
  225. "traindata.head()"
  226. ]
  227. },
  228. {
  229. "cell_type": "code",
  230. "execution_count": 4,
  231. "metadata": {},
  232. "outputs": [
  233. {
  234. "data": {
  235. "text/plain": [
  236. "(216, 14)"
  237. ]
  238. },
  239. "execution_count": 4,
  240. "metadata": {},
  241. "output_type": "execute_result"
  242. }
  243. ],
  244. "source": [
  245. "traindata.shape"
  246. ]
  247. },
  248. {
  249. "cell_type": "code",
  250. "execution_count": 5,
  251. "metadata": {},
  252. "outputs": [
  253. {
  254. "data": {
  255. "text/html": [
  256. "<div>\n",
  257. "<style scoped>\n",
  258. " .dataframe tbody tr th:only-of-type {\n",
  259. " vertical-align: middle;\n",
  260. " }\n",
  261. "\n",
  262. " .dataframe tbody tr th {\n",
  263. " vertical-align: top;\n",
  264. " }\n",
  265. "\n",
  266. " .dataframe thead th {\n",
  267. " text-align: right;\n",
  268. " }\n",
  269. "</style>\n",
  270. "<table border=\"1\" class=\"dataframe\">\n",
  271. " <thead>\n",
  272. " <tr style=\"text-align: right;\">\n",
  273. " <th></th>\n",
  274. " <th>0</th>\n",
  275. " <th>1</th>\n",
  276. " <th>2</th>\n",
  277. " <th>3</th>\n",
  278. " <th>4</th>\n",
  279. " <th>5</th>\n",
  280. " <th>6</th>\n",
  281. " <th>7</th>\n",
  282. " <th>8</th>\n",
  283. " <th>9</th>\n",
  284. " <th>10</th>\n",
  285. " <th>11</th>\n",
  286. " <th>12</th>\n",
  287. " <th>13</th>\n",
  288. " </tr>\n",
  289. " <tr>\n",
  290. " <th>Id</th>\n",
  291. " <th></th>\n",
  292. " <th></th>\n",
  293. " <th></th>\n",
  294. " <th></th>\n",
  295. " <th></th>\n",
  296. " <th></th>\n",
  297. " <th></th>\n",
  298. " <th></th>\n",
  299. " <th></th>\n",
  300. " <th></th>\n",
  301. " <th></th>\n",
  302. " <th></th>\n",
  303. " <th></th>\n",
  304. " <th></th>\n",
  305. " </tr>\n",
  306. " </thead>\n",
  307. " <tbody>\n",
  308. " <tr>\n",
  309. " <th>0</th>\n",
  310. " <td>63.0</td>\n",
  311. " <td>1.0</td>\n",
  312. " <td>1.0</td>\n",
  313. " <td>145.0</td>\n",
  314. " <td>233.0</td>\n",
  315. " <td>1.0</td>\n",
  316. " <td>2.0</td>\n",
  317. " <td>150.0</td>\n",
  318. " <td>0.0</td>\n",
  319. " <td>2.3</td>\n",
  320. " <td>3.0</td>\n",
  321. " <td>0.0</td>\n",
  322. " <td>6.0</td>\n",
  323. " <td>0</td>\n",
  324. " </tr>\n",
  325. " <tr>\n",
  326. " <th>1</th>\n",
  327. " <td>67.0</td>\n",
  328. " <td>1.0</td>\n",
  329. " <td>4.0</td>\n",
  330. " <td>160.0</td>\n",
  331. " <td>286.0</td>\n",
  332. " <td>0.0</td>\n",
  333. " <td>2.0</td>\n",
  334. " <td>108.0</td>\n",
  335. " <td>1.0</td>\n",
  336. " <td>1.5</td>\n",
  337. " <td>2.0</td>\n",
  338. " <td>3.0</td>\n",
  339. " <td>3.0</td>\n",
  340. " <td>2</td>\n",
  341. " </tr>\n",
  342. " <tr>\n",
  343. " <th>2</th>\n",
  344. " <td>67.0</td>\n",
  345. " <td>1.0</td>\n",
  346. " <td>4.0</td>\n",
  347. " <td>120.0</td>\n",
  348. " <td>229.0</td>\n",
  349. " <td>0.0</td>\n",
  350. " <td>2.0</td>\n",
  351. " <td>129.0</td>\n",
  352. " <td>1.0</td>\n",
  353. " <td>2.6</td>\n",
  354. " <td>2.0</td>\n",
  355. " <td>2.0</td>\n",
  356. " <td>7.0</td>\n",
  357. " <td>1</td>\n",
  358. " </tr>\n",
  359. " <tr>\n",
  360. " <th>3</th>\n",
  361. " <td>37.0</td>\n",
  362. " <td>1.0</td>\n",
  363. " <td>3.0</td>\n",
  364. " <td>130.0</td>\n",
  365. " <td>250.0</td>\n",
  366. " <td>0.0</td>\n",
  367. " <td>0.0</td>\n",
  368. " <td>187.0</td>\n",
  369. " <td>0.0</td>\n",
  370. " <td>3.5</td>\n",
  371. " <td>3.0</td>\n",
  372. " <td>0.0</td>\n",
  373. " <td>3.0</td>\n",
  374. " <td>0</td>\n",
  375. " </tr>\n",
  376. " <tr>\n",
  377. " <th>4</th>\n",
  378. " <td>41.0</td>\n",
  379. " <td>0.0</td>\n",
  380. " <td>2.0</td>\n",
  381. " <td>130.0</td>\n",
  382. " <td>204.0</td>\n",
  383. " <td>0.0</td>\n",
  384. " <td>2.0</td>\n",
  385. " <td>172.0</td>\n",
  386. " <td>0.0</td>\n",
  387. " <td>1.4</td>\n",
  388. " <td>1.0</td>\n",
  389. " <td>0.0</td>\n",
  390. " <td>3.0</td>\n",
  391. " <td>0</td>\n",
  392. " </tr>\n",
  393. " </tbody>\n",
  394. "</table>\n",
  395. "</div>"
  396. ],
  397. "text/plain": [
  398. " 0 1 2 3 4 5 6 7 8 9 10 11 12 13\n",
  399. "Id \n",
  400. "0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0 6.0 0\n",
  401. "1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0 3.0 2\n",
  402. "2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0 7.0 1\n",
  403. "3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0 3.0 0\n",
  404. "4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0 3.0 0"
  405. ]
  406. },
  407. "execution_count": 5,
  408. "metadata": {},
  409. "output_type": "execute_result"
  410. }
  411. ],
  412. "source": [
  413. "df=pd.DataFrame(traindata)\n",
  414. "df.head()"
  415. ]
  416. },
  417. {
  418. "cell_type": "code",
  419. "execution_count": 6,
  420. "metadata": {},
  421. "outputs": [],
  422. "source": [
  423. "traindata.replace({'?':np.nan}, inplace=True)"
  424. ]
  425. },
  426. {
  427. "cell_type": "code",
  428. "execution_count": 7,
  429. "metadata": {},
  430. "outputs": [],
  431. "source": [
  432. "traindata.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)"
  433. ]
  434. },
  435. {
  436. "cell_type": "code",
  437. "execution_count": 8,
  438. "metadata": {},
  439. "outputs": [],
  440. "source": [
  441. "trainset, testset=train_test_split(traindata)"
  442. ]
  443. },
  444. {
  445. "cell_type": "code",
  446. "execution_count": 9,
  447. "metadata": {},
  448. "outputs": [
  449. {
  450. "data": {
  451. "text/plain": [
  452. "0 93\n",
  453. "1 26\n",
  454. "3 19\n",
  455. "2 19\n",
  456. "4 2\n",
  457. "Name: 13, dtype: int64"
  458. ]
  459. },
  460. "execution_count": 9,
  461. "metadata": {},
  462. "output_type": "execute_result"
  463. }
  464. ],
  465. "source": [
  466. "#Contar os valores trainset. Fazer para a coluna 13, neste caso.\n",
  467. "trainset[\"13\"].value_counts()"
  468. ]
  469. },
  470. {
  471. "cell_type": "code",
  472. "execution_count": 10,
  473. "metadata": {},
  474. "outputs": [
  475. {
  476. "data": {
  477. "text/plain": [
  478. "0 22\n",
  479. "1 12\n",
  480. "4 7\n",
  481. "3 6\n",
  482. "2 6\n",
  483. "Name: 13, dtype: int64"
  484. ]
  485. },
  486. "execution_count": 10,
  487. "metadata": {},
  488. "output_type": "execute_result"
  489. }
  490. ],
  491. "source": [
  492. "#(No testset vamos ter que ver se os valores estão balanceados)\n",
  493. "testset[\"13\"].value_counts()"
  494. ]
  495. },
  496. {
  497. "cell_type": "markdown",
  498. "metadata": {},
  499. "source": [
  500. "# Função Standardscaler (dados já divididos)"
  501. ]
  502. },
  503. {
  504. "cell_type": "code",
  505. "execution_count": 11,
  506. "metadata": {},
  507. "outputs": [],
  508. "source": [
  509. "# Função Standardscaler - Transformar os dados de forma a que sua distribuição tem um valor \n",
  510. "# médio igual a 0 e um desvio padrão de 1."
  511. ]
  512. },
  513. {
  514. "cell_type": "code",
  515. "execution_count": 12,
  516. "metadata": {},
  517. "outputs": [
  518. {
  519. "data": {
  520. "text/plain": [
  521. "array([[-1.51112929, 0.73735291, -0.15951158, -0.150565 , -0.56516362,\n",
  522. " -0.3683942 , 1.04842767, 0.77092891, -0.63802794, 0.80364581,\n",
  523. " 0.61441966, -0.67320658, -0.80632771],\n",
  524. " [-1.7414932 , -1.35620268, -0.15951158, 0.31769583, -0.44273717,\n",
  525. " -0.3683942 , -0.97217839, 0.06973365, -0.63802794, -0.90921859,\n",
  526. " 0.61441966, -0.67320658, -0.80632771],\n",
  527. " [ 1.94432934, -1.35620268, 0.89725263, -1.20415186, -1.89145018,\n",
  528. " -0.3683942 , -0.97217839, -1.11353336, -0.63802794, 0.46107293,\n",
  529. " 0.61441966, -0.67320658, -0.80632771],\n",
  530. " [ 0.33178198, 0.73735291, 0.89725263, 1.02008707, 0.69990971,\n",
  531. " -0.3683942 , 1.04842767, -1.68325451, 1.56732949, -0.39535927,\n",
  532. " 0.61441966, 0.50305546, 0.75716139],\n",
  533. " [-0.93521952, -1.35620268, 0.89725263, 0.31769583, 0.02656423,\n",
  534. " -0.3683942 , 1.04842767, 0.06973365, 1.56732949, -0.90921859,\n",
  535. " 0.61441966, -0.67320658, -0.80632771],\n",
  536. " [-0.4744917 , -1.35620268, -1.21627579, -0.73589103, 0.04696864,\n",
  537. " -0.3683942 , -0.97217839, 0.50798069, -0.63802794, 0.03285683,\n",
  538. " -0.91202919, -0.67320658, -0.80632771],\n",
  539. " [ 0.33178198, 0.73735291, -1.21627579, 1.25421748, -0.19788426,\n",
  540. " -0.3683942 , 1.04842767, 0.5956301 , -0.63802794, -0.90921859,\n",
  541. " -0.91202919, 0.50305546, -0.80632771],\n",
  542. " [-1.9718571 , 0.73735291, -0.15951158, -0.150565 , 0.16939509,\n",
  543. " -0.3683942 , -0.97217839, 1.60359829, -0.63802794, 2.08829412,\n",
  544. " 2.14086851, -0.67320658, -0.80632771],\n",
  545. " [-0.58967365, -1.35620268, -1.21627579, 0.08356541, 0.59788767,\n",
  546. " -0.3683942 , -0.97217839, 0.50798069, -0.63802794, -0.90921859,\n",
  547. " 0.61441966, -0.67320658, -0.80632771],\n",
  548. " [ 0.44696394, 0.73735291, 0.89725263, 1.02008707, 0.57748326,\n",
  549. " -0.3683942 , 1.04842767, -1.72707921, 1.56732949, -0.22407283,\n",
  550. " -0.91202919, -0.67320658, 1.27832442],\n",
  551. " [ 0.67732784, 0.73735291, 0.89725263, -0.150565 , 0.23060832,\n",
  552. " -0.3683942 , -0.97217839, -0.28086398, 1.56732949, 0.28978649,\n",
  553. " -0.91202919, 0.50305546, 1.27832442],\n",
  554. " [-1.05040147, 0.73735291, 0.89725263, -1.67241268, -0.68759007,\n",
  555. " -0.3683942 , 1.04842767, -0.10556517, 1.56732949, 1.66007802,\n",
  556. " 0.61441966, -0.67320658, -0.80632771]])"
  557. ]
  558. },
  559. "execution_count": 12,
  560. "metadata": {},
  561. "output_type": "execute_result"
  562. }
  563. ],
  564. "source": [
  565. "scaler=StandardScaler()\n",
  566. "scaler.fit(trainset.iloc[:,:-1])\n",
  567. "trainX=scaler.transform(trainset.iloc[:,:-1])#Fizemos o fit e o transform juntos, mas podia ser separados\n",
  568. "trainX[:12] #Queremos normalizar tudo menos a última coluna"
  569. ]
  570. },
  571. {
  572. "cell_type": "code",
  573. "execution_count": 13,
  574. "metadata": {},
  575. "outputs": [],
  576. "source": [
  577. "trainY=trainset[\"13\"] #isto é o que eu quero prever"
  578. ]
  579. },
  580. {
  581. "cell_type": "markdown",
  582. "metadata": {},
  583. "source": [
  584. "# KNN (Derterminação do vizinho mais próximo)"
  585. ]
  586. },
  587. {
  588. "cell_type": "code",
  589. "execution_count": 27,
  590. "metadata": {},
  591. "outputs": [
  592. {
  593. "data": {
  594. "text/plain": [
  595. "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
  596. " metric_params=None, n_jobs=None, n_neighbors=3, p=2,\n",
  597. " weights='uniform')"
  598. ]
  599. },
  600. "execution_count": 27,
  601. "metadata": {},
  602. "output_type": "execute_result"
  603. }
  604. ],
  605. "source": [
  606. "knnmodel=KNeighborsClassifier(n_neighbors=3)#ver os vizinhos porque são 3??\n",
  607. "knnmodel.fit(trainX, trainY) #temos de fazer o fit"
  608. ]
  609. },
  610. {
  611. "cell_type": "code",
  612. "execution_count": 15,
  613. "metadata": {},
  614. "outputs": [],
  615. "source": [
  616. "testX=scaler.transform(testset.iloc[:,:-1])\n",
  617. "testY=testset[\"13\"]"
  618. ]
  619. },
  620. {
  621. "cell_type": "code",
  622. "execution_count": 16,
  623. "metadata": {},
  624. "outputs": [
  625. {
  626. "data": {
  627. "text/plain": [
  628. "0.4716981132075472"
  629. ]
  630. },
  631. "execution_count": 16,
  632. "metadata": {},
  633. "output_type": "execute_result"
  634. }
  635. ],
  636. "source": [
  637. "knnmodel.score(testX,testY) #normalização feita com base nos dados de treino. \n",
  638. "#O testY é o que eu quero prever. \n",
  639. "#Está a analisar a qualidade do teu resultado, em cada 100 acertei 94"
  640. ]
  641. },
  642. {
  643. "cell_type": "code",
  644. "execution_count": 17,
  645. "metadata": {},
  646. "outputs": [],
  647. "source": [
  648. "#knnmodel.predict(testset.iloc[:,:-1]) (ver o que colocar aqui)"
  649. ]
  650. },
  651. {
  652. "cell_type": "code",
  653. "execution_count": 18,
  654. "metadata": {},
  655. "outputs": [],
  656. "source": [
  657. "ppl=make_pipeline(StandardScaler(),KNeighborsClassifier(n_neighbors=1)) #ver isto dos vizinhos"
  658. ]
  659. },
  660. {
  661. "cell_type": "code",
  662. "execution_count": 19,
  663. "metadata": {},
  664. "outputs": [],
  665. "source": [
  666. "ppl2=Pipeline(steps=[(\"scaler\",StandardScaler()),(\"3NN\",KNeighborsClassifier(n_neighbors=3))])\n",
  667. "#outra forma de se resolver o ppl2. ver a questão dos vizinhos como se faz"
  668. ]
  669. },
  670. {
  671. "cell_type": "code",
  672. "execution_count": 20,
  673. "metadata": {},
  674. "outputs": [
  675. {
  676. "data": {
  677. "text/plain": [
  678. "Pipeline(memory=None,\n",
  679. " steps=[('standardscaler',\n",
  680. " StandardScaler(copy=True, with_mean=True, with_std=True)),\n",
  681. " ('kneighborsclassifier',\n",
  682. " KNeighborsClassifier(algorithm='auto', leaf_size=30,\n",
  683. " metric='minkowski', metric_params=None,\n",
  684. " n_jobs=None, n_neighbors=1, p=2,\n",
  685. " weights='uniform'))],\n",
  686. " verbose=False)"
  687. ]
  688. },
  689. "execution_count": 20,
  690. "metadata": {},
  691. "output_type": "execute_result"
  692. }
  693. ],
  694. "source": [
  695. "ppl.fit(trainX,trainY) \n",
  696. "#o pipeline é uma linha de transformação \n",
  697. "#- em vez da standardScaler - faz o fit transform para varios modelos?"
  698. ]
  699. },
  700. {
  701. "cell_type": "code",
  702. "execution_count": 21,
  703. "metadata": {},
  704. "outputs": [
  705. {
  706. "data": {
  707. "text/plain": [
  708. "0.5283018867924528"
  709. ]
  710. },
  711. "execution_count": 21,
  712. "metadata": {},
  713. "output_type": "execute_result"
  714. }
  715. ],
  716. "source": [
  717. "ppl.score(testX,testY)"
  718. ]
  719. },
  720. {
  721. "cell_type": "code",
  722. "execution_count": 22,
  723. "metadata": {},
  724. "outputs": [
  725. {
  726. "data": {
  727. "text/plain": [
  728. "Pipeline(memory=None,\n",
  729. " steps=[('scaler',\n",
  730. " StandardScaler(copy=True, with_mean=True, with_std=True)),\n",
  731. " ('3NN',\n",
  732. " KNeighborsClassifier(algorithm='auto', leaf_size=30,\n",
  733. " metric='minkowski', metric_params=None,\n",
  734. " n_jobs=None, n_neighbors=3, p=2,\n",
  735. " weights='uniform'))],\n",
  736. " verbose=False)"
  737. ]
  738. },
  739. "execution_count": 22,
  740. "metadata": {},
  741. "output_type": "execute_result"
  742. }
  743. ],
  744. "source": [
  745. "ppl2.fit(trainX,trainY)"
  746. ]
  747. },
  748. {
  749. "cell_type": "code",
  750. "execution_count": 23,
  751. "metadata": {},
  752. "outputs": [
  753. {
  754. "data": {
  755. "text/plain": [
  756. "0.49056603773584906"
  757. ]
  758. },
  759. "execution_count": 23,
  760. "metadata": {},
  761. "output_type": "execute_result"
  762. }
  763. ],
  764. "source": [
  765. "ppl2.score(testX,testY)"
  766. ]
  767. },
  768. {
  769. "cell_type": "markdown",
  770. "metadata": {},
  771. "source": [
  772. "Quais as previsões erradas? (Vamos ver onde erramos)"
  773. ]
  774. },
  775. {
  776. "cell_type": "code",
  777. "execution_count": 24,
  778. "metadata": {},
  779. "outputs": [
  780. {
  781. "data": {
  782. "text/plain": [
  783. "array([0, 3, 0, 3, 2], dtype=int64)"
  784. ]
  785. },
  786. "execution_count": 24,
  787. "metadata": {},
  788. "output_type": "execute_result"
  789. }
  790. ],
  791. "source": [
  792. "preds=ppl.predict(testX)\n",
  793. "preds[:5]"
  794. ]
  795. },
  796. {
  797. "cell_type": "code",
  798. "execution_count": 25,
  799. "metadata": {},
  800. "outputs": [
  801. {
  802. "data": {
  803. "text/plain": [
  804. "Id\n",
  805. "30 False\n",
  806. "40 True\n",
  807. "0 False\n",
  808. "213 False\n",
  809. "174 True\n",
  810. "Name: 13, dtype: bool"
  811. ]
  812. },
  813. "execution_count": 25,
  814. "metadata": {},
  815. "output_type": "execute_result"
  816. }
  817. ],
  818. "source": [
  819. "errado=(preds !=testY)\n",
  820. "errado.head()\n",
  821. "#Para identificarmos os objetos em que erramos a previsão\n",
  822. "#True é porque está errado e false é onde está certo"
  823. ]
  824. },
  825. {
  826. "cell_type": "code",
  827. "execution_count": 26,
  828. "metadata": {},
  829. "outputs": [
  830. {
  831. "name": "stderr",
  832. "output_type": "stream",
  833. "text": [
  834. "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
  835. "A value is trying to be set on a copy of a slice from a DataFrame\n",
  836. "\n",
  837. "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
  838. " \n",
  839. "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:8682: SettingWithCopyWarning: \n",
  840. "A value is trying to be set on a copy of a slice from a DataFrame\n",
  841. "\n",
  842. "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
  843. " self._update_inplace(new_data)\n",
  844. "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3325: SettingWithCopyWarning: \n",
  845. "A value is trying to be set on a copy of a slice from a DataFrame\n",
  846. "\n",
  847. "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
  848. " exec(code_obj, self.user_global_ns, self.user_ns)\n"
  849. ]
  850. },
  851. {
  852. "data": {
  853. "text/plain": [
  854. "Id\n",
  855. "30 0\n",
  856. "40 FAIL\n",
  857. "0 0\n",
  858. "213 3\n",
  859. "174 FAIL\n",
  860. "298 1\n",
  861. "280 2\n",
  862. "19 0\n",
  863. "177 1\n",
  864. "240 0\n",
  865. "Name: 13, dtype: object"
  866. ]
  867. },
  868. "execution_count": 26,
  869. "metadata": {},
  870. "output_type": "execute_result"
  871. }
  872. ],
  873. "source": [
  874. "colorseries=testY\n",
  875. "colorseries[errado==True]=\"FAIL\"\n",
  876. "colorseries.head(10)"
  877. ]
  878. },
  879. {
  880. "cell_type": "code",
  881. "execution_count": null,
  882. "metadata": {},
  883. "outputs": [],
  884. "source": []
  885. }
  886. ],
  887. "metadata": {
  888. "kernelspec": {
  889. "display_name": "Python 3",
  890. "language": "python",
  891. "name": "python3"
  892. },
  893. "language_info": {
  894. "codemirror_mode": {
  895. "name": "ipython",
  896. "version": 3
  897. },
  898. "file_extension": ".py",
  899. "mimetype": "text/x-python",
  900. "name": "python",
  901. "nbconvert_exporter": "python",
  902. "pygments_lexer": "ipython3",
  903. "version": "3.7.3"
  904. }
  905. },
  906. "nbformat": 4,
  907. "nbformat_minor": 2
  908. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement