Guest User

Untitled

a guest
Jun 20th, 2018
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.01 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import numpy as np\n",
  10. "import pandas as pd\n",
  11. "pd.set_option(\"display.max_rows\", 5000)\n",
  12. "pd.set_option(\"display.max_columns\", 1000)\n",
  13. "pd.set_option(\"display.max_colwidth\", 10000)"
  14. ]
  15. },
  16. {
  17. "cell_type": "code",
  18. "execution_count": 2,
  19. "metadata": {},
  20. "outputs": [],
  21. "source": [
  22. "# data is taken from\n",
  23. "df = pd.read_csv('survey_results_public.csv', low_memory=False)"
  24. ]
  25. },
  26. {
  27. "cell_type": "code",
  28. "execution_count": 3,
  29. "metadata": {},
  30. "outputs": [],
  31. "source": [
  32. "# clean up the dataset from na values\n",
  33. "df = df.dropna(axis=0, subset=['JobSatisfaction', 'LanguageWorkedWith']).reindex()"
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": 4,
  39. "metadata": {},
  40. "outputs": [
  41. {
  42. "data": {
  43. "text/plain": [
  44. "Moderately satisfied 22973\n",
  45. "Extremely satisfied 11084\n",
  46. "Slightly satisfied 8795\n",
  47. "Slightly dissatisfied 6253\n",
  48. "Moderately dissatisfied 5603\n",
  49. "Neither satisfied nor dissatisfied 4242\n",
  50. "Extremely dissatisfied 2142\n",
  51. "Name: JobSatisfaction, dtype: int64"
  52. ]
  53. },
  54. "execution_count": 4,
  55. "metadata": {},
  56. "output_type": "execute_result"
  57. }
  58. ],
  59. "source": [
  60. "df.JobSatisfaction.value_counts()"
  61. ]
  62. },
  63. {
  64. "cell_type": "code",
  65. "execution_count": 5,
  66. "metadata": {},
  67. "outputs": [],
  68. "source": [
  69. "# Normalize satisfaction levels\n",
  70. "satistaction_levels = {\n",
  71. " 'Extremely satisfied': 3,\n",
  72. " 'Moderately satisfied': 2,\n",
  73. " 'Slightly satisfied': 1,\n",
  74. " 'Neither satisfied nor dissatisfied': 0,\n",
  75. " 'Slightly dissatisfied': -1,\n",
  76. " 'Moderately dissatisfied': -2,\n",
  77. " 'Extremely dissatisfied': -3,\n",
  78. " np.nan: np.nan,\n",
  79. "}\n",
  80. "df['satisfaction'] = df.JobSatisfaction.apply(lambda v: satistaction_levels[v])"
  81. ]
  82. },
  83. {
  84. "cell_type": "code",
  85. "execution_count": 6,
  86. "metadata": {},
  87. "outputs": [
  88. {
  89. "data": {
  90. "text/plain": [
  91. "C#;JavaScript;SQL;HTML;CSS 1198\n",
  92. "JavaScript;PHP;SQL;HTML;CSS 1015\n",
  93. "C#;JavaScript;SQL;TypeScript;HTML;CSS 765\n",
  94. "Java 733\n",
  95. "JavaScript;PHP;SQL;HTML;CSS;Bash/Shell 682\n",
  96. "Name: LanguageWorkedWith, dtype: int64"
  97. ]
  98. },
  99. "execution_count": 6,
  100. "metadata": {},
  101. "output_type": "execute_result"
  102. }
  103. ],
  104. "source": [
  105. "# Languages have multiple choice, so we have to process them\n",
  106. "df.LanguageWorkedWith.value_counts().head()"
  107. ]
  108. },
  109. {
  110. "cell_type": "code",
  111. "execution_count": 7,
  112. "metadata": {},
  113. "outputs": [],
  114. "source": [
  115. "# Get the list of languages\n",
  116. "all_languages = set()\n",
  117. "for item in df.LanguageWorkedWith.unique():\n",
  118. " all_languages.update(item.split(';'))\n",
  119. " \n",
  120. "all_languages = sorted(all_languages)\n",
  121. "language_rows = ['lang_{}'.format(lang) for lang in all_languages]\n",
  122. " \n",
  123. "# Mark every language as \"not used\" initially\n",
  124. "for language in all_languages:\n",
  125. " df['lang_{}'.format(language)] = 0.0"
  126. ]
  127. },
  128. {
  129. "cell_type": "code",
  130. "execution_count": 8,
  131. "metadata": {},
  132. "outputs": [],
  133. "source": [
  134. "# Function to populate dataset with language usage info\n",
  135. "def set_languages(row):\n",
  136. " person_languages = set(row.LanguageWorkedWith.split(';'))\n",
  137. " for lang in person_languages:\n",
  138. " row['lang_{}'.format(lang)] = 1.0\n",
  139. " return row"
  140. ]
  141. },
  142. {
  143. "cell_type": "code",
  144. "execution_count": 9,
  145. "metadata": {},
  146. "outputs": [],
  147. "source": [
  148. "# Apply that function\n",
  149. "df = df.apply(set_languages, axis=1)"
  150. ]
  151. },
  152. {
  153. "cell_type": "code",
  154. "execution_count": 10,
  155. "metadata": {},
  156. "outputs": [],
  157. "source": [
  158. "from sklearn import linear_model"
  159. ]
  160. },
  161. {
  162. "cell_type": "code",
  163. "execution_count": 11,
  164. "metadata": {},
  165. "outputs": [
  166. {
  167. "data": {
  168. "text/plain": [
  169. "array([ 3., -2., 0., ..., 0., 1., -2.])"
  170. ]
  171. },
  172. "execution_count": 11,
  173. "metadata": {},
  174. "output_type": "execute_result"
  175. }
  176. ],
  177. "source": [
  178. "y = df.satisfaction.astype(np.float).values\n",
  179. "y"
  180. ]
  181. },
  182. {
  183. "cell_type": "code",
  184. "execution_count": 12,
  185. "metadata": {},
  186. "outputs": [
  187. {
  188. "data": {
  189. "text/plain": [
  190. "array([[0., 0., 0., ..., 0., 0., 0.],\n",
  191. " [0., 1., 0., ..., 0., 0., 0.],\n",
  192. " [0., 1., 0., ..., 0., 0., 0.],\n",
  193. " ...,\n",
  194. " [0., 0., 0., ..., 0., 0., 0.],\n",
  195. " [0., 0., 0., ..., 0., 0., 0.],\n",
  196. " [0., 0., 0., ..., 0., 0., 0.]])"
  197. ]
  198. },
  199. "execution_count": 12,
  200. "metadata": {},
  201. "output_type": "execute_result"
  202. }
  203. ],
  204. "source": [
  205. "X = df[language_rows].astype(np.float).values\n",
  206. "X"
  207. ]
  208. },
  209. {
  210. "cell_type": "code",
  211. "execution_count": 13,
  212. "metadata": {},
  213. "outputs": [],
  214. "source": [
  215. "regr = linear_model.LinearRegression()"
  216. ]
  217. },
  218. {
  219. "cell_type": "code",
  220. "execution_count": 14,
  221. "metadata": {},
  222. "outputs": [
  223. {
  224. "name": "stderr",
  225. "output_type": "stream",
  226. "text": [
  227. "/Users/roman/workspace/keras-playground/env/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n",
  228. " linalg.lstsq(X, y)\n"
  229. ]
  230. },
  231. {
  232. "data": {
  233. "text/plain": [
  234. "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
  235. ]
  236. },
  237. "execution_count": 14,
  238. "metadata": {},
  239. "output_type": "execute_result"
  240. }
  241. ],
  242. "source": [
  243. "regr.fit(X, y)"
  244. ]
  245. },
  246. {
  247. "cell_type": "code",
  248. "execution_count": 15,
  249. "metadata": {},
  250. "outputs": [],
  251. "source": [
  252. "result = pd.DataFrame(dict(coef=regr.coef_, language=all_languages))"
  253. ]
  254. },
  255. {
  256. "cell_type": "code",
  257. "execution_count": 16,
  258. "metadata": {},
  259. "outputs": [
  260. {
  261. "data": {
  262. "text/html": [
  263. "<div>\n",
  264. "<style scoped>\n",
  265. " .dataframe tbody tr th:only-of-type {\n",
  266. " vertical-align: middle;\n",
  267. " }\n",
  268. "\n",
  269. " .dataframe tbody tr th {\n",
  270. " vertical-align: top;\n",
  271. " }\n",
  272. "\n",
  273. " .dataframe thead th {\n",
  274. " text-align: right;\n",
  275. " }\n",
  276. "</style>\n",
  277. "<table border=\"1\" class=\"dataframe\">\n",
  278. " <thead>\n",
  279. " <tr style=\"text-align: right;\">\n",
  280. " <th></th>\n",
  281. " <th>coef</th>\n",
  282. " <th>language</th>\n",
  283. " </tr>\n",
  284. " </thead>\n",
  285. " <tbody>\n",
  286. " <tr>\n",
  287. " <th>1</th>\n",
  288. " <td>0.117826</td>\n",
  289. " <td>Bash/Shell</td>\n",
  290. " </tr>\n",
  291. " <tr>\n",
  292. " <th>29</th>\n",
  293. " <td>0.088925</td>\n",
  294. " <td>Ruby</td>\n",
  295. " </tr>\n",
  296. " <tr>\n",
  297. " <th>34</th>\n",
  298. " <td>0.088187</td>\n",
  299. " <td>TypeScript</td>\n",
  300. " </tr>\n",
  301. " <tr>\n",
  302. " <th>20</th>\n",
  303. " <td>0.084747</td>\n",
  304. " <td>Kotlin</td>\n",
  305. " </tr>\n",
  306. " <tr>\n",
  307. " <th>22</th>\n",
  308. " <td>0.079638</td>\n",
  309. " <td>Matlab</td>\n",
  310. " </tr>\n",
  311. " <tr>\n",
  312. " <th>13</th>\n",
  313. " <td>0.078126</td>\n",
  314. " <td>Groovy</td>\n",
  315. " </tr>\n",
  316. " <tr>\n",
  317. " <th>33</th>\n",
  318. " <td>0.077379</td>\n",
  319. " <td>Swift</td>\n",
  320. " </tr>\n",
  321. " <tr>\n",
  322. " <th>11</th>\n",
  323. " <td>0.069695</td>\n",
  324. " <td>F#</td>\n",
  325. " </tr>\n",
  326. " <tr>\n",
  327. " <th>27</th>\n",
  328. " <td>0.053163</td>\n",
  329. " <td>Python</td>\n",
  330. " </tr>\n",
  331. " <tr>\n",
  332. " <th>12</th>\n",
  333. " <td>0.051383</td>\n",
  334. " <td>Go</td>\n",
  335. " </tr>\n",
  336. " <tr>\n",
  337. " <th>30</th>\n",
  338. " <td>0.045079</td>\n",
  339. " <td>Rust</td>\n",
  340. " </tr>\n",
  341. " <tr>\n",
  342. " <th>0</th>\n",
  343. " <td>0.042354</td>\n",
  344. " <td>Assembly</td>\n",
  345. " </tr>\n",
  346. " <tr>\n",
  347. " <th>3</th>\n",
  348. " <td>0.040939</td>\n",
  349. " <td>C#</td>\n",
  350. " </tr>\n",
  351. " <tr>\n",
  352. " <th>4</th>\n",
  353. " <td>0.039322</td>\n",
  354. " <td>C++</td>\n",
  355. " </tr>\n",
  356. " <tr>\n",
  357. " <th>23</th>\n",
  358. " <td>0.034584</td>\n",
  359. " <td>Objective-C</td>\n",
  360. " </tr>\n",
  361. " <tr>\n",
  362. " <th>5</th>\n",
  363. " <td>0.027556</td>\n",
  364. " <td>CSS</td>\n",
  365. " </tr>\n",
  366. " <tr>\n",
  367. " <th>21</th>\n",
  368. " <td>0.025346</td>\n",
  369. " <td>Lua</td>\n",
  370. " </tr>\n",
  371. " <tr>\n",
  372. " <th>18</th>\n",
  373. " <td>0.024344</td>\n",
  374. " <td>JavaScript</td>\n",
  375. " </tr>\n",
  376. " <tr>\n",
  377. " <th>36</th>\n",
  378. " <td>0.023078</td>\n",
  379. " <td>VBA</td>\n",
  380. " </tr>\n",
  381. " <tr>\n",
  382. " <th>14</th>\n",
  383. " <td>0.022850</td>\n",
  384. " <td>HTML</td>\n",
  385. " </tr>\n",
  386. " <tr>\n",
  387. " <th>28</th>\n",
  388. " <td>0.019698</td>\n",
  389. " <td>R</td>\n",
  390. " </tr>\n",
  391. " <tr>\n",
  392. " <th>8</th>\n",
  393. " <td>0.012216</td>\n",
  394. " <td>CoffeeScript</td>\n",
  395. " </tr>\n",
  396. " <tr>\n",
  397. " <th>32</th>\n",
  398. " <td>0.008682</td>\n",
  399. " <td>Scala</td>\n",
  400. " </tr>\n",
  401. " <tr>\n",
  402. " <th>26</th>\n",
  403. " <td>0.000065</td>\n",
  404. " <td>Perl</td>\n",
  405. " </tr>\n",
  406. " <tr>\n",
  407. " <th>19</th>\n",
  408. " <td>-0.004244</td>\n",
  409. " <td>Julia</td>\n",
  410. " </tr>\n",
  411. " <tr>\n",
  412. " <th>31</th>\n",
  413. " <td>-0.005304</td>\n",
  414. " <td>SQL</td>\n",
  415. " </tr>\n",
  416. " <tr>\n",
  417. " <th>24</th>\n",
  418. " <td>-0.022756</td>\n",
  419. " <td>Ocaml</td>\n",
  420. " </tr>\n",
  421. " <tr>\n",
  422. " <th>6</th>\n",
  423. " <td>-0.039730</td>\n",
  424. " <td>Clojure</td>\n",
  425. " </tr>\n",
  426. " <tr>\n",
  427. " <th>9</th>\n",
  428. " <td>-0.043176</td>\n",
  429. " <td>Delphi/Object Pascal</td>\n",
  430. " </tr>\n",
  431. " <tr>\n",
  432. " <th>10</th>\n",
  433. " <td>-0.052615</td>\n",
  434. " <td>Erlang</td>\n",
  435. " </tr>\n",
  436. " <tr>\n",
  437. " <th>2</th>\n",
  438. " <td>-0.053490</td>\n",
  439. " <td>C</td>\n",
  440. " </tr>\n",
  441. " <tr>\n",
  442. " <th>25</th>\n",
  443. " <td>-0.061261</td>\n",
  444. " <td>PHP</td>\n",
  445. " </tr>\n",
  446. " <tr>\n",
  447. " <th>37</th>\n",
  448. " <td>-0.062269</td>\n",
  449. " <td>Visual Basic 6</td>\n",
  450. " </tr>\n",
  451. " <tr>\n",
  452. " <th>16</th>\n",
  453. " <td>-0.066041</td>\n",
  454. " <td>Haskell</td>\n",
  455. " </tr>\n",
  456. " <tr>\n",
  457. " <th>17</th>\n",
  458. " <td>-0.093999</td>\n",
  459. " <td>Java</td>\n",
  460. " </tr>\n",
  461. " <tr>\n",
  462. " <th>35</th>\n",
  463. " <td>-0.095383</td>\n",
  464. " <td>VB.NET</td>\n",
  465. " </tr>\n",
  466. " <tr>\n",
  467. " <th>15</th>\n",
  468. " <td>-0.103195</td>\n",
  469. " <td>Hack</td>\n",
  470. " </tr>\n",
  471. " <tr>\n",
  472. " <th>7</th>\n",
  473. " <td>-0.176546</td>\n",
  474. " <td>Cobol</td>\n",
  475. " </tr>\n",
  476. " </tbody>\n",
  477. "</table>\n",
  478. "</div>"
  479. ],
  480. "text/plain": [
  481. " coef language\n",
  482. "1 0.117826 Bash/Shell\n",
  483. "29 0.088925 Ruby\n",
  484. "34 0.088187 TypeScript\n",
  485. "20 0.084747 Kotlin\n",
  486. "22 0.079638 Matlab\n",
  487. "13 0.078126 Groovy\n",
  488. "33 0.077379 Swift\n",
  489. "11 0.069695 F#\n",
  490. "27 0.053163 Python\n",
  491. "12 0.051383 Go\n",
  492. "30 0.045079 Rust\n",
  493. "0 0.042354 Assembly\n",
  494. "3 0.040939 C#\n",
  495. "4 0.039322 C++\n",
  496. "23 0.034584 Objective-C\n",
  497. "5 0.027556 CSS\n",
  498. "21 0.025346 Lua\n",
  499. "18 0.024344 JavaScript\n",
  500. "36 0.023078 VBA\n",
  501. "14 0.022850 HTML\n",
  502. "28 0.019698 R\n",
  503. "8 0.012216 CoffeeScript\n",
  504. "32 0.008682 Scala\n",
  505. "26 0.000065 Perl\n",
  506. "19 -0.004244 Julia\n",
  507. "31 -0.005304 SQL\n",
  508. "24 -0.022756 Ocaml\n",
  509. "6 -0.039730 Clojure\n",
  510. "9 -0.043176 Delphi/Object Pascal\n",
  511. "10 -0.052615 Erlang\n",
  512. "2 -0.053490 C\n",
  513. "25 -0.061261 PHP\n",
  514. "37 -0.062269 Visual Basic 6\n",
  515. "16 -0.066041 Haskell\n",
  516. "17 -0.093999 Java\n",
  517. "35 -0.095383 VB.NET\n",
  518. "15 -0.103195 Hack\n",
  519. "7 -0.176546 Cobol"
  520. ]
  521. },
  522. "execution_count": 16,
  523. "metadata": {},
  524. "output_type": "execute_result"
  525. }
  526. ],
  527. "source": [
  528. "result.sort_values(by='coef', ascending=False)"
  529. ]
  530. },
  531. {
  532. "cell_type": "code",
  533. "execution_count": null,
  534. "metadata": {},
  535. "outputs": [],
  536. "source": []
  537. }
  538. ],
  539. "metadata": {
  540. "kernelspec": {
  541. "display_name": "Python 3",
  542. "language": "python",
  543. "name": "python3"
  544. },
  545. "language_info": {
  546. "codemirror_mode": {
  547. "name": "ipython",
  548. "version": 3
  549. },
  550. "file_extension": ".py",
  551. "mimetype": "text/x-python",
  552. "name": "python",
  553. "nbconvert_exporter": "python",
  554. "pygments_lexer": "ipython3",
  555. "version": "3.6.2"
  556. }
  557. },
  558. "nbformat": 4,
  559. "nbformat_minor": 2
  560. }
Add Comment
Please, Sign In to add comment