Advertisement
Guest User

Untitled

a guest
Dec 19th, 2014
217
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 23.91 KB | None | 0 0
  1. {
  2. "metadata": {
  3. "name": ""
  4. },
  5. "nbformat": 3,
  6. "nbformat_minor": 0,
  7. "worksheets": [
  8. {
  9. "cells": [
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. " CSE 591 - Data Science-Fall 2014\n",
  15. " Team7 - GHOUL POOL PROJECT"
  16. ]
  17. },
  18. {
  19. "cell_type": "markdown",
  20. "metadata": {},
  21. "source": [
  22. "This ipython notebook is used to soem minor cleaning on our dataset like removing null rows and random strings in numeric\n",
  23. "columns"
  24. ]
  25. },
  26. {
  27. "cell_type": "code",
  28. "collapsed": false,
  29. "input": [
  30. "import pandas as pd\n",
  31. "from pandas import Series, DataFrame\n",
  32. "import numpy as np\n",
  33. "import pylab as pl\n",
  34. "import numpy as np\n",
  35. "from sklearn import datasets, linear_model"
  36. ],
  37. "language": "python",
  38. "metadata": {},
  39. "outputs": [],
  40. "prompt_number": 1
  41. },
  42. {
  43. "cell_type": "code",
  44. "collapsed": false,
  45. "input": [
  46. "#Reading Dead people data \n",
  47. "df_data2 = pd.read_csv('../data/finally_cleaned_Sravs.csv', error_bad_lines=False, delimiter='~')"
  48. ],
  49. "language": "python",
  50. "metadata": {},
  51. "outputs": [],
  52. "prompt_number": 3
  53. },
  54. {
  55. "cell_type": "code",
  56. "collapsed": false,
  57. "input": [
  58. "df_data2.shape"
  59. ],
  60. "language": "python",
  61. "metadata": {},
  62. "outputs": [
  63. {
  64. "metadata": {},
  65. "output_type": "pyout",
  66. "prompt_number": 4,
  67. "text": [
  68. "(167481, 53)"
  69. ]
  70. }
  71. ],
  72. "prompt_number": 4
  73. },
  74. {
  75. "cell_type": "code",
  76. "collapsed": false,
  77. "input": [
  78. "#reading life table data\n",
  79. "df_male=pd.read_csv('../data/male_lifeexp.csv', error_bad_lines=False)\n",
  80. "df_female=pd.read_csv('../data/female_lifeexp.csv', error_bad_lines=False)\n",
  81. "malemat=df_male.as_matrix(columns=None)\n",
  82. "femalemat=df_female.as_matrix(columns=None)\n",
  83. "#adding columns\n",
  84. "df_data2['LifeExpectancy'] = Series(0.0, index=df_data2.index)\n",
  85. "df_data2['Age'] = Series(0.0, index=df_data2.index)\n",
  86. "df_data2['Probability']=Series(0.0, index=df_data2.index)"
  87. ],
  88. "language": "python",
  89. "metadata": {},
  90. "outputs": [],
  91. "prompt_number": 6
  92. },
  93. {
  94. "cell_type": "code",
  95. "collapsed": false,
  96. "input": [
  97. "#calculating age, Life expectancy, Probability \n",
  98. "for index, row in df_data2.iterrows():\n",
  99. " dob=row['DOB']\n",
  100. " dod=row['DOD']\n",
  101. " if \"-\" in dob:\n",
  102. " dob1=(dob.split('-')[0])\n",
  103. " else :\n",
  104. " dob1=(dob)\n",
  105. " if \"-\" in dod:\n",
  106. " dod1=(dod.split('-')[0])\n",
  107. " else :\n",
  108. " dod1=(dod)\n",
  109. " if len(dob1) >0 and len(dod1)>0:\n",
  110. " age=int(dod1)-int(dob1)\n",
  111. " df_data2.loc[index,'Age']=int(dod1)-int(dob1)\n",
  112. " if row['/people/person/gender']=='Male' and age<120 and age>=0:\n",
  113. " df_data2.loc[index,'Probability']=malemat[age][1]\n",
  114. " df_data2.loc[index,'LifeExpectancy']=malemat[age][2]\n",
  115. " elif row['/people/person/gender']=='Female' and age<120 and age>=0:\n",
  116. " df_data2.loc[index,'Probability']=femalemat[age][1]\n",
  117. " df_data2.loc[index,'LifeExpectancy']=femalemat[age][2] \n",
  118. " else :\n",
  119. " df_data2.loc[index,'Probability']=None\n",
  120. " else :\n",
  121. " df_data2.loc[index,'Age']=None\n",
  122. " df_data2.loc[index,'Probability']=None\n",
  123. " df_data2.loc[index,'LifeExpectancy']=None\n",
  124. " \n"
  125. ],
  126. "language": "python",
  127. "metadata": {},
  128. "outputs": [
  129. {
  130. "ename": "TypeError",
  131. "evalue": "argument of type 'float' is not iterable",
  132. "output_type": "pyerr",
  133. "traceback": [
  134. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
  135. "\u001b[0;32m<ipython-input-7-0f869b5b0294>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdob\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'DOB'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mdod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'DOD'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m\"-\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdob\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mdob1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  136. "\u001b[0;31mTypeError\u001b[0m: argument of type 'float' is not iterable"
  137. ]
  138. }
  139. ],
  140. "prompt_number": 7
  141. },
  142. {
  143. "cell_type": "code",
  144. "collapsed": false,
  145. "input": [
  146. "#saving the data file with fields Age, Life expectancy, Probability\n",
  147. "df_data2.to_csv(path_or_buf=\"../data/final_fields.csv\",sep=\"~\",encoding='utf-8')"
  148. ],
  149. "language": "python",
  150. "metadata": {},
  151. "outputs": [],
  152. "prompt_number": 12
  153. },
  154. {
  155. "cell_type": "code",
  156. "collapsed": false,
  157. "input": [
  158. "#function to clean profession\n",
  159. "def find_between( s, first, last ):\n",
  160. " try:\n",
  161. " start = s.index( first ) + len( first )\n",
  162. " end = s.index( last, start )\n",
  163. " return s[start:end]\n",
  164. " except ValueError:\n",
  165. " return \"\""
  166. ],
  167. "language": "python",
  168. "metadata": {},
  169. "outputs": [],
  170. "prompt_number": 13
  171. },
  172. {
  173. "cell_type": "code",
  174. "collapsed": false,
  175. "input": [
  176. "#cleaning profession, gender\n",
  177. "df_data2['profession']=Series(None, index=df_data2.index)\n",
  178. "for index, row in df_data2.iterrows():\n",
  179. " if \"'\" in row['/people/person/profession']:\n",
  180. " df_data2.loc[index,'profession']=find_between(row['/people/person/profession'],\"'\",\"'\")\n",
  181. " if row['/people/person/gender']=='Male':\n",
  182. " df_data2.loc[index,'gender']=0\n",
  183. " if row['/people/person/gender']=='Female':\n",
  184. " df_data2.loc[index,'gender']=1\n",
  185. "df_data2 =df_data2[df_data2.profession != 'None']\n",
  186. "print df_data2.shape\n",
  187. " "
  188. ],
  189. "language": "python",
  190. "metadata": {},
  191. "outputs": [
  192. {
  193. "ename": "TypeError",
  194. "evalue": "argument of type 'float' is not iterable",
  195. "output_type": "pyerr",
  196. "traceback": [
  197. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
  198. "\u001b[0;32m<ipython-input-14-dd4578ba0ddf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m\"'\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_between\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/gender'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m'Male'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  199. "\u001b[0;31mTypeError\u001b[0m: argument of type 'float' is not iterable"
  200. ]
  201. }
  202. ],
  203. "prompt_number": 14
  204. },
  205. {
  206. "cell_type": "code",
  207. "collapsed": false,
  208. "input": [
  209. "df_data2.to_csv(path_or_buf=\"../data/final_fields_prof.csv\",sep=\"~\",encoding='utf-8')\n"
  210. ],
  211. "language": "python",
  212. "metadata": {},
  213. "outputs": [],
  214. "prompt_number": 16
  215. },
  216. {
  217. "cell_type": "code",
  218. "collapsed": false,
  219. "input": [
  220. "#converting country to integer\n",
  221. "df_country= pd.read_csv('../data/country_freebase.csv', error_bad_lines=False)#index_col=True)\n",
  222. "country_dict = dict(df_country.values)"
  223. ],
  224. "language": "python",
  225. "metadata": {},
  226. "outputs": [
  227. {
  228. "output_type": "stream",
  229. "stream": "stdout",
  230. "text": [
  231. "135\n"
  232. ]
  233. },
  234. {
  235. "output_type": "stream",
  236. "stream": "stderr",
  237. "text": [
  238. "Skipping line 418: expected 2 fields, saw 3\n",
  239. "Skipping line 538: expected 2 fields, saw 3\n",
  240. "Skipping line 557: expected 2 fields, saw 3\n",
  241. "Skipping line 696: expected 2 fields, saw 3\n",
  242. "\n"
  243. ]
  244. }
  245. ],
  246. "prompt_number": 40
  247. },
  248. {
  249. "cell_type": "code",
  250. "collapsed": false,
  251. "input": [
  252. "#Adding nationality and number\n",
  253. "df_data2['nationality']=Series(None, index=df_data2.index)\n",
  254. "df_data2['nationality_no']=Series(None, index=df_data2.index)\n",
  255. "for index, row in df_data2.iterrows():\n",
  256. " if \"'\" in row['/people/person/nationality']:\n",
  257. " temp=find_between(row['/people/person/nationality'],\"'\",\"'\")\n",
  258. " df_data2.loc[index,'nationality']=temp\n",
  259. " if temp in country_dict:\n",
  260. " df_data2.loc[index,'nationality_no']=country_dict[temp]\n",
  261. " else:\n",
  262. " df_data2.loc[index,'nationality_no']=None\n",
  263. " else:\n",
  264. " df_data2.loc[index,'nationality']=None\n",
  265. " df_data2.loc[index,'nationality_no']=None\n",
  266. " \n"
  267. ],
  268. "language": "python",
  269. "metadata": {},
  270. "outputs": [
  271. {
  272. "ename": "TypeError",
  273. "evalue": "argument of type 'float' is not iterable",
  274. "output_type": "pyerr",
  275. "traceback": [
  276. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
  277. "\u001b[0;32m<ipython-input-41-0e51b5c2e8da>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'nationality_no'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m\"'\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/nationality'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_between\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/nationality'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'nationality'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  278. "\u001b[0;31mTypeError\u001b[0m: argument of type 'float' is not iterable"
  279. ]
  280. }
  281. ],
  282. "prompt_number": 41
  283. },
  284. {
  285. "cell_type": "code",
  286. "collapsed": false,
  287. "input": [
  288. "#saving final file\n",
  289. "df_data2.to_csv(path_or_buf=\"../data/final_fields_prof_nationality.csv\",sep=\"~\",encoding='utf-8')"
  290. ],
  291. "language": "python",
  292. "metadata": {},
  293. "outputs": [],
  294. "prompt_number": 42
  295. },
  296. {
  297. "cell_type": "code",
  298. "collapsed": false,
  299. "input": [
  300. "#removing NaN values\n",
  301. "df_data2 = df_data2.replace('unknown', np.nan)\n",
  302. "df_data2 = df_data2.replace('none', np.nan)\n",
  303. "df_data2=df_data2.fillna(df_data2.median())"
  304. ],
  305. "language": "python",
  306. "metadata": {},
  307. "outputs": [],
  308. "prompt_number": 46
  309. },
  310. {
  311. "cell_type": "code",
  312. "collapsed": false,
  313. "input": [
  314. "#extracting fields for regression model\n",
  315. "df_data_X=pd.concat([df_data2['Age'], df_data2['Probability'], df_data2['gender'],df_data2['nationality_no']],axis=1)\n",
  316. "df_data_Y=pd.concat([df_data2['LifeExpectancy']],axis=1)"
  317. ],
  318. "language": "python",
  319. "metadata": {},
  320. "outputs": [],
  321. "prompt_number": 48
  322. },
  323. {
  324. "cell_type": "code",
  325. "collapsed": false,
  326. "input": [
  327. "#training the model\n",
  328. "X_train=np.array(df_data_X)\n",
  329. "Y_train=np.array(df_data_Y)\n",
  330. "regr = linear_model.LinearRegression()\n",
  331. "regr.fit(X_train, Y_train)"
  332. ],
  333. "language": "python",
  334. "metadata": {},
  335. "outputs": [
  336. {
  337. "metadata": {},
  338. "output_type": "pyout",
  339. "prompt_number": 49,
  340. "text": [
  341. "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
  342. ]
  343. }
  344. ],
  345. "prompt_number": 49
  346. },
  347. {
  348. "cell_type": "code",
  349. "collapsed": false,
  350. "input": [
  351. "#10 fold cross validation\n",
  352. "from sklearn import cross_validation\n",
  353. "skf = cross_validation.StratifiedKFold(df_data_Y, n_folds=10)\n",
  354. "print df_data_X.shape\n",
  355. "print df_data_Y.shape"
  356. ],
  357. "language": "python",
  358. "metadata": {},
  359. "outputs": [
  360. {
  361. "output_type": "stream",
  362. "stream": "stdout",
  363. "text": [
  364. "(167481, 4)\n",
  365. "(167481, 1)\n"
  366. ]
  367. }
  368. ],
  369. "prompt_number": 55
  370. },
  371. {
  372. "cell_type": "code",
  373. "collapsed": false,
  374. "input": [
  375. "\n",
  376. "from sklearn.cross_validation import KFold\n",
  377. "import sklearn as sk\n",
  378. "kf = KFold(len(df_data_Y), 2, indices=False)\n",
  379. "sk.cross_validation.KFold(n=167481, k=2)\n",
  380. "for train, test in kf:\n",
  381. " print train, test\n"
  382. ],
  383. "language": "python",
  384. "metadata": {},
  385. "outputs": [
  386. {
  387. "output_type": "stream",
  388. "stream": "stdout",
  389. "text": [
  390. "[False False False ..., True True True] [ True True True ..., False False False]\n",
  391. "[ True True True ..., False False False] [False False False ..., True True True]\n"
  392. ]
  393. },
  394. {
  395. "output_type": "stream",
  396. "stream": "stderr",
  397. "text": [
  398. "/usr/lib/python2.7/dist-packages/sklearn/cross_validation.py:240: DeprecationWarning: The parameter k was renamed to n_folds and will be removed in 0.15.\n",
  399. " \" removed in 0.15.\", DeprecationWarning)\n"
  400. ]
  401. }
  402. ],
  403. "prompt_number": 62
  404. },
  405. {
  406. "cell_type": "code",
  407. "collapsed": false,
  408. "input": [
  409. "count=0.0\n",
  410. "for train_index, test_index in kf:\n",
  411. " X_train, X_test, y_train, y_test = df_data_X[train], df_data_X[test], df_data_Y[train], df_data_Y[test]\n",
  412. " regr = linear_model.LinearRegression()\n",
  413. " regr.fit(X_train, y_train)\n",
  414. " count+=np.mean((regr.predict(X_test) - y_test) ** 2)\n"
  415. ],
  416. "language": "python",
  417. "metadata": {},
  418. "outputs": [
  419. {
  420. "output_type": "stream",
  421. "stream": "stderr",
  422. "text": [
  423. "/usr/lib/python2.7/dist-packages/numexpr/necompiler.py:742: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility\n",
  424. " return compiled_ex(*arguments, **kwargs)\n",
  425. "/usr/lib/python2.7/dist-packages/numexpr/necompiler.py:742: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility\n",
  426. " return compiled_ex(*arguments, **kwargs)\n"
  427. ]
  428. }
  429. ],
  430. "prompt_number": 67
  431. },
  432. {
  433. "cell_type": "code",
  434. "collapsed": false,
  435. "input": [
  436. "#mean square error\n",
  437. "print count/10"
  438. ],
  439. "language": "python",
  440. "metadata": {},
  441. "outputs": [
  442. {
  443. "output_type": "stream",
  444. "stream": "stdout",
  445. "text": [
  446. "LifeExpectancy 21.317094\n",
  447. "dtype: float64\n"
  448. ]
  449. }
  450. ],
  451. "prompt_number": 68
  452. },
  453. {
  454. "cell_type": "code",
  455. "collapsed": false,
  456. "input": [
  457. "df_data2 = pd.read_csv('../data/freebase_cleaned_data.csv', error_bad_lines=False, delimiter='~')\n",
  458. "df_prof=pd.read_csv('../data/celebrities_32_final.csv', error_bad_lines=False)"
  459. ],
  460. "language": "python",
  461. "metadata": {},
  462. "outputs": [],
  463. "prompt_number": 5
  464. },
  465. {
  466. "cell_type": "code",
  467. "collapsed": false,
  468. "input": [
  469. "def find_between( s, first, last ):\n",
  470. " try:\n",
  471. " start = s.index( first ) + len( first )\n",
  472. " end = s.index( last, start )\n",
  473. " return s[start:end]\n",
  474. " except ValueError:\n",
  475. " return \"\""
  476. ],
  477. "language": "python",
  478. "metadata": {},
  479. "outputs": [],
  480. "prompt_number": 6
  481. },
  482. {
  483. "cell_type": "code",
  484. "collapsed": false,
  485. "input": [
  486. "#cleaning profession, gender\n",
  487. "df_data2['Politician']=Series(None, index=df_data2.index)\n",
  488. "df_data2['Pope-elect']=Series(None, index=df_data2.index)\n",
  489. "df_data2['Diplomat']=Series(None, index=df_data2.index)\n",
  490. "df_data2['Evangelist']=Series(None, index=df_data2.index)\n",
  491. "df_data2['Business magnate']=Series(None, index=df_data2.index)\n",
  492. "df_data2['Astronaut']=Series(None, index=df_data2.index)\n",
  493. "df_data2['Musician']=Series(None, index=df_data2.index)\n",
  494. "df_data2['Physicist']=Series(None, index=df_data2.index)\n",
  495. "df_data2['Actress']=Series(None, index=df_data2.index)\n",
  496. "df_data2['Actor']=Series(None, index=df_data2.index)\n",
  497. "df_data2['Guitarist']=Series(None, index=df_data2.index)\n",
  498. "df_data2['Singer']=Series(None, index=df_data2.index)\n",
  499. "df_data2['Pianist']=Series(None, index=df_data2.index)\n",
  500. "#x=['Politician','Pope-elect','Diplomat','Evangelist','Business magnate','Astronaut','Musician','Physicist','Actress',\n",
  501. "#'Actor','Guitarist','Singer','Pianist']\n"
  502. ],
  503. "language": "python",
  504. "metadata": {},
  505. "outputs": [],
  506. "prompt_number": 7
  507. },
  508. {
  509. "cell_type": "code",
  510. "collapsed": false,
  511. "input": [
  512. "for index, row in df_data2.iterrows():\n",
  513. " if \"Politician\" in str(row['/people/person/profession']):\n",
  514. " df_data2.loc[index,'Politician']=1\n",
  515. " else:\n",
  516. " df_data2.loc[index,'Politician']=0\n",
  517. " if \"Pope-elect\" in str(row['/people/person/profession']):\n",
  518. " df_data2.loc[index,'Pope-elect']=1\n",
  519. " else:\n",
  520. " df_data2.loc[index,'Pope-elect']=0\n",
  521. " if \"Diplomat\" in str(row['/people/person/profession']):\n",
  522. " df_data2.loc[index,'Diplomat']=1\n",
  523. " else:\n",
  524. " df_data2.loc[index,'Diplomat']=0\n",
  525. " if \"Evangelist\" in str(row['/people/person/profession']):\n",
  526. " df_data2.loc[index,'Evangelist']=1\n",
  527. " else:\n",
  528. " df_data2.loc[index,'Evangelist']=0\n",
  529. " if \"Business magnate\" in str(row['/people/person/profession']):\n",
  530. " df_data2.loc[index,'Business magnate']=1\n",
  531. " else:\n",
  532. " df_data2.loc[index,'Business magnate']=0\n",
  533. " if \"Astronaut\" in str(row['/people/person/profession']):\n",
  534. " df_data2.loc[index,'Astronaut']=1\n",
  535. " else:\n",
  536. " df_data2.loc[index,'Astronaut']=0\n",
  537. " if \"Musician\" in str(row['/people/person/profession']):\n",
  538. " df_data2.loc[index,'Musician']=1\n",
  539. " else:\n",
  540. " df_data2.loc[index,'Musician']=0\n",
  541. " if \"Physicist\" in str(row['/people/person/profession']):\n",
  542. " df_data2.loc[index,'Physicist']=1\n",
  543. " else:\n",
  544. " df_data2.loc[index,'Physicist']=0\n",
  545. " if \"Actress\" in str(row['/people/person/profession']):\n",
  546. " df_data2.loc[index,'Actress']=1\n",
  547. " else:\n",
  548. " df_data2.loc[index,'Actress']=0\n",
  549. " if \"Actor\" in str(row['/people/person/profession']):\n",
  550. " df_data2.loc[index,'Actor']=1\n",
  551. " else:\n",
  552. " df_data2.loc[index,'Actor']=0\n",
  553. " if \"Guitarist\" in str(row['/people/person/profession']):\n",
  554. " df_data2.loc[index,'Guitarist']=1\n",
  555. " else:\n",
  556. " df_data2.loc[index,'Guitarist']=0\n",
  557. " if \"Singer\" in str(row['/people/person/profession']):\n",
  558. " df_data2.loc[index,'Singer']=1\n",
  559. " else:\n",
  560. " df_data2.loc[index,'Singer']=0\n",
  561. " if \"Pianist\" in str(row['/people/person/profession']):\n",
  562. " df_data2.loc[index,'Pianist']=1\n",
  563. " else:\n",
  564. " df_data2.loc[index,'Pianist']=0"
  565. ],
  566. "language": "python",
  567. "metadata": {},
  568. "outputs": [],
  569. "prompt_number": 8
  570. },
  571. {
  572. "cell_type": "code",
  573. "collapsed": false,
  574. "input": [
  575. "df_data2.to_csv(path_or_buf=\"../data/finaldata_profession_corrected.csv\",sep=\"~\",encoding='utf-8')"
  576. ],
  577. "language": "python",
  578. "metadata": {},
  579. "outputs": [],
  580. "prompt_number": 9
  581. },
  582. {
  583. "cell_type": "code",
  584. "collapsed": false,
  585. "input": [],
  586. "language": "python",
  587. "metadata": {},
  588. "outputs": []
  589. }
  590. ],
  591. "metadata": {}
  592. }
  593. ]
  594. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement