Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "metadata": {
- "name": ""
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- " CSE 591 - Data Science-Fall 2014\n",
- " Team7 - GHOUL POOL PROJECT"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This ipython notebook is used to soem minor cleaning on our dataset like removing null rows and random strings in numeric\n",
- "columns"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "import pandas as pd\n",
- "from pandas import Series, DataFrame\n",
- "import numpy as np\n",
- "import pylab as pl\n",
- "import numpy as np\n",
- "from sklearn import datasets, linear_model"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 1
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#Reading Dead people data \n",
- "df_data2 = pd.read_csv('../data/finally_cleaned_Sravs.csv', error_bad_lines=False, delimiter='~')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 3
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "df_data2.shape"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 4,
- "text": [
- "(167481, 53)"
- ]
- }
- ],
- "prompt_number": 4
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#reading life table data\n",
- "df_male=pd.read_csv('../data/male_lifeexp.csv', error_bad_lines=False)\n",
- "df_female=pd.read_csv('../data/female_lifeexp.csv', error_bad_lines=False)\n",
- "malemat=df_male.as_matrix(columns=None)\n",
- "femalemat=df_female.as_matrix(columns=None)\n",
- "#adding columns\n",
- "df_data2['LifeExpectancy'] = Series(0.0, index=df_data2.index)\n",
- "df_data2['Age'] = Series(0.0, index=df_data2.index)\n",
- "df_data2['Probability']=Series(0.0, index=df_data2.index)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 6
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#calculating age, Life expectancy, Probability \n",
- "for index, row in df_data2.iterrows():\n",
- " dob=row['DOB']\n",
- " dod=row['DOD']\n",
- " if \"-\" in dob:\n",
- " dob1=(dob.split('-')[0])\n",
- " else :\n",
- " dob1=(dob)\n",
- " if \"-\" in dod:\n",
- " dod1=(dod.split('-')[0])\n",
- " else :\n",
- " dod1=(dod)\n",
- " if len(dob1) >0 and len(dod1)>0:\n",
- " age=int(dod1)-int(dob1)\n",
- " df_data2.loc[index,'Age']=int(dod1)-int(dob1)\n",
- " if row['/people/person/gender']=='Male' and age<120 and age>=0:\n",
- " df_data2.loc[index,'Probability']=malemat[age][1]\n",
- " df_data2.loc[index,'LifeExpectancy']=malemat[age][2]\n",
- " elif row['/people/person/gender']=='Female' and age<120 and age>=0:\n",
- " df_data2.loc[index,'Probability']=femalemat[age][1]\n",
- " df_data2.loc[index,'LifeExpectancy']=femalemat[age][2] \n",
- " else :\n",
- " df_data2.loc[index,'Probability']=None\n",
- " else :\n",
- " df_data2.loc[index,'Age']=None\n",
- " df_data2.loc[index,'Probability']=None\n",
- " df_data2.loc[index,'LifeExpectancy']=None\n",
- " \n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "ename": "TypeError",
- "evalue": "argument of type 'float' is not iterable",
- "output_type": "pyerr",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-7-0f869b5b0294>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdob\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'DOB'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mdod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'DOD'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m\"-\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdob\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mdob1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mTypeError\u001b[0m: argument of type 'float' is not iterable"
- ]
- }
- ],
- "prompt_number": 7
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#saving the data file with fields Age, Life expectancy, Probability\n",
- "df_data2.to_csv(path_or_buf=\"../data/final_fields.csv\",sep=\"~\",encoding='utf-8')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 12
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#function to clean profession\n",
- "def find_between( s, first, last ):\n",
- " try:\n",
- " start = s.index( first ) + len( first )\n",
- " end = s.index( last, start )\n",
- " return s[start:end]\n",
- " except ValueError:\n",
- " return \"\""
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 13
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#cleaning profession, gender\n",
- "df_data2['profession']=Series(None, index=df_data2.index)\n",
- "for index, row in df_data2.iterrows():\n",
- " if \"'\" in row['/people/person/profession']:\n",
- " df_data2.loc[index,'profession']=find_between(row['/people/person/profession'],\"'\",\"'\")\n",
- " if row['/people/person/gender']=='Male':\n",
- " df_data2.loc[index,'gender']=0\n",
- " if row['/people/person/gender']=='Female':\n",
- " df_data2.loc[index,'gender']=1\n",
- "df_data2 =df_data2[df_data2.profession != 'None']\n",
- "print df_data2.shape\n",
- " "
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "ename": "TypeError",
- "evalue": "argument of type 'float' is not iterable",
- "output_type": "pyerr",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-14-dd4578ba0ddf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m\"'\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_between\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/profession'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/gender'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m'Male'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mTypeError\u001b[0m: argument of type 'float' is not iterable"
- ]
- }
- ],
- "prompt_number": 14
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "df_data2.to_csv(path_or_buf=\"../data/final_fields_prof.csv\",sep=\"~\",encoding='utf-8')\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 16
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#converting country to integer\n",
- "df_country= pd.read_csv('../data/country_freebase.csv', error_bad_lines=False)#index_col=True)\n",
- "country_dict = dict(df_country.values)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "135\n"
- ]
- },
- {
- "output_type": "stream",
- "stream": "stderr",
- "text": [
- "Skipping line 418: expected 2 fields, saw 3\n",
- "Skipping line 538: expected 2 fields, saw 3\n",
- "Skipping line 557: expected 2 fields, saw 3\n",
- "Skipping line 696: expected 2 fields, saw 3\n",
- "\n"
- ]
- }
- ],
- "prompt_number": 40
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#Adding nationality and number\n",
- "df_data2['nationality']=Series(None, index=df_data2.index)\n",
- "df_data2['nationality_no']=Series(None, index=df_data2.index)\n",
- "for index, row in df_data2.iterrows():\n",
- " if \"'\" in row['/people/person/nationality']:\n",
- " temp=find_between(row['/people/person/nationality'],\"'\",\"'\")\n",
- " df_data2.loc[index,'nationality']=temp\n",
- " if temp in country_dict:\n",
- " df_data2.loc[index,'nationality_no']=country_dict[temp]\n",
- " else:\n",
- " df_data2.loc[index,'nationality_no']=None\n",
- " else:\n",
- " df_data2.loc[index,'nationality']=None\n",
- " df_data2.loc[index,'nationality_no']=None\n",
- " \n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "ename": "TypeError",
- "evalue": "argument of type 'float' is not iterable",
- "output_type": "pyerr",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-41-0e51b5c2e8da>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'nationality_no'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0;34m\"'\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/nationality'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_between\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/people/person/nationality'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdf_data2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'nationality'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mTypeError\u001b[0m: argument of type 'float' is not iterable"
- ]
- }
- ],
- "prompt_number": 41
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#saving final file\n",
- "df_data2.to_csv(path_or_buf=\"../data/final_fields_prof_nationality.csv\",sep=\"~\",encoding='utf-8')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 42
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#removing NaN values\n",
- "df_data2 = df_data2.replace('unknown', np.nan)\n",
- "df_data2 = df_data2.replace('none', np.nan)\n",
- "df_data2=df_data2.fillna(df_data2.median())"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 46
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#extracting fields for regression model\n",
- "df_data_X=pd.concat([df_data2['Age'], df_data2['Probability'], df_data2['gender'],df_data2['nationality_no']],axis=1)\n",
- "df_data_Y=pd.concat([df_data2['LifeExpectancy']],axis=1)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 48
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#training the model\n",
- "X_train=np.array(df_data_X)\n",
- "Y_train=np.array(df_data_Y)\n",
- "regr = linear_model.LinearRegression()\n",
- "regr.fit(X_train, Y_train)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 49,
- "text": [
- "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
- ]
- }
- ],
- "prompt_number": 49
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#10 fold cross validation\n",
- "from sklearn import cross_validation\n",
- "skf = cross_validation.StratifiedKFold(df_data_Y, n_folds=10)\n",
- "print df_data_X.shape\n",
- "print df_data_Y.shape"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "(167481, 4)\n",
- "(167481, 1)\n"
- ]
- }
- ],
- "prompt_number": 55
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "\n",
- "from sklearn.cross_validation import KFold\n",
- "import sklearn as sk\n",
- "kf = KFold(len(df_data_Y), 2, indices=False)\n",
- "sk.cross_validation.KFold(n=167481, k=2)\n",
- "for train, test in kf:\n",
- " print train, test\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "[False False False ..., True True True] [ True True True ..., False False False]\n",
- "[ True True True ..., False False False] [False False False ..., True True True]\n"
- ]
- },
- {
- "output_type": "stream",
- "stream": "stderr",
- "text": [
- "/usr/lib/python2.7/dist-packages/sklearn/cross_validation.py:240: DeprecationWarning: The parameter k was renamed to n_folds and will be removed in 0.15.\n",
- " \" removed in 0.15.\", DeprecationWarning)\n"
- ]
- }
- ],
- "prompt_number": 62
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "count=0.0\n",
- "for train_index, test_index in kf:\n",
- " X_train, X_test, y_train, y_test = df_data_X[train], df_data_X[test], df_data_Y[train], df_data_Y[test]\n",
- " regr = linear_model.LinearRegression()\n",
- " regr.fit(X_train, y_train)\n",
- " count+=np.mean((regr.predict(X_test) - y_test) ** 2)\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stderr",
- "text": [
- "/usr/lib/python2.7/dist-packages/numexpr/necompiler.py:742: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility\n",
- " return compiled_ex(*arguments, **kwargs)\n",
- "/usr/lib/python2.7/dist-packages/numexpr/necompiler.py:742: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility\n",
- " return compiled_ex(*arguments, **kwargs)\n"
- ]
- }
- ],
- "prompt_number": 67
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#mean square error\n",
- "print count/10"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "LifeExpectancy 21.317094\n",
- "dtype: float64\n"
- ]
- }
- ],
- "prompt_number": 68
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "df_data2 = pd.read_csv('../data/freebase_cleaned_data.csv', error_bad_lines=False, delimiter='~')\n",
- "df_prof=pd.read_csv('../data/celebrities_32_final.csv', error_bad_lines=False)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 5
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "def find_between( s, first, last ):\n",
- " try:\n",
- " start = s.index( first ) + len( first )\n",
- " end = s.index( last, start )\n",
- " return s[start:end]\n",
- " except ValueError:\n",
- " return \"\""
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 6
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#cleaning profession, gender\n",
- "df_data2['Politician']=Series(None, index=df_data2.index)\n",
- "df_data2['Pope-elect']=Series(None, index=df_data2.index)\n",
- "df_data2['Diplomat']=Series(None, index=df_data2.index)\n",
- "df_data2['Evangelist']=Series(None, index=df_data2.index)\n",
- "df_data2['Business magnate']=Series(None, index=df_data2.index)\n",
- "df_data2['Astronaut']=Series(None, index=df_data2.index)\n",
- "df_data2['Musician']=Series(None, index=df_data2.index)\n",
- "df_data2['Physicist']=Series(None, index=df_data2.index)\n",
- "df_data2['Actress']=Series(None, index=df_data2.index)\n",
- "df_data2['Actor']=Series(None, index=df_data2.index)\n",
- "df_data2['Guitarist']=Series(None, index=df_data2.index)\n",
- "df_data2['Singer']=Series(None, index=df_data2.index)\n",
- "df_data2['Pianist']=Series(None, index=df_data2.index)\n",
- "#x=['Politician','Pope-elect','Diplomat','Evangelist','Business magnate','Astronaut','Musician','Physicist','Actress',\n",
- "#'Actor','Guitarist','Singer','Pianist']\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 7
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "for index, row in df_data2.iterrows():\n",
- " if \"Politician\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Politician']=1\n",
- " else:\n",
- " df_data2.loc[index,'Politician']=0\n",
- " if \"Pope-elect\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Pope-elect']=1\n",
- " else:\n",
- " df_data2.loc[index,'Pope-elect']=0\n",
- " if \"Diplomat\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Diplomat']=1\n",
- " else:\n",
- " df_data2.loc[index,'Diplomat']=0\n",
- " if \"Evangelist\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Evangelist']=1\n",
- " else:\n",
- " df_data2.loc[index,'Evangelist']=0\n",
- " if \"Business magnate\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Business magnate']=1\n",
- " else:\n",
- " df_data2.loc[index,'Business magnate']=0\n",
- " if \"Astronaut\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Astronaut']=1\n",
- " else:\n",
- " df_data2.loc[index,'Astronaut']=0\n",
- " if \"Musician\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Musician']=1\n",
- " else:\n",
- " df_data2.loc[index,'Musician']=0\n",
- " if \"Physicist\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Physicist']=1\n",
- " else:\n",
- " df_data2.loc[index,'Physicist']=0\n",
- " if \"Actress\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Actress']=1\n",
- " else:\n",
- " df_data2.loc[index,'Actress']=0\n",
- " if \"Actor\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Actor']=1\n",
- " else:\n",
- " df_data2.loc[index,'Actor']=0\n",
- " if \"Guitarist\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Guitarist']=1\n",
- " else:\n",
- " df_data2.loc[index,'Guitarist']=0\n",
- " if \"Singer\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Singer']=1\n",
- " else:\n",
- " df_data2.loc[index,'Singer']=0\n",
- " if \"Pianist\" in str(row['/people/person/profession']):\n",
- " df_data2.loc[index,'Pianist']=1\n",
- " else:\n",
- " df_data2.loc[index,'Pianist']=0"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 8
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "df_data2.to_csv(path_or_buf=\"../data/finaldata_profession_corrected.csv\",sep=\"~\",encoding='utf-8')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 9
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": []
- }
- ],
- "metadata": {}
- }
- ]
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement