Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 用spark进行数据挖掘\n",
- "\n",
- "- 本例使用spark的python接口,对titanic数据做了一个完整的尝试\n",
- "- 首先用算质数的例子显示,即使在单机中,spark利用了多核处理能提高计算效率\n",
- "- 之后读入数据集,并对数据进行预处理\n",
- " - 步骤1:对名字进行了处理,用正则取出四种常见title\n",
- " - 步骤2:基于title,对年龄进行了缺失值处理\n",
- " - 步骤3:将类别变量均转为0-1变量\n",
- "- 数据合并整理成spark.mllib需要的格式\n",
- "- 使用线性模型建模,并得出错误率\n",
- "- 本例代码参考了《machine learning with spark》一书"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from pyspark import SparkContext\n",
- "sc = SparkContext( 'local[4]')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "- 算质数的例子"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "def isprime(n):\n",
- " \"\"\"\n",
- " check if integer n is a prime\n",
- " \"\"\"\n",
- " # make sure n is a positive integer\n",
- " n = abs(int(n))\n",
- " # 0 and 1 are not primes\n",
- " if n < 2:\n",
- " return False\n",
- " # 2 is the only even prime number\n",
- " if n == 2:\n",
- " return True\n",
- " # all other even numbers are not primes\n",
- " if not n & 1:\n",
- " return False\n",
- " # range starts with 3 and only needs to go up the square root of n\n",
- " # for all odd numbers\n",
- " for x in range(3, int(n**0.5)+1, 2):\n",
- " if n % x == 0:\n",
- " return False\n",
- " return True"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "78498\n",
- "78498\n",
- "78498\n",
- "78498\n",
- "1 loops, best of 3: 4.81 s per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "import numpy as np\n",
- "nums = xrange(1000000)\n",
- "print np.sum([1 for x in nums if isprime(x)])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false,
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "78498\n",
- "78498\n",
- "78498\n",
- "78498\n",
- "1 loops, best of 3: 2.71 s per loop\n"
- ]
- }
- ],
- "source": [
- "%%timeit\n",
- "nums = sc.parallelize(xrange(1000000))\n",
- "print nums.filter(isprime).count()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "- titanic例子,先读入变量名"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "vname = !head -1 titanic.csv\n",
- "vname = vname[0].split(',')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#!sed 1d titanic.csv > titanic_noheader.csv\n",
- "raw = sc.textFile('titanic_noheader.csv')\n",
- "raw.first() # 原始数据"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "- 数据预处理"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# 处理title\n",
- "def extract_name(x):\n",
- " import re\n",
- " return re.search(\"\\\"(.*)\\\"\", x).group(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[u'Braund, Mr. Owen Harris',\n",
- " u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n",
- " u'Heikkinen, Miss. Laina',\n",
- " u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "names = raw.map(extract_name)\n",
- "names.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import re\n",
- "title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(u'Mr', 517),\n",
- " (u'Miss', 182),\n",
- " (u'Mrs', 125),\n",
- " (u'Master', 40),\n",
- " (u'Dr', 7),\n",
- " (u'Rev', 6),\n",
- " (u'Major', 2),\n",
- " (u'Mlle', 2),\n",
- " (u'Col', 2),\n",
- " (u'Sir', 1),\n",
- " (u'the Countess', 1),\n",
- " (u'Don', 1),\n",
- " (u'Capt', 1),\n",
- " (u'Lady', 1),\n",
- " (u'Jonkheer', 1),\n",
- " (u'Ms', 1),\n",
- " (u'Mme', 1)]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[u'Mr', u'Miss', u'Mrs', u'Master']"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n",
- "top_title"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def assign_title(x):\n",
- " if x in top_title: return x\n",
- " else: return u'other'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[u'Mr', u'Mrs', u'Miss', u'Mrs']"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "title_less = title.map(assign_title)\n",
- "title_less.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# 处理其它数据\n",
- "def split_rest(x):\n",
- " import re\n",
- " rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n",
- " return rec.split(',')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df = raw.map(split_rest)\n",
- "df.first()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# 观察数据\n",
- "vname.remove('name')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0th variable:survived distinct value: 2\n",
- "1th variable:pclass distinct value: 3\n",
- "2th variable:sex distinct value: 2\n",
- "3th variable:age distinct value: 89\n",
- "4th variable:sibsp distinct value: 7\n",
- "5th variable:parch distinct value: 7\n",
- "6th variable:ticket distinct value: 681\n",
- "7th variable:fare distinct value: 248\n",
- "8th variable:cabin distinct value: 148\n",
- "9th variable:embarked distinct value: 4\n"
- ]
- }
- ],
- "source": [
- "# 取值个数\n",
- "m = len(df.first())\n",
- "for i in range(m):\n",
- " print '%dth variable:%s distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0th variable:survived miss value: 0\n",
- "1th variable:pclass miss value: 0\n",
- "2th variable:sex miss value: 0\n",
- "3th variable:age miss value: 177\n",
- "4th variable:sibsp miss value: 0\n",
- "5th variable:parch miss value: 0\n",
- "6th variable:ticket miss value: 0\n",
- "7th variable:fare miss value: 0\n",
- "8th variable:cabin miss value: 687\n",
- "9th variable:embarked miss value: 2\n"
- ]
- }
- ],
- "source": [
- "# 缺失个数\n",
- "for i in range(m):\n",
- " print '%dth variable:%s miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# 处理年龄缺失\n",
- "age = df.map(lambda x: x[3])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "title_age = title.zip(age)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import numpy as np"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def miss_mean(data):\n",
- " res = [x for x in data if x!=-1]\n",
- " return np.mean(res)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{u'Capt': 70.0,\n",
- " u'Col': 58.0,\n",
- " u'Don': 40.0,\n",
- " u'Dr': 42.0,\n",
- " u'Jonkheer': 38.0,\n",
- " u'Lady': 48.0,\n",
- " u'Major': 48.5,\n",
- " u'Master': 4.5741666666666667,\n",
- " u'Miss': 21.773972602739725,\n",
- " u'Mlle': 24.0,\n",
- " u'Mme': 24.0,\n",
- " u'Mr': 32.368090452261306,\n",
- " u'Mrs': 35.898148148148145,\n",
- " u'Ms': 28.0,\n",
- " u'Rev': 43.166666666666664,\n",
- " u'Sir': 49.0,\n",
- " u'the Countess': 33.0}"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "age_dict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def age_func((title,age)):\n",
- " if age== -1: res = (title, age_dict[title])\n",
- " else: res = (title, age)\n",
- " return res"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[22.0, 38.0, 26.0, 35.0]"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "title_age = title_age.map(age_func)\n",
- "age_imputed = title_age.values()\n",
- "age_imputed.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 处理 embarked缺失\n",
- "df.map(lambda record: record[9]).countByValue()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def embarked_func(record):\n",
- " if record[9]=='' : return u'S' \n",
- " else: return record[9]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "embarked= df.map(embarked_func)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# 将四个类别变量转为0-1二元变量"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n",
- "title_dict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "def create_vector(term, term_dict):\n",
- " #from scipy import sparse as sp\n",
- " num_terms = len(term_dict)\n",
- " #x = sp.csc_matrix((1, num_terms))\n",
- " x = [0]*num_terms\n",
- " idx = term_dict[term]\n",
- " x[idx] = 1\n",
- " return x"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[0, 1, 0, 0, 0]"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "create_vector(u'Master',title_dict)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n",
- "title_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{u'1': 0, u'2': 2, u'3': 1}"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n",
- "pclass_dict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n",
- "pclass_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{u'C': 2, u'Q': 0, u'S': 1}"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n",
- "embarked_dict"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]"
- ]
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n",
- "embarked_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [0, 1, 0, 7.25]),\n",
- " (1, [1, 1, 0, 71.2833]),\n",
- " (2, [1, 0, 0, 7.925]),\n",
- " (3, [1, 1, 0, 53.1])]"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 合并数据\n",
- "restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n",
- "restdf.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [0, 0, 0, 1, 0]),\n",
- " (1, [0, 0, 0, 0, 1]),\n",
- " (2, [1, 0, 0, 0, 0]),\n",
- " (3, [0, 0, 0, 0, 1])]"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
- "title_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
- "pclass_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
- "embarked_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [1]), (1, [0]), (2, [0]), (3, [0])]"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
- "gender_ind.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
- "age_imputed.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n",
- "finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n",
- "finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n",
- "finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n",
- "finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
- " (384,\n",
- " [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
- " (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n",
- " (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "finaldf.take(4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# 准备建模需要格式\n",
- "from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
- "from pyspark.mllib.regression import LabeledPoint\n",
- "def parsePoint(line):\n",
- " features = line[1][1:]\n",
- " target = line[1][0]\n",
- " return LabeledPoint(target, features)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "modeldata = finaldf.map(parsePoint)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "modeldata.first()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# 数据切分\n",
- "train, test = modeldata.randomSplit([0.75,0.25])\n",
- "# 建模\n",
- "model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Training Error = 0.308056872038\n"
- ]
- }
- ],
- "source": [
- "# 评估\n",
- "labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n",
- "testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n",
- "print(\"Training Error = \" + str(testErr))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement