Advertisement
Guest User

Untitled

a guest
May 30th, 2015
225
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 23.55 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "### 用spark进行数据挖掘\n",
  8. "\n",
  9. "- 本例使用spark的python接口,对titanic数据做了一个完整的尝试\n",
  10. "- 首先用算质数的例子显示,即使在单机中,spark利用了多核处理能提高计算效率\n",
  11. "- 之后读入数据集,并对数据进行预处理\n",
  12. " - 步骤1:对名字进行了处理,用正则取出四种常见title\n",
  13. " - 步骤2:基于title,对年龄进行了缺失值处理\n",
  14. " - 步骤3:将类别变量均转为0-1变量\n",
  15. "- 数据合并整理成spark.mllib需要的格式\n",
  16. "- 使用线性模型建模,并得出错误率\n",
  17. "- 本例代码参考了《machine learning with spark》一书"
  18. ]
  19. },
  20. {
  21. "cell_type": "code",
  22. "execution_count": 1,
  23. "metadata": {
  24. "collapsed": false
  25. },
  26. "outputs": [],
  27. "source": [
  28. "from pyspark import SparkContext\n",
  29. "sc = SparkContext( 'local[4]')"
  30. ]
  31. },
  32. {
  33. "cell_type": "markdown",
  34. "metadata": {},
  35. "source": [
  36. "- 算质数的例子"
  37. ]
  38. },
  39. {
  40. "cell_type": "code",
  41. "execution_count": 2,
  42. "metadata": {
  43. "collapsed": false
  44. },
  45. "outputs": [],
  46. "source": [
  47. "def isprime(n):\n",
  48. " \"\"\"\n",
  49. " check if integer n is a prime\n",
  50. " \"\"\"\n",
  51. " # make sure n is a positive integer\n",
  52. " n = abs(int(n))\n",
  53. " # 0 and 1 are not primes\n",
  54. " if n < 2:\n",
  55. " return False\n",
  56. " # 2 is the only even prime number\n",
  57. " if n == 2:\n",
  58. " return True\n",
  59. " # all other even numbers are not primes\n",
  60. " if not n & 1:\n",
  61. " return False\n",
  62. " # range starts with 3 and only needs to go up the square root of n\n",
  63. " # for all odd numbers\n",
  64. " for x in range(3, int(n**0.5)+1, 2):\n",
  65. " if n % x == 0:\n",
  66. " return False\n",
  67. " return True"
  68. ]
  69. },
  70. {
  71. "cell_type": "code",
  72. "execution_count": 3,
  73. "metadata": {
  74. "collapsed": false
  75. },
  76. "outputs": [
  77. {
  78. "name": "stdout",
  79. "output_type": "stream",
  80. "text": [
  81. "78498\n",
  82. "78498\n",
  83. "78498\n",
  84. "78498\n",
  85. "1 loops, best of 3: 4.81 s per loop\n"
  86. ]
  87. }
  88. ],
  89. "source": [
  90. "%%timeit\n",
  91. "import numpy as np\n",
  92. "nums = xrange(1000000)\n",
  93. "print np.sum([1 for x in nums if isprime(x)])"
  94. ]
  95. },
  96. {
  97. "cell_type": "code",
  98. "execution_count": 4,
  99. "metadata": {
  100. "collapsed": false,
  101. "scrolled": true
  102. },
  103. "outputs": [
  104. {
  105. "name": "stdout",
  106. "output_type": "stream",
  107. "text": [
  108. "78498\n",
  109. "78498\n",
  110. "78498\n",
  111. "78498\n",
  112. "1 loops, best of 3: 2.71 s per loop\n"
  113. ]
  114. }
  115. ],
  116. "source": [
  117. "%%timeit\n",
  118. "nums = sc.parallelize(xrange(1000000))\n",
  119. "print nums.filter(isprime).count()"
  120. ]
  121. },
  122. {
  123. "cell_type": "markdown",
  124. "metadata": {},
  125. "source": [
  126. "- titanic例子,先读入变量名"
  127. ]
  128. },
  129. {
  130. "cell_type": "code",
  131. "execution_count": 2,
  132. "metadata": {
  133. "collapsed": false
  134. },
  135. "outputs": [],
  136. "source": [
  137. "vname = !head -1 titanic.csv\n",
  138. "vname = vname[0].split(',')"
  139. ]
  140. },
  141. {
  142. "cell_type": "code",
  143. "execution_count": 3,
  144. "metadata": {
  145. "collapsed": false
  146. },
  147. "outputs": [
  148. {
  149. "data": {
  150. "text/plain": [
  151. "u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'"
  152. ]
  153. },
  154. "execution_count": 3,
  155. "metadata": {},
  156. "output_type": "execute_result"
  157. }
  158. ],
  159. "source": [
  160. "#!sed 1d titanic.csv > titanic_noheader.csv\n",
  161. "raw = sc.textFile('titanic_noheader.csv')\n",
  162. "raw.first() # 原始数据"
  163. ]
  164. },
  165. {
  166. "cell_type": "markdown",
  167. "metadata": {},
  168. "source": [
  169. "- 数据预处理"
  170. ]
  171. },
  172. {
  173. "cell_type": "code",
  174. "execution_count": 4,
  175. "metadata": {
  176. "collapsed": true
  177. },
  178. "outputs": [],
  179. "source": [
  180. "# 处理title\n",
  181. "def extract_name(x):\n",
  182. " import re\n",
  183. " return re.search(\"\\\"(.*)\\\"\", x).group(1)"
  184. ]
  185. },
  186. {
  187. "cell_type": "code",
  188. "execution_count": 5,
  189. "metadata": {
  190. "collapsed": false
  191. },
  192. "outputs": [
  193. {
  194. "data": {
  195. "text/plain": [
  196. "[u'Braund, Mr. Owen Harris',\n",
  197. " u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n",
  198. " u'Heikkinen, Miss. Laina',\n",
  199. " u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']"
  200. ]
  201. },
  202. "execution_count": 5,
  203. "metadata": {},
  204. "output_type": "execute_result"
  205. }
  206. ],
  207. "source": [
  208. "names = raw.map(extract_name)\n",
  209. "names.take(4)"
  210. ]
  211. },
  212. {
  213. "cell_type": "code",
  214. "execution_count": 6,
  215. "metadata": {
  216. "collapsed": false
  217. },
  218. "outputs": [],
  219. "source": [
  220. "import re\n",
  221. "title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))"
  222. ]
  223. },
  224. {
  225. "cell_type": "code",
  226. "execution_count": 7,
  227. "metadata": {
  228. "collapsed": false
  229. },
  230. "outputs": [
  231. {
  232. "data": {
  233. "text/plain": [
  234. "[(u'Mr', 517),\n",
  235. " (u'Miss', 182),\n",
  236. " (u'Mrs', 125),\n",
  237. " (u'Master', 40),\n",
  238. " (u'Dr', 7),\n",
  239. " (u'Rev', 6),\n",
  240. " (u'Major', 2),\n",
  241. " (u'Mlle', 2),\n",
  242. " (u'Col', 2),\n",
  243. " (u'Sir', 1),\n",
  244. " (u'the Countess', 1),\n",
  245. " (u'Don', 1),\n",
  246. " (u'Capt', 1),\n",
  247. " (u'Lady', 1),\n",
  248. " (u'Jonkheer', 1),\n",
  249. " (u'Ms', 1),\n",
  250. " (u'Mme', 1)]"
  251. ]
  252. },
  253. "execution_count": 7,
  254. "metadata": {},
  255. "output_type": "execute_result"
  256. }
  257. ],
  258. "source": [
  259. "sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)"
  260. ]
  261. },
  262. {
  263. "cell_type": "code",
  264. "execution_count": 8,
  265. "metadata": {
  266. "collapsed": false
  267. },
  268. "outputs": [
  269. {
  270. "data": {
  271. "text/plain": [
  272. "[u'Mr', u'Miss', u'Mrs', u'Master']"
  273. ]
  274. },
  275. "execution_count": 8,
  276. "metadata": {},
  277. "output_type": "execute_result"
  278. }
  279. ],
  280. "source": [
  281. "top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n",
  282. "top_title"
  283. ]
  284. },
  285. {
  286. "cell_type": "code",
  287. "execution_count": 9,
  288. "metadata": {
  289. "collapsed": true
  290. },
  291. "outputs": [],
  292. "source": [
  293. "def assign_title(x):\n",
  294. " if x in top_title: return x\n",
  295. " else: return u'other'"
  296. ]
  297. },
  298. {
  299. "cell_type": "code",
  300. "execution_count": 10,
  301. "metadata": {
  302. "collapsed": false
  303. },
  304. "outputs": [
  305. {
  306. "data": {
  307. "text/plain": [
  308. "[u'Mr', u'Mrs', u'Miss', u'Mrs']"
  309. ]
  310. },
  311. "execution_count": 10,
  312. "metadata": {},
  313. "output_type": "execute_result"
  314. }
  315. ],
  316. "source": [
  317. "title_less = title.map(assign_title)\n",
  318. "title_less.take(4)"
  319. ]
  320. },
  321. {
  322. "cell_type": "code",
  323. "execution_count": 11,
  324. "metadata": {
  325. "collapsed": false
  326. },
  327. "outputs": [],
  328. "source": [
  329. "# 处理其它数据\n",
  330. "def split_rest(x):\n",
  331. " import re\n",
  332. " rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n",
  333. " return rec.split(',')"
  334. ]
  335. },
  336. {
  337. "cell_type": "code",
  338. "execution_count": 12,
  339. "metadata": {
  340. "collapsed": false
  341. },
  342. "outputs": [
  343. {
  344. "data": {
  345. "text/plain": [
  346. "[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']"
  347. ]
  348. },
  349. "execution_count": 12,
  350. "metadata": {},
  351. "output_type": "execute_result"
  352. }
  353. ],
  354. "source": [
  355. "df = raw.map(split_rest)\n",
  356. "df.first()"
  357. ]
  358. },
  359. {
  360. "cell_type": "code",
  361. "execution_count": 13,
  362. "metadata": {
  363. "collapsed": true
  364. },
  365. "outputs": [],
  366. "source": [
  367. "# 观察数据\n",
  368. "vname.remove('name')"
  369. ]
  370. },
  371. {
  372. "cell_type": "code",
  373. "execution_count": 14,
  374. "metadata": {
  375. "collapsed": false
  376. },
  377. "outputs": [
  378. {
  379. "name": "stdout",
  380. "output_type": "stream",
  381. "text": [
  382. "0th variable:survived distinct value: 2\n",
  383. "1th variable:pclass distinct value: 3\n",
  384. "2th variable:sex distinct value: 2\n",
  385. "3th variable:age distinct value: 89\n",
  386. "4th variable:sibsp distinct value: 7\n",
  387. "5th variable:parch distinct value: 7\n",
  388. "6th variable:ticket distinct value: 681\n",
  389. "7th variable:fare distinct value: 248\n",
  390. "8th variable:cabin distinct value: 148\n",
  391. "9th variable:embarked distinct value: 4\n"
  392. ]
  393. }
  394. ],
  395. "source": [
  396. "# 取值个数\n",
  397. "m = len(df.first())\n",
  398. "for i in range(m):\n",
  399. " print '%dth variable:%s distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())"
  400. ]
  401. },
  402. {
  403. "cell_type": "code",
  404. "execution_count": 15,
  405. "metadata": {
  406. "collapsed": false
  407. },
  408. "outputs": [
  409. {
  410. "name": "stdout",
  411. "output_type": "stream",
  412. "text": [
  413. "0th variable:survived miss value: 0\n",
  414. "1th variable:pclass miss value: 0\n",
  415. "2th variable:sex miss value: 0\n",
  416. "3th variable:age miss value: 177\n",
  417. "4th variable:sibsp miss value: 0\n",
  418. "5th variable:parch miss value: 0\n",
  419. "6th variable:ticket miss value: 0\n",
  420. "7th variable:fare miss value: 0\n",
  421. "8th variable:cabin miss value: 687\n",
  422. "9th variable:embarked miss value: 2\n"
  423. ]
  424. }
  425. ],
  426. "source": [
  427. "# 缺失个数\n",
  428. "for i in range(m):\n",
  429. " print '%dth variable:%s miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())"
  430. ]
  431. },
  432. {
  433. "cell_type": "code",
  434. "execution_count": 16,
  435. "metadata": {
  436. "collapsed": false
  437. },
  438. "outputs": [],
  439. "source": [
  440. "# 处理年龄缺失\n",
  441. "age = df.map(lambda x: x[3])"
  442. ]
  443. },
  444. {
  445. "cell_type": "code",
  446. "execution_count": 17,
  447. "metadata": {
  448. "collapsed": false
  449. },
  450. "outputs": [],
  451. "source": [
  452. "title_age = title.zip(age)"
  453. ]
  454. },
  455. {
  456. "cell_type": "code",
  457. "execution_count": 18,
  458. "metadata": {
  459. "collapsed": false
  460. },
  461. "outputs": [],
  462. "source": [
  463. "title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)"
  464. ]
  465. },
  466. {
  467. "cell_type": "code",
  468. "execution_count": 19,
  469. "metadata": {
  470. "collapsed": false
  471. },
  472. "outputs": [],
  473. "source": [
  474. "import numpy as np"
  475. ]
  476. },
  477. {
  478. "cell_type": "code",
  479. "execution_count": 20,
  480. "metadata": {
  481. "collapsed": true
  482. },
  483. "outputs": [],
  484. "source": [
  485. "def miss_mean(data):\n",
  486. " res = [x for x in data if x!=-1]\n",
  487. " return np.mean(res)"
  488. ]
  489. },
  490. {
  491. "cell_type": "code",
  492. "execution_count": 21,
  493. "metadata": {
  494. "collapsed": false
  495. },
  496. "outputs": [],
  497. "source": [
  498. "age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())"
  499. ]
  500. },
  501. {
  502. "cell_type": "code",
  503. "execution_count": 22,
  504. "metadata": {
  505. "collapsed": false
  506. },
  507. "outputs": [
  508. {
  509. "data": {
  510. "text/plain": [
  511. "{u'Capt': 70.0,\n",
  512. " u'Col': 58.0,\n",
  513. " u'Don': 40.0,\n",
  514. " u'Dr': 42.0,\n",
  515. " u'Jonkheer': 38.0,\n",
  516. " u'Lady': 48.0,\n",
  517. " u'Major': 48.5,\n",
  518. " u'Master': 4.5741666666666667,\n",
  519. " u'Miss': 21.773972602739725,\n",
  520. " u'Mlle': 24.0,\n",
  521. " u'Mme': 24.0,\n",
  522. " u'Mr': 32.368090452261306,\n",
  523. " u'Mrs': 35.898148148148145,\n",
  524. " u'Ms': 28.0,\n",
  525. " u'Rev': 43.166666666666664,\n",
  526. " u'Sir': 49.0,\n",
  527. " u'the Countess': 33.0}"
  528. ]
  529. },
  530. "execution_count": 22,
  531. "metadata": {},
  532. "output_type": "execute_result"
  533. }
  534. ],
  535. "source": [
  536. "age_dict"
  537. ]
  538. },
  539. {
  540. "cell_type": "code",
  541. "execution_count": 23,
  542. "metadata": {
  543. "collapsed": true
  544. },
  545. "outputs": [],
  546. "source": [
  547. "def age_func((title,age)):\n",
  548. " if age== -1: res = (title, age_dict[title])\n",
  549. " else: res = (title, age)\n",
  550. " return res"
  551. ]
  552. },
  553. {
  554. "cell_type": "code",
  555. "execution_count": 24,
  556. "metadata": {
  557. "collapsed": false
  558. },
  559. "outputs": [
  560. {
  561. "data": {
  562. "text/plain": [
  563. "[22.0, 38.0, 26.0, 35.0]"
  564. ]
  565. },
  566. "execution_count": 24,
  567. "metadata": {},
  568. "output_type": "execute_result"
  569. }
  570. ],
  571. "source": [
  572. "title_age = title_age.map(age_func)\n",
  573. "age_imputed = title_age.values()\n",
  574. "age_imputed.take(4)"
  575. ]
  576. },
  577. {
  578. "cell_type": "code",
  579. "execution_count": 25,
  580. "metadata": {
  581. "collapsed": false
  582. },
  583. "outputs": [
  584. {
  585. "data": {
  586. "text/plain": [
  587. "defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})"
  588. ]
  589. },
  590. "execution_count": 25,
  591. "metadata": {},
  592. "output_type": "execute_result"
  593. }
  594. ],
  595. "source": [
  596. "# 处理 embarked缺失\n",
  597. "df.map(lambda record: record[9]).countByValue()"
  598. ]
  599. },
  600. {
  601. "cell_type": "code",
  602. "execution_count": 26,
  603. "metadata": {
  604. "collapsed": true
  605. },
  606. "outputs": [],
  607. "source": [
  608. "def embarked_func(record):\n",
  609. " if record[9]=='' : return u'S' \n",
  610. " else: return record[9]"
  611. ]
  612. },
  613. {
  614. "cell_type": "code",
  615. "execution_count": 27,
  616. "metadata": {
  617. "collapsed": false
  618. },
  619. "outputs": [],
  620. "source": [
  621. "embarked= df.map(embarked_func)"
  622. ]
  623. },
  624. {
  625. "cell_type": "code",
  626. "execution_count": 28,
  627. "metadata": {
  628. "collapsed": true
  629. },
  630. "outputs": [],
  631. "source": [
  632. "# 将四个类别变量转为0-1二元变量"
  633. ]
  634. },
  635. {
  636. "cell_type": "code",
  637. "execution_count": 29,
  638. "metadata": {
  639. "collapsed": false
  640. },
  641. "outputs": [
  642. {
  643. "data": {
  644. "text/plain": [
  645. "{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}"
  646. ]
  647. },
  648. "execution_count": 29,
  649. "metadata": {},
  650. "output_type": "execute_result"
  651. }
  652. ],
  653. "source": [
  654. "title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n",
  655. "title_dict"
  656. ]
  657. },
  658. {
  659. "cell_type": "code",
  660. "execution_count": 30,
  661. "metadata": {
  662. "collapsed": true
  663. },
  664. "outputs": [],
  665. "source": [
  666. "def create_vector(term, term_dict):\n",
  667. " #from scipy import sparse as sp\n",
  668. " num_terms = len(term_dict)\n",
  669. " #x = sp.csc_matrix((1, num_terms))\n",
  670. " x = [0]*num_terms\n",
  671. " idx = term_dict[term]\n",
  672. " x[idx] = 1\n",
  673. " return x"
  674. ]
  675. },
  676. {
  677. "cell_type": "code",
  678. "execution_count": 31,
  679. "metadata": {
  680. "collapsed": false
  681. },
  682. "outputs": [
  683. {
  684. "data": {
  685. "text/plain": [
  686. "[0, 1, 0, 0, 0]"
  687. ]
  688. },
  689. "execution_count": 31,
  690. "metadata": {},
  691. "output_type": "execute_result"
  692. }
  693. ],
  694. "source": [
  695. "create_vector(u'Master',title_dict)"
  696. ]
  697. },
  698. {
  699. "cell_type": "code",
  700. "execution_count": 32,
  701. "metadata": {
  702. "collapsed": false
  703. },
  704. "outputs": [
  705. {
  706. "data": {
  707. "text/plain": [
  708. "[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]"
  709. ]
  710. },
  711. "execution_count": 32,
  712. "metadata": {},
  713. "output_type": "execute_result"
  714. }
  715. ],
  716. "source": [
  717. "title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n",
  718. "title_ind.take(4)"
  719. ]
  720. },
  721. {
  722. "cell_type": "code",
  723. "execution_count": 33,
  724. "metadata": {
  725. "collapsed": false
  726. },
  727. "outputs": [
  728. {
  729. "data": {
  730. "text/plain": [
  731. "{u'1': 0, u'2': 2, u'3': 1}"
  732. ]
  733. },
  734. "execution_count": 33,
  735. "metadata": {},
  736. "output_type": "execute_result"
  737. }
  738. ],
  739. "source": [
  740. "pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n",
  741. "pclass_dict"
  742. ]
  743. },
  744. {
  745. "cell_type": "code",
  746. "execution_count": 34,
  747. "metadata": {
  748. "collapsed": false
  749. },
  750. "outputs": [
  751. {
  752. "data": {
  753. "text/plain": [
  754. "[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]"
  755. ]
  756. },
  757. "execution_count": 34,
  758. "metadata": {},
  759. "output_type": "execute_result"
  760. }
  761. ],
  762. "source": [
  763. "pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n",
  764. "pclass_ind.take(4)"
  765. ]
  766. },
  767. {
  768. "cell_type": "code",
  769. "execution_count": 35,
  770. "metadata": {
  771. "collapsed": false
  772. },
  773. "outputs": [
  774. {
  775. "data": {
  776. "text/plain": [
  777. "{u'C': 2, u'Q': 0, u'S': 1}"
  778. ]
  779. },
  780. "execution_count": 35,
  781. "metadata": {},
  782. "output_type": "execute_result"
  783. }
  784. ],
  785. "source": [
  786. "embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n",
  787. "embarked_dict"
  788. ]
  789. },
  790. {
  791. "cell_type": "code",
  792. "execution_count": 36,
  793. "metadata": {
  794. "collapsed": false
  795. },
  796. "outputs": [
  797. {
  798. "data": {
  799. "text/plain": [
  800. "[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]"
  801. ]
  802. },
  803. "execution_count": 36,
  804. "metadata": {},
  805. "output_type": "execute_result"
  806. }
  807. ],
  808. "source": [
  809. "embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n",
  810. "embarked_ind.take(4)"
  811. ]
  812. },
  813. {
  814. "cell_type": "code",
  815. "execution_count": 37,
  816. "metadata": {
  817. "collapsed": false
  818. },
  819. "outputs": [],
  820. "source": [
  821. "gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)"
  822. ]
  823. },
  824. {
  825. "cell_type": "code",
  826. "execution_count": 38,
  827. "metadata": {
  828. "collapsed": false
  829. },
  830. "outputs": [
  831. {
  832. "data": {
  833. "text/plain": [
  834. "[(0, [0, 1, 0, 7.25]),\n",
  835. " (1, [1, 1, 0, 71.2833]),\n",
  836. " (2, [1, 0, 0, 7.925]),\n",
  837. " (3, [1, 1, 0, 53.1])]"
  838. ]
  839. },
  840. "execution_count": 38,
  841. "metadata": {},
  842. "output_type": "execute_result"
  843. }
  844. ],
  845. "source": [
  846. "# 合并数据\n",
  847. "restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n",
  848. "restdf.take(4)"
  849. ]
  850. },
  851. {
  852. "cell_type": "code",
  853. "execution_count": 39,
  854. "metadata": {
  855. "collapsed": false
  856. },
  857. "outputs": [
  858. {
  859. "data": {
  860. "text/plain": [
  861. "[(0, [0, 0, 0, 1, 0]),\n",
  862. " (1, [0, 0, 0, 0, 1]),\n",
  863. " (2, [1, 0, 0, 0, 0]),\n",
  864. " (3, [0, 0, 0, 0, 1])]"
  865. ]
  866. },
  867. "execution_count": 39,
  868. "metadata": {},
  869. "output_type": "execute_result"
  870. }
  871. ],
  872. "source": [
  873. "title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
  874. "title_ind.take(4)"
  875. ]
  876. },
  877. {
  878. "cell_type": "code",
  879. "execution_count": 40,
  880. "metadata": {
  881. "collapsed": false
  882. },
  883. "outputs": [
  884. {
  885. "data": {
  886. "text/plain": [
  887. "[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]"
  888. ]
  889. },
  890. "execution_count": 40,
  891. "metadata": {},
  892. "output_type": "execute_result"
  893. }
  894. ],
  895. "source": [
  896. "pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
  897. "pclass_ind.take(4)"
  898. ]
  899. },
  900. {
  901. "cell_type": "code",
  902. "execution_count": 41,
  903. "metadata": {
  904. "collapsed": false
  905. },
  906. "outputs": [
  907. {
  908. "data": {
  909. "text/plain": [
  910. "[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]"
  911. ]
  912. },
  913. "execution_count": 41,
  914. "metadata": {},
  915. "output_type": "execute_result"
  916. }
  917. ],
  918. "source": [
  919. "embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
  920. "embarked_ind.take(4)"
  921. ]
  922. },
  923. {
  924. "cell_type": "code",
  925. "execution_count": 42,
  926. "metadata": {
  927. "collapsed": false
  928. },
  929. "outputs": [
  930. {
  931. "data": {
  932. "text/plain": [
  933. "[(0, [1]), (1, [0]), (2, [0]), (3, [0])]"
  934. ]
  935. },
  936. "execution_count": 42,
  937. "metadata": {},
  938. "output_type": "execute_result"
  939. }
  940. ],
  941. "source": [
  942. "gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
  943. "gender_ind.take(4)"
  944. ]
  945. },
  946. {
  947. "cell_type": "code",
  948. "execution_count": 43,
  949. "metadata": {
  950. "collapsed": false
  951. },
  952. "outputs": [
  953. {
  954. "data": {
  955. "text/plain": [
  956. "[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]"
  957. ]
  958. },
  959. "execution_count": 43,
  960. "metadata": {},
  961. "output_type": "execute_result"
  962. }
  963. ],
  964. "source": [
  965. "age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
  966. "age_imputed.take(4)"
  967. ]
  968. },
  969. {
  970. "cell_type": "code",
  971. "execution_count": 44,
  972. "metadata": {
  973. "collapsed": false
  974. },
  975. "outputs": [],
  976. "source": [
  977. "finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n",
  978. "finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n",
  979. "finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n",
  980. "finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n",
  981. "finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)"
  982. ]
  983. },
  984. {
  985. "cell_type": "code",
  986. "execution_count": 45,
  987. "metadata": {
  988. "collapsed": false
  989. },
  990. "outputs": [
  991. {
  992. "data": {
  993. "text/plain": [
  994. "[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
  995. " (384,\n",
  996. " [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
  997. " (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n",
  998. " (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]"
  999. ]
  1000. },
  1001. "execution_count": 45,
  1002. "metadata": {},
  1003. "output_type": "execute_result"
  1004. }
  1005. ],
  1006. "source": [
  1007. "finaldf.take(4)"
  1008. ]
  1009. },
  1010. {
  1011. "cell_type": "code",
  1012. "execution_count": null,
  1013. "metadata": {
  1014. "collapsed": true
  1015. },
  1016. "outputs": [],
  1017. "source": []
  1018. },
  1019. {
  1020. "cell_type": "code",
  1021. "execution_count": 46,
  1022. "metadata": {
  1023. "collapsed": true
  1024. },
  1025. "outputs": [],
  1026. "source": [
  1027. "# 准备建模需要格式\n",
  1028. "from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
  1029. "from pyspark.mllib.regression import LabeledPoint\n",
  1030. "def parsePoint(line):\n",
  1031. " features = line[1][1:]\n",
  1032. " target = line[1][0]\n",
  1033. " return LabeledPoint(target, features)"
  1034. ]
  1035. },
  1036. {
  1037. "cell_type": "code",
  1038. "execution_count": 47,
  1039. "metadata": {
  1040. "collapsed": true
  1041. },
  1042. "outputs": [],
  1043. "source": [
  1044. "modeldata = finaldf.map(parsePoint)"
  1045. ]
  1046. },
  1047. {
  1048. "cell_type": "code",
  1049. "execution_count": 48,
  1050. "metadata": {
  1051. "collapsed": false
  1052. },
  1053. "outputs": [
  1054. {
  1055. "data": {
  1056. "text/plain": [
  1057. "LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])"
  1058. ]
  1059. },
  1060. "execution_count": 48,
  1061. "metadata": {},
  1062. "output_type": "execute_result"
  1063. }
  1064. ],
  1065. "source": [
  1066. "modeldata.first()"
  1067. ]
  1068. },
  1069. {
  1070. "cell_type": "code",
  1071. "execution_count": 49,
  1072. "metadata": {
  1073. "collapsed": false
  1074. },
  1075. "outputs": [],
  1076. "source": [
  1077. "# 数据切分\n",
  1078. "train, test = modeldata.randomSplit([0.75,0.25])\n",
  1079. "# 建模\n",
  1080. "model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')"
  1081. ]
  1082. },
  1083. {
  1084. "cell_type": "code",
  1085. "execution_count": 50,
  1086. "metadata": {
  1087. "collapsed": false
  1088. },
  1089. "outputs": [
  1090. {
  1091. "name": "stdout",
  1092. "output_type": "stream",
  1093. "text": [
  1094. "Training Error = 0.308056872038\n"
  1095. ]
  1096. }
  1097. ],
  1098. "source": [
  1099. "# 评估\n",
  1100. "labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n",
  1101. "testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n",
  1102. "print(\"Training Error = \" + str(testErr))"
  1103. ]
  1104. },
  1105. {
  1106. "cell_type": "code",
  1107. "execution_count": null,
  1108. "metadata": {
  1109. "collapsed": false
  1110. },
  1111. "outputs": [],
  1112. "source": []
  1113. }
  1114. ],
  1115. "metadata": {
  1116. "kernelspec": {
  1117. "display_name": "Python 2",
  1118. "language": "python",
  1119. "name": "python2"
  1120. },
  1121. "language_info": {
  1122. "codemirror_mode": {
  1123. "name": "ipython",
  1124. "version": 2
  1125. },
  1126. "file_extension": ".py",
  1127. "mimetype": "text/x-python",
  1128. "name": "python",
  1129. "nbconvert_exporter": "python",
  1130. "pygments_lexer": "ipython2",
  1131. "version": "2.7.9"
  1132. }
  1133. },
  1134. "nbformat": 4,
  1135. "nbformat_minor": 0
  1136. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement