Advertisement
Guest User

Untitled

a guest
Nov 28th, 2015
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 25.59 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {
  7. "collapsed": false
  8. },
  9. "outputs": [
  10. {
  11. "name": "stdout",
  12. "output_type": "stream",
  13. "text": [
  14. "0 b\n",
  15. "1 a\n",
  16. "2 n\n",
  17. "3 a\n",
  18. "4 n\n",
  19. "5 a\n"
  20. ]
  21. }
  22. ],
  23. "source": [
  24. "fruit = 'banana'\n",
  25. "index = 0\n",
  26. "while index < len(fruit):\n",
  27. " letter = fruit[index]\n",
  28. " print index, letter\n",
  29. " index = index + 1"
  30. ]
  31. },
  32. {
  33. "cell_type": "code",
  34. "execution_count": null,
  35. "metadata": {
  36. "collapsed": false
  37. },
  38. "outputs": [],
  39. "source": [
  40. "fruit = 'banana'\n",
  41. "index = 0\n",
  42. "for letter in fruit:\n",
  43. " print index, letter\n",
  44. " index = index + 1\n"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": null,
  50. "metadata": {
  51. "collapsed": false
  52. },
  53. "outputs": [],
  54. "source": [
  55. "word = 'banana'\n",
  56. "count = 0\n",
  57. "for letter in word:\n",
  58. " if letter=='a':\n",
  59. " count = count + 1\n",
  60. "print count"
  61. ]
  62. },
  63. {
  64. "cell_type": "code",
  65. "execution_count": null,
  66. "metadata": {
  67. "collapsed": false
  68. },
  69. "outputs": [],
  70. "source": [
  71. "greet = 'Hello Bob'\n",
  72. "nstr = greet.replace('Bob','Jane')\n",
  73. "print nstr"
  74. ]
  75. },
  76. {
  77. "cell_type": "code",
  78. "execution_count": null,
  79. "metadata": {
  80. "collapsed": false
  81. },
  82. "outputs": [],
  83. "source": [
  84. "greet = 'Hello Bob'\n",
  85. "nstr = greet.replace('o','X')\n",
  86. "print nstr"
  87. ]
  88. },
  89. {
  90. "cell_type": "code",
  91. "execution_count": null,
  92. "metadata": {
  93. "collapsed": false
  94. },
  95. "outputs": [],
  96. "source": [
  97. "greet = ' Hello Bob '\n",
  98. "greet.lstrip()\n",
  99. "greet.rstrip()\n",
  100. "greet.strip()\n",
  101. "\n",
  102. "\n"
  103. ]
  104. },
  105. {
  106. "cell_type": "code",
  107. "execution_count": null,
  108. "metadata": {
  109. "collapsed": false
  110. },
  111. "outputs": [],
  112. "source": [
  113. "line = 'Please have a nice day'\n",
  114. "line.startswith('Please')\n",
  115. "line.startswith('please')"
  116. ]
  117. },
  118. {
  119. "cell_type": "code",
  120. "execution_count": null,
  121. "metadata": {
  122. "collapsed": false
  123. },
  124. "outputs": [],
  125. "source": [
  126. "data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
  127. "atpos = data.find('@')\n",
  128. "print atpos"
  129. ]
  130. },
  131. {
  132. "cell_type": "code",
  133. "execution_count": null,
  134. "metadata": {
  135. "collapsed": false
  136. },
  137. "outputs": [],
  138. "source": [
  139. "sppos = data.find(' ',atpos)\n",
  140. "print sppos"
  141. ]
  142. },
  143. {
  144. "cell_type": "code",
  145. "execution_count": null,
  146. "metadata": {
  147. "collapsed": false
  148. },
  149. "outputs": [],
  150. "source": [
  151. "host = data[atpos+1 :sppos]\n",
  152. "# up to but not incolding space\n",
  153. "print host"
  154. ]
  155. },
  156. {
  157. "cell_type": "code",
  158. "execution_count": null,
  159. "metadata": {
  160. "collapsed": false
  161. },
  162. "outputs": [],
  163. "source": [
  164. "fhand = open('mbox-short.txt')\n",
  165. "print fhand\n",
  166. "count = 0\n",
  167. "for line in fhand:\n",
  168. " count = count + 1\n",
  169. "print 'Line Count:', count"
  170. ]
  171. },
  172. {
  173. "cell_type": "code",
  174. "execution_count": null,
  175. "metadata": {
  176. "collapsed": false
  177. },
  178. "outputs": [],
  179. "source": [
  180. "fhand = open('mbox-short.txt')\n",
  181. "for line in fhand:\n",
  182. " if line.startswith('From:'):\n",
  183. " print line"
  184. ]
  185. },
  186. {
  187. "cell_type": "code",
  188. "execution_count": null,
  189. "metadata": {
  190. "collapsed": false
  191. },
  192. "outputs": [],
  193. "source": [
  194. "fhand = open('mbox-short.txt')\n",
  195. "for line in fhand:\n",
  196. " line = line.rstrip()\n",
  197. " if line.startswith('From:'):\n",
  198. " print line"
  199. ]
  200. },
  201. {
  202. "cell_type": "code",
  203. "execution_count": null,
  204. "metadata": {
  205. "collapsed": false
  206. },
  207. "outputs": [],
  208. "source": [
  209. "fhand = open('mbox-short.txt')\n",
  210. "for line in fhand:\n",
  211. " line = line.rstrip()\n",
  212. " #Skip uninteresting lines\n",
  213. " if not line.startswith('From:'):\n",
  214. " continue\n",
  215. " #Process ou interesting lines\n",
  216. " print line"
  217. ]
  218. },
  219. {
  220. "cell_type": "code",
  221. "execution_count": null,
  222. "metadata": {
  223. "collapsed": false
  224. },
  225. "outputs": [],
  226. "source": [
  227. "fhand = open('mbox-short.txt')\n",
  228. "for line in fhand:\n",
  229. " line = line.rstrip()\n",
  230. " #Skip uninteresting lines\n",
  231. " if not '@uct.ac.za' in line:\n",
  232. " continue\n",
  233. " #Process ou interesting lines\n",
  234. " print line"
  235. ]
  236. },
  237. {
  238. "cell_type": "code",
  239. "execution_count": null,
  240. "metadata": {
  241. "collapsed": false
  242. },
  243. "outputs": [],
  244. "source": [
  245. "fname = raw_input('Enter the file name')\n",
  246. "fhand = open(fname)\n",
  247. "count = 0\n",
  248. "for line in fhand:\n",
  249. " if line.startswith('Subject:'):\n",
  250. " count = count + 1\n",
  251. "print 'There were'.count, 'subject line in'.fname"
  252. ]
  253. },
  254. {
  255. "cell_type": "code",
  256. "execution_count": null,
  257. "metadata": {
  258. "collapsed": false
  259. },
  260. "outputs": [],
  261. "source": [
  262. "fname = raw_input('Enter the file name: ')\n",
  263. "try:\n",
  264. " fhand = open(fname)\n",
  265. "except:\n",
  266. " print 'File cannot be opened:', fname\n",
  267. " exit()\n",
  268. "\n",
  269. "count = 0\n",
  270. "for line in fhand:\n",
  271. " if line.startswith('From:'):\n",
  272. " line = line.rstrip()\n",
  273. " count = count + 1\n",
  274. " print line\n",
  275. "print 'There were', count, 'From line in', fname\n",
  276. " "
  277. ]
  278. },
  279. {
  280. "cell_type": "code",
  281. "execution_count": null,
  282. "metadata": {
  283. "collapsed": false
  284. },
  285. "outputs": [],
  286. "source": [
  287. "x = 'X-DSPAM-Confidence: 0.8475'\n",
  288. "print x\n",
  289. "pos = x.find(' ')\n",
  290. "num = float(x[pos+1:])\n",
  291. "print num,'Is a',type(num)\n"
  292. ]
  293. },
  294. {
  295. "cell_type": "code",
  296. "execution_count": null,
  297. "metadata": {
  298. "collapsed": false
  299. },
  300. "outputs": [],
  301. "source": [
  302. "lotto = [2, 14, 28, 41, 63]\n",
  303. "lotto[2] = ['Paris', 'Lyon', 'Nice']\n",
  304. "print lotto"
  305. ]
  306. },
  307. {
  308. "cell_type": "code",
  309. "execution_count": null,
  310. "metadata": {
  311. "collapsed": false
  312. },
  313. "outputs": [],
  314. "source": [
  315. "fruit = 'Banana'\n",
  316. "print fruit[0]\n",
  317. "fruit = fruit.lower()\n",
  318. "print fruit"
  319. ]
  320. },
  321. {
  322. "cell_type": "code",
  323. "execution_count": null,
  324. "metadata": {
  325. "collapsed": false
  326. },
  327. "outputs": [],
  328. "source": [
  329. "# Range Function\n",
  330. "print range(10)\n",
  331. "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
  332. "print range(len(cities))"
  333. ]
  334. },
  335. {
  336. "cell_type": "code",
  337. "execution_count": null,
  338. "metadata": {
  339. "collapsed": false
  340. },
  341. "outputs": [],
  342. "source": [
  343. "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
  344. "for city in cities:\n",
  345. " print 'You are wellecome to ', city"
  346. ]
  347. },
  348. {
  349. "cell_type": "code",
  350. "execution_count": null,
  351. "metadata": {
  352. "collapsed": false
  353. },
  354. "outputs": [],
  355. "source": [
  356. "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
  357. "for i in range(len(cities)):\n",
  358. " city = cities[i]\n",
  359. " print 'You are wellecome to ', city"
  360. ]
  361. },
  362. {
  363. "cell_type": "code",
  364. "execution_count": null,
  365. "metadata": {
  366. "collapsed": false
  367. },
  368. "outputs": [],
  369. "source": [
  370. "#Slicing List\n",
  371. "t=[9, 11, 43, 2 ,55,99]\n",
  372. "t[1:3]\n",
  373. "#Remember: The second number is \"Up to but not including\""
  374. ]
  375. },
  376. {
  377. "cell_type": "code",
  378. "execution_count": null,
  379. "metadata": {
  380. "collapsed": false
  381. },
  382. "outputs": [],
  383. "source": [
  384. "stuff = list()\n",
  385. "stuff.append('book')\n",
  386. "stuff.append('cookies')\n",
  387. "print stuff\n",
  388. "stuff.append(99)\n",
  389. "print stuff\n",
  390. "stuff.sort()\n",
  391. "print stuff"
  392. ]
  393. },
  394. {
  395. "cell_type": "code",
  396. "execution_count": null,
  397. "metadata": {
  398. "collapsed": false
  399. },
  400. "outputs": [],
  401. "source": [
  402. "total = 0\n",
  403. "count = 0\n",
  404. "while True:\n",
  405. " inp = raw_input('Enter a number:')\n",
  406. " if inp == 'done':break\n",
  407. " value = float(inp)\n",
  408. " total = total + value\n",
  409. " count = count + 1\n",
  410. "\n",
  411. "average = total/count\n",
  412. "print 'Average:', average"
  413. ]
  414. },
  415. {
  416. "cell_type": "code",
  417. "execution_count": null,
  418. "metadata": {
  419. "collapsed": false
  420. },
  421. "outputs": [],
  422. "source": [
  423. "numlist = list()\n",
  424. "while True:\n",
  425. " inp = raw_input('Enter a number:')\n",
  426. " if inp == 'done':break\n",
  427. " value = float(inp)\n",
  428. " numlist.append(value)\n",
  429. "\n",
  430. "average = sum(numlist)/len(numlist)\n",
  431. "print 'Average:', average"
  432. ]
  433. },
  434. {
  435. "cell_type": "code",
  436. "execution_count": null,
  437. "metadata": {
  438. "collapsed": false
  439. },
  440. "outputs": [],
  441. "source": [
  442. "######### Lists & Strings\n",
  443. "abc = ' Welcome to Python tutorial'\n",
  444. "stuff = abc.split()\n",
  445. "print type (abc)\n",
  446. "print type(stuff)\n",
  447. "print stuff[0]\n",
  448. "print stuff[len(stuff)-1]\n",
  449. "for i in stuff:\n",
  450. " print i"
  451. ]
  452. },
  453. {
  454. "cell_type": "code",
  455. "execution_count": null,
  456. "metadata": {
  457. "collapsed": false
  458. },
  459. "outputs": [],
  460. "source": [
  461. "######### Lists & Strings\n",
  462. "line = 'first;second;third'\n",
  463. "thing = line.split()\n",
  464. "print thing, \"This list has \", len(thing), \"Elements\"\n",
  465. "thing2 = line.split(';')\n",
  466. "print thing2, \"This list has \", len(thing2), \"Elements\""
  467. ]
  468. },
  469. {
  470. "cell_type": "code",
  471. "execution_count": null,
  472. "metadata": {
  473. "collapsed": false
  474. },
  475. "outputs": [],
  476. "source": [
  477. "fhand = open('mbox-short.txt')\n",
  478. "for line in fhand:\n",
  479. " line = line.rstrip()\n",
  480. " if not line.startswith('From '):continue\n",
  481. " words = line.split()\n",
  482. " print words[2:7]"
  483. ]
  484. },
  485. {
  486. "cell_type": "code",
  487. "execution_count": null,
  488. "metadata": {
  489. "collapsed": false
  490. },
  491. "outputs": [],
  492. "source": [
  493. "###### Double Split Pattern\n",
  494. "fhand = open('mbox-short.txt')\n",
  495. "for line in fhand:\n",
  496. " line = line.rstrip()\n",
  497. " if not line.startswith('From '):continue\n",
  498. " words = line.split()\n",
  499. " email = words[1]\n",
  500. " pieces = email.split('@')\n",
  501. " print pieces[1]"
  502. ]
  503. },
  504. {
  505. "cell_type": "code",
  506. "execution_count": null,
  507. "metadata": {
  508. "collapsed": false
  509. },
  510. "outputs": [],
  511. "source": [
  512. "# Dictionnaires - Key/Value\n",
  513. "purse = dict()\n",
  514. "purse['money'] = 12\n",
  515. "purse['candy'] = 3\n",
  516. "purse['tissues'] = 75\n",
  517. "print purse, '\\n'\n",
  518. "print purse['candy']\n",
  519. "purse['candy'] = purse['candy']+2\n",
  520. "print purse\n",
  521. "# ===================================\n",
  522. "# key=chuck, value=1\n",
  523. "jjj = {'chuck':1,'fred':42, 'jan':100}\n",
  524. "print jjj, type(jjj)\n",
  525. "ooo = {}\n",
  526. "print ooo\n",
  527. "# ===================================\n",
  528. "names = {'zhen':5, 'marquard':3,'cwen':2, 'csev':3}\n",
  529. "print names, type(names)"
  530. ]
  531. },
  532. {
  533. "cell_type": "code",
  534. "execution_count": null,
  535. "metadata": {
  536. "collapsed": false
  537. },
  538. "outputs": [],
  539. "source": [
  540. "counts = dict () \n",
  541. "names = ['csev', 'owen', 'csev', 'zqian', 'cwen', 'csev', 'csev', 'csev']\n",
  542. "# ------ names is a list\n",
  543. "for name in names:\n",
  544. " if name not in counts:\n",
  545. " counts [name] = 1 \n",
  546. " else: \n",
  547. " counts [name] = counts [name] + 1 \n",
  548. "print counts "
  549. ]
  550. },
  551. {
  552. "cell_type": "code",
  553. "execution_count": null,
  554. "metadata": {
  555. "collapsed": false
  556. },
  557. "outputs": [],
  558. "source": [
  559. "# Get Method\n",
  560. "counts = dict () \n",
  561. "if name in counts:\n",
  562. " print counts[name]\n",
  563. "else:\n",
  564. " print 0"
  565. ]
  566. },
  567. {
  568. "cell_type": "code",
  569. "execution_count": null,
  570. "metadata": {
  571. "collapsed": true
  572. },
  573. "outputs": [],
  574. "source": [
  575. "# counts = dict () \n",
  576. "print counts.get(name,0)\n",
  577. "# get(name=KeyName,0=ValueToGiveBackIfKeyIsntExist)\n"
  578. ]
  579. },
  580. {
  581. "cell_type": "code",
  582. "execution_count": null,
  583. "metadata": {
  584. "collapsed": false
  585. },
  586. "outputs": [],
  587. "source": [
  588. "counts = dict () \n",
  589. "listOfnames = ['zqian', 'owen', 'csev', 'zqian', 'cwen', 'csev', 'csev', 'owen']\n",
  590. "for name in listOfnames:\n",
  591. " counts[name] = counts.get(name,0)+1 # Either to creat or to update\n",
  592. " # get(name=KeyName,0=ValueToGiveBackIfKeyIsntExist) \n",
  593. "print counts"
  594. ]
  595. },
  596. {
  597. "cell_type": "code",
  598. "execution_count": null,
  599. "metadata": {
  600. "collapsed": false
  601. },
  602. "outputs": [],
  603. "source": [
  604. "test ={'owen': 2, 'zqian': 2, 'csev': 3, 'cwen': 1}\n",
  605. "print test.get('owen')"
  606. ]
  607. },
  608. {
  609. "cell_type": "code",
  610. "execution_count": null,
  611. "metadata": {
  612. "collapsed": false
  613. },
  614. "outputs": [],
  615. "source": [
  616. "#Find most commun word/Top 10 words\n",
  617. "###### Counting Pattern ###########\n",
  618. "counts = dict()\n",
  619. "print 'Enter a line of text:'\n",
  620. "line = raw_input('')\n",
  621. "\n",
  622. "words = line.split()\n",
  623. "print 'Words: ', words\n",
  624. "\n",
  625. "print 'Counting...'\n",
  626. "for word in words:\n",
  627. " counts[word] = counts.get(word,0) + 1\n",
  628. "print counts\n",
  629. "\n"
  630. ]
  631. },
  632. {
  633. "cell_type": "code",
  634. "execution_count": null,
  635. "metadata": {
  636. "collapsed": false
  637. },
  638. "outputs": [],
  639. "source": [
  640. "# Loop throught Dictionaries\n",
  641. "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
  642. "for key in dicOfWords:\n",
  643. " print key, dicOfWords[key]"
  644. ]
  645. },
  646. {
  647. "cell_type": "code",
  648. "execution_count": null,
  649. "metadata": {
  650. "collapsed": false
  651. },
  652. "outputs": [],
  653. "source": [
  654. "# Retrieving list of Keys and Valuse \n",
  655. "# i.e converting Dictionary to List\n",
  656. "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
  657. "print 'Print dictionary as a list','\\n',list(dicOfWords)\n",
  658. "print 'Print dictionarys Keys','\\n',dicOfWords.keys()\n",
  659. "print 'Print dictionarys Values ','\\n',dicOfWords.values()\n",
  660. "print 'Print dictionary as an item set','\\n',dicOfWords.items()"
  661. ]
  662. },
  663. {
  664. "cell_type": "code",
  665. "execution_count": null,
  666. "metadata": {
  667. "collapsed": false
  668. },
  669. "outputs": [],
  670. "source": [
  671. "# Bonus: Two Iteration Variables!\n",
  672. "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
  673. "for i,j in dicOfWords.items(): # i= Key, j = value\n",
  674. " print i,j"
  675. ]
  676. },
  677. {
  678. "cell_type": "code",
  679. "execution_count": null,
  680. "metadata": {
  681. "collapsed": false
  682. },
  683. "outputs": [],
  684. "source": [
  685. "fileName = raw_input('Entered file name:')\n",
  686. "try:\n",
  687. " handle = open(fileName,'r')\n",
  688. "except:\n",
  689. " print 'File cannt be opend', fname \n",
  690. " exit()\n",
  691. "text = handle.read() # as it is a small text we put it in one variable\n",
  692. "listOfWords = text.split() \n",
  693. "counts = dict()\n",
  694. "for word in listOfWords:\n",
  695. " counts[word] = counts.get(word,0) + 1\n",
  696. "\n",
  697. "bigcount = None\n",
  698. "bigword = None\n",
  699. "for word,count in counts.items():\n",
  700. " if bigcount == None or count > bigcount:\n",
  701. " bigword = word\n",
  702. " bigcount = count\n",
  703. "print 'The bigest word is:',bigword,' Which appears ', bigcount, 'times'"
  704. ]
  705. },
  706. {
  707. "cell_type": "code",
  708. "execution_count": null,
  709. "metadata": {
  710. "collapsed": false
  711. },
  712. "outputs": [],
  713. "source": [
  714. "#========Tuple============\n",
  715. "# We use tuple for temporary variables (we use Tuple as temporary list)\n",
  716. "l = list()\n",
  717. "dir(l)\n"
  718. ]
  719. },
  720. {
  721. "cell_type": "code",
  722. "execution_count": null,
  723. "metadata": {
  724. "collapsed": false
  725. },
  726. "outputs": [],
  727. "source": [
  728. "t = tuple()\n",
  729. "dir(t)"
  730. ]
  731. },
  732. {
  733. "cell_type": "code",
  734. "execution_count": null,
  735. "metadata": {
  736. "collapsed": false
  737. },
  738. "outputs": [],
  739. "source": [
  740. "(x, y) = (4, 'Fred')\n",
  741. "print x\n",
  742. "print y\n",
  743. "a, b = (88, 99)\n",
  744. "print a\n",
  745. "print b"
  746. ]
  747. },
  748. {
  749. "cell_type": "code",
  750. "execution_count": null,
  751. "metadata": {
  752. "collapsed": false
  753. },
  754. "outputs": [],
  755. "source": [
  756. "d = dict()\n",
  757. "d['aa'] = 2\n",
  758. "d['bb'] = 4\n",
  759. "for(k,v) in d.items():\n",
  760. " print k, v\n",
  761. "#=========================\n",
  762. "tups = d.items() # We can get list of tuples\n",
  763. "print tups # We can get list of tuple each eliments inside\n",
  764. " # the list is a Tuple"
  765. ]
  766. },
  767. {
  768. "cell_type": "code",
  769. "execution_count": null,
  770. "metadata": {
  771. "collapsed": false
  772. },
  773. "outputs": [],
  774. "source": [
  775. "# ========== Tuples are Comparable\n",
  776. "(1, 1, 2)>(0, 2, 0)"
  777. ]
  778. },
  779. {
  780. "cell_type": "code",
  781. "execution_count": null,
  782. "metadata": {
  783. "collapsed": false
  784. },
  785. "outputs": [],
  786. "source": [
  787. "#Sorting Lists of Tuples\n",
  788. "d = {'b':1, 'a':10,'c':22}\n",
  789. "print d.items()\n",
  790. "t = sorted(d.items())\n",
  791. "t"
  792. ]
  793. },
  794. {
  795. "cell_type": "code",
  796. "execution_count": null,
  797. "metadata": {
  798. "collapsed": false
  799. },
  800. "outputs": [],
  801. "source": [
  802. "for k, v in sorted(d.items()):\n",
  803. " print k, v\n",
  804. "for k1, v1 in d.items():\n",
  805. " print '===============\\n',k1, v1"
  806. ]
  807. },
  808. {
  809. "cell_type": "code",
  810. "execution_count": null,
  811. "metadata": {
  812. "collapsed": false
  813. },
  814. "outputs": [],
  815. "source": [
  816. "# Sort by value instated of key\n",
  817. "c = {'b':1, 'a':10,'c':22}\n",
  818. "tmp = list()\n",
  819. "tmp2 = list()\n",
  820. "for k, v in c.items():\n",
  821. " tmp.append((v,k)) # Key first, Value secound\n",
  822. "print tmp\n",
  823. "tmp.sort() # Ascending sort \n",
  824. "print tmp\n",
  825. "tmp.sort(reverse=True) # descending sort \n",
  826. "print tmp"
  827. ]
  828. },
  829. {
  830. "cell_type": "code",
  831. "execution_count": null,
  832. "metadata": {
  833. "collapsed": false
  834. },
  835. "outputs": [],
  836. "source": [
  837. "fname = raw_input('Enter the file name: ')\n",
  838. "try:\n",
  839. " fhand = open(fname)\n",
  840. "except:\n",
  841. " print 'File cannot be opened:', fname\n",
  842. " exit()\n",
  843. "counts = dict()\n",
  844. "for line in fhand:\n",
  845. " words = line.split()\n",
  846. " for word in words:\n",
  847. " counts[word] = counts.get(word,0)+1\n",
  848. "lst = list()\n",
  849. "for key, val in counts.items():\n",
  850. " lst.append( (val, key) ) # put values in the list\n",
  851. " \n",
  852. "lst.sort(reverse=True)\n",
  853. "\n",
  854. "for val, key in lst[:10]:\n",
  855. " print key, val"
  856. ]
  857. },
  858. {
  859. "cell_type": "code",
  860. "execution_count": null,
  861. "metadata": {
  862. "collapsed": false
  863. },
  864. "outputs": [],
  865. "source": [
  866. "fname = raw_input('Enter the file name: ')\n",
  867. "try:\n",
  868. " fhand = open(fname)\n",
  869. "except:\n",
  870. " print 'File cannot be opened:', fname\n",
  871. " exit()\n",
  872. "#======================================== \n",
  873. "counts = dict()\n",
  874. "for line in fhand:\n",
  875. " words = line.split()\n",
  876. " for word in words:\n",
  877. " counts[word] = counts.get(word,0)+1\n",
  878. "lst = list()\n",
  879. "for key, val in counts.items():\n",
  880. " lst.append( (val, key) ) # put values in the list\n",
  881. "tups = sorted ( [ (v,k) for k,v in counts.items() ])\n",
  882. "print tups[len(tups)-10:len(tups)]"
  883. ]
  884. },
  885. {
  886. "cell_type": "code",
  887. "execution_count": null,
  888. "metadata": {
  889. "collapsed": false
  890. },
  891. "outputs": [],
  892. "source": [
  893. "fname = raw_input('Enter the file name: ')\n",
  894. "try:\n",
  895. " fhand = open(fname)\n",
  896. "except:\n",
  897. " print 'File cannot be opened:', fname\n",
  898. " exit()\n",
  899. "#======================================== \n",
  900. "counts = dict()\n",
  901. "for line in fhand:\n",
  902. " words = line.split()\n",
  903. " for word in words:\n",
  904. " wrd = word.lower()\n",
  905. " counts[wrd] = counts.get(wrd,0)+1\n",
  906. " \n",
  907. "flipped = list()\n",
  908. "for kie, vaal in counts.items():\n",
  909. " newtup = (vaal, kie)\n",
  910. " flipped.append(newtup)\n",
  911. "print 'befor been sorted ============== \\n',flipped[len(flipped)-10:len(flipped)] # befor been sorted\n",
  912. "flipped.sort()\n",
  913. "print 'After been sorted ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n",
  914. "flipped.sort(reverse=True)\n",
  915. "print 'After reverse sorting ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n",
  916. "\n",
  917. "#@@@@@@@@ Print Top 10 Values\n",
  918. "for kay, vall in flipped[:5]:\n",
  919. " print \"Winner\", kay, vall\n"
  920. ]
  921. },
  922. {
  923. "cell_type": "code",
  924. "execution_count": null,
  925. "metadata": {
  926. "collapsed": false
  927. },
  928. "outputs": [],
  929. "source": [
  930. "flipped.sort(reverse=True)\n",
  931. "print 'After reverse sorting ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n"
  932. ]
  933. },
  934. {
  935. "cell_type": "code",
  936. "execution_count": null,
  937. "metadata": {
  938. "collapsed": false
  939. },
  940. "outputs": [],
  941. "source": [
  942. "import re\n",
  943. "hand = open('mbox-short.txt')\n",
  944. "for line in hand:\n",
  945. " line = line.rstrip()\n",
  946. " if re.search('From:',line): # if From in Line\n",
  947. " print line"
  948. ]
  949. },
  950. {
  951. "cell_type": "code",
  952. "execution_count": null,
  953. "metadata": {
  954. "collapsed": false
  955. },
  956. "outputs": [],
  957. "source": [
  958. "import re\n",
  959. "hand = open('mbox-short.txt')\n",
  960. "for line in hand:\n",
  961. " line = line.rstrip()\n",
  962. " if re.search('^X-DSPAM-Result',line): # (^) if lien start with \"X-DSPAM-Result\"\n",
  963. " print line"
  964. ]
  965. },
  966. {
  967. "cell_type": "code",
  968. "execution_count": 2,
  969. "metadata": {
  970. "collapsed": false
  971. },
  972. "outputs": [
  973. {
  974. "name": "stdout",
  975. "output_type": "stream",
  976. "text": [
  977. "['2', '1', '9', '4', '2']\n"
  978. ]
  979. }
  980. ],
  981. "source": [
  982. "\"Matching and Extracting Data\"\n",
  983. "import re\n",
  984. "x = \"My 2 favorite numbers are 19 and 42\"\n",
  985. "y = re.findall(\"[0-9]+ \",x)\n",
  986. "print y"
  987. ]
  988. },
  989. {
  990. "cell_type": "code",
  991. "execution_count": 4,
  992. "metadata": {
  993. "collapsed": false
  994. },
  995. "outputs": [
  996. {
  997. "name": "stdout",
  998. "output_type": "stream",
  999. "text": [
  1000. "['M']\n"
  1001. ]
  1002. }
  1003. ],
  1004. "source": [
  1005. "import re #Regular Expressions\n",
  1006. "x = \"My 2 favorite numbers are 19 and 42\"\n",
  1007. "y = re.findall(\"[ABCDEFGHM]+\",x)\n",
  1008. "print y"
  1009. ]
  1010. },
  1011. {
  1012. "cell_type": "code",
  1013. "execution_count": 5,
  1014. "metadata": {
  1015. "collapsed": false
  1016. },
  1017. "outputs": [
  1018. {
  1019. "name": "stdout",
  1020. "output_type": "stream",
  1021. "text": [
  1022. "['From Using the :']\n"
  1023. ]
  1024. }
  1025. ],
  1026. "source": [
  1027. "# Greedy Matching\n",
  1028. "import re\n",
  1029. "x = 'From: Using the : character'\n",
  1030. "y = re.findall('^F.+:',x) # First character in the match is an F\n",
  1031. "print y # Last character in the match is a :\n",
  1032. " # (+) i.e One or more character"
  1033. ]
  1034. },
  1035. {
  1036. "cell_type": "code",
  1037. "execution_count": 10,
  1038. "metadata": {
  1039. "collapsed": false
  1040. },
  1041. "outputs": [
  1042. {
  1043. "name": "stdout",
  1044. "output_type": "stream",
  1045. "text": [
  1046. "['From sfsdgerbfr :']\n"
  1047. ]
  1048. }
  1049. ],
  1050. "source": [
  1051. "# Non-Greedy Matching\n",
  1052. "import re\n",
  1053. "x = 'From: Using the : character'\n",
  1054. "y = re.findall('^F.+?:',x) # First character in the match is an F\n",
  1055. "print y # Last character in the match is a :\n",
  1056. " # (+) i.e One or more character BUT \n",
  1057. " # NOT GREEDY "
  1058. ]
  1059. },
  1060. {
  1061. "cell_type": "code",
  1062. "execution_count": 11,
  1063. "metadata": {
  1064. "collapsed": false
  1065. },
  1066. "outputs": [
  1067. {
  1068. "name": "stdout",
  1069. "output_type": "stream",
  1070. "text": [
  1071. "['a.shlash@ymail.com']\n"
  1072. ]
  1073. }
  1074. ],
  1075. "source": [
  1076. "# Fine Tuning String Extraction\n",
  1077. "x = 'From: a.shlash@ymail.com Sat bala abla bals lmsqlmfdzeu'\n",
  1078. "y = re.findall('\\S+@\\S+',x) \n",
  1079. "print y "
  1080. ]
  1081. },
  1082. {
  1083. "cell_type": "code",
  1084. "execution_count": 14,
  1085. "metadata": {
  1086. "collapsed": false
  1087. },
  1088. "outputs": [
  1089. {
  1090. "name": "stdout",
  1091. "output_type": "stream",
  1092. "text": [
  1093. "['uct.ac.za']\n"
  1094. ]
  1095. }
  1096. ],
  1097. "source": [
  1098. "#The Double Split Version \n",
  1099. "import re\n",
  1100. "lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
  1101. "y = re.findall('@([^ ]*)', lin) #Look through the string until you\n",
  1102. "print y #find an at-sign(@)\n",
  1103. " # ( et ) i.e Extract the non-blank character"
  1104. ]
  1105. },
  1106. {
  1107. "cell_type": "code",
  1108. "execution_count": 15,
  1109. "metadata": {
  1110. "collapsed": false
  1111. },
  1112. "outputs": [
  1113. {
  1114. "name": "stdout",
  1115. "output_type": "stream",
  1116. "text": [
  1117. "['uct.ac.za']\n"
  1118. ]
  1119. }
  1120. ],
  1121. "source": [
  1122. "#The Double Split Version \n",
  1123. "import re\n",
  1124. "lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
  1125. "y = re.findall('^From .*@([^ ]*)', lin) #Look through the string until you\n",
  1126. "print y #find an at-sign(@)\n",
  1127. " # ( et ) i.e Extract the non-blank character"
  1128. ]
  1129. },
  1130. {
  1131. "cell_type": "code",
  1132. "execution_count": null,
  1133. "metadata": {
  1134. "collapsed": true
  1135. },
  1136. "outputs": [],
  1137. "source": []
  1138. }
  1139. ],
  1140. "metadata": {
  1141. "kernelspec": {
  1142. "display_name": "Python 2",
  1143. "language": "python",
  1144. "name": "python2"
  1145. },
  1146. "language_info": {
  1147. "codemirror_mode": {
  1148. "name": "ipython",
  1149. "version": 2
  1150. },
  1151. "file_extension": ".py",
  1152. "mimetype": "text/x-python",
  1153. "name": "python",
  1154. "nbconvert_exporter": "python",
  1155. "pygments_lexer": "ipython2",
  1156. "version": "2.7.10"
  1157. }
  1158. },
  1159. "nbformat": 4,
  1160. "nbformat_minor": 0
  1161. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement