Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 b\n",
      "1 a\n",
      "2 n\n",
      "3 a\n",
      "4 n\n",
      "5 a\n"
     ]
    }
   ],
   "source": [
    "fruit = 'banana'\n",
    "index = 0\n",
    "while index < len(fruit):\n",
    "    letter = fruit[index]\n",
    "    print index, letter\n",
    "    index = index + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fruit = 'banana'\n",
    "index = 0\n",
    "for letter in fruit:\n",
    "    print index, letter\n",
    "    index = index + 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "word = 'banana'\n",
    "count = 0\n",
    "for letter in word:\n",
    "    if letter=='a':\n",
    "        count = count + 1\n",
    "print count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "greet = 'Hello Bob'\n",
    "nstr = greet.replace('Bob','Jane')\n",
    "print nstr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "greet = 'Hello Bob'\n",
    "nstr = greet.replace('o','X')\n",
    "print nstr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "greet = ' Hello Bob '\n",
    "greet.lstrip()\n",
    "greet.rstrip()\n",
    "greet.strip()\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "line = 'Please have a nice day'\n",
    "line.startswith('Please')\n",
    "line.startswith('please')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
    "atpos = data.find('@')\n",
    "print atpos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sppos = data.find(' ',atpos)\n",
    "print sppos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "host = data[atpos+1 :sppos]\n",
    "# up to but not incolding space\n",
    "print host"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fhand = open('mbox-short.txt')\n",
    "print fhand\n",
    "count = 0\n",
    "for line in fhand:\n",
    "    count = count + 1\n",
    "print 'Line Count:', count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fhand = open('mbox-short.txt')\n",
    "for line in fhand:\n",
    "    if line.startswith('From:'):\n",
    "        print line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fhand = open('mbox-short.txt')\n",
    "for line in fhand:\n",
    "    line = line.rstrip()\n",
    "    if line.startswith('From:'):\n",
    "        print line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fhand = open('mbox-short.txt')\n",
    "for line in fhand:\n",
    "    line = line.rstrip()\n",
    "    #Skip uninteresting lines\n",
    "    if not line.startswith('From:'):\n",
    "        continue\n",
    "    #Process ou interesting lines\n",
    "    print line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fhand = open('mbox-short.txt')\n",
    "for line in fhand:\n",
    "    line = line.rstrip()\n",
    "    #Skip uninteresting lines\n",
    "    if not '@uct.ac.za' in line:\n",
    "        continue\n",
    "    #Process ou interesting lines\n",
    "    print line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fname = raw_input('Enter the file name')\n",
    "fhand = open(fname)\n",
    "count = 0\n",
    "for line in fhand:\n",
    "    if line.startswith('Subject:'):\n",
    "        count = count + 1\n",
    "print 'There were'.count, 'subject line in'.fname"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fname = raw_input('Enter the file name: ')\n",
    "try:\n",
    "    fhand = open(fname)\n",
    "except:\n",
    "    print 'File cannot be opened:', fname\n",
    "    exit()\n",
    "\n",
    "count = 0\n",
    "for line in fhand:\n",
    "    if line.startswith('From:'):\n",
    "        line = line.rstrip()\n",
    "        count = count + 1\n",
    "        print line\n",
    "print 'There were', count, 'From line in', fname\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x = 'X-DSPAM-Confidence: 0.8475'\n",
    "print x\n",
    "pos = x.find(' ')\n",
    "num = float(x[pos+1:])\n",
    "print num,'Is a',type(num)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lotto = [2, 14, 28, 41, 63]\n",
    "lotto[2] =  ['Paris', 'Lyon', 'Nice']\n",
    "print lotto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fruit = 'Banana'\n",
    "print fruit[0]\n",
    "fruit = fruit.lower()\n",
    "print fruit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Range Function\n",
    "print range(10)\n",
    "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
    "print range(len(cities))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
    "for city in cities:\n",
    "    print 'You are wellecome to ', city"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
    "for i in range(len(cities)):\n",
    "    city = cities[i]\n",
    "    print 'You are wellecome to ', city"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Slicing List\n",
    "t=[9, 11, 43, 2 ,55,99]\n",
    "t[1:3]\n",
    "#Remember: The second number is \"Up to but not including\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "stuff = list()\n",
    "stuff.append('book')\n",
    "stuff.append('cookies')\n",
    "print stuff\n",
    "stuff.append(99)\n",
    "print stuff\n",
    "stuff.sort()\n",
    "print stuff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total = 0\n",
    "count = 0\n",
    "while True:\n",
    "    inp = raw_input('Enter a number:')\n",
    "    if inp == 'done':break\n",
    "    value = float(inp)\n",
    "    total = total + value\n",
    "    count = count + 1\n",
    "\n",
    "average = total/count\n",
    "print 'Average:', average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "numlist = list()\n",
    "while True:\n",
    "    inp = raw_input('Enter a number:')\n",
    "    if inp == 'done':break\n",
    "    value = float(inp)\n",
    "    numlist.append(value)\n",
    "\n",
    "average = sum(numlist)/len(numlist)\n",
    "print 'Average:', average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "######### Lists & Strings\n",
    "abc = ' Welcome to Python tutorial'\n",
    "stuff = abc.split()\n",
    "print type (abc)\n",
    "print type(stuff)\n",
    "print stuff[0]\n",
    "print stuff[len(stuff)-1]\n",
    "for i in stuff:\n",
    "    print i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "######### Lists & Strings\n",
    "line = 'first;second;third'\n",
    "thing = line.split()\n",
    "print thing, \"This list has \", len(thing), \"Elements\"\n",
    "thing2 = line.split(';')\n",
    "print thing2, \"This list has \", len(thing2), \"Elements\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fhand = open('mbox-short.txt')\n",
    "for line in fhand:\n",
    "    line = line.rstrip()\n",
    "    if not line.startswith('From '):continue\n",
    "    words = line.split()\n",
    "    print words[2:7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "###### Double Split Pattern\n",
    "fhand = open('mbox-short.txt')\n",
    "for line in fhand:\n",
    "    line = line.rstrip()\n",
    "    if not line.startswith('From '):continue\n",
    "    words = line.split()\n",
    "    email = words[1]\n",
    "    pieces = email.split('@')\n",
    "    print pieces[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Dictionnaires - Key/Value\n",
    "purse = dict()\n",
    "purse['money'] = 12\n",
    "purse['candy'] = 3\n",
    "purse['tissues'] = 75\n",
    "print purse, '\\n'\n",
    "print purse['candy']\n",
    "purse['candy'] = purse['candy']+2\n",
    "print purse\n",
    "# ===================================\n",
    "# key=chuck, value=1\n",
    "jjj = {'chuck':1,'fred':42, 'jan':100}\n",
    "print jjj, type(jjj)\n",
    "ooo = {}\n",
    "print ooo\n",
    "# ===================================\n",
    "names = {'zhen':5, 'marquard':3,'cwen':2, 'csev':3}\n",
    "print names, type(names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts = dict ()  \n",
    "names = ['csev', 'owen', 'csev', 'zqian', 'cwen', 'csev', 'csev', 'csev']\n",
    "# ------ names is a list\n",
    "for name in names:\n",
    "    if name not in counts:\n",
    "        counts [name] = 1  \n",
    "    else:  \n",
    "        counts [name] = counts [name] + 1  \n",
    "print counts "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Get Method\n",
    "counts = dict ()  \n",
    "if name in counts:\n",
    "    print counts[name]\n",
    "else:\n",
    "    print 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# counts = dict ()  \n",
    "print counts.get(name,0)\n",
    "# get(name=KeyName,0=ValueToGiveBackIfKeyIsntExist)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts = dict ()  \n",
    "listOfnames = ['zqian', 'owen', 'csev', 'zqian', 'cwen', 'csev', 'csev', 'owen']\n",
    "for name in listOfnames:\n",
    "    counts[name] = counts.get(name,0)+1 # Either to creat or to update\n",
    "    # get(name=KeyName,0=ValueToGiveBackIfKeyIsntExist)                                             \n",
    "print counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test ={'owen': 2, 'zqian': 2, 'csev': 3, 'cwen': 1}\n",
    "print test.get('owen')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Find most commun word/Top 10 words\n",
    "###### Counting Pattern ###########\n",
    "counts = dict()\n",
    "print 'Enter a line of text:'\n",
    "line = raw_input('')\n",
    "\n",
    "words = line.split()\n",
    "print 'Words: ', words\n",
    "\n",
    "print 'Counting...'\n",
    "for word in words:\n",
    "    counts[word] = counts.get(word,0) + 1\n",
    "print counts\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Loop throught Dictionaries\n",
    "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
    "for key in dicOfWords:\n",
    "    print key, dicOfWords[key]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Retrieving list of Keys and Valuse \n",
    "# i.e converting Dictionary to List\n",
    "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
    "print 'Print dictionary as a list','\\n',list(dicOfWords)\n",
    "print 'Print dictionarys Keys','\\n',dicOfWords.keys()\n",
    "print 'Print dictionarys Values ','\\n',dicOfWords.values()\n",
    "print 'Print dictionary as an item set','\\n',dicOfWords.items()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Bonus: Two Iteration Variables!\n",
    "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
    "for i,j in dicOfWords.items(): # i= Key, j = value\n",
    "    print i,j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fileName = raw_input('Entered file name:')\n",
    "try:\n",
    "    handle = open(fileName,'r')\n",
    "except:\n",
    "    print 'File cannt be opend', fname \n",
    "    exit()\n",
    "text = handle.read() # as it is a small text we put it in one variable\n",
    "listOfWords = text.split()  \n",
    "counts = dict()\n",
    "for word in listOfWords:\n",
    "    counts[word] = counts.get(word,0) + 1\n",
    "\n",
    "bigcount = None\n",
    "bigword = None\n",
    "for word,count in counts.items():\n",
    "    if bigcount == None or count > bigcount:\n",
    "        bigword = word\n",
    "        bigcount = count\n",
    "print 'The bigest word is:',bigword,' Which appears  ', bigcount, 'times'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#========Tuple============\n",
    "# We use tuple for temporary variables (we use Tuple as temporary list)\n",
    "l = list()\n",
    "dir(l)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "t = tuple()\n",
    "dir(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "(x, y) = (4, 'Fred')\n",
    "print x\n",
    "print y\n",
    "a, b = (88, 99)\n",
    "print a\n",
    "print b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d = dict()\n",
    "d['aa'] = 2\n",
    "d['bb'] = 4\n",
    "for(k,v) in d.items():\n",
    "    print k, v\n",
    "#=========================\n",
    "tups = d.items() # We can get list of tuples\n",
    "print tups       # We can get list of tuple each eliments inside\n",
    "                 # the list is a Tuple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# ========== Tuples are Comparable\n",
    "(1, 1, 2)>(0, 2, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Sorting Lists of Tuples\n",
    "d = {'b':1, 'a':10,'c':22}\n",
    "print d.items()\n",
    "t = sorted(d.items())\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for k, v in sorted(d.items()):\n",
    "    print k, v\n",
    "for k1, v1 in d.items():\n",
    "    print '===============\\n',k1, v1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Sort by value instated of key\n",
    "c = {'b':1, 'a':10,'c':22}\n",
    "tmp = list()\n",
    "tmp2 = list()\n",
    "for k, v in c.items():\n",
    "    tmp.append((v,k))  # Key first, Value secound\n",
    "print tmp\n",
    "tmp.sort() # Ascending sort \n",
    "print tmp\n",
    "tmp.sort(reverse=True) # descending sort \n",
    "print tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fname = raw_input('Enter the file name: ')\n",
    "try:\n",
    "    fhand = open(fname)\n",
    "except:\n",
    "    print 'File cannot be opened:', fname\n",
    "    exit()\n",
    "counts = dict()\n",
    "for line in fhand:\n",
    "    words = line.split()\n",
    "    for word in words:\n",
    "        counts[word] = counts.get(word,0)+1\n",
    "lst = list()\n",
    "for key, val in counts.items():\n",
    "    lst.append( (val, key) ) # put values in the list\n",
    "    \n",
    "lst.sort(reverse=True)\n",
    "\n",
    "for val, key in lst[:10]:\n",
    "    print key, val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fname = raw_input('Enter the file name: ')\n",
    "try:\n",
    "    fhand = open(fname)\n",
    "except:\n",
    "    print 'File cannot be opened:', fname\n",
    "    exit()\n",
    "#======================================== \n",
    "counts = dict()\n",
    "for line in fhand:\n",
    "    words = line.split()\n",
    "    for word in words:\n",
    "        counts[word] = counts.get(word,0)+1\n",
    "lst = list()\n",
    "for key, val in counts.items():\n",
    "    lst.append( (val, key) ) # put values in the list\n",
    "tups = sorted ( [ (v,k) for k,v in counts.items() ])\n",
    "print tups[len(tups)-10:len(tups)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fname = raw_input('Enter the file name: ')\n",
    "try:\n",
    "    fhand = open(fname)\n",
    "except:\n",
    "    print 'File cannot be opened:', fname\n",
    "    exit()\n",
    "#======================================== \n",
    "counts = dict()\n",
    "for line in fhand:\n",
    "    words = line.split()\n",
    "    for word in words:\n",
    "        wrd = word.lower()\n",
    "        counts[wrd] = counts.get(wrd,0)+1\n",
    "        \n",
    "flipped = list()\n",
    "for kie, vaal in counts.items():\n",
    "    newtup = (vaal, kie)\n",
    "    flipped.append(newtup)\n",
    "print 'befor been sorted ============== \\n',flipped[len(flipped)-10:len(flipped)] # befor been sorted\n",
    "flipped.sort()\n",
    "print 'After been sorted ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n",
    "flipped.sort(reverse=True)\n",
    "print 'After reverse sorting ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n",
    "\n",
    "#@@@@@@@@ Print Top 10 Values\n",
    "for kay, vall in flipped[:5]:\n",
    "    print \"Winner\", kay, vall\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "flipped.sort(reverse=True)\n",
    "print 'After reverse sorting ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "hand = open('mbox-short.txt')\n",
    "for line in hand:\n",
    "    line = line.rstrip()\n",
    "    if re.search('From:',line): # if From in Line\n",
    "        print line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "hand = open('mbox-short.txt')\n",
    "for line in hand:\n",
    "    line = line.rstrip()\n",
    "    if re.search('^X-DSPAM-Result',line): # (^) if lien start with \"X-DSPAM-Result\"\n",
    "        print line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['2', '1', '9', '4', '2']\n"
     ]
    }
   ],
   "source": [
    "\"Matching and Extracting Data\"\n",
    "import re\n",
    "x = \"My 2 favorite numbers are 19 and 42\"\n",
    "y = re.findall(\"[0-9]+ \",x)\n",
    "print y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['M']\n"
     ]
    }
   ],
   "source": [
    "import re     #Regular Expressions\n",
    "x = \"My 2 favorite numbers are 19 and 42\"\n",
    "y = re.findall(\"[ABCDEFGHM]+\",x)\n",
    "print y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['From Using the :']\n"
     ]
    }
   ],
   "source": [
    "# Greedy Matching\n",
    "import re\n",
    "x = 'From: Using the : character'\n",
    "y = re.findall('^F.+:',x) # First character in the match is an F\n",
    "print y                   # Last character in the match is a :\n",
    "                          # (+) i.e One or more character"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['From sfsdgerbfr :']\n"
     ]
    }
   ],
   "source": [
    "# Non-Greedy Matching\n",
    "import re\n",
    "x = 'From: Using the : character'\n",
    "y = re.findall('^F.+?:',x) # First character in the match is an F\n",
    "print y                    # Last character in the match is a :\n",
    "                           # (+) i.e One or more character BUT \n",
    "                           # NOT GREEDY "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['a.shlash@ymail.com']\n"
     ]
    }
   ],
   "source": [
    "# Fine Tuning String Extraction\n",
    "x = 'From: a.shlash@ymail.com Sat bala abla bals lmsqlmfdzeu'\n",
    "y = re.findall('\\S+@\\S+',x) \n",
    "print y "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['uct.ac.za']\n"
     ]
    }
   ],
   "source": [
    "#The Double Split Version \n",
    "import re\n",
    "lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
    "y = re.findall('@([^ ]*)', lin) #Look through the string until you\n",
    "print y                         #find an at-sign(@)\n",
    "                    # ( et ) i.e Extract the non-blank character"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['uct.ac.za']\n"
     ]
    }
   ],
   "source": [
    "#The Double Split Version \n",
    "import re\n",
    "lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
    "y = re.findall('^From .*@([^ ]*)', lin) #Look through the string until you\n",
    "print y                         #find an at-sign(@)\n",
    "                    # ( et ) i.e Extract the non-blank character"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}