Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0 b\n",
- "1 a\n",
- "2 n\n",
- "3 a\n",
- "4 n\n",
- "5 a\n"
- ]
- }
- ],
- "source": [
- "fruit = 'banana'\n",
- "index = 0\n",
- "while index < len(fruit):\n",
- " letter = fruit[index]\n",
- " print index, letter\n",
- " index = index + 1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fruit = 'banana'\n",
- "index = 0\n",
- "for letter in fruit:\n",
- " print index, letter\n",
- " index = index + 1\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "word = 'banana'\n",
- "count = 0\n",
- "for letter in word:\n",
- " if letter=='a':\n",
- " count = count + 1\n",
- "print count"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "greet = 'Hello Bob'\n",
- "nstr = greet.replace('Bob','Jane')\n",
- "print nstr"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "greet = 'Hello Bob'\n",
- "nstr = greet.replace('o','X')\n",
- "print nstr"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "greet = ' Hello Bob '\n",
- "greet.lstrip()\n",
- "greet.rstrip()\n",
- "greet.strip()\n",
- "\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "line = 'Please have a nice day'\n",
- "line.startswith('Please')\n",
- "line.startswith('please')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
- "atpos = data.find('@')\n",
- "print atpos"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "sppos = data.find(' ',atpos)\n",
- "print sppos"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "host = data[atpos+1 :sppos]\n",
- "# up to but not incolding space\n",
- "print host"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fhand = open('mbox-short.txt')\n",
- "print fhand\n",
- "count = 0\n",
- "for line in fhand:\n",
- " count = count + 1\n",
- "print 'Line Count:', count"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fhand = open('mbox-short.txt')\n",
- "for line in fhand:\n",
- " if line.startswith('From:'):\n",
- " print line"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fhand = open('mbox-short.txt')\n",
- "for line in fhand:\n",
- " line = line.rstrip()\n",
- " if line.startswith('From:'):\n",
- " print line"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fhand = open('mbox-short.txt')\n",
- "for line in fhand:\n",
- " line = line.rstrip()\n",
- " #Skip uninteresting lines\n",
- " if not line.startswith('From:'):\n",
- " continue\n",
- " #Process ou interesting lines\n",
- " print line"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fhand = open('mbox-short.txt')\n",
- "for line in fhand:\n",
- " line = line.rstrip()\n",
- " #Skip uninteresting lines\n",
- " if not '@uct.ac.za' in line:\n",
- " continue\n",
- " #Process ou interesting lines\n",
- " print line"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fname = raw_input('Enter the file name')\n",
- "fhand = open(fname)\n",
- "count = 0\n",
- "for line in fhand:\n",
- " if line.startswith('Subject:'):\n",
- " count = count + 1\n",
- "print 'There were'.count, 'subject line in'.fname"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fname = raw_input('Enter the file name: ')\n",
- "try:\n",
- " fhand = open(fname)\n",
- "except:\n",
- " print 'File cannot be opened:', fname\n",
- " exit()\n",
- "\n",
- "count = 0\n",
- "for line in fhand:\n",
- " if line.startswith('From:'):\n",
- " line = line.rstrip()\n",
- " count = count + 1\n",
- " print line\n",
- "print 'There were', count, 'From line in', fname\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "x = 'X-DSPAM-Confidence: 0.8475'\n",
- "print x\n",
- "pos = x.find(' ')\n",
- "num = float(x[pos+1:])\n",
- "print num,'Is a',type(num)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "lotto = [2, 14, 28, 41, 63]\n",
- "lotto[2] = ['Paris', 'Lyon', 'Nice']\n",
- "print lotto"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fruit = 'Banana'\n",
- "print fruit[0]\n",
- "fruit = fruit.lower()\n",
- "print fruit"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Range Function\n",
- "print range(10)\n",
- "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
- "print range(len(cities))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
- "for city in cities:\n",
- " print 'You are wellecome to ', city"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "cities = ['Paris', 'Lyon', 'Nice', 'Nantes']\n",
- "for i in range(len(cities)):\n",
- " city = cities[i]\n",
- " print 'You are wellecome to ', city"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#Slicing List\n",
- "t=[9, 11, 43, 2 ,55,99]\n",
- "t[1:3]\n",
- "#Remember: The second number is \"Up to but not including\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "stuff = list()\n",
- "stuff.append('book')\n",
- "stuff.append('cookies')\n",
- "print stuff\n",
- "stuff.append(99)\n",
- "print stuff\n",
- "stuff.sort()\n",
- "print stuff"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "total = 0\n",
- "count = 0\n",
- "while True:\n",
- " inp = raw_input('Enter a number:')\n",
- " if inp == 'done':break\n",
- " value = float(inp)\n",
- " total = total + value\n",
- " count = count + 1\n",
- "\n",
- "average = total/count\n",
- "print 'Average:', average"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "numlist = list()\n",
- "while True:\n",
- " inp = raw_input('Enter a number:')\n",
- " if inp == 'done':break\n",
- " value = float(inp)\n",
- " numlist.append(value)\n",
- "\n",
- "average = sum(numlist)/len(numlist)\n",
- "print 'Average:', average"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "######### Lists & Strings\n",
- "abc = ' Welcome to Python tutorial'\n",
- "stuff = abc.split()\n",
- "print type (abc)\n",
- "print type(stuff)\n",
- "print stuff[0]\n",
- "print stuff[len(stuff)-1]\n",
- "for i in stuff:\n",
- " print i"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "######### Lists & Strings\n",
- "line = 'first;second;third'\n",
- "thing = line.split()\n",
- "print thing, \"This list has \", len(thing), \"Elements\"\n",
- "thing2 = line.split(';')\n",
- "print thing2, \"This list has \", len(thing2), \"Elements\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fhand = open('mbox-short.txt')\n",
- "for line in fhand:\n",
- " line = line.rstrip()\n",
- " if not line.startswith('From '):continue\n",
- " words = line.split()\n",
- " print words[2:7]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "###### Double Split Pattern\n",
- "fhand = open('mbox-short.txt')\n",
- "for line in fhand:\n",
- " line = line.rstrip()\n",
- " if not line.startswith('From '):continue\n",
- " words = line.split()\n",
- " email = words[1]\n",
- " pieces = email.split('@')\n",
- " print pieces[1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Dictionnaires - Key/Value\n",
- "purse = dict()\n",
- "purse['money'] = 12\n",
- "purse['candy'] = 3\n",
- "purse['tissues'] = 75\n",
- "print purse, '\\n'\n",
- "print purse['candy']\n",
- "purse['candy'] = purse['candy']+2\n",
- "print purse\n",
- "# ===================================\n",
- "# key=chuck, value=1\n",
- "jjj = {'chuck':1,'fred':42, 'jan':100}\n",
- "print jjj, type(jjj)\n",
- "ooo = {}\n",
- "print ooo\n",
- "# ===================================\n",
- "names = {'zhen':5, 'marquard':3,'cwen':2, 'csev':3}\n",
- "print names, type(names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "counts = dict () \n",
- "names = ['csev', 'owen', 'csev', 'zqian', 'cwen', 'csev', 'csev', 'csev']\n",
- "# ------ names is a list\n",
- "for name in names:\n",
- " if name not in counts:\n",
- " counts [name] = 1 \n",
- " else: \n",
- " counts [name] = counts [name] + 1 \n",
- "print counts "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Get Method\n",
- "counts = dict () \n",
- "if name in counts:\n",
- " print counts[name]\n",
- "else:\n",
- " print 0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# counts = dict () \n",
- "print counts.get(name,0)\n",
- "# get(name=KeyName,0=ValueToGiveBackIfKeyIsntExist)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "counts = dict () \n",
- "listOfnames = ['zqian', 'owen', 'csev', 'zqian', 'cwen', 'csev', 'csev', 'owen']\n",
- "for name in listOfnames:\n",
- " counts[name] = counts.get(name,0)+1 # Either to creat or to update\n",
- " # get(name=KeyName,0=ValueToGiveBackIfKeyIsntExist) \n",
- "print counts"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "test ={'owen': 2, 'zqian': 2, 'csev': 3, 'cwen': 1}\n",
- "print test.get('owen')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#Find most commun word/Top 10 words\n",
- "###### Counting Pattern ###########\n",
- "counts = dict()\n",
- "print 'Enter a line of text:'\n",
- "line = raw_input('')\n",
- "\n",
- "words = line.split()\n",
- "print 'Words: ', words\n",
- "\n",
- "print 'Counting...'\n",
- "for word in words:\n",
- " counts[word] = counts.get(word,0) + 1\n",
- "print counts\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Loop throught Dictionaries\n",
- "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
- "for key in dicOfWords:\n",
- " print key, dicOfWords[key]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Retrieving list of Keys and Valuse \n",
- "# i.e converting Dictionary to List\n",
- "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
- "print 'Print dictionary as a list','\\n',list(dicOfWords)\n",
- "print 'Print dictionarys Keys','\\n',dicOfWords.keys()\n",
- "print 'Print dictionarys Values ','\\n',dicOfWords.values()\n",
- "print 'Print dictionary as an item set','\\n',dicOfWords.items()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Bonus: Two Iteration Variables!\n",
- "dicOfWords = {'be': 18, 'question': 9, 'to': 18, 'not': 9, 'the': 9, 'or': 9, 'thats': 9}\n",
- "for i,j in dicOfWords.items(): # i= Key, j = value\n",
- " print i,j"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fileName = raw_input('Entered file name:')\n",
- "try:\n",
- " handle = open(fileName,'r')\n",
- "except:\n",
- " print 'File cannt be opend', fname \n",
- " exit()\n",
- "text = handle.read() # as it is a small text we put it in one variable\n",
- "listOfWords = text.split() \n",
- "counts = dict()\n",
- "for word in listOfWords:\n",
- " counts[word] = counts.get(word,0) + 1\n",
- "\n",
- "bigcount = None\n",
- "bigword = None\n",
- "for word,count in counts.items():\n",
- " if bigcount == None or count > bigcount:\n",
- " bigword = word\n",
- " bigcount = count\n",
- "print 'The bigest word is:',bigword,' Which appears ', bigcount, 'times'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#========Tuple============\n",
- "# We use tuple for temporary variables (we use Tuple as temporary list)\n",
- "l = list()\n",
- "dir(l)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "t = tuple()\n",
- "dir(t)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "(x, y) = (4, 'Fred')\n",
- "print x\n",
- "print y\n",
- "a, b = (88, 99)\n",
- "print a\n",
- "print b"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "d = dict()\n",
- "d['aa'] = 2\n",
- "d['bb'] = 4\n",
- "for(k,v) in d.items():\n",
- " print k, v\n",
- "#=========================\n",
- "tups = d.items() # We can get list of tuples\n",
- "print tups # We can get list of tuple each eliments inside\n",
- " # the list is a Tuple"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# ========== Tuples are Comparable\n",
- "(1, 1, 2)>(0, 2, 0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "#Sorting Lists of Tuples\n",
- "d = {'b':1, 'a':10,'c':22}\n",
- "print d.items()\n",
- "t = sorted(d.items())\n",
- "t"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "for k, v in sorted(d.items()):\n",
- " print k, v\n",
- "for k1, v1 in d.items():\n",
- " print '===============\\n',k1, v1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Sort by value instated of key\n",
- "c = {'b':1, 'a':10,'c':22}\n",
- "tmp = list()\n",
- "tmp2 = list()\n",
- "for k, v in c.items():\n",
- " tmp.append((v,k)) # Key first, Value secound\n",
- "print tmp\n",
- "tmp.sort() # Ascending sort \n",
- "print tmp\n",
- "tmp.sort(reverse=True) # descending sort \n",
- "print tmp"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fname = raw_input('Enter the file name: ')\n",
- "try:\n",
- " fhand = open(fname)\n",
- "except:\n",
- " print 'File cannot be opened:', fname\n",
- " exit()\n",
- "counts = dict()\n",
- "for line in fhand:\n",
- " words = line.split()\n",
- " for word in words:\n",
- " counts[word] = counts.get(word,0)+1\n",
- "lst = list()\n",
- "for key, val in counts.items():\n",
- " lst.append( (val, key) ) # put values in the list\n",
- " \n",
- "lst.sort(reverse=True)\n",
- "\n",
- "for val, key in lst[:10]:\n",
- " print key, val"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fname = raw_input('Enter the file name: ')\n",
- "try:\n",
- " fhand = open(fname)\n",
- "except:\n",
- " print 'File cannot be opened:', fname\n",
- " exit()\n",
- "#======================================== \n",
- "counts = dict()\n",
- "for line in fhand:\n",
- " words = line.split()\n",
- " for word in words:\n",
- " counts[word] = counts.get(word,0)+1\n",
- "lst = list()\n",
- "for key, val in counts.items():\n",
- " lst.append( (val, key) ) # put values in the list\n",
- "tups = sorted ( [ (v,k) for k,v in counts.items() ])\n",
- "print tups[len(tups)-10:len(tups)]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "fname = raw_input('Enter the file name: ')\n",
- "try:\n",
- " fhand = open(fname)\n",
- "except:\n",
- " print 'File cannot be opened:', fname\n",
- " exit()\n",
- "#======================================== \n",
- "counts = dict()\n",
- "for line in fhand:\n",
- " words = line.split()\n",
- " for word in words:\n",
- " wrd = word.lower()\n",
- " counts[wrd] = counts.get(wrd,0)+1\n",
- " \n",
- "flipped = list()\n",
- "for kie, vaal in counts.items():\n",
- " newtup = (vaal, kie)\n",
- " flipped.append(newtup)\n",
- "print 'befor been sorted ============== \\n',flipped[len(flipped)-10:len(flipped)] # befor been sorted\n",
- "flipped.sort()\n",
- "print 'After been sorted ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n",
- "flipped.sort(reverse=True)\n",
- "print 'After reverse sorting ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n",
- "\n",
- "#@@@@@@@@ Print Top 10 Values\n",
- "for kay, vall in flipped[:5]:\n",
- " print \"Winner\", kay, vall\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "flipped.sort(reverse=True)\n",
- "print 'After reverse sorting ============== \\n',flipped[len(flipped)-10:len(flipped)] # After been sorted\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import re\n",
- "hand = open('mbox-short.txt')\n",
- "for line in hand:\n",
- " line = line.rstrip()\n",
- " if re.search('From:',line): # if From in Line\n",
- " print line"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import re\n",
- "hand = open('mbox-short.txt')\n",
- "for line in hand:\n",
- " line = line.rstrip()\n",
- " if re.search('^X-DSPAM-Result',line): # (^) if lien start with \"X-DSPAM-Result\"\n",
- " print line"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['2', '1', '9', '4', '2']\n"
- ]
- }
- ],
- "source": [
- "\"Matching and Extracting Data\"\n",
- "import re\n",
- "x = \"My 2 favorite numbers are 19 and 42\"\n",
- "y = re.findall(\"[0-9]+ \",x)\n",
- "print y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['M']\n"
- ]
- }
- ],
- "source": [
- "import re #Regular Expressions\n",
- "x = \"My 2 favorite numbers are 19 and 42\"\n",
- "y = re.findall(\"[ABCDEFGHM]+\",x)\n",
- "print y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['From Using the :']\n"
- ]
- }
- ],
- "source": [
- "# Greedy Matching\n",
- "import re\n",
- "x = 'From: Using the : character'\n",
- "y = re.findall('^F.+:',x) # First character in the match is an F\n",
- "print y # Last character in the match is a :\n",
- " # (+) i.e One or more character"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['From sfsdgerbfr :']\n"
- ]
- }
- ],
- "source": [
- "# Non-Greedy Matching\n",
- "import re\n",
- "x = 'From: Using the : character'\n",
- "y = re.findall('^F.+?:',x) # First character in the match is an F\n",
- "print y # Last character in the match is a :\n",
- " # (+) i.e One or more character BUT \n",
- " # NOT GREEDY "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['a.shlash@ymail.com']\n"
- ]
- }
- ],
- "source": [
- "# Fine Tuning String Extraction\n",
- "x = 'From: a.shlash@ymail.com Sat bala abla bals lmsqlmfdzeu'\n",
- "y = re.findall('\\S+@\\S+',x) \n",
- "print y "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['uct.ac.za']\n"
- ]
- }
- ],
- "source": [
- "#The Double Split Version \n",
- "import re\n",
- "lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
- "y = re.findall('@([^ ]*)', lin) #Look through the string until you\n",
- "print y #find an at-sign(@)\n",
- " # ( et ) i.e Extract the non-blank character"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['uct.ac.za']\n"
- ]
- }
- ],
- "source": [
- "#The Double Split Version \n",
- "import re\n",
- "lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
- "y = re.findall('^From .*@([^ ]*)', lin) #Look through the string until you\n",
- "print y #find an at-sign(@)\n",
- " # ( et ) i.e Extract the non-blank character"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement