Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false,
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "From: stephen.marquard@uct.ac.za\n",
- "From: louis@media.berkeley.edu\n",
- "From: zqian@umich.edu\n",
- "From: rjlowe@iupui.edu\n",
- "From: zqian@umich.edu\n",
- "From: rjlowe@iupui.edu\n",
- "From: cwen@iupui.edu\n",
- "From: cwen@iupui.edu\n",
- "From: gsilver@umich.edu\n",
- "From: gsilver@umich.edu\n",
- "From: zqian@umich.edu\n",
- "From: gsilver@umich.edu\n",
- "From: wagnermr@iupui.edu\n",
- "From: zqian@umich.edu\n",
- "From: antranig@caret.cam.ac.uk\n",
- "From: gopal.ramasammycook@gmail.com\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: stephen.marquard@uct.ac.za\n",
- "From: louis@media.berkeley.edu\n",
- "From: louis@media.berkeley.edu\n",
- "From: ray@media.berkeley.edu\n",
- "From: cwen@iupui.edu\n",
- "From: cwen@iupui.edu\n",
- "From: cwen@iupui.edu\n"
- ]
- }
- ],
- "source": [
- "hand = open('mbox-short.txt')\n",
- "for line in hand:\n",
- " line = line.rstrip()\n",
- " if line.find('From:') >= 0:\n",
- " # if line.startswith('From:') >= 0:\n",
- " print (line)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "From: stephen.marquard@uct.ac.za\n",
- "From: louis@media.berkeley.edu\n",
- "From: zqian@umich.edu\n",
- "From: rjlowe@iupui.edu\n",
- "From: zqian@umich.edu\n",
- "From: rjlowe@iupui.edu\n",
- "From: cwen@iupui.edu\n",
- "From: cwen@iupui.edu\n",
- "From: gsilver@umich.edu\n",
- "From: gsilver@umich.edu\n",
- "From: zqian@umich.edu\n",
- "From: gsilver@umich.edu\n",
- "From: wagnermr@iupui.edu\n",
- "From: zqian@umich.edu\n",
- "From: antranig@caret.cam.ac.uk\n",
- "From: gopal.ramasammycook@gmail.com\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: david.horwitz@uct.ac.za\n",
- "From: stephen.marquard@uct.ac.za\n",
- "From: louis@media.berkeley.edu\n",
- "From: louis@media.berkeley.edu\n",
- "From: ray@media.berkeley.edu\n",
- "From: cwen@iupui.edu\n",
- "From: cwen@iupui.edu\n",
- "From: cwen@iupui.edu\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "\n",
- "hand = open('mbox-short.txt')\n",
- "for line in hand:\n",
- " line = line.strip()\n",
- " if re.search('From:', line):\n",
- " print (line)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008\n",
- "From: stephen.marquard@uct.ac.za\n",
- "From louis@media.berkeley.edu Fri Jan 4 18:10:48 2008\n",
- "From: louis@media.berkeley.edu\n",
- "From zqian@umich.edu Fri Jan 4 16:10:39 2008\n",
- "From: zqian@umich.edu\n",
- "From rjlowe@iupui.edu Fri Jan 4 15:46:24 2008\n",
- "From: rjlowe@iupui.edu\n",
- "From zqian@umich.edu Fri Jan 4 15:03:18 2008\n",
- "From: zqian@umich.edu\n",
- "From rjlowe@iupui.edu Fri Jan 4 14:50:18 2008\n",
- "From: rjlowe@iupui.edu\n",
- "From cwen@iupui.edu Fri Jan 4 11:37:30 2008\n",
- "From: cwen@iupui.edu\n",
- "From cwen@iupui.edu Fri Jan 4 11:35:08 2008\n",
- "From: cwen@iupui.edu\n",
- "From gsilver@umich.edu Fri Jan 4 11:12:37 2008\n",
- "From: gsilver@umich.edu\n",
- "From gsilver@umich.edu Fri Jan 4 11:11:52 2008\n",
- "From: gsilver@umich.edu\n",
- "From zqian@umich.edu Fri Jan 4 11:11:03 2008\n",
- "From: zqian@umich.edu\n",
- "From gsilver@umich.edu Fri Jan 4 11:10:22 2008\n",
- "From: gsilver@umich.edu\n",
- "From wagnermr@iupui.edu Fri Jan 4 10:38:42 2008\n",
- "From: wagnermr@iupui.edu\n",
- "From zqian@umich.edu Fri Jan 4 10:17:43 2008\n",
- "From: zqian@umich.edu\n",
- "From antranig@caret.cam.ac.uk Fri Jan 4 10:04:14 2008\n",
- "From: antranig@caret.cam.ac.uk\n",
- "From gopal.ramasammycook@gmail.com Fri Jan 4 09:05:31 2008\n",
- "From: gopal.ramasammycook@gmail.com\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 07:02:32 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 06:08:27 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 04:49:08 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 04:33:44 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From stephen.marquard@uct.ac.za Fri Jan 4 04:07:34 2008\n",
- "From: stephen.marquard@uct.ac.za\n",
- "From louis@media.berkeley.edu Thu Jan 3 19:51:21 2008\n",
- "From: louis@media.berkeley.edu\n",
- "From louis@media.berkeley.edu Thu Jan 3 17:18:23 2008\n",
- "From: louis@media.berkeley.edu\n",
- "From ray@media.berkeley.edu Thu Jan 3 17:07:00 2008\n",
- "From: ray@media.berkeley.edu\n",
- "From cwen@iupui.edu Thu Jan 3 16:34:40 2008\n",
- "From: cwen@iupui.edu\n",
- "From cwen@iupui.edu Thu Jan 3 16:29:07 2008\n",
- "From: cwen@iupui.edu\n",
- "From cwen@iupui.edu Thu Jan 3 16:23:48 2008\n",
- "From: cwen@iupui.edu\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "\n",
- "hand = open('mbox-short.txt')\n",
- "for line in hand:\n",
- " line = line.rstrip()\n",
- " # if re.search('^From:', line):\n",
- " if re.search('^F.*:', line):\n",
- " print (line)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008\n",
- "From: stephen.marquard@uct.ac.za\n",
- "From louis@media.berkeley.edu Fri Jan 4 18:10:48 2008\n",
- "From: louis@media.berkeley.edu\n",
- "From zqian@umich.edu Fri Jan 4 16:10:39 2008\n",
- "From: zqian@umich.edu\n",
- "From rjlowe@iupui.edu Fri Jan 4 15:46:24 2008\n",
- "From: rjlowe@iupui.edu\n",
- "From zqian@umich.edu Fri Jan 4 15:03:18 2008\n",
- "From: zqian@umich.edu\n",
- "From rjlowe@iupui.edu Fri Jan 4 14:50:18 2008\n",
- "From: rjlowe@iupui.edu\n",
- "From cwen@iupui.edu Fri Jan 4 11:37:30 2008\n",
- "From: cwen@iupui.edu\n",
- "From cwen@iupui.edu Fri Jan 4 11:35:08 2008\n",
- "From: cwen@iupui.edu\n",
- "From gsilver@umich.edu Fri Jan 4 11:12:37 2008\n",
- "From: gsilver@umich.edu\n",
- "From gsilver@umich.edu Fri Jan 4 11:11:52 2008\n",
- "From: gsilver@umich.edu\n",
- "From zqian@umich.edu Fri Jan 4 11:11:03 2008\n",
- "From: zqian@umich.edu\n",
- "From gsilver@umich.edu Fri Jan 4 11:10:22 2008\n",
- "From: gsilver@umich.edu\n",
- "From wagnermr@iupui.edu Fri Jan 4 10:38:42 2008\n",
- "From: wagnermr@iupui.edu\n",
- "From zqian@umich.edu Fri Jan 4 10:17:43 2008\n",
- "From: zqian@umich.edu\n",
- "From antranig@caret.cam.ac.uk Fri Jan 4 10:04:14 2008\n",
- "From: antranig@caret.cam.ac.uk\n",
- "From gopal.ramasammycook@gmail.com Fri Jan 4 09:05:31 2008\n",
- "From: gopal.ramasammycook@gmail.com\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 07:02:32 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 06:08:27 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 04:49:08 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From david.horwitz@uct.ac.za Fri Jan 4 04:33:44 2008\n",
- "From: david.horwitz@uct.ac.za\n",
- "From stephen.marquard@uct.ac.za Fri Jan 4 04:07:34 2008\n",
- "From: stephen.marquard@uct.ac.za\n",
- "From louis@media.berkeley.edu Thu Jan 3 19:51:21 2008\n",
- "From: louis@media.berkeley.edu\n",
- "From louis@media.berkeley.edu Thu Jan 3 17:18:23 2008\n",
- "From: louis@media.berkeley.edu\n",
- "From ray@media.berkeley.edu Thu Jan 3 17:07:00 2008\n",
- "From: ray@media.berkeley.edu\n",
- "From cwen@iupui.edu Thu Jan 3 16:34:40 2008\n",
- "From: cwen@iupui.edu\n",
- "From cwen@iupui.edu Thu Jan 3 16:29:07 2008\n",
- "From: cwen@iupui.edu\n",
- "From cwen@iupui.edu Thu Jan 3 16:23:48 2008\n",
- "From: cwen@iupui.edu\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "\n",
- "hand = open('mbox-short.txt')\n",
- "for line in hand:\n",
- " line = line.rstrip()\n",
- " if re.findall('^F.+?:', line):\n",
- " print (line)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['2', '19', '42']\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "\n",
- "hand = open('mbox-short.txt')\n",
- "# for line in hand:\n",
- "line = '2 19 42'\n",
- "y = re.findall('[0-9]+', line)\n",
- "print (y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['stephen.marquard@uct.ac.za']\n",
- "['stephen.marquard@uct.ac.za']\n",
- "['From stephen.marquard@uct.ac.za']\n"
- ]
- }
- ],
- "source": [
- "x ='From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
- "y = re.findall('\\S+@\\S+',x)\n",
- "print (y)\n",
- "\n",
- "y = re.findall('^From (\\S+@\\S+)',x)\n",
- "print (y)\n",
- "\n",
- "y = re.findall('^From \\S+@\\S+',x)\n",
- "print (y)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "uct.ac.za\n",
- "uct.ac.za\n",
- "['uct.ac.za']\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "data ='From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'\n",
- "\n",
- "atpos = data.find('@')\n",
- "sppos = data.find(' ', atpos)\n",
- "host = data[atpos+1 : sppos]\n",
- "print (host)\n",
- "\n",
- "words = data.split()\n",
- "email = words[1]\n",
- "pieces = email.split('@')\n",
- "print (pieces[1])\n",
- "\n",
- "y = re.findall ('@([^ ]*)',data)\n",
- "print (y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Maximum: 0.9907\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "hand = open('mbox-short.txt')\n",
- "numlist = list()\n",
- "for line in hand:\n",
- " line = line.rstrip()\n",
- " stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)',line)\n",
- " if len(stuff) != 1 : continue\n",
- " num = float(stuff[0])\n",
- " numlist.append(num)\n",
- " \n",
- "print ('Maximum:', max(numlist))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['$10.00']\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "x = 'We just received $10.00 for cookies'\n",
- "y = re.findall('\\$[0-9.]+',x)\n",
- "print (y)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement