Untitled

\documentclass{beamer}
\usepackage{beamerthemesplit}
\usepackage{times}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
\usepackage{times}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\include{pythonlisting}

\mode<presentation>{
    \usetheme{Warsaw}
    \setbeamercovered{invisible}
}

\mode<handout>{
  \usepackage{pgfpages}
  \pgfpagesuselayout{4 on 1}[a4paper,border shrink=5mm,landscape]
  \setbeamercolor{background canvas}{bg=black!5}
}

\title[Regular Expressions]{UCT Algorithm Circle: Intermediate Class: Regular Expressions}
\author{Yaseen Hamdulay}
\institute{Wynberg Boys High School}
\date{4 March 2010}

\pgfdeclareimage[height=20pt]{university-logo}{images/uct}
\pgfdeclareimage[width=200px]{regular-expresions-xkcd}{images/regular-expressions-xkcd}
\logo{\pgfuseimage{university-logo}}

\begin{document}

\begin{frame}
\titlepage
\end{frame}

\section{Introduction to Regular Expressions}

\begin{frame}{What are Regular Expressions}
    \begin{itemize}
        \item Regular Expressions are a powerful way to search for patterns in a string
        \item Regular Expressions are described in their own language (Yup, new language coming through)
        \pause
        \item It's best to avoid them if it's possible to have the same result with the standard string functions
        \begin{itemize}
            \item Regular Expressions are hard to debug and understand in retrospect
        \end{itemize}
    \end{itemize}
\end{frame}

\begin{frame}{Why use Regular Expressions?}
    \begin{itemize}
        \item Since Regular Expressions are so complicated why would we want to use them?
        \pause
        \item Well for one...
        \pause
        \item You could save someone from a serial-killer!
    \end{itemize}
\end{frame}

\begin{frame}
        \begin{center}
            \pgfuseimage{regular-expresions-xkcd}
        \end{center}
\end{frame}

\begin{frame}{Why use Regular Expressions?}
    If somehow saving someones life isn't a good enough reason for you...
    \begin{itemize}
        \item Regular expressions can match very complicated patterns in strings
        \pause
        \item They can do things that just aren't possible with standard string functions
        \pause
        \item Regular expressions can validate that the data is valid
        \pause
        \item Regular expressions can separate a string into groups of useful information
        \begin{itemize}
            \item For example separating a URI into its protocol, domain, directory etc etc
        \end{itemize}
    \end{itemize}
\end{frame}

\section{Regular Expressions syntax}
\begin{frame}{Simple Pattern Matching}
    \begin{itemize}
        \item Plain text characters in a pattern match themselves.
        \pause
        \item For example using a regular expression of 'name'
        \item Would match \texttt{'My \textbf{name} is Yaseen'}
        \pause
        \item We have a problem if we have a string like this \texttt{'My sur\textbf{name} is Yaseen'}
        \pause
        \item This is where regular expressions become very useful
        \item We can specify in exactly what context we want our match to be in
    \end{itemize}
\end{frame}

\begin{frame}
    \begin{itemize}
        \item In this case we want name to be a word on its own, not just some random part of a word
        \pause
        \item To do this we use a special symbol \texttt{\textbackslash{}b}, this matches a word boundary
        \pause
        \item We can not update our Regular Expression to \texttt{\textbackslash{}bname\textbackslash{}b}
        \item Our pattern will now match name but not surname or anything else
    \end{itemize}
\end{frame}

\begin{frame}{Character Classes}
    \begin{itemize}
        \item Sometimes we have a pattern that we want to match but certain parts of it are unknown, but we know how it's supposed to look
        \pause
        \item To match the unknown characters we use character classes. Character classes match a range of characters that we specify
        \item To match the letters of the alphabet we use the class \texttt{[a-zA-Z]}
        \item The \texttt{[} and \texttt{]} makes this a character class
        \pause
        \item Only \textbf{one} character in the character class gets matched
        \pause
        \item Special characters don't need to be escaped when inside a character class
    \end{itemize}
\end{frame}

\begin{frame}{Special Characters}
    \begin{itemize}
        \item Examples of special characters \texttt{. \textasciicircum{} \$ * + ? \{ \} [ ] \textbackslash{} | ( ) }
        \item Most characters simply match themselves, special characters do special things
        \item We have seen an example of this with the \texttt{\textbackslash{}b} symbol
        \pause
        \item If we want to match the character literal itself we will have to escape it first
        \pause
        \item Say we wanted to match \texttt{\textbackslash{}her} in \texttt{his\textbackslash{}her}
        \pause
        \item Our pattern would have to be \texttt{'\textbackslash{}\textbackslash{}her'}
    \end{itemize}
\end{frame}

\begin{frame}{Summary of Special and Meta Characters}
    \begin{tabular}{l l}
        \texttt{.} & Match anything\\
        \texttt{\textasciicircum{}} & Match the beginning of a line\\
        \texttt{\$} & Match the end of a string\\
        \texttt{\textbackslash{}b} & Match a word boundary\\
        \texttt{\textbackslash{}B} & Match anything besides a word boundary\\
        \texttt{\textbackslash{}d} & Match any decimal digit\\
        \texttt{\textbackslash{}D} & Match anything besides a decimal digit\\
        \texttt{\textbackslash{}s} & Match any whitespace\\
        \texttt{\textbackslash{}S} & Match anything besides whitespace\\
        \texttt{\textbackslash{}w} & Match anything in the class [a-zA-Z0-9]\\
        \texttt{\textbackslash{}W} & Match anything not in the class [a-zA-Z0-9]\\
    \end{tabular}
    \begin{itemize}
    \item NOTE: Special Characters match themselves when in a character class
    \end{itemize}
\end{frame}

\begin{frame}{Repetition}
    \begin{itemize}
        \item Sometimes we want to allow certain parts of our pattern to be repeated
        \pause
        \item We want to be able to match \texttt{This is really awesome} and \texttt{This is really really really awesome} with one pattern
        \pause
        \item To do this we use the repetition metacharacters
        \begin{tabular}{l  l}
        \texttt{*} & match the previous pattern zero or more times\\
        \texttt{+} & match the previous pattern one or more times\\
        \texttt{?} & match the previous pattern one or no times\\
        \texttt{\{a, b\}} & match the previous pattern a to b times\\
        \end{tabular}
        \pause
        \item Now to match the previous example we would use a pattern of \texttt{(really\textbackslash{}b)+}
        \item The brackets are to ensure that the repetition characters ensure that the entire group of characters are matched for repetition and not just the last character
    \end{itemize}
\end{frame}

\section{Using Regular Expressions}
\begin{frame}{Regular Expressions in Python}
    \begin{itemize}
        \item Regular Expression library is in module \texttt{re}
        \item In order to use a pattern to match a string we first have to compile it, we use \texttt{re.compile(pattern)} for this. This gives us a pattern object
        \item This pattern object contains all the methods we need to search and replace things in strings
        \item All searching methods return \texttt{None} when no match was made
    \end{itemize}
\end{frame}

\begin{frame}[fragile]{Pattern Methods}
\begin{python}
import re
p = re.compile(r'\b[a-z0-9.]+@[a-z0-9.]+\.[a-z]{2,4}\b')
#Searches through the string for a match
#and returns a match object
p.search('username@email.com')
#Finds all substrings that match the pattern
#and returns it in a list of strings
p.findall('My email address is: username@email.com'+
          ' or whateverelse@example.com')
#Same as findall except it returns an iterator
#that gives us match objects
pattern.finditer('My email address is: '+
        'username@email.com or whateverelse@example.com')
\end{python}
\end{frame}

\begin{frame}{Match methods}
    \begin{tabular}{l l}
    \texttt{group()} & Return the string matched by the Regular Expression\\
    \texttt{start()} & Return the beginning index of the match\\
    \texttt{end()} & Return the end index of the match\\
    \texttt{span()} & Return a tuple (start(), end())\\
    \end{tabular}
\end{frame}

\begin{frame}{Regular Expressions in Java}
\begin{verbatim}
import java.util.regex.Pattern;
import java.util.regex.Matcher;

public class Regex \{
    public static void main(String [] args) \{
        Pattern p = new Pattern("\b[a-z0-9.]+@[a-z0-9.]+\.[a-z]{2,4}\b");
        Matcher m = p.matcher("some@email.com");
        while(m.find()) \{
            System.out.println("Found match: "+m.group());
            System.out.println("Begins at: "+m.start());
            System.out.println("Ends at: "m.end());
        \}
    \}
\}
\end{verbatim}
\end{frame}
\end{document}