Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- \documentclass{beamer}
- \usepackage{beamerthemesplit}
- \usepackage{times}
- \usepackage[english]{babel}
- \usepackage[latin1]{inputenc}
- \usepackage{times}
- \usepackage[T1]{fontenc}
- \usepackage{graphicx}
- \include{pythonlisting}
- \mode<presentation>{
- \usetheme{Warsaw}
- \setbeamercovered{invisible}
- }
- \mode<handout>{
- \usepackage{pgfpages}
- \pgfpagesuselayout{4 on 1}[a4paper,border shrink=5mm,landscape]
- \setbeamercolor{background canvas}{bg=black!5}
- }
- \title[Regular Expressions]{UCT Algorithm Circle: Intermediate Class: Regular Expressions}
- \author{Yaseen Hamdulay}
- \institute{Wynberg Boys High School}
- \date{4 March 2010}
- \pgfdeclareimage[height=20pt]{university-logo}{images/uct}
- \pgfdeclareimage[width=200px]{regular-expresions-xkcd}{images/regular-expressions-xkcd}
- \logo{\pgfuseimage{university-logo}}
- \begin{document}
- \begin{frame}
- \titlepage
- \end{frame}
- \section{Introduction to Regular Expressions}
- \begin{frame}{What are Regular Expressions}
- \begin{itemize}
- \item Regular Expressions are a powerful way to search for patterns in a string
- \item Regular Expressions are described in their own language (Yup, new language coming through)
- \pause
- \item It's best to avoid them if it's possible to have the same result with the standard string functions
- \begin{itemize}
- \item Regular Expressions are hard to debug and understand in retrospect
- \end{itemize}
- \end{itemize}
- \end{frame}
- \begin{frame}{Why use Regular Expressions?}
- \begin{itemize}
- \item Since Regular Expressions are so complicated why would we want to use them?
- \pause
- \item Well for one...
- \pause
- \item You could save someone from a serial-killer!
- \end{itemize}
- \end{frame}
- \begin{frame}
- \begin{center}
- \pgfuseimage{regular-expresions-xkcd}
- \end{center}
- \end{frame}
- \begin{frame}{Why use Regular Expressions?}
- If somehow saving someones life isn't a good enough reason for you...
- \begin{itemize}
- \item Regular expressions can match very complicated patterns in strings
- \pause
- \item They can do things that just aren't possible with standard string functions
- \pause
- \item Regular expressions can validate that the data is valid
- \pause
- \item Regular expressions can separate a string into groups of useful information
- \begin{itemize}
- \item For example separating a URI into its protocol, domain, directory etc etc
- \end{itemize}
- \end{itemize}
- \end{frame}
- \section{Regular Expressions syntax}
- \begin{frame}{Simple Pattern Matching}
- \begin{itemize}
- \item Plain text characters in a pattern match themselves.
- \pause
- \item For example using a regular expression of 'name'
- \item Would match \texttt{'My \textbf{name} is Yaseen'}
- \pause
- \item We have a problem if we have a string like this \texttt{'My sur\textbf{name} is Yaseen'}
- \pause
- \item This is where regular expressions become very useful
- \item We can specify in exactly what context we want our match to be in
- \end{itemize}
- \end{frame}
- \begin{frame}
- \begin{itemize}
- \item In this case we want name to be a word on its own, not just some random part of a word
- \pause
- \item To do this we use a special symbol \texttt{\textbackslash{}b}, this matches a word boundary
- \pause
- \item We can not update our Regular Expression to \texttt{\textbackslash{}bname\textbackslash{}b}
- \item Our pattern will now match name but not surname or anything else
- \end{itemize}
- \end{frame}
- \begin{frame}{Character Classes}
- \begin{itemize}
- \item Sometimes we have a pattern that we want to match but certain parts of it are unknown, but we know how it's supposed to look
- \pause
- \item To match the unknown characters we use character classes. Character classes match a range of characters that we specify
- \item To match the letters of the alphabet we use the class \texttt{[a-zA-Z]}
- \item The \texttt{[} and \texttt{]} makes this a character class
- \pause
- \item Only \textbf{one} character in the character class gets matched
- \pause
- \item Special characters don't need to be escaped when inside a character class
- \end{itemize}
- \end{frame}
- \begin{frame}{Special Characters}
- \begin{itemize}
- \item Examples of special characters \texttt{. \textasciicircum{} \$ * + ? \{ \} [ ] \textbackslash{} | ( ) }
- \item Most characters simply match themselves, special characters do special things
- \item We have seen an example of this with the \texttt{\textbackslash{}b} symbol
- \pause
- \item If we want to match the character literal itself we will have to escape it first
- \pause
- \item Say we wanted to match \texttt{\textbackslash{}her} in \texttt{his\textbackslash{}her}
- \pause
- \item Our pattern would have to be \texttt{'\textbackslash{}\textbackslash{}her'}
- \end{itemize}
- \end{frame}
- \begin{frame}{Summary of Special and Meta Characters}
- \begin{tabular}{l l}
- \texttt{.} & Match anything\\
- \texttt{\textasciicircum{}} & Match the beginning of a line\\
- \texttt{\$} & Match the end of a string\\
- \texttt{\textbackslash{}b} & Match a word boundary\\
- \texttt{\textbackslash{}B} & Match anything besides a word boundary\\
- \texttt{\textbackslash{}d} & Match any decimal digit\\
- \texttt{\textbackslash{}D} & Match anything besides a decimal digit\\
- \texttt{\textbackslash{}s} & Match any whitespace\\
- \texttt{\textbackslash{}S} & Match anything besides whitespace\\
- \texttt{\textbackslash{}w} & Match anything in the class [a-zA-Z0-9]\\
- \texttt{\textbackslash{}W} & Match anything not in the class [a-zA-Z0-9]\\
- \end{tabular}
- \begin{itemize}
- \item NOTE: Special Characters match themselves when in a character class
- \end{itemize}
- \end{frame}
- \begin{frame}{Repetition}
- \begin{itemize}
- \item Sometimes we want to allow certain parts of our pattern to be repeated
- \pause
- \item We want to be able to match \texttt{This is really awesome} and \texttt{This is really really really awesome} with one pattern
- \pause
- \item To do this we use the repetition metacharacters
- \begin{tabular}{l l}
- \texttt{*} & match the previous pattern zero or more times\\
- \texttt{+} & match the previous pattern one or more times\\
- \texttt{?} & match the previous pattern one or no times\\
- \texttt{\{a, b\}} & match the previous pattern a to b times\\
- \end{tabular}
- \pause
- \item Now to match the previous example we would use a pattern of \texttt{(really\textbackslash{}b)+}
- \item The brackets are to ensure that the repetition characters ensure that the entire group of characters are matched for repetition and not just the last character
- \end{itemize}
- \end{frame}
- \section{Using Regular Expressions}
- \begin{frame}{Regular Expressions in Python}
- \begin{itemize}
- \item Regular Expression library is in module \texttt{re}
- \item In order to use a pattern to match a string we first have to compile it, we use \texttt{re.compile(pattern)} for this. This gives us a pattern object
- \item This pattern object contains all the methods we need to search and replace things in strings
- \item All searching methods return \texttt{None} when no match was made
- \end{itemize}
- \end{frame}
- \begin{frame}[fragile]{Pattern Methods}
- \begin{python}
- import re
- p = re.compile(r'\b[a-z0-9.]+@[a-z0-9.]+\.[a-z]{2,4}\b')
- #Searches through the string for a match
- #and returns a match object
- p.search('username@email.com')
- #Finds all substrings that match the pattern
- #and returns it in a list of strings
- p.findall('My email address is: username@email.com'+
- ' or whateverelse@example.com')
- #Same as findall except it returns an iterator
- #that gives us match objects
- pattern.finditer('My email address is: '+
- 'username@email.com or whateverelse@example.com')
- \end{python}
- \end{frame}
- \begin{frame}{Match methods}
- \begin{tabular}{l l}
- \texttt{group()} & Return the string matched by the Regular Expression\\
- \texttt{start()} & Return the beginning index of the match\\
- \texttt{end()} & Return the end index of the match\\
- \texttt{span()} & Return a tuple (start(), end())\\
- \end{tabular}
- \end{frame}
- \begin{frame}{Regular Expressions in Java}
- \begin{verbatim}
- import java.util.regex.Pattern;
- import java.util.regex.Matcher;
- public class Regex \{
- public static void main(String [] args) \{
- Pattern p = new Pattern("\b[a-z0-9.]+@[a-z0-9.]+\.[a-z]{2,4}\b");
- Matcher m = p.matcher("some@email.com");
- while(m.find()) \{
- System.out.println("Found match: "+m.group());
- System.out.println("Begins at: "+m.start());
- System.out.println("Ends at: "m.end());
- \}
- \}
- \}
- \end{verbatim}
- \end{frame}
- \end{document}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement