Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <td xyz="123"><a href="blah.html">This is a line</a></td>
- <tr><td>New line</td></tr>
- <tr><td xyz="123"><a href="blah.html">CaptureThis</a></td></tr>
- <tr><td x?y?z?=?"?(ddd)?"?>?<?a?.*?>?(.*?)?<?/?a?>?</td></tr>
- from xml.etree import ElementTree
- tree = ElementTree.parse('filename.html')
- for elem in tree.findall('tr'):
- print ElementTree.tostring(elem)
- >>> line1
- '<tr><td>New line</td></tr>'
- >>> line2
- '<tr><td xyz="123"><a href="blah.html">CaptureThis</a></td></tr>'
- >>> pattern2 = re.compile(r'>([ws]+)<')
- >>> pattern2.search(line1).group(1)
- 'New line'
- >>> pattern2.search(line2).group(1)
- 'CaptureThis'
- >>> pattern = re.compile(r'<tds+w+="([^"]*)">')
- >>> pattern.search(line2).group(1)
- '123'
- >>> text = '''<tr><td>New line</td></tr>
- <tr><td xyz="123"><a href="blah.html">CaptureThis</a></td></tr>
- <tr><td xyz="456">CaptureThisAlso</td></tr>
- '''
- >>> re.findall(r'<tr><td(?: xyz="(d+)")?>(?:<a href=".*?">)?(.*?)(?:</a>)?</td></tr>', text)
- [('', 'New line'), ('123', 'CaptureThis'), ('456', 'CaptureThisAlso')]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement