Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jun 22nd, 2012  |  syntax: None  |  size: 1.43 KB  |  hits: 14  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. HTML parsing using lxml code
  2. <table class="results">
  3.   <tr>
  4.     <td>
  5.       <a href="..">link</a><span>2nd Mar 2011</span><br>XYZ Consultancy Ltd<br>
  6.        <div>....</div>
  7.     </td>
  8.   </tr>
  9. </table>
  10.        
  11. import lxml.html
  12. for el in root.cssselect("table.results"):    
  13.  for el2 in el: #tr tags
  14.   for e13 in el2:#td tags
  15.      for e14 in e13:
  16.       if ( e14.tag == 'a') :
  17.          print "keyword: ",e14.text_content()
  18.       if (e14.tag == 'span'):
  19.          print "date: ",e14.text_content()
  20.        
  21. import lxml.html
  22. root = lxml.html.fromstring('''
  23. <table class="results">
  24.   <tr>
  25.     <td>
  26.       <a href="..">link</a><span>2nd Mar 2011</span><br>XYZ Consultancy Ltd<br>
  27.        <div>....</div>
  28.     </td>
  29.   </tr>
  30. </table>
  31. ''')
  32. for br_with_tail in root.cssselect('table.results > tr > td > a + span + br'):
  33.     print br_with_tail.tail
  34.     # => XYZ Consultancy Ltd
  35.        
  36. data = '''<table class="results">
  37.   <tr>
  38.     <td>
  39.       <a href="..">link</a><span>2nd Mar 2011</span><br>XYZ Consultancy Ltd<br>
  40.        <div>....</div>
  41.     </td>
  42.   </tr>
  43. </table>'''
  44.  
  45. root = etree.HTML(data)
  46.  
  47. for e in root.xpath('//table[@class="results"]/tr/td/a'):
  48.     parsed_tag = e.text
  49.     next = e.getnext()
  50.     if next is None or next.tag != 'span':
  51.         continue
  52.     parsed_date = next.text
  53.     next_next = next.getnext()
  54.     if next_next is None or next_next.tag != 'br':
  55.         continue
  56.     print 'tag: ', parsed_tag
  57.     print 'date: ', parsed_date
  58.     print 'company: ', next_next.tail