Advertisement
Guest User

How to parse a HTML file with table using Python

a guest
Mar 27th, 2012
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.00 KB | None | 0 0
  1. class MyHTMLParser(HTMLParser):
  2.  
  3. def handle_starttag(self, tag, attrs):
  4. if tag == 'tr':
  5. for class in attrs:
  6. if class == 'Table_row'
  7.  
  8. p = MyHTMLParser()
  9. p.feed(ht)
  10.  
  11. <table class="Table_rows" cellspacing="0" rules="all" border="1" id="MyDataGrid" style="width:700px;border-collapse:collapse;">
  12.  
  13. <tr class="Table_Heading">
  14.  
  15. <td>STATION CODE</td><td>STATION NAME</td><td>SCHEDULED ARRIVAL</td><td>SCHEDULED DEPARTURE</td><td>ACTUAL/ EXPECTED ARRIVAL</td><td>ACTUAL/ EXPECTED DEPARTURE</td>
  16.  
  17. </tr><tr class="Table_row">
  18.  
  19. <td>TVC </td><td style="width:160px;">ORIGON</td><td>Starting Station </td><td>05:00, 07 May 2011</td><td>Starting Station</td><td>05:00, 07 May 2011</td>
  20.  
  21. </tr><tr class="alternat_table_row">
  22.  
  23. <td>TVP </td><td>NEY YORK</td><td>05:04, 07 May 2011</td><td>05:05, 07 May 2011</td><td>05:04, 07 May 2011</td><td>05:05, 07 May 2011</td>
  24.  
  25. </tr>
  26. </table>
  27.  
  28. from HTMLParser import HTMLParser
  29.  
  30. class MyHTMLParser(HTMLParser):
  31. def handle_starttag(self, tag, attrs):
  32. if tag == 'tr':
  33. for name, value in attrs:
  34. if name == 'class':
  35. print 'Found class', value
  36.  
  37. p = MyHTMLParser()
  38. p.feed(ht)
  39.  
  40. Found class Table_Heading
  41. Found class Table_row
  42. Found class alternat_table_row
  43.  
  44. from BeautifulSoup import BeautifulSoup
  45.  
  46. html = '''
  47. <td>STATION CODE</td><td>STATION NAME</td><td>SCHEDULED ARRIVAL</td><td>SCHEDULED DEPARTURE</td><td>ACTUAL/ EXPECTED ARRIVAL</td><td>ACTUAL/ EXPECTED DEPARTURE</td>
  48. </tr><tr class="Table_row">
  49. <td>TVC </td><td style="width:160px;">ORIGON</td><td>Starting Station </td><td>05:00, 07 May 2011</td><td>Starting Station</td><td>05:00, 07 May 2011</td>
  50. '''
  51.  
  52. soup = BeautifulSoup(html)
  53. tag = soup.findAll('td', limit=2)
  54. tag_O = soup.findAll('td')[7]
  55.  
  56. for i in range(len(tag)):
  57. print tag[i].string
  58. print tag_O.string
  59.  
  60. '''Output-->
  61. STATION CODE
  62. STATION NAME
  63. ORIGON
  64. '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement