Advertisement
akiladila

Python TableParser

May 11th, 2012
4,267
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.96 KB | None | 0 0
  1. #!/usr/local/bin/python
  2. # coding: UTF-8
  3. # modified code downloaded from:
  4. # http://devwiki.beloblotskiy.com/index.php5/Generic_HTML_Table_parser_(python)
  5. # mods by: Aquil H. Abdullah
  6. from HTMLParser import HTMLParser
  7. # import pdb
  8.  
  9. # Print debug info
  10. markup_debug_low = not True
  11. markup_debug_med = not True
  12.  
  13. class NestedTableError(Exception):
  14.     """
  15.    Error raised when TableParser finds a nested table.
  16.    """
  17.     def __init__(self, msg):
  18.         self.msg = msg
  19.  
  20.     def __str__(self):
  21.         return repr(self.msg)
  22.  
  23. # Generic HTML table parser
  24. class TableParser(HTMLParser):
  25.     """
  26.    Class to handle extracting a table from an HTML Page.
  27.    NOTE: Does not handle Tables within
  28.    """
  29.     def __init__(self):
  30.         HTMLParser.__init__(self)
  31.         # Can't use super HTMLParser is an old-style class
  32.         # super(TableParser, self).__init__()
  33.         self._tables = list() # Added to generic class
  34.         self._curr_table = list() # Added to generic class
  35.         self._curr_row = list()# Added to generic class
  36.         self._curr_cell = '' # Added to generic class
  37.         self._in_table = False # Added to generic class
  38.         self._td_cnt = 0
  39.         self._tr_cnt = 0
  40.         self._curr_tag = ''
  41.         self._colspan = 1
  42.     def get_tables(self):
  43.         """
  44.        Return the list of tables scraped from html page
  45.        """
  46.         return self._tables
  47.  
  48.     def handle_starttag(self, tag, attrs):
  49.         self._curr_tag = tag
  50.         if tag.upper() == 'TABLE' and not self._in_table:
  51.             self._in_table = True
  52.         elif tag.upper() == 'TABLE' and self._in_table:
  53.             raise NestedTableError("Parsing Failed Nested Table Found.")
  54.  
  55.         if tag == 'td':
  56.             self._td_cnt += 1
  57.             for attr in attrs:
  58.                 if attr[0].upper() == 'COLSPAN':
  59.                     self._colspan = int(attr[1])
  60.             self.col_start(self._td_cnt)
  61.             if markup_debug_low: print "<TD> --- %s ---" % self._td_cnt
  62.         elif tag == 'tr':
  63.             self._td_cnt = 0
  64.             self._tr_cnt += 1
  65.             self.row_start(self._tr_cnt)
  66.             if markup_debug_low: print "<TR> === %s ===" % self._tr_cnt
  67.         else:
  68.             if markup_debug_low: print "<%s>" % tag
  69.  
  70.     def handle_endtag(self, tag):
  71.         if tag.upper() == 'TABLE':
  72.             self._in_table = False
  73.             self._tables.append(self._curr_table)
  74.             self._curr_table = list()
  75.         if markup_debug_low: print "</%s>" % tag
  76.         # it's possible to check "start tag - end tag" pair here (see, tag and self._curr_tag)
  77.         if tag == 'tr':
  78.             self.row_finish(self._tr_cnt)
  79.         elif tag == 'td':
  80.             self.col_finish(self._td_cnt)
  81.             self._colspan = 1
  82.         else:
  83.             pass
  84.  
  85.     def handle_data(self, data):
  86.         #if markup_debug_low: print u'[%s,%s] %s: "%s"' % (self._tr_cnt, self._td_cnt, self._curr_tag, unicode(data, 'mbcs'))
  87.         self.process_raw_data(self._tr_cnt, self._td_cnt, self._curr_tag, data)
  88.  
  89.     # Overridable
  90.     def process_raw_data(self, row, col, tag, data):
  91.         if row > 0 and col > 0:
  92.             self.process_cell_data(row, col, tag, data)
  93.         else:
  94.             pass    # outside the table
  95.  
  96.     # Overridable
  97.     def process_cell_data(self, row, col, tag, data):
  98.         # pass
  99.         self._curr_cell += data.strip() + ' '
  100.  
  101.     # Overridable
  102.     def row_start(self, row):
  103.         # pass
  104.         self._curr_row = list()
  105.  
  106.     # Overridable
  107.     def row_finish(self, row):
  108.         # pass
  109.         row = self._curr_row[:]
  110.         self._curr_table.append(row)
  111.  
  112.     # Overridable
  113.     def col_start(self, col):
  114.         # pass
  115.         self._curr_cell = ''
  116.  
  117.     # Overridable
  118.     def col_finish(self, col):
  119.         # pass
  120.         self._curr_row.append(self._curr_cell)
  121.         pad = self._colspan - 1
  122.         if pad > 0:
  123.             for i in range(pad):
  124.                 self._curr_row.append('')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement