Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python
- # coding: UTF-8
- # modified code downloaded from:
- # http://devwiki.beloblotskiy.com/index.php5/Generic_HTML_Table_parser_(python)
- # mods by: Aquil H. Abdullah
- from HTMLParser import HTMLParser
- # import pdb
- # Print debug info
- markup_debug_low = not True
- markup_debug_med = not True
- class NestedTableError(Exception):
- """
- Error raised when TableParser finds a nested table.
- """
- def __init__(self, msg):
- self.msg = msg
- def __str__(self):
- return repr(self.msg)
- # Generic HTML table parser
- class TableParser(HTMLParser):
- """
- Class to handle extracting a table from an HTML Page.
- NOTE: Does not handle Tables within
- """
- def __init__(self):
- HTMLParser.__init__(self)
- # Can't use super HTMLParser is an old-style class
- # super(TableParser, self).__init__()
- self._tables = list() # Added to generic class
- self._curr_table = list() # Added to generic class
- self._curr_row = list()# Added to generic class
- self._curr_cell = '' # Added to generic class
- self._in_table = False # Added to generic class
- self._td_cnt = 0
- self._tr_cnt = 0
- self._curr_tag = ''
- self._colspan = 1
- def get_tables(self):
- """
- Return the list of tables scraped from html page
- """
- return self._tables
- def handle_starttag(self, tag, attrs):
- self._curr_tag = tag
- if tag.upper() == 'TABLE' and not self._in_table:
- self._in_table = True
- elif tag.upper() == 'TABLE' and self._in_table:
- raise NestedTableError("Parsing Failed Nested Table Found.")
- if tag == 'td':
- self._td_cnt += 1
- for attr in attrs:
- if attr[0].upper() == 'COLSPAN':
- self._colspan = int(attr[1])
- self.col_start(self._td_cnt)
- if markup_debug_low: print "<TD> --- %s ---" % self._td_cnt
- elif tag == 'tr':
- self._td_cnt = 0
- self._tr_cnt += 1
- self.row_start(self._tr_cnt)
- if markup_debug_low: print "<TR> === %s ===" % self._tr_cnt
- else:
- if markup_debug_low: print "<%s>" % tag
- def handle_endtag(self, tag):
- if tag.upper() == 'TABLE':
- self._in_table = False
- self._tables.append(self._curr_table)
- self._curr_table = list()
- if markup_debug_low: print "</%s>" % tag
- # it's possible to check "start tag - end tag" pair here (see, tag and self._curr_tag)
- if tag == 'tr':
- self.row_finish(self._tr_cnt)
- elif tag == 'td':
- self.col_finish(self._td_cnt)
- self._colspan = 1
- else:
- pass
- def handle_data(self, data):
- #if markup_debug_low: print u'[%s,%s] %s: "%s"' % (self._tr_cnt, self._td_cnt, self._curr_tag, unicode(data, 'mbcs'))
- self.process_raw_data(self._tr_cnt, self._td_cnt, self._curr_tag, data)
- # Overridable
- def process_raw_data(self, row, col, tag, data):
- if row > 0 and col > 0:
- self.process_cell_data(row, col, tag, data)
- else:
- pass # outside the table
- # Overridable
- def process_cell_data(self, row, col, tag, data):
- # pass
- self._curr_cell += data.strip() + ' '
- # Overridable
- def row_start(self, row):
- # pass
- self._curr_row = list()
- # Overridable
- def row_finish(self, row):
- # pass
- row = self._curr_row[:]
- self._curr_table.append(row)
- # Overridable
- def col_start(self, col):
- # pass
- self._curr_cell = ''
- # Overridable
- def col_finish(self, col):
- # pass
- self._curr_row.append(self._curr_cell)
- pad = self._colspan - 1
- if pad > 0:
- for i in range(pad):
- self._curr_row.append('')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement