Advertisement
Atheuz

Parsing an HTML table to a multidimensional list, in python

Apr 5th, 2012
182
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.13 KB | None | 0 0
  1. # Credit for these functions go to reclosedev, from this Stackoverflow question:
  2. # http://stackoverflow.com/questions/9978445/parsing-a-table-with-rowspan-and-colspan
  3. def table_to_list(table):
  4.     dct = table_to_2d_dict(table)
  5.     return list(iter_2d_dict(dct))
  6.  
  7. def table_to_2d_dict(table):
  8.     result = defaultdict(lambda : defaultdict(unicode))
  9.     for row_i, row in enumerate(table.xpath('./tr')):
  10.         for col_i, col in enumerate(row.xpath('./td|./th')):
  11.             colspan = int(col.get('colspan', 1))
  12.             rowspan = int(col.get('rowspan', 1))
  13.             col_data = col.text_content()
  14.             while row_i in result and col_i in result[row_i]:
  15.                 col_i += 1
  16.             for i in range(row_i, row_i + rowspan):
  17.                 for j in range(col_i, col_i + colspan):
  18.                     result[i][j] = col_data
  19.     return result
  20.  
  21. def iter_2d_dict(dct):
  22.     for i, row in sorted(dct.items()):
  23.         cols = []
  24.         for j, col in sorted(row.items()):
  25.             cols.append(col)
  26.         yield cols
  27.  
  28. # call with table_rows = extract_tables(table_el) where table_el is a lxml.html table element.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement