Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # table2doc_demo.py
- import os
- import sys
- import argparse
- import subprocess
- from html.parser import HTMLParser
- demo_html = '''<table>
- <thead>
- <tr>
- <th>Header 1</th>
- <th>Header 2</th>
- <th>Header 3</th>
- </tr>
- </thead>
- <tbody>
- <tr>
- <td>Row 1, Cell 1</td>
- <td>Row 1, Cell 2 with some longer content</td>
- <td>Row 1, Cell 3</td>
- </tr>
- <tr>
- <td>Row 2, Cell 1</td>
- <td>Row 2, Cell 2</td>
- <td>Row 2, Cell 3</td>
- </tr>
- <tr>
- <td colspan="2">Merged cells</td>
- <td>Last cell</td>
- </tr>
- </tbody>
- </table>'''
- def save_as_rtf(text_content, filename):
- """Save RTF content to file with fallback to Documents folder"""
- try:
- with open(filename, 'w', encoding='utf-8') as f:
- f.write(text_content)
- return filename
- except Exception as primary_error:
- print(f"Warning: {primary_error}", file=sys.stderr)
- # Fallback to Documents folder
- try:
- docs_path = os.path.join(os.path.expanduser('~'), 'Documents', os.path.basename(filename))
- with open(docs_path, 'w', encoding='utf-8') as f:
- f.write(text_content)
- print(f"Saved to fallback location: {docs_path}")
- return docs_path
- except Exception as fallback_error:
- print(f"Error saving file: {fallback_error}", file=sys.stderr)
- return None
- def open_file(filepath):
- """Open file with default application"""
- try:
- if sys.platform == 'win32':
- os.startfile(filepath)
- elif sys.platform == 'darwin':
- subprocess.run(['open', filepath], check=True)
- else: # Linux and others
- subprocess.run(['xdg-open', filepath], check=True)
- return True
- except Exception as e:
- print(f"Could not open file: {e}")
- return False
- class TableParser(HTMLParser):
- def __init__(self):
- super().__init__()
- self.tags_stack = []
- self.in_table = False
- self.table_data = []
- self.current_row = []
- self.current_cell = {"content": "", "colspan": 1, "is_header": False}
- self.in_cell = False
- def handle_starttag(self, tag, attrs):
- self.tags_stack.append(tag)
- attrs_dict = dict(attrs)
- if tag == 'table':
- self.in_table = True
- self.table_data = []
- elif tag == 'tr':
- self.current_row = []
- elif tag in ['td', 'th']:
- self.in_cell = True
- self.current_cell = {
- "content": "",
- "colspan": int(attrs_dict.get('colspan', 1)),
- "is_header": (tag == 'th')
- }
- def handle_endtag(self, tag):
- if tag == 'table':
- self.in_table = False
- elif tag == 'tr':
- if self.current_row: # Only add non-empty rows
- self.table_data.append(self.current_row.copy())
- elif tag in ['td', 'th']:
- self.in_cell = False
- self.current_row.append(self.current_cell.copy())
- # Remove the tag from the stack
- if self.tags_stack and self.tags_stack[-1] == tag:
- self.tags_stack.pop()
- def handle_data(self, data):
- if self.in_cell:
- data = data.strip()
- if data:
- if self.current_cell["content"]:
- self.current_cell["content"] += " " + data
- else:
- self.current_cell["content"] = data
- def html_table_to_rtf(html_content, page_width=1200):
- """Convert HTML table to RTF format with proper formatting and cell spanning"""
- # Page width in twips (1/20 of a point, 1/1440 of an inch)
- page_width_twips = page_width * 20 # Convert units to twips
- # Parse the HTML content
- parser = TableParser()
- parser.feed(html_content)
- table_data = parser.table_data
- # Start building RTF document
- rtf = []
- # RTF Header with proper document setup
- rtf.append(r'''{\rtf1\ansi\ansicpg1252\deff0\nouicompat\deflang1033
- {\fonttbl{\f0\fswiss\fcharset0 Calibri;}{\f1\fswiss\fprq2\fcharset0 Arial;}}
- {\colortbl;\red0\green0\blue0;\red255\green255\blue255;\red242\green242\blue242;\red79\green129\blue189;}
- {\*\generator table2doc converter}
- \viewkind4\uc1
- \paperw''')
- rtf.append(str(page_width_twips))
- rtf.append(r'''\paperh15840\margl720\margr720\margt720\margb720
- \pard\sa200\sl276\slmult1\qc\b\f0\fs28 Table Conversion\par
- \pard\sa200\sl276\slmult1\b0\fs24
- ''')
- if not table_data:
- rtf.append(r'\pard\sa200\sl276\slmult1 No table data found\par')
- rtf.append('}')
- return ''.join(rtf)
- # Calculate max columns and their widths
- max_cols = max(sum(cell["colspan"] for cell in row) for row in table_data if row)
- # Initialize column widths based on content length (minimum 1500 twips)
- col_widths = [1500] * max_cols
- # First pass: estimate content width needed for each column
- for row in table_data:
- col_idx = 0
- for cell in row:
- content_len = len(cell["content"])
- avg_width = 120 * content_len // cell["colspan"] # Approximate width per character
- # Distribute width across spanned columns
- for span in range(cell["colspan"]):
- if col_idx + span < max_cols:
- col_widths[col_idx + span] = max(col_widths[col_idx + span], avg_width)
- col_idx += cell["colspan"]
- # Adjust for page width constraint
- total_width = sum(col_widths)
- available_width = page_width_twips - 1440 # Account for margins
- if total_width > available_width:
- # Scale down proportionally
- scale_factor = available_width / total_width
- col_widths = [int(width * scale_factor) for width in col_widths]
- # Begin table
- rtf.append(r'\par\pard')
- # Process each row
- for row_idx, row in enumerate(table_data):
- if not row: # Skip empty rows
- continue
- # Begin row
- rtf.append(r'\trowd\trgaph70\trleft0\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3')
- # Set row properties
- if row_idx == 0 or any(cell["is_header"] for cell in row):
- # Header formatting
- rtf.append(r'\trbrdrt\brdrs\brdrw10\brdrcf4') # Top border
- rtf.append(r'\trbrdrl\brdrs\brdrw10\brdrcf4') # Left border
- rtf.append(r'\trbrdrb\brdrs\brdrw10\brdrcf4') # Bottom border
- rtf.append(r'\trbrdrr\brdrs\brdrw10\brdrcf4') # Right border
- else:
- # Regular row borders
- rtf.append(r'\trbrdrt\brdrs\brdrw5\brdrcf1')
- rtf.append(r'\trbrdrl\brdrs\brdrw5\brdrcf1')
- rtf.append(r'\trbrdrb\brdrs\brdrw5\brdrcf1')
- rtf.append(r'\trbrdrr\brdrs\brdrw5\brdrcf1')
- # Calculate cell positions
- pos = 0
- col_idx = 0
- for cell in row:
- # Calculate width for this cell (sum of spanned columns)
- cell_width = sum(col_widths[col_idx:col_idx + cell["colspan"]])
- end_pos = pos + cell_width
- # Cell border and background formatting
- if cell["is_header"]:
- rtf.append(r'\clcbpat3') # Light gray background for headers
- else:
- rtf.append(r'\clcbpat2') # White background for regular cells
- rtf.append(r'\clbrdrt\brdrs\brdrw5\brdrcf1') # Top border
- rtf.append(r'\clbrdrl\brdrs\brdrw5\brdrcf1') # Left border
- rtf.append(r'\clbrdrb\brdrs\brdrw5\brdrcf1') # Bottom border
- rtf.append(r'\clbrdrr\brdrs\brdrw5\brdrcf1') # Right border
- # Cell position (right boundary)
- rtf.append(f'\\cellx{end_pos}')
- pos = end_pos
- col_idx += cell["colspan"]
- rtf.append('\n')
- # Cell content
- for cell in row:
- if cell["is_header"]:
- rtf.append(r'{\intbl\qc\b\fs36 ') # Centered, bold for headers, font size is 18px
- else:
- rtf.append(r'{\intbl\ql ') # Left-aligned for regular cells
- # Handle newlines in content
- content = cell["content"].replace('\n', '\\line ')
- rtf.append(content)
- rtf.append(r'}\cell ')
- rtf.append(r'\row\n')
- # End RTF document
- rtf.append(r'\pard\sa200\sl276\slmult1\par}')
- return ''.join(rtf)
- def main():
- parser = argparse.ArgumentParser(description='Convert HTML tables to RTF documents')
- parser.add_argument('--file', help='Input HTML file')
- parser.add_argument('--output', default='table.rtf', help='Output RTF file')
- parser.add_argument('--no-open', dest='open', action='store_false',
- help='Do not open the output file after conversion')
- parser.add_argument('--width', type=int, default=1200,
- help='Page width in RTF units (1/20 of a point)')
- parser.add_argument('--demo', action='store_true',
- help='Use built-in demo table instead of input file')
- parser.set_defaults(open=True)
- args = parser.parse_args()
- html_content = None
- if args.demo or (not args.file):
- html_content = demo_html
- print("Using demo HTML table:")
- print("-" * 40)
- print(demo_html)
- print("-" * 40)
- elif args.file:
- try:
- with open(args.file, 'r', encoding='utf-8') as f:
- html_content = f.read()
- print(f"Successfully read HTML from: {args.file}")
- except Exception as e:
- print(f"Error reading file: {e}", file=sys.stderr)
- sys.exit(1)
- rtf = html_table_to_rtf(html_content, args.width)
- saved_path = save_as_rtf(rtf, args.output)
- if not saved_path:
- print("Failed to save RTF file", file=sys.stderr)
- sys.exit(1)
- print(f"Successfully converted table to: {saved_path}")
- if args.open and not open_file(saved_path):
- print("Could not automatically open the file. Please open it manually.")
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement