Advertisement
here2share

# table2doc_demo.py

May 12th, 2025
383
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.47 KB | None | 0 0
  1. # table2doc_demo.py
  2.  
  3. import os
  4. import sys
  5. import argparse
  6. import subprocess
  7. from html.parser import HTMLParser
  8.  
  9. demo_html = '''<table>
  10.    <thead>
  11.        <tr>
  12.            <th>Header 1</th>
  13.            <th>Header 2</th>
  14.            <th>Header 3</th>
  15.        </tr>
  16.    </thead>
  17.    <tbody>
  18.        <tr>
  19.            <td>Row 1, Cell 1</td>
  20.            <td>Row 1, Cell 2 with some longer content</td>
  21.            <td>Row 1, Cell 3</td>
  22.        </tr>
  23.        <tr>
  24.            <td>Row 2, Cell 1</td>
  25.            <td>Row 2, Cell 2</td>
  26.            <td>Row 2, Cell 3</td>
  27.        </tr>
  28.        <tr>
  29.            <td colspan="2">Merged cells</td>
  30.            <td>Last cell</td>
  31.        </tr>
  32.    </tbody>
  33. </table>'''
  34.  
  35. def save_as_rtf(text_content, filename):
  36.     """Save RTF content to file with fallback to Documents folder"""
  37.     try:
  38.         with open(filename, 'w', encoding='utf-8') as f:
  39.             f.write(text_content)
  40.         return filename
  41.     except Exception as primary_error:
  42.         print(f"Warning: {primary_error}", file=sys.stderr)
  43.        
  44.         # Fallback to Documents folder
  45.         try:
  46.             docs_path = os.path.join(os.path.expanduser('~'), 'Documents', os.path.basename(filename))
  47.             with open(docs_path, 'w', encoding='utf-8') as f:
  48.                 f.write(text_content)
  49.             print(f"Saved to fallback location: {docs_path}")
  50.             return docs_path
  51.         except Exception as fallback_error:
  52.             print(f"Error saving file: {fallback_error}", file=sys.stderr)
  53.             return None
  54.  
  55. def open_file(filepath):
  56.     """Open file with default application"""
  57.     try:
  58.         if sys.platform == 'win32':
  59.             os.startfile(filepath)
  60.         elif sys.platform == 'darwin':
  61.             subprocess.run(['open', filepath], check=True)
  62.         else:  # Linux and others
  63.             subprocess.run(['xdg-open', filepath], check=True)
  64.         return True
  65.     except Exception as e:
  66.         print(f"Could not open file: {e}")
  67.         return False
  68.  
  69. class TableParser(HTMLParser):
  70.     def __init__(self):
  71.         super().__init__()
  72.         self.tags_stack = []
  73.         self.in_table = False
  74.         self.table_data = []
  75.         self.current_row = []
  76.         self.current_cell = {"content": "", "colspan": 1, "is_header": False}
  77.         self.in_cell = False
  78.  
  79.     def handle_starttag(self, tag, attrs):
  80.         self.tags_stack.append(tag)
  81.         attrs_dict = dict(attrs)
  82.        
  83.         if tag == 'table':
  84.             self.in_table = True
  85.             self.table_data = []
  86.         elif tag == 'tr':
  87.             self.current_row = []
  88.         elif tag in ['td', 'th']:
  89.             self.in_cell = True
  90.             self.current_cell = {
  91.                 "content": "",
  92.                 "colspan": int(attrs_dict.get('colspan', 1)),
  93.                 "is_header": (tag == 'th')
  94.             }
  95.  
  96.     def handle_endtag(self, tag):
  97.         if tag == 'table':
  98.             self.in_table = False
  99.         elif tag == 'tr':
  100.             if self.current_row:  # Only add non-empty rows
  101.                 self.table_data.append(self.current_row.copy())
  102.         elif tag in ['td', 'th']:
  103.             self.in_cell = False
  104.             self.current_row.append(self.current_cell.copy())
  105.            
  106.         # Remove the tag from the stack
  107.         if self.tags_stack and self.tags_stack[-1] == tag:
  108.             self.tags_stack.pop()
  109.            
  110.     def handle_data(self, data):
  111.         if self.in_cell:
  112.             data = data.strip()
  113.             if data:
  114.                 if self.current_cell["content"]:
  115.                     self.current_cell["content"] += " " + data
  116.                 else:
  117.                     self.current_cell["content"] = data
  118.  
  119. def html_table_to_rtf(html_content, page_width=1200):
  120.     """Convert HTML table to RTF format with proper formatting and cell spanning"""
  121.     # Page width in twips (1/20 of a point, 1/1440 of an inch)
  122.     page_width_twips = page_width * 20  # Convert units to twips
  123.    
  124.     # Parse the HTML content
  125.     parser = TableParser()
  126.     parser.feed(html_content)
  127.     table_data = parser.table_data
  128.    
  129.     # Start building RTF document
  130.     rtf = []
  131.    
  132.     # RTF Header with proper document setup
  133.     rtf.append(r'''{\rtf1\ansi\ansicpg1252\deff0\nouicompat\deflang1033
  134. {\fonttbl{\f0\fswiss\fcharset0 Calibri;}{\f1\fswiss\fprq2\fcharset0 Arial;}}
  135. {\colortbl;\red0\green0\blue0;\red255\green255\blue255;\red242\green242\blue242;\red79\green129\blue189;}
  136. {\*\generator table2doc converter}
  137. \viewkind4\uc1
  138. \paperw''')
  139.     rtf.append(str(page_width_twips))
  140.     rtf.append(r'''\paperh15840\margl720\margr720\margt720\margb720
  141. \pard\sa200\sl276\slmult1\qc\b\f0\fs28 Table Conversion\par
  142. \pard\sa200\sl276\slmult1\b0\fs24
  143. ''')
  144.    
  145.     if not table_data:
  146.         rtf.append(r'\pard\sa200\sl276\slmult1 No table data found\par')
  147.         rtf.append('}')
  148.         return ''.join(rtf)
  149.    
  150.     # Calculate max columns and their widths
  151.     max_cols = max(sum(cell["colspan"] for cell in row) for row in table_data if row)
  152.    
  153.     # Initialize column widths based on content length (minimum 1500 twips)
  154.     col_widths = [1500] * max_cols
  155.    
  156.     # First pass: estimate content width needed for each column
  157.     for row in table_data:
  158.         col_idx = 0
  159.         for cell in row:
  160.             content_len = len(cell["content"])
  161.             avg_width = 120 * content_len // cell["colspan"]  # Approximate width per character
  162.            
  163.             # Distribute width across spanned columns
  164.             for span in range(cell["colspan"]):
  165.                 if col_idx + span < max_cols:
  166.                     col_widths[col_idx + span] = max(col_widths[col_idx + span], avg_width)
  167.            
  168.             col_idx += cell["colspan"]
  169.    
  170.     # Adjust for page width constraint
  171.     total_width = sum(col_widths)
  172.     available_width = page_width_twips - 1440  # Account for margins
  173.    
  174.     if total_width > available_width:
  175.         # Scale down proportionally
  176.         scale_factor = available_width / total_width
  177.         col_widths = [int(width * scale_factor) for width in col_widths]
  178.    
  179.     # Begin table
  180.     rtf.append(r'\par\pard')
  181.    
  182.     # Process each row
  183.     for row_idx, row in enumerate(table_data):
  184.         if not row:  # Skip empty rows
  185.             continue
  186.        
  187.         # Begin row
  188.         rtf.append(r'\trowd\trgaph70\trleft0\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3')
  189.        
  190.         # Set row properties
  191.         if row_idx == 0 or any(cell["is_header"] for cell in row):
  192.             # Header formatting
  193.             rtf.append(r'\trbrdrt\brdrs\brdrw10\brdrcf4')  # Top border
  194.             rtf.append(r'\trbrdrl\brdrs\brdrw10\brdrcf4')  # Left border
  195.             rtf.append(r'\trbrdrb\brdrs\brdrw10\brdrcf4')  # Bottom border
  196.             rtf.append(r'\trbrdrr\brdrs\brdrw10\brdrcf4')  # Right border
  197.         else:
  198.             # Regular row borders
  199.             rtf.append(r'\trbrdrt\brdrs\brdrw5\brdrcf1')
  200.             rtf.append(r'\trbrdrl\brdrs\brdrw5\brdrcf1')
  201.             rtf.append(r'\trbrdrb\brdrs\brdrw5\brdrcf1')
  202.             rtf.append(r'\trbrdrr\brdrs\brdrw5\brdrcf1')
  203.        
  204.         # Calculate cell positions
  205.         pos = 0
  206.         col_idx = 0
  207.        
  208.         for cell in row:
  209.             # Calculate width for this cell (sum of spanned columns)
  210.             cell_width = sum(col_widths[col_idx:col_idx + cell["colspan"]])
  211.             end_pos = pos + cell_width
  212.            
  213.             # Cell border and background formatting
  214.             if cell["is_header"]:
  215.                 rtf.append(r'\clcbpat3')  # Light gray background for headers
  216.             else:
  217.                 rtf.append(r'\clcbpat2')  # White background for regular cells
  218.                
  219.             rtf.append(r'\clbrdrt\brdrs\brdrw5\brdrcf1')  # Top border
  220.             rtf.append(r'\clbrdrl\brdrs\brdrw5\brdrcf1')  # Left border
  221.             rtf.append(r'\clbrdrb\brdrs\brdrw5\brdrcf1')  # Bottom border
  222.             rtf.append(r'\clbrdrr\brdrs\brdrw5\brdrcf1')  # Right border
  223.            
  224.             # Cell position (right boundary)
  225.             rtf.append(f'\\cellx{end_pos}')
  226.            
  227.             pos = end_pos
  228.             col_idx += cell["colspan"]
  229.        
  230.         rtf.append('\n')
  231.        
  232.         # Cell content
  233.         for cell in row:
  234.             if cell["is_header"]:
  235.                 rtf.append(r'{\intbl\qc\b\fs36 ')  # Centered, bold for headers, font size is 18px
  236.             else:
  237.                 rtf.append(r'{\intbl\ql ')  # Left-aligned for regular cells
  238.            
  239.             # Handle newlines in content
  240.             content = cell["content"].replace('\n', '\\line ')
  241.             rtf.append(content)
  242.             rtf.append(r'}\cell ')
  243.        
  244.         rtf.append(r'\row\n')
  245.    
  246.     # End RTF document
  247.     rtf.append(r'\pard\sa200\sl276\slmult1\par}')
  248.    
  249.     return ''.join(rtf)
  250.  
  251. def main():
  252.     parser = argparse.ArgumentParser(description='Convert HTML tables to RTF documents')
  253.     parser.add_argument('--file', help='Input HTML file')
  254.     parser.add_argument('--output', default='table.rtf', help='Output RTF file')
  255.     parser.add_argument('--no-open', dest='open', action='store_false',
  256.                       help='Do not open the output file after conversion')
  257.     parser.add_argument('--width', type=int, default=1200,
  258.                       help='Page width in RTF units (1/20 of a point)')
  259.     parser.add_argument('--demo', action='store_true',
  260.                       help='Use built-in demo table instead of input file')
  261.     parser.set_defaults(open=True)
  262.  
  263.     args = parser.parse_args()
  264.  
  265.     html_content = None
  266.    
  267.     if args.demo or (not args.file):
  268.         html_content = demo_html
  269.         print("Using demo HTML table:")
  270.         print("-" * 40)
  271.         print(demo_html)
  272.         print("-" * 40)
  273.     elif args.file:
  274.         try:
  275.             with open(args.file, 'r', encoding='utf-8') as f:
  276.                 html_content = f.read()
  277.             print(f"Successfully read HTML from: {args.file}")
  278.         except Exception as e:
  279.             print(f"Error reading file: {e}", file=sys.stderr)
  280.             sys.exit(1)
  281.  
  282.     rtf = html_table_to_rtf(html_content, args.width)
  283.  
  284.     saved_path = save_as_rtf(rtf, args.output)
  285.     if not saved_path:
  286.         print("Failed to save RTF file", file=sys.stderr)
  287.         sys.exit(1)
  288.  
  289.     print(f"Successfully converted table to: {saved_path}")
  290.  
  291.     if args.open and not open_file(saved_path):
  292.         print("Could not automatically open the file. Please open it manually.")
  293.  
  294. if __name__ == "__main__":
  295.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement