Advertisement
nicuf

Parsing data docx to html

Sep 29th, 2023 (edited)
749
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.87 KB | None | 0 0
  1. import docx
  2. import re
  3.  
  4. def run_get_style(run) -> str:
  5.     if run.bold:
  6.         return "bold"
  7.     elif run.italic:
  8.         return "italic"
  9.     else:
  10.         return "normal"
  11.  
  12. def detect_fonts(document: docx.Document) -> None:
  13.     with open("bebe.html", "w") as f:
  14.         for paragraph in document.paragraphs:
  15.             runs = paragraph.runs
  16.             if not runs:
  17.                 continue
  18.             current_style = None
  19.             current_text = ""
  20.  
  21.             for run in runs:
  22.                 run_style = run_get_style(run)
  23.                 if run_style == current_style:
  24.                     current_text += run.text
  25.                 else:
  26.                     if current_style:
  27.                         if current_style == "bold":
  28.                             f.write(f"<b>{current_text}</b>")
  29.                         elif current_style == "italic":
  30.                             f.write(f"<em>{current_text}</em>")
  31.                         else:
  32.                             f.write(current_text)
  33.                     current_style = run_style
  34.                     current_text = run.text
  35.  
  36.             if current_style:
  37.                 if current_style == "bold":
  38.                     f.write(f"<b>{current_text}</b>")
  39.                 elif current_style == "italic":
  40.                     f.write(f"<em>{current_text}</em>")
  41.                 else:
  42.                     f.write(current_text)
  43.  
  44.             f.write("</p>\n")  # Add a closing paragraph tag at the end of each paragraph
  45.  
  46.     # Add a paragraph tag at the beginning of each line
  47.     with open("bebe.html", "r") as f:
  48.         content = f.read()
  49.  
  50.     content = re.sub(r"^[ \t]*", "<p>", content, flags=re.MULTILINE)
  51.  
  52.     with open("bebe.html", "w") as f:
  53.         f.write(content)
  54.  
  55. def main():
  56.     document = docx.Document("bebe.docx")
  57.     detect_fonts(document)
  58.  
  59. if __name__ == "__main__":
  60.     main()
  61.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement