Advertisement
nicuf

Convert docx to html (parsing data)

Sep 29th, 2023
895
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.77 KB | None | 0 0
  1. import docx
  2.  
  3.  
  4. def run_get_style(run) -> str:
  5.   """Returns the font style of the run.
  6.  
  7.  Does not consider a paragraph style that makes the whole Paragraph bold or italic.
  8.  
  9.  Args:
  10.    run: A docx.Run object.
  11.  
  12.  Returns:
  13.    A string indicating the font style of the run: "bold", "italic", or "normal".
  14.  """
  15.  
  16.   if run.bold:
  17.     return "bold"
  18.   elif run.italic:
  19.     return "italic"
  20.   else:
  21.     return "normal"
  22.  
  23.  
  24. def detect_fonts(document: docx.Document) -> None:
  25.   """Detects the font styles in the document and writes them to a HTML file.
  26.  
  27.  Args:
  28.    document: A docx.Document object.
  29.  """
  30.  
  31.   with open("bebe.html", "w") as f:
  32.     current_style = None
  33.     current_text = ""
  34.     for paragraph in document.paragraphs:
  35.       if paragraph.runs:
  36.         run_style = run_get_style(paragraph.runs[0])
  37.         if current_style != run_style:
  38.           if current_text:
  39.             if current_style == "bold":
  40.               f.write(f"<p><b>{current_text}</b></p>\n")
  41.             elif current_style == "italic":
  42.               f.write(f"<p><em>{current_text}</em></p>\n")
  43.             else:
  44.               f.write(f"<p>{current_text}</p>\n")
  45.           current_style = run_style
  46.           current_text = paragraph.text
  47.       elif paragraph.text == "Bebe este așa":
  48.         f.write(f"<p><b>{paragraph.text}</b></p>\n")
  49.       elif paragraph.text:
  50.         current_text += paragraph.text
  51.     if current_text:
  52.       if current_style == "bold":
  53.         f.write(f"<p><b>{current_text}</b></p>\n")
  54.       elif current_style == "italic":
  55.         f.write(f"<p><em>{current_text}</em></p>\n")
  56.       else:
  57.         f.write(f"<p>{current_text}</p>\n")
  58.  
  59.  
  60. def main():
  61.   document = docx.Document("bebe.docx")
  62.   detect_fonts(document)
  63.  
  64.  
  65. if __name__ == "__main__":
  66.   main()
  67.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement