Guest User

Untitled

a guest
May 27th, 2018
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.09 KB | None | 0 0
  1. import os
  2. from docx import Document
  3.  
  4. def print_tables(path):
  5. doc = Document(path)
  6. for ntable, table in enumerate(doc.tables):
  7. buf = u""
  8. start_row = 0
  9. if ntable > 0:
  10. start_row = 2
  11. n_empty = 0
  12. data_part = False
  13. for nrow, row in enumerate(table.rows[start_row:]):
  14. last_tc = None
  15. row_empty = True
  16. for cell in row.cells:
  17. row_empty = row_empty and not cell.text
  18. # ignore merged and empty cells
  19. if ((cell._tc != last_tc) and cell.text):
  20. # try to determine if cell.text contains float value
  21. is_float = (cell.text[0].isdigit() and cell.text.count(".") == 1)
  22. if (is_float):
  23. #if (not data_part):
  24. # print cell.text
  25. data_part = True
  26. # to make Excel happy
  27. buf += cell.text.replace(".", ",")
  28. else:
  29. buf += cell.text
  30. buf += u";"
  31. last_tc = cell._tc
  32. if row_empty:
  33. n_empty += 1
  34. if data_part and row_empty:
  35. break
  36. buf += u"\n"
  37. #print (n_empty, nrow)
  38. yield buf
  39.  
  40. for i in os.listdir("."):
  41. if i.endswith(".docx"):
  42. print(i)
  43. tbl = list(print_tables(i))
  44. open("%s.csv" % i, "wb").write(u"".join(tbl).encode("cp1251"))
Add Comment
Please, Sign In to add comment