Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from docx import Document
- def print_tables(path):
- doc = Document(path)
- for ntable, table in enumerate(doc.tables):
- buf = u""
- start_row = 0
- if ntable > 0:
- start_row = 2
- n_empty = 0
- data_part = False
- for nrow, row in enumerate(table.rows[start_row:]):
- last_tc = None
- row_empty = True
- for cell in row.cells:
- row_empty = row_empty and not cell.text
- # ignore merged and empty cells
- if ((cell._tc != last_tc) and cell.text):
- # try to determine if cell.text contains float value
- is_float = (cell.text[0].isdigit() and cell.text.count(".") == 1)
- if (is_float):
- #if (not data_part):
- # print cell.text
- data_part = True
- # to make Excel happy
- buf += cell.text.replace(".", ",")
- else:
- buf += cell.text
- buf += u";"
- last_tc = cell._tc
- if row_empty:
- n_empty += 1
- if data_part and row_empty:
- break
- buf += u"\n"
- #print (n_empty, nrow)
- yield buf
- for i in os.listdir("."):
- if i.endswith(".docx"):
- print(i)
- tbl = list(print_tables(i))
- open("%s.csv" % i, "wb").write(u"".join(tbl).encode("cp1251"))
Add Comment
Please, Sign In to add comment