Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import docx
- from subprocess import Popen, PIPE
- def doc_to_txt(filename):
- '''
- Get the path of a Word document and returns the text of this document
- :param filename: The filename of the doc or docx document
- :type filename: str
- :return: The text of the document
- :rtype: str
- :Example:
- >>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00097342.doc")
- 'This is text from a .doc document'
- >>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00136930.docx")
- 'This is text from a .docx document'
- '''
- full_text = []
- if filename.lower().endswith(".doc"):
- print("Converting to txt the doc file:" + filename)
- cmd = ['antiword', filename]
- p = Popen(cmd, stdout=PIPE)
- stdout, stderr = p.communicate()
- full_text = stdout.decode()
- elif filename.lower().endswith(".docx"):
- print("Converting to txt the docx file:" + filename)
- doc = docx.Document(filename)
- for para in doc.paragraphs:
- full_text.append(para.text)
- full_text = '\n'.join(full_text)
- else :
- print("Document extension should be either .doc or .docx")
- return full_text
Add Comment
Please, Sign In to add comment