Guest User

Untitled

a guest
Oct 18th, 2017
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.24 KB | None | 0 0
  1. import docx
  2. from subprocess import Popen, PIPE
  3.  
  4. def doc_to_txt(filename):
  5. '''
  6. Get the path of a Word document and returns the text of this document
  7.  
  8. :param filename: The filename of the doc or docx document
  9. :type filename: str
  10. :return: The text of the document
  11. :rtype: str
  12.  
  13. :Example:
  14.  
  15. >>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00097342.doc")
  16. 'This is text from a .doc document'
  17. >>> doc_to_txt("/Users/seiteta/Work/quen-dit-la-cour/reports/jf00136930.docx")
  18. 'This is text from a .docx document'
  19.  
  20. '''
  21. full_text = []
  22.  
  23. if filename.lower().endswith(".doc"):
  24. print("Converting to txt the doc file:" + filename)
  25. cmd = ['antiword', filename]
  26. p = Popen(cmd, stdout=PIPE)
  27. stdout, stderr = p.communicate()
  28. full_text = stdout.decode()
  29.  
  30. elif filename.lower().endswith(".docx"):
  31. print("Converting to txt the docx file:" + filename)
  32. doc = docx.Document(filename)
  33. for para in doc.paragraphs:
  34. full_text.append(para.text)
  35. full_text = '\n'.join(full_text)
  36.  
  37. else :
  38. print("Document extension should be either .doc or .docx")
  39.  
  40. return full_text
Add Comment
Please, Sign In to add comment