Advertisement
Guest User

Untitled

a guest
Aug 20th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.80 KB | None | 0 0
  1. from pathlib import Path
  2. import chardet
  3. import io
  4.  
  5. #DetectorFactory.seed = 0
  6. pdf_files=[]
  7. txt_path = r'{text file folder path}'
  8. language_dict = {}
  9.  
  10. for filename in Path(txt_path).glob(r'**/*.txt'):
  11. rfilename = r'%s'%str(filename)
  12. try:
  13. f = io.open(u'%s'%rfilename, "r",encoding='unicode')
  14. except IOError:
  15. f2 = f.decode('UTF-8')
  16. f = open(r'%s'%f2, "r")
  17. language_dict = {}
  18. text_path_short = (str(filename).split('\\')[-1] +'||'+ str(filename).split('\\')[-2])
  19. try:
  20. detect_result = chardet.detect(f.read().encode("utf-8"))
  21. language_dict[text_path_short] = (detect_result['language'] , detect_result['confidence'])
  22. except UnicodeDecodeError:
  23. detect_result = 'unicode prob'
  24. language_dict[text_path_short] = (detect_result, 0)
  25. f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement