Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pathlib import Path
- import chardet
- import io
- #DetectorFactory.seed = 0
- pdf_files=[]
- txt_path = r'{text file folder path}'
- language_dict = {}
- for filename in Path(txt_path).glob(r'**/*.txt'):
- rfilename = r'%s'%str(filename)
- try:
- f = io.open(u'%s'%rfilename, "r",encoding='unicode')
- except IOError:
- f2 = f.decode('UTF-8')
- f = open(r'%s'%f2, "r")
- language_dict = {}
- text_path_short = (str(filename).split('\\')[-1] +'||'+ str(filename).split('\\')[-2])
- try:
- detect_result = chardet.detect(f.read().encode("utf-8"))
- language_dict[text_path_short] = (detect_result['language'] , detect_result['confidence'])
- except UnicodeDecodeError:
- detect_result = 'unicode prob'
- language_dict[text_path_short] = (detect_result, 0)
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement