SHOW:
|
|
- or go back to the newest paste.
1 | import os | |
2 | import re | |
3 | import textwrap | |
4 | import html | |
5 | from google_trans_new import google_translator | |
6 | translator = google_translator() | |
7 | ||
8 | #------------------------------------------------------------------------------- | |
9 | # calea catre folder-ul cu documente de tradus | |
10 | - | fisiere_din_folder = r"c:\Users\Castel\Videos\555\traducere_Nicu" |
10 | + | fisiere_din_folder = r"c:\Folder1" |
11 | source_language = 'ro' | |
12 | # in ce limba vreau sa traduc | |
13 | # {'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'lo': 'lao', 'la': 'latin', 'lv': 'latvian', 'lt': 'lithuanian', 'lb': 'luxembourgish', 'mk': 'macedonian', 'mg': 'malagasy', 'ms': 'malay', 'ml': 'malayalam', 'mt': 'maltese', 'mi': 'maori', 'mr': 'marathi', 'mn': 'mongolian', 'my': 'myanmar (burmese)', 'ne': 'nepali', 'no': 'norwegian', 'ps': 'pashto', 'fa': 'persian', 'pl': 'polish', 'pt': 'portuguese', 'pa': 'punjabi', 'ro': 'romanian', 'ru': 'russian', 'sm': 'samoan', 'gd': 'scots gaelic', 'sr': 'serbian', 'st': 'sesotho', 'sn': 'shona', 'sd': 'sindhi', 'si': 'sinhala', 'sk': 'slovak', 'sl': 'slovenian', 'so': 'somali', 'es': 'spanish', 'su': 'sundanese', 'sw': 'swahili', 'sv': 'swedish', 'tg': 'tajik', 'ta': 'tamil', 'te': 'telugu', 'th': 'thai', 'tr': 'turkish', 'uk': 'ukrainian', 'ur': 'urdu', 'uz': 'uzbek', 'vi': 'vietnamese', 'cy': 'welsh', 'xh': 'xhosa', 'yi': 'yiddish', 'yo': 'yoruba', 'zu': 'zulu', 'fil': 'Filipino', 'he': 'Hebrew'} | |
14 | destination_language = 'be' | |
15 | ||
16 | extensie_fisier = ".html" | |
17 | #------------------------------------------------------------------------------- | |
18 | ||
19 | lista_cale_fisiere = [] | |
20 | delimitatori_text = [['<title','</title>'], ['<h1 class="den_articol" itemprop="name', '</h1>'], ['<meta name="description" content="', '"/>'], ['<p class="text_obisnuit', '</p>'], ['<span class="text', '</span>']] | |
21 | ||
22 | def traducere_text(text): | |
23 | result = "" | |
24 | for txt in (textwrap.wrap(text, 4500, break_long_words=False)): # impart in maxim 4500 de caractere uitandu-ma dupa spatii | |
25 | txt = html.unescape(txt) # Convert all named and numeric character references (e.g. >, >, >) in the string s to the corresponding Unicode characters. | |
26 | result = result + translator.translate(txt, lang_tgt=destination_language) | |
27 | return result | |
28 | ||
29 | def selectare_text(): | |
30 | ||
31 | ||
32 | for file in os.listdir(fisiere_din_folder): | |
33 | if file.endswith(extensie_fisier): | |
34 | lista_cale_fisiere.append(os.path.join(fisiere_din_folder, file)) | |
35 | ||
36 | for fisier in lista_cale_fisiere: | |
37 | ||
38 | f = open(fisier, 'r') | |
39 | ||
40 | if f.mode == 'r': | |
41 | contents = f.read() | |
42 | contents = html.unescape(contents) # Convert all named and numeric character references (e.g. >, >, >) in the string s to the corresponding Unicode characters. | |
43 | print ("Acum lucrez la fisierul :", fisier) | |
44 | ||
45 | for delimitator in delimitatori_text: | |
46 | start_delim = delimitator[0] # '<title' | |
47 | stop_delim = delimitator[1] # '</title>' | |
48 | start_position = 0 | |
49 | stop_position = len(contents)-1 | |
50 | while contents[start_position:stop_position].find(start_delim)>0: | |
51 | temp_st = contents[start_position:stop_position].find(start_delim) + len(start_delim) + start_position | |
52 | temp = temp_st + contents[temp_st:stop_position].find('>') | |
53 | if (contents[temp-1] == '/'): | |
54 | start_position = temp | |
55 | else: | |
56 | start_position = temp+1 | |
57 | st = contents[start_position:stop_position].find(stop_delim) + start_position | |
58 | extracted_text = contents[start_position:st] | |
59 | translated_text = traducere_text(extracted_text) | |
60 | contents = contents[:start_position] + translated_text + contents[st:] | |
61 | start_position = start_position + len(translated_text) | |
62 | ||
63 | print("Am citit un fisier si incep traducerea!\n") | |
64 | ||
65 | with open(fisier[:len(fisier)-len(extensie_fisier)]+"_"+destination_language+extensie_fisier, 'w', encoding="utf-8") as f: | |
66 | f.write(contents) | |
67 | print("Am terminat traducerea !") | |
68 | ||
69 | selectare_text() |