Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ROMANIAN: https://neculaifantanaru.com/python-split-break-sparge-textul-in-blocuri-la-traducere.html
- # ENGLISH: https://neculaifantanaru.com/en/python-split-breaks-the-text-into-blocks-during-translation.html
- # pentru traducere taguri TRUE
- # pentru traducere TEXT (FALSE - LANGUAGE (en) - TXT)
- # PUNE TOATE DOCUMENTELE DE TRADUS IN FOLDER1
- # https://beautiful-soup-4.readthedocs.io/en/latest/ css_soup.find_all("p", class_="strikeout")
- import os
- from bs4 import BeautifulSoup, NavigableString
- import re
- import textwrap
- from googletrans import Translator
- import pprint
- base_path = "Folder1" # AICI SELECTEZI PATH LA ORIGINAL FILES
- read_tags = input("Want to read tags: ") # TRUE (html) sau FALSE (txt)
- if(read_tags.lower() == "true"):
- read_tags = True
- else:
- read_tags = False
- input_lang = input("Enter language in which you want to translate: ")
- input_extension = input("Enter file extension .txt, .html, etc: ")
- def recursively_translate(translator, node, input_lang):
- for entry in node.contents:
- if isinstance(entry, NavigableString):
- contents = entry.string
- if contents.strip() != '':
- try:
- translation = translator.translate(contents, dest=input_lang)
- entry.replace_with(translation.text)
- except Exception as e:
- print("Got error during rec translation {}".format(e))
- pass
- elif entry != None:
- recursively_translate(translator, entry, input_lang)
- def remove_tags(data):
- data = data.replace("<html>" , "\n")
- data = data.replace("</html>" , "\n")
- data = data.replace("<body>" , "\n")
- data = data.replace("</body>" , "\n")
- tags = re.findall("<(.*?)</", data)
- for tag in tags:
- ch = '>'
- listOfWords = tag.split(ch, 1)
- tag = listOfWords[1]
- data = data.replace(tag,"\n")
- soup = BeautifulSoup(data, 'lxml')
- return soup
- translator = Translator()
- subfolders = []
- # getting names of all pdfs files
- for file in os.listdir(base_path):
- if file.endswith(input_extension):
- subfolders.append(file)
- ## Cod nou care traduce pe bucatele
- def traducere_v1_txt(translator, file):
- data = []
- with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file:
- data = open_file.readlines()
- if len(data) == 0:
- print("{} este gol".format(file))
- return
- file_name = file.replace(".txt","")
- with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file:
- for i, paragraph in enumerate(data):
- print("Traducere paragraf {}".format(i))
- lines = textwrap.wrap(paragraph, 4820, break_long_words=False )
- for line in lines:
- try:
- translated_line = translator.translate(line, dest=input_lang)
- translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False)
- translation_file.writelines(translated_lines)
- except Exception as e:
- print(e)
- return
- translation_file.write("\n")
- def traducere_v2_txt(translator, file):
- data = ""
- with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file:
- data = open_file.read()
- if data == "":
- print("{} este gol".format(file))
- return
- lines = textwrap.wrap(data, 4820, break_long_words=False, )
- file_name = file.replace(".txt","")
- with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file:
- for i, line in enumerate(lines):
- print("Traducere linia {}".format(i))
- try:
- translated_line = translator.translate(line, dest=input_lang)
- translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False)
- translation_file.writelines(translated_lines)
- translation_file.write("\n")
- except Exception as e:
- print(e)
- return
- for file in subfolders:
- print(f"Translating {file} ..... \n")
- if(file.endswith(".txt")):
- traducere_v2_txt(translator, file) # daca vreau varianta identic paragraf, varianta care dureaza. atunci pun v1 in loc de v2
- print("{} a fost tradus".format(file))
- elif(file.endswith(".html")):
- data = ""
- with open(f"{base_path}/{file}" , "r" , encoding='utf8', errors='ignore') as open_file:
- data = open_file.read()
- if data == "":
- print("{} este gol".format(file))
- continue
- lxml1 = str(BeautifulSoup(data, 'lxml'))
- #lxml1 = data
- lxml1 = lxml1.replace("\ufeff" , " ")
- #lxml1 = lxml1.replace("\n" , " ")
- #lxml1 = re.sub(' +', ' ', lxml1)
- if(read_tags == True):
- soup = BeautifulSoup(data, 'lxml')
- title_tag = soup.find("title")
- desc_tag = soup.select_one("div.news_desc > h3")
- # to_p_tag = soup.findAll('p', class_='text_obisnuit')
- to_p_tag = soup.find_all('p', class_='text_obisnuit')
- ist_p_tag = soup.find("p" , class_="text_obisnuit2")
- second_p_tag = soup.find("p" , class_="donoo")
- ist3_p_tag = soup.find("p" , class_="JAGAAA")
- # ist3_p_tag = soup.find("p", {'class': "JAGAAA"})
- # ist3_p_tag = soup.find('p', attr={'class_': 'JAGAAA'})
- # ist3_p_tag = soup.find("p" , attr={'class_': "JAGAAA"})
- # ist3_p_tag = soup.find_all("p", class_="JAGAAA")
- # ist3_p_tag = soup.find("p" , {'class_': "JAGAAA"})
- if(ist3_p_tag == None):
- print("<p class='JAGAAA' /> not found")
- else:
- translated_p = translator.translate(ist3_p_tag.text, dest=input_lang)
- lxml1 = lxml1.replace(ist3_p_tag.text,translated_p.text)
- meta_tag = soup.find("meta")
- if(title_tag == None):
- print("Title tag does not found")
- else:
- translated_title = translator.translate(title_tag.text, dest=input_lang)
- lxml1 = lxml1.replace(title_tag.text,translated_title.text)
- if(meta_tag == None):
- print("meta tag does not found")
- else:
- translated_meta = translator.translate(meta_tag["content"], dest=input_lang)
- lxml1 = lxml1.replace(meta_tag["content"],translated_meta.text)
- if(ist_p_tag == None):
- print("<p class='text_obisnuit2' /> not found")
- else:
- translated_p = translator.translate(ist_p_tag.text, dest=input_lang)
- lxml1 = lxml1.replace(ist_p_tag.text,translated_p.text)
- if(len(to_p_tag) == 0):
- print("<p class='text_obisnuit' /> not found")
- else:
- for p in to_p_tag:
- recursively_translate(translator, p, input_lang)
- # translated_p = translator.translate(to_p_tag.text, dest=input_lang)
- # lxml1 = lxml1.replace(to_p_tag.text,translated_p.text)
- if(desc_tag == None):
- print("<h3 /> not found")
- else:
- translated_p = translator.translate(desc_tag.text, dest=input_lang)
- lxml1 = lxml1.replace(desc_tag.text,translated_p.text)
- if(second_p_tag == None):
- print("<p class='donoo' /> not found")
- else:
- translated_p_2 = translator.translate(second_p_tag.text, dest=input_lang)
- lxml1= lxml1.replace(second_p_tag.text,translated_p_2.text)
- soup = remove_tags(data)
- text = soup.text
- replace_text = text
- text = text.replace("\ufeff" , " ")
- text = text.replace("\n" , " ")
- text = re.sub(' +', ' ', text)
- text = textwrap.wrap(text, 4800, break_long_words=False, )
- if(len(text) == 1 and text[0] == ''):
- pass
- else:
- translation = ""
- linecount = 0
- for line in text:
- try:
- translated_line = translator.translate(line, dest=input_lang)
- #print("Translated line: ",translated_line.text)
- lxml1 = lxml1.replace(line,translated_line.text)
- except Exception as e:
- print(e)
- continue
- linecount = linecount + 1
- file_name = file.replace(".html","")
- with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile:
- htmlfile.write(lxml1)
- else:
- soup = remove_tags(data)
- text = soup.text
- replace_text = text
- text = text.replace("\ufeff" , " ")
- text = text.replace("\n" , " ")
- text = re.sub(' +', ' ', text)
- text = textwrap.wrap(text, 4800, break_long_words=False, )
- if(len(text) == 1 and text[0] == ''):
- print("No text found")
- else:
- translation = ""
- linecount = 0
- for line in text:
- try:
- translated_line = translator.translate(line, dest=input_lang)
- #print("Translated line: ",translated_line.text)
- lxml1 = lxml1.replace(line,translated_line.text)
- except Exception as e:
- print(e)
- continue
- linecount = linecount + 1
- file_name = file.replace(".html","")
- with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile:
- htmlfile.write(lxml1)
- print("{} a fost tradus".format(file))
- pass
- #dt1 = translator.detect(text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement