Advertisement
nicuf

Break Text into block on translating html pages

Mar 21st, 2023
679
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.18 KB | Source Code | 0 0
  1. # ROMANIAN: https://neculaifantanaru.com/python-split-break-sparge-textul-in-blocuri-la-traducere.html
  2. # ENGLISH:  https://neculaifantanaru.com/en/python-split-breaks-the-text-into-blocks-during-translation.html
  3.  
  4. # pentru traducere taguri TRUE
  5. # pentru traducere TEXT (FALSE - LANGUAGE (en) - TXT)
  6. # PUNE TOATE DOCUMENTELE DE TRADUS IN FOLDER1
  7. # https://beautiful-soup-4.readthedocs.io/en/latest/   css_soup.find_all("p", class_="strikeout")
  8.  
  9.  
  10. import os
  11. from bs4 import BeautifulSoup, NavigableString
  12. import re
  13. import textwrap
  14. from googletrans import Translator
  15. import pprint
  16.  
  17. base_path = "Folder1"  # AICI SELECTEZI PATH LA ORIGINAL FILES
  18. read_tags = input("Want to read tags: ") # TRUE (html) sau FALSE (txt)
  19. if(read_tags.lower() == "true"):
  20.   read_tags = True
  21. else:
  22.   read_tags = False
  23.  
  24. input_lang = input("Enter language in which you want to translate: ")
  25.  
  26. input_extension = input("Enter file extension .txt, .html, etc: ")
  27.  
  28. def recursively_translate(translator, node, input_lang):
  29.     for entry in node.contents:
  30.         if isinstance(entry, NavigableString):
  31.             contents = entry.string
  32.             if contents.strip() != '':
  33.                 try:
  34.                     translation = translator.translate(contents, dest=input_lang)
  35.                     entry.replace_with(translation.text)
  36.                 except Exception as e:
  37.                     print("Got error during rec translation {}".format(e))
  38.                     pass
  39.         elif entry != None:
  40.             recursively_translate(translator, entry, input_lang)
  41.  
  42. def remove_tags(data):
  43.   data = data.replace("<html>" , "\n")
  44.   data = data.replace("</html>" , "\n")
  45.   data = data.replace("<body>" , "\n")
  46.   data = data.replace("</body>" , "\n")
  47.   tags  = re.findall("<(.*?)</", data)
  48.   for tag in tags:
  49.     ch = '>'
  50.     listOfWords = tag.split(ch, 1)
  51.     tag = listOfWords[1]
  52.     data = data.replace(tag,"\n")
  53.   soup = BeautifulSoup(data, 'lxml')
  54.   return soup
  55.  
  56.  
  57. translator = Translator()
  58. subfolders = []
  59. # getting names of all pdfs files
  60. for file in os.listdir(base_path):
  61.     if file.endswith(input_extension):
  62.       subfolders.append(file)
  63.  
  64. ## Cod nou care traduce pe bucatele
  65. def traducere_v1_txt(translator, file):
  66.   data = []
  67.   with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file:
  68.     data = open_file.readlines()
  69.   if len(data) == 0:
  70.     print("{} este gol".format(file))
  71.     return
  72.   file_name = file.replace(".txt","")
  73.   with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file:
  74.     for i, paragraph in enumerate(data):
  75.           print("Traducere paragraf {}".format(i))
  76.           lines = textwrap.wrap(paragraph, 4820, break_long_words=False )
  77.           for line in lines:
  78.             try:
  79.                   translated_line = translator.translate(line, dest=input_lang)
  80.                   translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False)
  81.                   translation_file.writelines(translated_lines)
  82.             except Exception as e:
  83.                     print(e)
  84.                     return
  85.           translation_file.write("\n")
  86.  
  87. def traducere_v2_txt(translator, file):
  88.   data = ""
  89.   with open(f"{base_path}/{file}" , "r" ,encoding='utf8', errors='ignore') as open_file:
  90.     data = open_file.read()
  91.   if data == "":
  92.     print("{} este gol".format(file))
  93.     return
  94.   lines = textwrap.wrap(data, 4820, break_long_words=False, )
  95.   file_name = file.replace(".txt","")
  96.   with open(f"Translated_Folder/{file_name}_{input_lang}.txt","w", encoding='utf8') as translation_file:
  97.     for i, line in enumerate(lines):
  98.           print("Traducere linia {}".format(i))
  99.           try:
  100.             translated_line = translator.translate(line, dest=input_lang)
  101.             translated_lines = textwrap.wrap(translated_line.text, 120, break_long_words=False)
  102.             translation_file.writelines(translated_lines)
  103.             translation_file.write("\n")
  104.           except Exception as e:
  105.                   print(e)
  106.                   return
  107.  
  108. for file in subfolders:
  109.   print(f"Translating {file} ..... \n")
  110.   if(file.endswith(".txt")):
  111.     traducere_v2_txt(translator, file)  # daca vreau varianta identic paragraf, varianta care dureaza. atunci pun v1 in loc de v2
  112.     print("{} a fost tradus".format(file))
  113.   elif(file.endswith(".html")):
  114.     data = ""
  115.     with open(f"{base_path}/{file}" , "r" , encoding='utf8', errors='ignore') as open_file:
  116.       data = open_file.read()
  117.     if data == "":
  118.       print("{} este gol".format(file))
  119.       continue
  120.     lxml1 = str(BeautifulSoup(data, 'lxml'))
  121.     #lxml1 = data
  122.     lxml1 = lxml1.replace("\ufeff" , " ")
  123.     #lxml1 = lxml1.replace("\n" , " ")
  124.     #lxml1 = re.sub(' +', ' ', lxml1)
  125.     if(read_tags == True):
  126.       soup = BeautifulSoup(data, 'lxml')
  127.       title_tag = soup.find("title")
  128.       desc_tag = soup.select_one("div.news_desc > h3")
  129.       # to_p_tag = soup.findAll('p', class_='text_obisnuit')
  130.       to_p_tag = soup.find_all('p', class_='text_obisnuit')
  131.       ist_p_tag = soup.find("p" , class_="text_obisnuit2")
  132.       second_p_tag = soup.find("p" , class_="donoo")
  133.  
  134.  
  135.       ist3_p_tag = soup.find("p" , class_="JAGAAA")
  136.       # ist3_p_tag = soup.find("p", {'class': "JAGAAA"})
  137.       # ist3_p_tag = soup.find('p', attr={'class_': 'JAGAAA'})
  138.       # ist3_p_tag = soup.find("p" , attr={'class_': "JAGAAA"})
  139.       # ist3_p_tag = soup.find_all("p", class_="JAGAAA")
  140.       # ist3_p_tag = soup.find("p" , {'class_': "JAGAAA"})
  141.  
  142.       if(ist3_p_tag == None):
  143.         print("<p class='JAGAAA' /> not found")
  144.       else:
  145.         translated_p = translator.translate(ist3_p_tag.text, dest=input_lang)
  146.         lxml1 = lxml1.replace(ist3_p_tag.text,translated_p.text)
  147.  
  148.  
  149.  
  150.       meta_tag = soup.find("meta")
  151.       if(title_tag ==  None):
  152.         print("Title tag does not found")
  153.       else:
  154.         translated_title = translator.translate(title_tag.text, dest=input_lang)
  155.         lxml1 = lxml1.replace(title_tag.text,translated_title.text)
  156.       if(meta_tag ==  None):
  157.         print("meta tag does not found")
  158.       else:
  159.         translated_meta = translator.translate(meta_tag["content"], dest=input_lang)
  160.         lxml1 = lxml1.replace(meta_tag["content"],translated_meta.text)
  161.  
  162.       if(ist_p_tag == None):
  163.         print("<p class='text_obisnuit2' /> not found")
  164.       else:
  165.         translated_p = translator.translate(ist_p_tag.text, dest=input_lang)
  166.         lxml1 = lxml1.replace(ist_p_tag.text,translated_p.text)
  167.  
  168.       if(len(to_p_tag) == 0):
  169.         print("<p class='text_obisnuit' /> not found")
  170.       else:
  171.         for p in to_p_tag:
  172.           recursively_translate(translator, p, input_lang)
  173.         # translated_p = translator.translate(to_p_tag.text, dest=input_lang)
  174.         # lxml1 = lxml1.replace(to_p_tag.text,translated_p.text)
  175.  
  176.  
  177.       if(desc_tag == None):
  178.         print("<h3   /> not found")
  179.       else:
  180.         translated_p = translator.translate(desc_tag.text, dest=input_lang)
  181.         lxml1 = lxml1.replace(desc_tag.text,translated_p.text)
  182.  
  183.       if(second_p_tag == None):
  184.         print("<p class='donoo' /> not found")
  185.       else:
  186.         translated_p_2 = translator.translate(second_p_tag.text, dest=input_lang)
  187.         lxml1= lxml1.replace(second_p_tag.text,translated_p_2.text)
  188.  
  189.       soup = remove_tags(data)
  190.       text = soup.text
  191.       replace_text = text
  192.       text = text.replace("\ufeff" , " ")
  193.       text = text.replace("\n" , " ")
  194.       text = re.sub(' +', ' ', text)
  195.       text = textwrap.wrap(text, 4800, break_long_words=False, )
  196.       if(len(text) == 1 and text[0] == ''):
  197.         pass
  198.       else:
  199.         translation = ""
  200.         linecount = 0
  201.         for line in text:
  202.           try:
  203.             translated_line = translator.translate(line, dest=input_lang)
  204.             #print("Translated line: ",translated_line.text)
  205.             lxml1 = lxml1.replace(line,translated_line.text)
  206.           except Exception as e:
  207.             print(e)
  208.             continue
  209.           linecount = linecount + 1
  210.         file_name = file.replace(".html","")
  211.         with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile:
  212.           htmlfile.write(lxml1)
  213.     else:
  214.       soup = remove_tags(data)
  215.       text = soup.text
  216.       replace_text = text
  217.       text = text.replace("\ufeff" , " ")
  218.       text = text.replace("\n" , " ")
  219.       text = re.sub(' +', ' ', text)
  220.       text = textwrap.wrap(text, 4800, break_long_words=False, )
  221.       if(len(text) == 1 and text[0] == ''):
  222.         print("No text found")
  223.       else:
  224.         translation = ""
  225.         linecount = 0
  226.         for line in text:
  227.           try:
  228.             translated_line = translator.translate(line, dest=input_lang)
  229.             #print("Translated line: ",translated_line.text)
  230.             lxml1 = lxml1.replace(line,translated_line.text)
  231.           except Exception as e:
  232.             print(e)
  233.             continue
  234.           linecount = linecount + 1
  235.         file_name = file.replace(".html","")
  236.         with open(f"Translated_Folder/{file_name}_{input_lang}.html","w", encoding='utf8') as htmlfile:
  237.           htmlfile.write(lxml1)
  238.     print("{} a fost tradus".format(file))
  239.     pass
  240.  
  241.  
  242. #dt1 = translator.detect(text)
  243.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement