Advertisement
nicuf

Delete double empty spaces in html tags (Second solution)

Dec 1st, 2021
755
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. EXPLANATION:
  2.  
  3. ROMANIAN: https://neculaifantanaru.com/python-sterge-spatiile-goale-duble-din-tagurile-html.html
  4. ENGLISH:  https://neculaifantanaru.com/en/python-delete-double-empty-spaces-in-html-tags.html
  5. ----------------------------------
  6.  
  7. import re
  8. import os
  9.  
  10.  
  11. def read_text_from_file(file_path):
  12.     """
  13.    Aceasta functie returneaza continutul unui fisier.
  14.    file_path: calea catre fisierul din care vrei sa citesti
  15.    """
  16.     with open(file_path, encoding='utf8') as f:
  17.         text = f.read()
  18.         return text
  19.  
  20.  
  21. def write_to_file(text, file_path):
  22.     """
  23.    Aceasta functie scrie un text intr-un fisier.
  24.    text: textul pe care vrei sa il scrii
  25.    file_path: calea catre fisierul in care vrei sa scrii
  26.    """
  27.     with open(file_path, 'wb') as f:
  28.         f.write(text.encode('utf8', 'ignore'))
  29.  
  30.  
  31. def replace_white_spaces(tag_name, file_path):
  32.     """
  33.    Aceasta functie modifica textul dintre un tag dat ca argument.
  34.    """
  35.     text = read_text_from_file(file_path)
  36.    
  37.     text = str(text)
  38.    
  39.    
  40.     articol_pattern = re.compile('<!-- ARTICOL START -->[\s\S]*?<!-- ARTICOL FINAL -->[\s\S]*?')
  41.     text_articol = re.findall(articol_pattern, text)
  42.     if len(text_articol) != 0:
  43.         text_articol = str(text_articol[0])
  44.         pattern = re.compile('<{} class=\".*?\">(.*?)</{}>'.format(tag_name, tag_name))  
  45.        
  46.         tag_texts = re.findall(pattern, text_articol)
  47.         new_text_articol = text_articol
  48.         for tag_text in tag_texts:
  49.            
  50.             new_text = tag_text.strip()
  51.             m = re.findall('<em>(.*?)</em>', new_text)
  52.             if len(m) >= 1:
  53.                 text_em = str(m[0])
  54.                 text_em_new = text_em.strip()
  55.                 new_text = new_text.replace(text_em, text_em_new)
  56.            
  57.             new_text = " ".join(new_text.split())
  58.            
  59.             new_text_articol = new_text_articol.replace(tag_text, new_text)
  60.        
  61.         text = text.replace(text_articol, new_text_articol)
  62.         write_to_file(text, file_path)
  63.     else:
  64.         print("Fisierul nu are structura corecta: ", file_path)
  65.  
  66.  
  67. def replace_white_spaces_only_html_php(tag_name, directory_name):
  68.     for root, dirs, files in os.walk(directory_name):
  69.         for f in files:
  70.             if f.endswith('html'):
  71.                 file_path = os.path.join(root, f)
  72.                 replace_white_spaces(tag_name, file_path)
  73.             else:
  74.                 continue
  75.  
  76. if __name__ == '__main__':
  77.    
  78.    
  79.     directory_name = 'c:\\Folder1'
  80.    
  81.     tag_name = 'p'
  82.    
  83.     replace_white_spaces_only_html_php(tag_name, directory_name)
  84.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement