Advertisement
nicuf

Replace string in html tags and replace double spaces with one space

Jan 16th, 2022 (edited)
1,616
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.01 KB | None | 0 0
  1. Explanation:
  2. ROMANIAN: https://neculaifantanaru.com/python-inlocuieste-un-string-nbsp-cu-spatiu-si-elimina-toate-spatiile-duble-din-tagurile.html
  3. ENGLISH:  https://neculaifantanaru.com/en/python-replaces-the-string-nbsp-with-a-space-and-then-removes-all-duplicate-spaces-from-html-tags.html
  4. ----------------------
  5.  
  6.  
  7. import re
  8. import os
  9.  
  10.  
  11. def read_text_from_file(file_path):
  12.     """
  13.    Aceasta functie returneaza continutul unui fisier.
  14.    file_path: calea catre fisierul din care vrei sa citesti
  15.    """
  16.     with open(file_path, encoding='utf8') as f:
  17.         text = f.read()
  18.         return text
  19.  
  20.  
  21. def write_to_file(text, file_path):
  22.     """
  23.    Aceasta functie scrie un text intr-un fisier.
  24.    text: textul pe care vrei sa il scrii
  25.    file_path: calea catre fisierul in care vrei sa scrii
  26.    """
  27.     with open(file_path, 'wb') as f:
  28.         f.write(text.encode('utf8', 'ignore'))
  29.  
  30.  
  31. def replace_white_spaces(tag_name, file_path):
  32.     """
  33.    Aceasta functie modifica textul dintre un tag dat ca argument.
  34.    """
  35.  
  36.     text = read_text_from_file(file_path)
  37.  
  38.     text = str(text)
  39.  
  40.     articol_pattern = re.compile('<!-- ARTICOL START -->[\s\S]*?<!-- ARTICOL FINAL -->[\s\S]*?')
  41.     text_articol = re.findall(articol_pattern, text)
  42.     if len(text_articol) != 0:
  43.         text_articol = str(text_articol[0])
  44.         pattern = re.compile('<{} class=\".*?\">(.*?)</{}>'.format(tag_name, tag_name))  
  45.        
  46.         tag_texts = re.findall(pattern, text_articol)
  47.        
  48.         new_text_articol = text_articol
  49.         for tag_text in tag_texts:
  50.          
  51.            
  52.            
  53.             new_text = tag_text.strip()
  54.             m = re.findall('<em>(.*?)</em>', new_text)
  55.             if len(m) >= 1:
  56.                 text_em = str(m[0])
  57.                 text_em_new = text_em
  58.                 text_em_new = text_em_new.replace(r'&nbsp;', r' ')
  59.                 text_em_new = text_em_new.strip()
  60.                 new_text = new_text.replace(text_em, text_em_new)
  61.            
  62.             new_text = new_text.replace(r'&nbsp;', r' ')
  63.             new_text = " ".join(new_text.split())
  64.            
  65.             new_text_articol = new_text_articol.replace(tag_text, new_text)
  66.        
  67.         text = text.replace(text_articol, new_text_articol)
  68.         write_to_file(text, file_path)
  69.         print("Fisierul modificat cu succes este: ", file_path)
  70.     else:
  71.         print("Fisierul nu are structura corecta: ", file_path)
  72.  
  73.  
  74.  
  75.  
  76. def gaseste_nbsp(file_path):
  77.     text_Reg = f.read()
  78.  
  79.  
  80. def replace_white_spaces_only_html_php(tag_name, directory_name):
  81.     for root, dirs, files in os.walk(directory_name):
  82.         for f in files:
  83.             if f.endswith('html'):
  84.                 file_path = os.path.join(root, f)
  85.                 replace_white_spaces(tag_name, file_path)
  86.             else:
  87.                 continue
  88.  
  89. if __name__ == '__main__':  
  90.    
  91.    
  92.     directory_name = 'c:\\Folder1'
  93.    
  94.     tag_name = 'p'
  95.    
  96.     replace_white_spaces_only_html_php(tag_name, directory_name)
  97.  
  98.    
  99.  
  100.    
  101.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement