Advertisement
nicuf

Find all files that contain duplicate string/number

Feb 28th, 2022
1,315
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.48 KB | None | 0 0
  1. --------------------
  2. EXPLANATION:
  3.  
  4. ROMANIAN: https://neculaifantanaru.com/gaseste-toate-fisierele-care-contin-cuvinte-duble-string-sau-numar.html
  5. ENGLISH: https://neculaifantanaru.com/en/find-all-files-that-contain-a-double-words-string-or-number.html
  6. ---------------------
  7.  
  8. import os
  9. import re
  10.  
  11. def read_text_from_file(file_path):
  12.     """
  13.    Aceasta functie returneaza continutul unui fisier.
  14.    file_path: calea catre fisierul din care vrei sa citesti
  15.    """
  16.     with open(file_path, encoding='utf8') as f:
  17.         text = f.read()
  18.         return text
  19.  
  20.  
  21. def write_to_file(text, file_path):
  22.     """
  23.    Aceasta functie scrie un text intr-un fisier.
  24.    text: textul pe care vrei sa il scrii
  25.    file_path: calea catre fisierul in care vrei sa scrii
  26.    """
  27.     with open(file_path, 'wb') as f:
  28.         f.write(text.encode('utf8', 'ignore'))
  29.  
  30.  
  31. def get_duplicates(directory_path, results_file):
  32.     duplicates = dict()
  33.     fisiere_care_nu_au_id = ''
  34.     fisiere_duplicat = ''
  35.     id_pattern = re.compile('\$item_id = (.*?);')
  36.     for f in os.listdir(directory_path):
  37.             if f.endswith('.html') and f != 'termeni-si-conditii.html' and f != "parteneri.html":
  38.                 filepath = directory_path + '//' + f
  39.                 file_text = read_text_from_file(filepath)
  40.                 number = re.findall(id_pattern, file_text)
  41.                 if len(number) != 0:
  42.                     number = number[0]
  43.                     number = number.strip()
  44.                     if number in duplicates.keys():
  45.                         duplicates[number].append(f)
  46.                         # duplicates[number].append(f)
  47.                     else:
  48.                         duplicates[number] = [f]
  49.                 else:
  50.                     fisiere_care_nu_au_id = fisiere_care_nu_au_id + f + '\n'
  51.  
  52.     for key in duplicates.keys():
  53.         if len(duplicates[key]) >= 2:
  54.             print(key)
  55.             for f in duplicates[key]:
  56.                 fisiere_duplicat = fisiere_duplicat + f + '\n'
  57.             fisiere_duplicat += '\n\n'
  58.  
  59.     result = "FISIERE CARE NU AU ID \n\n" + fisiere_care_nu_au_id + '\n' + "FISIERE DUPLICAT \n\n" + fisiere_duplicat
  60.     write_to_file(result, results_file)
  61.  
  62.     print("Scriere efectuata cu succes.")
  63.  
  64. if __name__ == '__main__':
  65.     directory_path = "e:\\Carte\\BB\\17 - Site Leadership\\Principal\\ro"
  66.     results_file = "e:\\Carte\\BB\\17 - Site Leadership\\Principal\\ro\\results_duplicates.txt"
  67.     get_duplicates(directory_path, results_file)
  68.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement