Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- --------------------
- EXPLANATION:
- ROMANIAN: https://neculaifantanaru.com/gaseste-toate-fisierele-care-contin-cuvinte-duble-string-sau-numar.html
- ENGLISH: https://neculaifantanaru.com/en/find-all-files-that-contain-a-double-words-string-or-number.html
- ---------------------
- import os
- import re
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- def get_duplicates(directory_path, results_file):
- duplicates = dict()
- fisiere_care_nu_au_id = ''
- fisiere_duplicat = ''
- id_pattern = re.compile('\$item_id = (.*?);')
- for f in os.listdir(directory_path):
- if f.endswith('.html') and f != 'termeni-si-conditii.html' and f != "parteneri.html":
- filepath = directory_path + '//' + f
- file_text = read_text_from_file(filepath)
- number = re.findall(id_pattern, file_text)
- if len(number) != 0:
- number = number[0]
- number = number.strip()
- if number in duplicates.keys():
- duplicates[number].append(f)
- # duplicates[number].append(f)
- else:
- duplicates[number] = [f]
- else:
- fisiere_care_nu_au_id = fisiere_care_nu_au_id + f + '\n'
- for key in duplicates.keys():
- if len(duplicates[key]) >= 2:
- print(key)
- for f in duplicates[key]:
- fisiere_duplicat = fisiere_duplicat + f + '\n'
- fisiere_duplicat += '\n\n'
- result = "FISIERE CARE NU AU ID \n\n" + fisiere_care_nu_au_id + '\n' + "FISIERE DUPLICAT \n\n" + fisiere_duplicat
- write_to_file(result, results_file)
- print("Scriere efectuata cu succes.")
- if __name__ == '__main__':
- directory_path = "e:\\Carte\\BB\\17 - Site Leadership\\Principal\\ro"
- results_file = "e:\\Carte\\BB\\17 - Site Leadership\\Principal\\ro\\results_duplicates.txt"
- get_duplicates(directory_path, results_file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement