Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -----------------
- EXPLANATION"
- English: https://neculaifantanaru.com/en/find-all-files-that-contain-a-double-words-string-or-number.html
- Romanian: https://neculaifantanaru.com/gaseste-toate-fisierele-care-contin-cuvinte-duble-string-sau-numar.html
- -----------------
- import os
- import re
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8', errors='ignore') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path, encoding='utf8'):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf-8', 'ignore'))
- def get_duplicates(directory_path, results_file, tag):
- duplicates = dict()
- fisiere_care_nu_au_id = ''
- fisiere_duplicat = ''
- id_pattern = re.compile('\$item_id = (.*?);')
- for f in os.listdir(directory_path):
- if f.endswith('.html') and f != 'termeni-si-conditii.html' and f != "parteneri.html":
- filepath = directory_path + '//' + f
- file_text = read_text_from_file(filepath)
- number = re.findall(id_pattern, file_text)
- if len(number) != 0:
- number = number[0]
- number = number.strip()
- if number in duplicates.keys():
- duplicates[number].append(f)
- else:
- duplicates[number] = [f]
- else:
- fisiere_care_nu_au_id = fisiere_care_nu_au_id + f + '\n'
- for key in duplicates.keys():
- if len(duplicates[key]) >= 2:
- for f in duplicates[key]:
- fisiere_duplicat = fisiere_duplicat + f + '\n'
- fisiere_duplicat += '\n\n'
- # i-au toate numerele din intervalul 1 - id maxim
- # modificare in numere intregi
- numere_intregi = [int(i) for i in list(duplicates.keys())]
- interval = list()
- if tag == 'ro':
- interval = [i for i in range(1, max(numere_intregi) + 1)]
- elif tag == 'en':
- interval = [i for i in range(5000, max(numere_intregi) + 1)]
- numere_care_lipsesc = list()
- for number in interval:
- if number not in numere_intregi:
- numere_care_lipsesc.append(number)
- print("MAX: ", max(numere_intregi))
- print("NUMERE CARE LIPSESC: ", numere_care_lipsesc)
- fisiere_care_lipsesc_id = ''
- for numar in numere_care_lipsesc:
- fisiere_care_lipsesc_id = fisiere_care_lipsesc_id + str(numar) + '\n'
- result = "FISIERE CARE NU AU ID \n\n" + fisiere_care_nu_au_id + '\n' + "FISIERE DUPLICAT \n\n" + fisiere_duplicat + '\n' + "NUMERE CARE LIPSESC \n\n" + fisiere_care_lipsesc_id
- write_to_file(result, results_file)
- print("Scriere efectuata cu succes.")
- if __name__ == '__main__':
- directory_path = "e:\\Carte\\BB\\17 - Site Leadership\\Principal\\en" # AICI SCHIMB PATCH cu ro sau cu en
- results_file = "e:\\Carte\\BB\\17 - Site Leadership\\Principal\\ro\\results_duplicates.txt" # AICI APAR REZULTATELE FINALE
- get_duplicates(directory_path, results_file, "en") # "ro" # AICI SCHIMB PATCH cu ro sau cu en (SCHIMBA SI MAI SUS )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement