Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- EXPLANATION:
- ROMANIAN: https://neculaifantanaru.com/python-gaseste-acele-linkuri-care-se-repeta-in-alte-pagini-html-din-acelasi-folder.html
- ENGLISH: https://neculaifantanaru.com/en/python-find-those-links-that-are-repeated-in-other-html-pages-in-the-same-folder.html
- -----------------
- import sys
- import re
- import os
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- def extragere_linkuri(cale_fisier_html):
- text_html = read_text_from_file(cale_fisier_html)
- flags_pattern = re.compile('<!-- FLAGS_1 -->([\s\S]*?)<!-- FLAGS -->[\s\S]*?')
- text_flags = re.findall(flags_pattern, text_html)
- if len(text_flags) != 0:
- text_flags = text_flags[0]
- link_pattern = 'href=\"(.*?)\"'
- links = re.findall(link_pattern, text_flags)
- links = list(set(links))
- return links
- def verificare_fisiere(cale_folder_fisiere, cale_fisier_rezultat):
- cai_fisiere = list()
- lista_linkuri = list()
- for f in os.listdir(cale_folder_fisiere):
- if f.endswith('.html'):
- cale_fisier_html = cale_folder_fisiere + "\\" + f
- links = extragere_linkuri(cale_fisier_html)
- cai_fisiere.append(cale_fisier_html)
- lista_linkuri.append(links)
- else:
- continue
- rezultate = ''
- for i in range(0, len(lista_linkuri)):
- for j in range(i + 1, len(lista_linkuri)):
- if len(set(lista_linkuri[i]).intersection(set(lista_linkuri[j]))) != 0:
- rezultate += "Fisiere comune: \n"
- print("Fisiere comune: ")
- for link in set(lista_linkuri[i]).intersection(set(lista_linkuri[j])):
- rezultate += link
- rezultate += '\n'
- print(link, '\n')
- rezultate += 'Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j])
- rezultate += '\n\n'
- print('Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j]))
- print('\n\n')
- limba = "en" # BEBE AICI VEZI EXACT FOLDERUL, sa lasi doar "" daca vrei sa cauti in limba romana
- rezultate += "==========={}============\n\n".format(limba.upper())
- print("==========={}============\n\n".format(limba.upper()))
- for i in range(0, len(lista_linkuri)):
- for j in range(i + 1, len(lista_linkuri)):
- linkuri_limba = list()
- if len(set(lista_linkuri[i]).intersection(set(lista_linkuri[j]))) != 0:
- for link in set(lista_linkuri[i]).intersection(set(lista_linkuri[j])):
- if limba in link.split('/'):
- linkuri_limba.append(link)
- if len(linkuri_limba) != 0:
- rezultate += "Fisiere comune: \n"
- print("Fisiere comune: ")
- for link in linkuri_limba:
- rezultate += link
- rezultate += '\n'
- print(link, '\n')
- rezultate += 'Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j])
- rezultate += '\n\n'
- print('Fisier {} ARE LINKURI IN COMUN CU: {}'.format(cai_fisiere[i], cai_fisiere[j]))
- print('\n\n')
- write_to_file(rezultate, cale_fisier_rezultat)
- if __name__ == "__main__":
- verificare_fisiere("c:\\Folder1", "c:\\Folder1\\rezultate.txt")
- # verificare_fisiere("e:\\Carte\\BB\\17 - Site Leadership\\Principal\\en", "c:\\Folder1\\rezultate.txt")
Add Comment
Please, Sign In to add comment