Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Explanation:
- ROMANIAN: https://neculaifantanaru.com/python-parsing-muta-linkurile-html-in-alt-cadru.html
- ENGLISH: https://neculaifantanaru.com/en/python-parsing-move-html-links-in-another-frame.html
- -----------------------
- import re
- import os
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, 'r') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'w') as f:
- f.write(text)
- def check_link(file_path):
- text = read_text_from_file(file_path)
- # transformam textul din fisier intr-un string
- text = str(text)
- pattern = re.compile('<link rel="canonical" href="(.*?)" />')
- canonical_link = re.findall(pattern, text)
- if len(canonical_link) != 0:
- file_name = canonical_link[0].split('/')[-1]
- # -1 se foloseste cand vrei sa extragi elementul de pe ultima pozitie dintr-o lista | numerotarea incepe de la 0
- flags_pattern = re.compile('<!-- FLAGS_1 -->[\s\S]*?<!-- FLAGS -->[\s\S]*?')
- text_flags = str(re.findall(flags_pattern, text)[0])
- # print("before: ", text_flags)
- languages = ['en', 'ar', 'zh', 'hi', 'de', 'ru']
- text_flags_new = text_flags
- for language in languages:
- template = re.compile('<a href=\"https://neculaifantanaru.com/{}/(.*?)\">'.format(language))
- links = re.findall(template, text_flags)
- for link in links:
- if link != file_name:
- text_flags_new = text_flags_new.replace(link, file_name)
- # print("after: ", text_flags_new)
- text = text.replace(text_flags, text_flags_new)
- write_to_file(text, file_path)
- else:
- print("Found a problem with the file: ", file_path)
- def check_links_for_all_files(directory_name):
- for file in os.listdir(directory_name):
- filename = str(file)
- print(filename)
- # verificam daca fisierul se termina cu extensia html sau php
- if filename.endswith(".html"): #or filename.endswith(".php"):
- file_path = os.path.join(directory_name, filename)
- # pentru fiecare fisier gasit, stergem spatiile in plus
- check_link(file_path)
- else:
- continue
- if __name__ == '__main__':
- check_links_for_all_files("e:\\Carte\\BB")
Add Comment
Please, Sign In to add comment