nicuf

Copy Html Links in another frame

Nov 23rd, 2021 (edited)
509
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.61 KB | None | 0 0
  1. Explanation:
  2. ROMANIAN: https://neculaifantanaru.com/python-parsing-muta-linkurile-html-in-alt-cadru.html
  3. ENGLISH: https://neculaifantanaru.com/en/python-parsing-move-html-links-in-another-frame.html
  4. -----------------------
  5.  
  6. import re
  7. import os
  8.  
  9.  
  10. def read_text_from_file(file_path):
  11.     """
  12.    Aceasta functie returneaza continutul unui fisier.
  13.    file_path: calea catre fisierul din care vrei sa citesti
  14.    """
  15.     with open(file_path, 'r') as f:
  16.         text = f.read()
  17.         return text
  18.  
  19. def write_to_file(text, file_path):
  20.     """
  21.    Aceasta functie scrie un text intr-un fisier.
  22.    text: textul pe care vrei sa il scrii
  23.    file_path: calea catre fisierul in care vrei sa scrii
  24.    """
  25.     with open(file_path, 'w') as f:
  26.         f.write(text)
  27.  
  28. def check_link(file_path):
  29.     text = read_text_from_file(file_path)
  30.     # transformam textul din fisier intr-un string
  31.     text = str(text)
  32.     pattern = re.compile('<link rel="canonical" href="(.*?)" />')
  33.     canonical_link = re.findall(pattern, text)
  34.     if len(canonical_link) != 0:
  35.         file_name = canonical_link[0].split('/')[-1]
  36.         # -1 se foloseste cand vrei sa extragi elementul de pe ultima pozitie dintr-o lista | numerotarea incepe de la 0
  37.         flags_pattern = re.compile('<!-- FLAGS_1 -->[\s\S]*?<!-- FLAGS -->[\s\S]*?')
  38.         text_flags = str(re.findall(flags_pattern, text)[0])
  39.         # print("before: ", text_flags)
  40.         languages = ['en', 'ar', 'zh', 'hi', 'de', 'ru']
  41.         text_flags_new = text_flags
  42.         for language in languages:
  43.             template = re.compile('<a href=\"https://neculaifantanaru.com/{}/(.*?)\">'.format(language))
  44.             links = re.findall(template, text_flags)
  45.             for link in links:
  46.                 if link != file_name:
  47.                     text_flags_new = text_flags_new.replace(link, file_name)
  48.         # print("after: ", text_flags_new)
  49.         text = text.replace(text_flags, text_flags_new)
  50.         write_to_file(text, file_path)
  51.     else:
  52.         print("Found a problem with the file: ", file_path)
  53.  
  54.  
  55. def check_links_for_all_files(directory_name):
  56.     for file in os.listdir(directory_name):
  57.         filename = str(file)
  58.         print(filename)
  59.         # verificam daca fisierul se termina cu extensia html sau php
  60.         if filename.endswith(".html"): #or filename.endswith(".php"):
  61.             file_path = os.path.join(directory_name, filename)
  62.             # pentru fiecare fisier gasit, stergem spatiile in plus
  63.             check_link(file_path)
  64.         else:
  65.             continue
  66.  
  67. if __name__ == '__main__':
  68.     check_links_for_all_files("e:\\Carte\\BB")
  69.  
Add Comment
Please, Sign In to add comment