Advertisement
nicuf

Save title html tag as link

Mar 21st, 2022
893
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.97 KB | None | 0 0
  1. --------------------------
  2. EXPLANATION:
  3.  
  4. ENGLISH: https://neculaifantanaru.com/en/python-save-title-html-tag-to-link.html
  5. ROMANIAN: https://neculaifantanaru.com/python-save-title-html-tag-to-link.html
  6. --------------------------
  7.  
  8. from bs4 import BeautifulSoup
  9. from bs4.formatter import HTMLFormatter
  10. import requests
  11. import re
  12. import execjs
  13. from urllib import parse
  14. import json
  15. import os
  16.  
  17. class UnsortedAttributes(HTMLFormatter):
  18.     def attributes(self, tag):
  19.         for k, v in tag.attrs.items():
  20.             yield k, v
  21.  
  22.  
  23. def read_text_from_file(file_path):
  24.     """
  25.    Aceasta functie returneaza continutul unui fisier.
  26.    file_path: calea catre fisierul din care vrei sa citesti
  27.    """
  28.     with open(file_path, encoding='utf8') as f:
  29.         text = f.read()
  30.         return text
  31.  
  32.  
  33. def write_to_file(text, file_path):
  34.     """
  35.    Aceasta functie scrie un text intr-un fisier.
  36.    text: textul pe care vrei sa il scrii
  37.    file_path: calea catre fisierul in care vrei sa scrii
  38.    """
  39.     with open(file_path, 'wb') as f:
  40.         f.write(text.encode('utf8', 'ignore'))
  41.  
  42. files_from_folder = "e:\\Folder"
  43.  
  44. extension_file = ".html"
  45.  
  46. directory = os.fsencode(files_from_folder)
  47.  
  48. amount = 1
  49. for file in os.listdir(directory):
  50.     filename = os.fsdecode(file)
  51.     if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
  52.         continue
  53.  
  54.     if filename.endswith(extension_file):
  55.         current_file_name = ''
  56.         new_file_name = ''
  57.  
  58.         with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
  59.             file_text = html.read()
  60.             soup = BeautifulSoup('<pre>' + file_text + '</pre>', 'html.parser')
  61.             text_title = soup.findAll('title')[0].get_text()
  62.  
  63.             print(f'{filename} changed filename ({amount})')
  64.             amount += 1
  65.             new_filename = text_title
  66.             # replace 's
  67.             new_filename = re.sub('\'\w', '', new_filename)
  68.             new_filename = new_filename.lower()
  69.             words = re.findall(r'\w+', new_filename)
  70.             new_filename = '-'.join(words)
  71.             new_filename = new_filename + '.html'
  72.             new_filename = os.fsdecode(new_filename)
  73.  
  74.             # inlocuire nume fisier
  75.             current_file_name = os.path.join(files_from_folder, filename)
  76.             new_file_name = os.path.join(files_from_folder, new_filename)
  77.  
  78.             canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
  79.             canonical = re.findall(canonical_pattern, file_text)
  80.             if len(canonical) > 0:
  81.                 canonical = canonical[0]
  82.                 link_nou = "https://trinketbox.ro/en/" + '-'.join(words) + ".html"
  83.                 file_text = file_text.replace(canonical, link_nou)
  84.                 write_to_file(file_text, current_file_name)
  85.             else:
  86.                 print("Nu am gasit tag-ul canonical in fisier")
  87.  
  88.         html.close()
  89.         os.rename(current_file_name, new_file_name)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement