Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- --------------------------
- EXPLANATION:
- ENGLISH: https://neculaifantanaru.com/en/python-save-title-html-tag-to-link.html
- ROMANIAN: https://neculaifantanaru.com/python-save-title-html-tag-to-link.html
- --------------------------
- from bs4 import BeautifulSoup
- from bs4.formatter import HTMLFormatter
- import requests
- import re
- import execjs
- from urllib import parse
- import json
- import os
- class UnsortedAttributes(HTMLFormatter):
- def attributes(self, tag):
- for k, v in tag.attrs.items():
- yield k, v
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- files_from_folder = "e:\\Folder"
- extension_file = ".html"
- directory = os.fsencode(files_from_folder)
- amount = 1
- for file in os.listdir(directory):
- filename = os.fsdecode(file)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- current_file_name = ''
- new_file_name = ''
- with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
- file_text = html.read()
- soup = BeautifulSoup('<pre>' + file_text + '</pre>', 'html.parser')
- text_title = soup.findAll('title')[0].get_text()
- print(f'{filename} changed filename ({amount})')
- amount += 1
- new_filename = text_title
- # replace 's
- new_filename = re.sub('\'\w', '', new_filename)
- new_filename = new_filename.lower()
- words = re.findall(r'\w+', new_filename)
- new_filename = '-'.join(words)
- new_filename = new_filename + '.html'
- new_filename = os.fsdecode(new_filename)
- # inlocuire nume fisier
- current_file_name = os.path.join(files_from_folder, filename)
- new_file_name = os.path.join(files_from_folder, new_filename)
- canonical_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
- canonical = re.findall(canonical_pattern, file_text)
- if len(canonical) > 0:
- canonical = canonical[0]
- link_nou = "https://trinketbox.ro/en/" + '-'.join(words) + ".html"
- file_text = file_text.replace(canonical, link_nou)
- write_to_file(file_text, current_file_name)
- else:
- print("Nu am gasit tag-ul canonical in fisier")
- html.close()
- os.rename(current_file_name, new_file_name)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement