Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ------------------
- EXPLANATION:
- ENGLISH: https://neculaifantanaru.com/en/python-scripts-examples.html
- ROMANIAN: https://neculaifantanaru.com/python-scripts-examples.html
- ------------------
- import requests
- import re
- import os
- cale_folder_html = r"d:\\Folder1"
- extension_file = ".html" or ".htm"
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf8', 'ignore'))
- print('Going through folder')
- amount = 1
- for filename in os.listdir(cale_folder_html):
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith('.html') or filename.endswith('.htm'):
- cale_fisier_html = cale_folder_html + "\\" + filename
- html_text = read_text_from_file(cale_fisier_html)
- # preluam description
- meta_description = re.search('<meta name="description".+>', html_text)[0]
- description_pattern = re.compile('<meta name="description" content="(.*?)>')
- description = re.findall(description_pattern, html_text)
- if len(description) != 0:
- description = description[0]
- title_pattern = re.compile('<title>(.*?)</title>')
- title = re.search('<title>.+</title>', html_text)[0]
- title_text = re.findall(title_pattern, html_text)
- if len(title_text) != 0:
- title_text = title_text[0]
- # prelucrare continut
- dict_simboluri = dict()
- dict_simboluri['ă'] = 'a'
- dict_simboluri['â'] = 'a'
- dict_simboluri['ã'] = 'a'
- dict_simboluri['â'] = 'a'
- dict_simboluri['ă'] = 'a'
- dict_simboluri['â'] = 'a'
- dict_simboluri['ӑ'] = 'a'
- dict_simboluri['ȃ'] = 'a'
- dict_simboluri['â'] = 'a'
- dict_simboluri['ă'] = 'a'
- dict_simboluri['ã'] = 'a'
- dict_simboluri['à'] = 'a'
- dict_simboluri['á'] = 'a'
- dict_simboluri['å'] = 'a'
- dict_simboluri['ä'] = 'a'
- dict_simboluri['â'] = 'a'
- dict_simboluri['…'] = ''
- dict_simboluri['…'] = ''
- dict_simboluri['\"'] = ''
- dict_simboluri['–'] = '- '
- dict_simboluri[' '] = ' '
- dict_simboluri['Â '] = ' '
- dict_simboluri['Â '] = ' '
- dict_simboluri['''] = '\''
- dict_simboluri['„'] = '\''
- dict_simboluri['”'] = '\''
- dict_simboluri['['] = ''
- dict_simboluri[']'] = ''
- dict_simboluri['/'] = ''
- dict_simboluri['}'] = ''
- dict_simboluri['{'] = ''
- dict_simboluri['î'] = 'i'
- dict_simboluri['Î'] = 'i'
- dict_simboluri['î'] = 'i'
- dict_simboluri['î'] = 'i'
- dict_simboluri['Î'] = 'i'
- dict_simboluri['Î'] = 'i'
- dict_simboluri['î'] = 'i'
- dict_simboluri['Î'] = 'i'
- dict_simboluri['ȋ'] = 'i'
- dict_simboluri['î'] = 'i'
- dict_simboluri['Î'] = 'I'
- dict_simboluri['Ĩ'] = 'I'
- dict_simboluri['Ĩ'] = 'I'
- dict_simboluri['Î'] = 'I'
- dict_simboluri['Î'] = 'Ĩ'
- dict_simboluri['ī'] = 'i'
- dict_simboluri['ĭ'] = 'i'
- dict_simboluri['í'] = 'i'
- dict_simboluri['!'] = ' '
- dict_simboluri['('] = '-'
- dict_simboluri[')'] = ' '
- dict_simboluri[' '] = ' '
- dict_simboluri[',,'] = ' '
- dict_simboluri['Ĩ'] = 'I'
- dict_simboluri['é'] = 'e'
- dict_simboluri['ê'] = 'e'
- dict_simboluri['é'] = 'e'
- dict_simboluri['a©'] = 'e'
- dict_simboluri['è'] = 'e'
- dict_simboluri['ë'] = 'e'
- dict_simboluri['Ë'] = 'e'
- dict_simboluri['ș'] = 's'
- dict_simboluri['Ș'] = 's'
- dict_simboluri['Ş'] = 's'
- dict_simboluri['ș'] = 's'
- dict_simboluri['ş'] = 's'
- dict_simboluri['ş'] = 's'
- dict_simboluri['ș'] = 's'
- dict_simboluri['Ş'] = 'S'
- dict_simboluri['Ș'] = 'S'
- dict_simboluri['Ș'] = 'S'
- dict_simboluri['š'] = 's'
- dict_simboluri['ś'] = 's'
- dict_simboluri['ș'] = 's'
- dict_simboluri['ṣ'] = 's'
- dict_simboluri['"'] = ''
- dict_simboluri['’'] = ''
- dict_simboluri['”'] = ''
- dict_simboluri['’'] = ''
- dict_simboluri['„'] = ''
- dict_simboluri['“'] = ''
- dict_simboluri['„'] = ''
- dict_simboluri['“'] = ''
- dict_simboluri['”'] = ''
- dict_simboluri['<'] = ''
- dict_simboluri['<'] = ''
- dict_simboluri['«'] = ''
- dict_simboluri['»'] = ''
- dict_simboluri['“'] = ''
- dict_simboluri['”'] = ''
- dict_simboluri['"'] = ''
- dict_simboluri[':'] = ''
- dict_simboluri['&'] = ''
- dict_simboluri['ț'] = 't'
- dict_simboluri['ţ'] = 't'
- dict_simboluri['Ţ'] = 't'
- dict_simboluri['ț'] = 't'
- dict_simboluri['ţ'] = 't'
- dict_simboluri['ț'] = 't'
- dict_simboluri['Ţ'] = 'T'
- dict_simboluri['Ț'] = 'T'
- dict_simboluri['ť'] = 't'
- dict_simboluri['ṭ'] = 't'
- for simbol in dict_simboluri.keys():
- description = description.replace(simbol, dict_simboluri[simbol])
- for simbol in dict_simboluri.keys():
- title_text = title_text.replace(simbol, dict_simboluri[simbol])
- print(title_text)
- #meta_description = re.search('<meta name="description".+>', html_text)[0]
- new_meta_description = re.sub(r'content=".+"', f'content="{description}"', meta_description)
- new_title = re.sub(r'<title>.+</title>', f'<title>{title_text}</title>', title)
- html_text = html_text.replace(meta_description, new_meta_description)
- html_text = html_text.replace(title, new_title)
- print(f'{filename} parsed ({amount})')
- amount += 1
- write_to_file(html_text, cale_fisier_html)
- else:
- print("Text has no description")
- else:
- continue
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement