Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- See this (explained):
- https://neculaifantanaru.com/en/parsing-python-how-to-copy-a-section-of-data-from-one-html-file-to-other-html-files.html
- import requests
- import re
- # The folder that contains the file you want to parse
- english_folder1 = r"d:\Downloads\A"
- # The folder with the files you want to change
- english_folder2 = r"d:\Downloads\B"
- # The file you want to parse
- file_to_parse_from = 'example.html'
- extension_file = ".html"
- use_parse_folder = True
- import os
- en1_directory = os.fsencode(english_folder1)
- en2_directory = os.fsencode(english_folder2)
- print('Going through english folder')
- for file in os.listdir(en2_directory):
- filename = os.fsdecode(file)
- print(filename)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- with open(os.path.join(english_folder1, file_to_parse_from), encoding='utf-8') as html:
- html = html.read()
- try:
- with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
- en_html = en_html.read()
- title = re.search('<title.+/title>', html)[0]
- meta = re.search('<meta name="description".+>', html)[0]
- comment_body = re.search('<!-- ARTICLE START -->.+<!-- ARTICLE FINAL -->', html, flags=re.DOTALL)[0]
- try:
- comment_body2 = re.search('<!-- FLAGS_1 -->.+<!-- FLAGS -->', html, flags=re.DOTALL)[0]
- en_html = re.sub('<!-- FLAGS_1 -->.+<!-- FLAGS -->', comment_body2, en_html, flags=re.DOTALL)
- except:
- pass
- try:
- comment_body3 = re.search('<!-- MENU START -->.+<!-- MENU FINAL -->', html, flags=re.DOTALL)[0]
- en_html = re.sub('<!-- MENU START -->.+<!-- MENU FINAL -->', comment_body3, en_html, flags=re.DOTALL)
- except:
- pass
- en_html = re.sub('<!-- ARTICLE START -->.+<!-- ARTICLE FINAL -->', comment_body, en_html, flags=re.DOTALL)
- en_html = re.sub('<meta name="description".+>', meta, en_html)
- en_html = re.sub('<title.+/title>', title, en_html)
- except FileNotFoundError:
- continue
- print(f'{filename} parsed')
- if use_parse_folder:
- try:
- with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
- new_html.write(en_html)
- except:
- os.mkdir(english_folder2+r'\parsed')
- with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
- new_html.write(en_html)
- else:
- with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
- html.write(en_html)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement