Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- English: https://neculaifantanaru.com/en/creating-a-batch-processing-python-with-regex-and-html-tags-parsing.html
- Romanian: https://neculaifantanaru.com/creating-a-batch-processing-python-with-regex-and-html-tags-parsing.html
- import requests
- import re
- # Path to english folder 1
- english_folder1 = r"c:\Folder3"
- # Path to english folder 2
- english_folder2 = r"c:\Folder3"
- extension_file = ".html"
- use_parse_folder = True #Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folder
- import os
- en1_directory = os.fsencode(english_folder1)
- en2_directory = os.fsencode(english_folder2)
- print('Going through english folder')
- for file in os.listdir(en1_directory):
- filename = os.fsdecode(file)
- print(filename)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html':
- continue
- if filename.endswith(extension_file):
- with open(os.path.join(english_folder1, filename), encoding='utf-8') as html:
- html = html.read()
- try:
- with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
- en_html = en_html.read()
- if False: # if True: will Parse also the content that starts from <!-- ARTICOL START --> to <!-- ARTICOL FINAL --> and so on
- try:
- comment_body = re.search('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', html, flags=re.DOTALL)[0]
- en_html = re.sub('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', comment_body, en_html, flags=re.DOTALL)
- except:
- pass
- try:
- comment_body2 = re.search('<!-- FLAGS_1 -->.+<!-- FLAGS -->', html, flags=re.DOTALL)[0]
- en_html = re.sub('<!-- FLAGS_1 -->.+<!-- FLAGS -->', comment_body2, en_html, flags=re.DOTALL)
- except:
- pass
- try:
- comment_body3 = re.search('<!-- MENIU BARA SUS -->.+<!-- SFARSIT MENIU BARA SUS -->', html, flags=re.DOTALL)[0]
- en_html = re.sub('<!-- MENIU BARA SUS -->.+<!-- SFARSIT MENIU BARA SUS -->', comment_body3, en_html, flags=re.DOTALL)
- except:
- pass
- # title to meta
- try:
- title = re.search('<title.+/title>', html)[0]
- title_content = re.search('>(.+)<', title)[1]
- except:
- pass
- try:
- meta_og_title = re.search('<meta property="og:title".*>', en_html)[0]
- new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title)
- en_html = en_html.replace(meta_og_title, new_meta_og_title)
- except:
- pass
- try:
- meta_keywords = re.search('<meta name="keywords".*>', en_html)[0]
- new_meta_keywords = re.sub(r'content=".+"', f'content="{title_content}"', meta_keywords)
- en_html = en_html.replace(meta_keywords, new_meta_keywords)
- except:
- pass
- try:
- meta_abstract = re.search('<meta name="abstract".*>', en_html)[0]
- new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract)
- en_html = en_html.replace(meta_abstract, new_meta_abstract)
- except:
- pass
- try:
- meta_Subject = re.search('<meta name="Subject".*>', en_html)[0]
- new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject)
- en_html = en_html.replace(meta_Subject, new_meta_Subject)
- except:
- pass
- try:
- headline = re.search('"headline":.+', en_html)[0]
- new_headline = re.sub(r':.+', f': "{title_content}",', headline)
- en_html = en_html.replace(headline, new_headline)
- except:
- pass
- try:
- keywords = re.search('"keywords":.+', en_html)[0]
- new_keywords = re.sub(r':.+', f': "{title_content}",', keywords)
- en_html = en_html.replace(keywords, new_keywords)
- except:
- pass
- # canonical to meta og:url and @id
- try:
- canonical_content = re.search('<link rel="canonical" href="(.+)".*>', html)[1]
- except:
- pass
- try:
- og_url = re.search('<meta property="og:url".*>', en_html)[0]
- new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url)
- en_html = en_html.replace(og_url, new_og_url)
- except:
- pass
- try:
- id = re.search('"@id":.+', en_html)[0]
- new_id = re.sub(r':.+', f': "{canonical_content}"', id)
- en_html = en_html.replace(id, new_id)
- except:
- pass
- # meta description to og:description and description
- try:
- meta = re.search('<meta name="description".+/>', html)[0]
- meta_description = re.search('<meta name="description" content="(.+)".+>', html)[1]
- except:
- pass
- try:
- og_description = re.search('<meta property="og:description".+/>', en_html)[0]
- new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description)
- en_html = en_html.replace(og_description, new_og_description)
- except:
- pass
- try:
- description = re.search('"description":.+', en_html)[0]
- new_description = re.sub(r':.+', f': "{meta_description}",', description)
- en_html = en_html.replace(description, new_description)
- except:
- pass
- try:
- en_html = re.sub('<meta name="description".+/>', meta, en_html)
- except:
- pass
- try:
- en_html = re.sub('<title.+/title>', title, en_html)
- except:
- pass
- except FileNotFoundError:
- continue
- print(f'{filename} parsed')
- if use_parse_folder:
- try:
- with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
- new_html.write(en_html)
- except:
- os.mkdir(english_folder2+r'\parsed')
- with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
- new_html.write(en_html)
- else:
- with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
- html.write(en_html)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement