Parsing-Batch Processor


English: https://neculaifantanaru.com/en/creating-a-batch-processing-python-with-regex-and-html-tags-parsing.html
Romanian:  https://neculaifantanaru.com/creating-a-batch-processing-python-with-regex-and-html-tags-parsing.html


import requests
import re

# Path to english folder 1
english_folder1 = r"c:\Folder3"

# Path to english folder 2
english_folder2 = r"c:\Folder3"

extension_file = ".html"

use_parse_folder = True #Face folder nou daca pui True, iar daca pui False redenumeste fisierele in acelasi folder

import os

en1_directory = os.fsencode(english_folder1)
en2_directory = os.fsencode(english_folder2)

print('Going through english folder')
for file in os.listdir(en1_directory):
    filename = os.fsdecode(file)
    print(filename)
    if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html':
        continue
    if filename.endswith(extension_file):
        with open(os.path.join(english_folder1, filename), encoding='utf-8') as html:
            html = html.read()

            try:
                with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
                    en_html = en_html.read()

                    if False:  # if True: will Parse also the content that starts from <!-- ARTICOL START --> to <!-- ARTICOL FINAL --> and so on
                        try:
                            comment_body = re.search('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', html, flags=re.DOTALL)[0]
                            en_html = re.sub('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', comment_body, en_html, flags=re.DOTALL)
                        except:
                            pass

                        try:
                            comment_body2 = re.search('<!-- FLAGS_1 -->.+<!-- FLAGS -->', html, flags=re.DOTALL)[0]
                            en_html = re.sub('<!-- FLAGS_1 -->.+<!-- FLAGS -->', comment_body2, en_html, flags=re.DOTALL)
                        except:
                            pass

                        try:
                            comment_body3 = re.search('<!-- MENIU BARA SUS -->.+<!-- SFARSIT MENIU BARA SUS -->', html, flags=re.DOTALL)[0]
                            en_html = re.sub('<!-- MENIU BARA SUS -->.+<!-- SFARSIT MENIU BARA SUS -->', comment_body3, en_html, flags=re.DOTALL)
                        except:
                            pass

                    # title to meta
                    try:
                        title = re.search('<title.+/title>', html)[0]
                        title_content = re.search('>(.+)<', title)[1]
                    except:
                        pass

                    try:
                        meta_og_title = re.search('<meta property="og:title".*>', en_html)[0]
                        new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title)
                        en_html = en_html.replace(meta_og_title, new_meta_og_title)
                    except:
                        pass

                    try:
                        meta_keywords = re.search('<meta name="keywords".*>', en_html)[0]
                        new_meta_keywords = re.sub(r'content=".+"', f'content="{title_content}"', meta_keywords)
                        en_html = en_html.replace(meta_keywords, new_meta_keywords)
                    except:
                        pass

                    try:
                        meta_abstract = re.search('<meta name="abstract".*>', en_html)[0]
                        new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract)
                        en_html = en_html.replace(meta_abstract, new_meta_abstract)
                    except:
                        pass

                    try:
                        meta_Subject = re.search('<meta name="Subject".*>', en_html)[0]
                        new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject)
                        en_html = en_html.replace(meta_Subject, new_meta_Subject)
                    except:
                        pass

                    try:
                        headline = re.search('"headline":.+', en_html)[0]
                        new_headline = re.sub(r':.+', f': "{title_content}",', headline)
                        en_html = en_html.replace(headline, new_headline)
                    except:
                        pass

                    try:
                        keywords = re.search('"keywords":.+', en_html)[0]
                        new_keywords = re.sub(r':.+', f': "{title_content}",', keywords)
                        en_html = en_html.replace(keywords, new_keywords)
                    except:
                        pass

                    # canonical to meta og:url and @id
                    try:
                        canonical_content = re.search('<link rel="canonical" href="(.+)".*>', html)[1]
                    except:
                        pass

                    try:
                        og_url = re.search('<meta property="og:url".*>', en_html)[0]
                        new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url)
                        en_html = en_html.replace(og_url, new_og_url)
                    except:
                        pass

                    try:
                        id = re.search('"@id":.+', en_html)[0]
                        new_id = re.sub(r':.+', f': "{canonical_content}"', id)
                        en_html = en_html.replace(id, new_id)
                    except:
                        pass

                    # meta description to og:description and description
                    try:
                        meta = re.search('<meta name="description".+/>', html)[0]
                        meta_description = re.search('<meta name="description" content="(.+)".+>', html)[1]
                    except:
                        pass

                    try:
                        og_description = re.search('<meta property="og:description".+/>', en_html)[0]
                        new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description)
                        en_html = en_html.replace(og_description, new_og_description)
                    except:
                        pass

                    try:
                        description = re.search('"description":.+', en_html)[0]
                        new_description = re.sub(r':.+', f': "{meta_description}",', description)
                        en_html = en_html.replace(description, new_description)
                    except:
                        pass

                    try:
                        en_html = re.sub('<meta name="description".+/>', meta, en_html)
                    except:
                        pass

                    try:
                        en_html = re.sub('<title.+/title>', title, en_html)
                    except:
                        pass
            except FileNotFoundError:
                continue

        print(f'{filename} parsed')
        if use_parse_folder:
            try:
                with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
            except:
                os.mkdir(english_folder2+r'\parsed')
                with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
        else:
            with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
                html.write(en_html)