Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import re
- # Path to english folder 1
- english_folder2 = r"c:\Folder1"
- extension_file = ".html"
- use_parse_folder = True
- import os
- en1_directory = os.fsencode(english_folder2)
- en2_directory = os.fsencode(english_folder2)
- # These connection words will be ignore when parsing data from <title> tag to <meta keywords> tag
- LISTA_CUVINTE_LEGATURA = [
- 'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a',
- 'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine',
- 'ale', 'sau', 'dintre', 'intre', 'cu','ce', 'va', 'fi', 'este', 'cand', 'o',
- 'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
- 'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'aceasta', 'pe', 'tu',
- 'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti',
- 'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori',
- 'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an',
- 'on', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'made', 'make', 'my', 'me', '-',
- 'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l',
- 'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he'
- ]
- def creeaza_lista_keywords(titlu):
- # imparte titlul in 2 in functie de bara verticala |
- prima_parte_titlu = titlu.split('|')[0]
- # extrage toate cuvintele din prima parte a titlului
- keywords = re.findall(r'(?:\w|-*\!)+', prima_parte_titlu)
- # extrage keyword-urile care nu se gasesc in lista de cuvinte de legatura
- keywords_OK = list()
- for keyword in keywords:
- if keyword not in LISTA_CUVINTE_LEGATURA:
- # adauga keyword-ul cu litere mici
- keywords_OK.append(keyword.lower())
- # returneaza un string in care toate keyword-urile sunt alaturate prin ', '
- return ", ".join(keywords_OK)
- print('Going through english folder')
- amount = 1
- for file in os.listdir(en1_directory):
- filename = os.fsdecode(file)
- print(filename)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- with open(os.path.join(english_folder2, filename), encoding='utf-8') as html:
- html = html.read()
- try:
- with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
- en_html = en_html.read()
- # title to meta
- try:
- title = re.search('<title.+/title>', html)[0]
- title_content = re.search('>(.+)<', title)[1]
- except:
- pass
- try:
- meta_og_title = re.search('<meta property="og:title".*>', en_html)[0]
- new_meta_og_title = re.sub(r'content=".+"', f'content="{title_content}"', meta_og_title)
- en_html = en_html.replace(meta_og_title, new_meta_og_title)
- except:
- pass
- try:
- meta_keywords = re.search('<meta name="keywords".*>', en_html)[0]
- keywords = creeaza_lista_keywords(title_content)
- new_meta_keywords = re.sub(r'content=".+"', f'content="{keywords}"', meta_keywords)
- en_html = en_html.replace(meta_keywords, new_meta_keywords)
- except:
- pass
- try:
- meta_abstract = re.search('<meta name="abstract".*>', en_html)[0]
- new_meta_abstract = re.sub(r'content=".+"', f'content="{title_content}"', meta_abstract)
- en_html = en_html.replace(meta_abstract, new_meta_abstract)
- except:
- pass
- try:
- meta_Subject = re.search('<meta name="Subject".*>', en_html)[0]
- new_meta_Subject = re.sub(r'content=".+"', f'content="{title_content}"', meta_Subject)
- en_html = en_html.replace(meta_Subject, new_meta_Subject)
- except:
- pass
- try:
- headline = re.search('"headline":.+', en_html)[0]
- new_headline = re.sub(r':.+', f': "{title_content}",', headline)
- en_html = en_html.replace(headline, new_headline)
- except:
- pass
- try:
- keywords = re.search('"keywords":.+', en_html)[0]
- new_keywords = re.sub(r':.+', f': "{title_content}",', keywords)
- en_html = en_html.replace(keywords, new_keywords)
- except:
- pass
- # canonical to meta og:url and @id
- try:
- canonical_content = re.search('<link rel="canonical" href="(.+)".*>', html)[1]
- except:
- pass
- try:
- og_url = re.search('<meta property="og:url".*>', en_html)[0]
- new_og_url = re.sub(r'content=".+"', f'content="{canonical_content}"', og_url)
- en_html = en_html.replace(og_url, new_og_url)
- except:
- pass
- try:
- id = re.search('"@id":.+', en_html)[0]
- new_id = re.sub(r':.+', f': "{canonical_content}"', id)
- en_html = en_html.replace(id, new_id)
- except:
- pass
- # meta description to og:description and description
- try:
- meta = re.search('<meta name="description".+>', html)[0]
- meta_description = re.search('<meta name="description" content="(.+)".*>', html)[1]
- except:
- pass
- try:
- og_description = re.search('<meta property="og:description".+/>', en_html)[0]
- new_og_description = re.sub(r'content=".+"', f'content="{meta_description}"', og_description)
- en_html = en_html.replace(og_description, new_og_description)
- except:
- pass
- try:
- description = re.search('"description":.+', en_html)[0]
- new_description = re.sub(r':.+', f': "{meta_description}",', description)
- en_html = en_html.replace(description, new_description)
- except:
- pass
- try:
- en_html = re.sub('<meta name="description".+/>', meta, en_html)
- except:
- pass
- try:
- en_html = re.sub('<title.+/title>', title, en_html)
- except:
- pass
- except FileNotFoundError:
- continue
- print(f'{filename} parsed ({amount})')
- amount += 1
- if use_parse_folder:
- try:
- with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html:
- new_html.write(en_html)
- except:
- os.mkdir(english_folder2+r'')
- with open(os.path.join(english_folder2+r'', ''+filename), 'w', encoding='utf-8') as new_html:
- new_html.write(en_html)
- else:
- with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
- html.write(en_html)
Add Comment
Please, Sign In to add comment