nicuf

parsing-section-html

Jul 30th, 2021 (edited)
462
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. See this (explained):
  2. https://neculaifantanaru.com/en/parsing-python-how-to-copy-a-section-of-data-from-one-html-file-to-other-html-files.html
  3.  
  4.  
  5. import requests
  6. import re
  7.  
  8. # The folder that contains the file you want to parse
  9. english_folder1 = r"d:\Downloads\A"
  10.  
  11. # The folder with the files you want to change
  12. english_folder2 = r"d:\Downloads\B"
  13.  
  14. # The file you want to parse
  15. file_to_parse_from = 'example.html'
  16.  
  17. extension_file = ".html"
  18.  
  19. use_parse_folder = True
  20.  
  21. import os
  22.  
  23. en1_directory = os.fsencode(english_folder1)
  24. en2_directory = os.fsencode(english_folder2)
  25.  
  26. print('Going through english folder')
  27. for file in os.listdir(en2_directory):
  28.     filename = os.fsdecode(file)
  29.     print(filename)
  30.     if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
  31.         continue
  32.     if filename.endswith(extension_file):
  33.         with open(os.path.join(english_folder1, file_to_parse_from), encoding='utf-8') as html:
  34.             html = html.read()
  35.  
  36.             try:
  37.                 with open(os.path.join(english_folder2, filename), encoding='utf-8') as en_html:
  38.                     en_html = en_html.read()
  39.                    
  40.                     title = re.search('<title.+/title>', html)[0]
  41.                     meta = re.search('<meta name="description".+>', html)[0]
  42.                     comment_body = re.search('<!-- ARTICLE START -->.+<!-- ARTICLE FINAL -->', html, flags=re.DOTALL)[0]
  43.  
  44.                     try:
  45.                         comment_body2 = re.search('<!-- FLAGS_1 -->.+<!-- FLAGS -->', html, flags=re.DOTALL)[0]
  46.                         en_html = re.sub('<!-- FLAGS_1 -->.+<!-- FLAGS -->', comment_body2, en_html, flags=re.DOTALL)
  47.                     except:
  48.                         pass
  49.  
  50.                     try:
  51.                         comment_body3 = re.search('<!-- MENU START -->.+<!-- MENU FINAL -->', html, flags=re.DOTALL)[0]
  52.                         en_html = re.sub('<!-- MENU START -->.+<!-- MENU FINAL -->', comment_body3, en_html, flags=re.DOTALL)
  53.                     except:
  54.                         pass
  55.                    
  56.                     en_html = re.sub('<!-- ARTICLE START -->.+<!-- ARTICLE FINAL -->', comment_body, en_html, flags=re.DOTALL)
  57.                     en_html = re.sub('<meta name="description".+>', meta, en_html)
  58.                     en_html = re.sub('<title.+/title>', title, en_html)
  59.             except FileNotFoundError:
  60.                 continue
  61.  
  62.         print(f'{filename} parsed')
  63.         if use_parse_folder:
  64.             try:
  65.                 with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
  66.                     new_html.write(en_html)
  67.             except:
  68.                 os.mkdir(english_folder2+r'\parsed')
  69.                 with open(os.path.join(english_folder2+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
  70.                     new_html.write(en_html)
  71.         else:
  72.             with open(os.path.join(english_folder2, 'parsed_'+filename), 'w', encoding='utf-8') as html:
  73.                 html.write(en_html)
  74.  
RAW Paste Data