Advertisement
Guest User

Untitled

a guest
Feb 19th, 2020
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.81 KB | None | 0 0
  1. import os
  2. import sys
  3. import codecs
  4. from glob import glob
  5. import argparse
  6. from tidylib import tidy_document
  7. from distutils.dir_util import copy_tree, remove_tree
  8. import re
  9.  
  10.  
  11. # sample = r"!WSEE_Books_fixed\EN 2015 CHM Help\Fullhelp\English\148409.html"
  12. sample = r"!WSEE_Books\EN 2015 Context Help\Admin Guide as Context HTML\English\180167.html"
  13.  
  14.  
  15. def html_errors_one_file_detailed(path):
  16.     with codecs.open(path, 'r') as f:
  17.         document, errors = tidy_document(f.read(), options={'numeric-entities': 1})
  18.     return errors
  19.  
  20.  
  21. def html_errors():
  22.     global all_html
  23.     print 'Files: %s' % len(all_html)
  24.     all_err = []
  25.     for filepath in all_html:
  26.         with codecs.open(filepath, 'r') as f:
  27.             document, errors = tidy_document(f.read(), options={'numeric-entities': 1})
  28.         if errors:
  29.             for err in errors.splitlines():
  30.                 if 'missing <!DOCTYPE> declaration' in err:
  31.                     continue
  32.                 if '<script> inserting "type" attribute' in err:
  33.                     continue
  34.                 if '<table> lacks "summary" attribute' in err:
  35.                     continue
  36.                 if 'proprietary attribute' in err:
  37.                     continue
  38.                 if '<table> attribute "height" has invalid value' in err:
  39.                     continue
  40.                 # print filepath, err
  41.                 all_err.append(err)
  42.     return all_err
  43.  
  44.  
  45. if __name__ == '__main__':
  46.     parser = argparse.ArgumentParser(description='Fix <span> tag issue in html files (chm format) exported from Author-it.')
  47.     parser.add_argument('folder', metavar='folder', nargs='?', default='!WSEE_Books', help='Folder with html files.')
  48.     args = parser.parse_args()
  49.  
  50.     new_folder = args.folder + '_fixed'
  51.     if os.path.exists(new_folder):
  52.         print 'Deleting old dir %s...' % new_folder
  53.         remove_tree(new_folder)
  54.  
  55.     copy_tree(args.folder, new_folder)
  56.  
  57.     all_html = [y for x in os.walk(new_folder) for y in glob(os.path.join(x[0], '*.html'))]
  58.  
  59.     print 'Checking original files...'
  60.     print 'Errors: %s' % len(html_errors())
  61.  
  62.     print 'Checking sample file: %s...' % sample
  63.     print(html_errors_one_file_detailed(sample))
  64.  
  65.     print 'Fixing tags...'
  66.     for filepath in all_html:
  67.         with codecs.open(filepath, 'r') as f:
  68.             content = f.read()
  69.         new_content = re.sub(r'<a(.*?)><span class="guicharacter"></a>(.*?)<a(.*?)></span></a>',
  70.                              r'<a\1></a><span class="guicharacter">\2</span><a\3></a>',
  71.                              content)
  72.         with codecs.open(filepath, 'w') as f:
  73.             f.write(new_content)
  74.  
  75.     print 'Checking fixed files...'
  76.     print 'Errors: %s' % len(html_errors())
  77.  
  78.     print 'Checking sample file: %s...' % sample
  79.     print(html_errors_one_file_detailed(sample))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement