SHARE
TWEET

Untitled

a guest Feb 19th, 2020 86 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import os
  2. import sys
  3. import codecs
  4. from glob import glob
  5. import argparse
  6. from tidylib import tidy_document
  7. from distutils.dir_util import copy_tree, remove_tree
  8. import re
  9.  
  10.  
  11. # sample = r"!WSEE_Books_fixed\EN 2015 CHM Help\Fullhelp\English\148409.html"
  12. sample = r"!WSEE_Books\EN 2015 Context Help\Admin Guide as Context HTML\English\180167.html"
  13.  
  14.  
  15. def html_errors_one_file_detailed(path):
  16.     with codecs.open(path, 'r') as f:
  17.         document, errors = tidy_document(f.read(), options={'numeric-entities': 1})
  18.     return errors
  19.  
  20.  
  21. def html_errors():
  22.     global all_html
  23.     print 'Files: %s' % len(all_html)
  24.     all_err = []
  25.     for filepath in all_html:
  26.         with codecs.open(filepath, 'r') as f:
  27.             document, errors = tidy_document(f.read(), options={'numeric-entities': 1})
  28.         if errors:
  29.             for err in errors.splitlines():
  30.                 if 'missing <!DOCTYPE> declaration' in err:
  31.                     continue
  32.                 if '<script> inserting "type" attribute' in err:
  33.                     continue
  34.                 if '<table> lacks "summary" attribute' in err:
  35.                     continue
  36.                 if 'proprietary attribute' in err:
  37.                     continue
  38.                 if '<table> attribute "height" has invalid value' in err:
  39.                     continue
  40.                 # print filepath, err
  41.                 all_err.append(err)
  42.     return all_err
  43.  
  44.  
  45. if __name__ == '__main__':
  46.     parser = argparse.ArgumentParser(description='Fix <span> tag issue in html files (chm format) exported from Author-it.')
  47.     parser.add_argument('folder', metavar='folder', nargs='?', default='!WSEE_Books', help='Folder with html files.')
  48.     args = parser.parse_args()
  49.  
  50.     new_folder = args.folder + '_fixed'
  51.     if os.path.exists(new_folder):
  52.         print 'Deleting old dir %s...' % new_folder
  53.         remove_tree(new_folder)
  54.  
  55.     copy_tree(args.folder, new_folder)
  56.  
  57.     all_html = [y for x in os.walk(new_folder) for y in glob(os.path.join(x[0], '*.html'))]
  58.  
  59.     print 'Checking original files...'
  60.     print 'Errors: %s' % len(html_errors())
  61.  
  62.     print 'Checking sample file: %s...' % sample
  63.     print(html_errors_one_file_detailed(sample))
  64.  
  65.     print 'Fixing tags...'
  66.     for filepath in all_html:
  67.         with codecs.open(filepath, 'r') as f:
  68.             content = f.read()
  69.         new_content = re.sub(r'<a(.*?)><span class="guicharacter"></a>(.*?)<a(.*?)></span></a>',
  70.                              r'<a\1></a><span class="guicharacter">\2</span><a\3></a>',
  71.                              content)
  72.         with codecs.open(filepath, 'w') as f:
  73.             f.write(new_content)
  74.  
  75.     print 'Checking fixed files...'
  76.     print 'Errors: %s' % len(html_errors())
  77.  
  78.     print 'Checking sample file: %s...' % sample
  79.     print(html_errors_one_file_detailed(sample))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top