Untitled

import os
import sys
import codecs
from glob import glob
import argparse
from tidylib import tidy_document
from distutils.dir_util import copy_tree, remove_tree
import re


# sample = r"!WSEE_Books_fixed\EN 2015 CHM Help\Fullhelp\English\148409.html"
sample = r"!WSEE_Books\EN 2015 Context Help\Admin Guide as Context HTML\English\180167.html"


def html_errors_one_file_detailed(path):
    with codecs.open(path, 'r') as f:
        document, errors = tidy_document(f.read(), options={'numeric-entities': 1})
    return errors


def html_errors():
    global all_html
    print 'Files: %s' % len(all_html)
    all_err = []
    for filepath in all_html:
        with codecs.open(filepath, 'r') as f:
            document, errors = tidy_document(f.read(), options={'numeric-entities': 1})
        if errors:
            for err in errors.splitlines():
                if 'missing <!DOCTYPE> declaration' in err:
                    continue
                if '<script> inserting "type" attribute' in err:
                    continue
                if '<table> lacks "summary" attribute' in err:
                    continue
                if 'proprietary attribute' in err:
                    continue
                if '<table> attribute "height" has invalid value' in err:
                    continue
                # print filepath, err
                all_err.append(err)
    return all_err


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Fix <span> tag issue in html files (chm format) exported from Author-it.')
    parser.add_argument('folder', metavar='folder', nargs='?', default='!WSEE_Books', help='Folder with html files.')
    args = parser.parse_args()

    new_folder = args.folder + '_fixed'
    if os.path.exists(new_folder):
        print 'Deleting old dir %s...' % new_folder
        remove_tree(new_folder)

    copy_tree(args.folder, new_folder)

    all_html = [y for x in os.walk(new_folder) for y in glob(os.path.join(x[0], '*.html'))]

    print 'Checking original files...'
    print 'Errors: %s' % len(html_errors())

    print 'Checking sample file: %s...' % sample
    print(html_errors_one_file_detailed(sample))

    print 'Fixing tags...'
    for filepath in all_html:
        with codecs.open(filepath, 'r') as f:
            content = f.read()
        new_content = re.sub(r'<a(.*?)><span class="guicharacter"></a>(.*?)<a(.*?)></span></a>',
                             r'<a\1></a><span class="guicharacter">\2</span><a\3></a>',
                             content)
        with codecs.open(filepath, 'w') as f:
            f.write(new_content)

    print 'Checking fixed files...'
    print 'Errors: %s' % len(html_errors())

    print 'Checking sample file: %s...' % sample
    print(html_errors_one_file_detailed(sample))