catch-cheaters.py

import glob, string, os, re
from htmlentitydefs import name2codepoint as n2cp

def substitute_entity(match):
    ent = match.group(2)
    if match.group(1) == "#":
        return unichr(int(ent))
    else:
        cp = n2cp.get(ent)

        if cp:
            return unichr(cp)
        else:
            return match.group()

def decode_htmlentities(string):
    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
    return entity_re.subn(substitute_entity, string)[0]


for filename in glob.glob('*.html'):
    f = open(filename)
    writeLines = False
    count = 0
    for line in f:
        count = count+1
        if string.find(line, 'title') != -1:
            start = 14 + string.find(line, 'CodingBat Java ')
            end = string.find(line, '</title>')
            dir, fn = string.split(line[start:end])
            if not os.path.exists(dir):
                os.makedirs(dir)
            o = open(dir+"/"+fn+".txt", 'w')
        if string.find(line, '</textarea>') != -1:
            writeLines = False
        if writeLines == True:
            o.write(decode_htmlentities(line))
        if string.find(line, '<textarea') != -1:
            writeLines = True
    o.close()