Advertisement
Guest User

catch-cheaters.py

a guest
Feb 17th, 2011
414
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.20 KB | None | 0 0
  1. import glob, string, os, re
  2. from htmlentitydefs import name2codepoint as n2cp
  3.  
  4. def substitute_entity(match):
  5.     ent = match.group(2)
  6.     if match.group(1) == "#":
  7.         return unichr(int(ent))
  8.     else:
  9.         cp = n2cp.get(ent)
  10.  
  11.         if cp:
  12.             return unichr(cp)
  13.         else:
  14.             return match.group()
  15.  
  16. def decode_htmlentities(string):
  17.     entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
  18.     return entity_re.subn(substitute_entity, string)[0]
  19.  
  20.  
  21. for filename in glob.glob('*.html'):
  22.     f = open(filename)
  23.     writeLines = False
  24.     count = 0
  25.     for line in f:
  26.         count = count+1
  27.         if string.find(line, 'title') != -1:
  28.             start = 14 + string.find(line, 'CodingBat Java ')
  29.             end = string.find(line, '</title>')
  30.             dir, fn = string.split(line[start:end])
  31.             if not os.path.exists(dir):
  32.                 os.makedirs(dir)
  33.             o = open(dir+"/"+fn+".txt", 'w')
  34.         if string.find(line, '</textarea>') != -1:
  35.             writeLines = False
  36.         if writeLines == True:
  37.             o.write(decode_htmlentities(line))
  38.         if string.find(line, '<textarea') != -1:
  39.             writeLines = True
  40.     o.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement