Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import glob, string, os, re
- from htmlentitydefs import name2codepoint as n2cp
- def substitute_entity(match):
- ent = match.group(2)
- if match.group(1) == "#":
- return unichr(int(ent))
- else:
- cp = n2cp.get(ent)
- if cp:
- return unichr(cp)
- else:
- return match.group()
- def decode_htmlentities(string):
- entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
- return entity_re.subn(substitute_entity, string)[0]
- for filename in glob.glob('*.html'):
- f = open(filename)
- writeLines = False
- count = 0
- for line in f:
- count = count+1
- if string.find(line, 'title') != -1:
- start = 14 + string.find(line, 'CodingBat Java ')
- end = string.find(line, '</title>')
- dir, fn = string.split(line[start:end])
- if not os.path.exists(dir):
- os.makedirs(dir)
- o = open(dir+"/"+fn+".txt", 'w')
- if string.find(line, '</textarea>') != -1:
- writeLines = False
- if writeLines == True:
- o.write(decode_htmlentities(line))
- if string.find(line, '<textarea') != -1:
- writeLines = True
- o.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement