Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from __future__ import with_statement
- import re
- import sys
- import os
- from contextlib import nested
- reload(sys)
- sys.setdefaultencoding('utf-8')
- def clean(filename):
- regex = re.compile(r'(?<!(&#\d{3};))(&#\d{3};){2}(?!(&#\d{3};))')
- try:
- os.rename(filename, "%s.old" %filename)
- except:
- print "Did not rename"
- with nested(open(filename, "wb" ), open(filename+".old", "rb" )) as (destination, source):
- counter = 0
- for line in source:
- rObj = re.search(regex, line)
- counter += 1
- print counter
- if rObj is not None:
- hexValues = [hex(int(rObj.group()[2:5])), hex(int(rObj.group()[8:11]))]
- newChar = ''.join([chr(int(c, 16)) for c in hexValues]).decode('utf8')
- newLine = re.sub(regex, newChar, line)
- destination.write(newLine)
- else:
- destination.write(line)
- os.remove("%s.old" %filename)
- def usage():
- print "Usage: python fix-xml.py relase, where release is for example 20091101"
- sys.exit()
- def main(argv):
- if len(argv) == 0 or len(argv[0]) != 8:
- usage()
- try:
- int(argv[0])
- except ValueError:
- usage()
- sys.exit()
- release = argv[0]
- filename = 'discogs_%s_labels.xml' % release
- clean(filename)
- filename = 'discogs_%s_releases.xml' % release
- clean(filename)
- filename = 'discogs_%s_artists.xml' % release
- clean(filename)
- if __name__ == '__main__':
- main(sys.argv[1:])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement