This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Jun 3rd, 2010  |  syntax: Python  |  size: 1.55 KB  |  views: 99  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. # -*- coding: utf-8 -*-
  2. from __future__ import with_statement
  3.  
  4. import re
  5.  
  6. import sys
  7.  
  8. import os
  9.  
  10. from contextlib import nested
  11.  
  12. reload(sys)
  13. sys.setdefaultencoding('utf-8')
  14.  
  15.  
  16. def clean(filename):
  17.  
  18.   regex = re.compile(r'(?<!(&#\d{3};))(&#\d{3};){2}(?!(&#\d{3};))')
  19.  
  20.  
  21.  
  22.   try:
  23.  
  24.     os.rename(filename, "%s.old" %filename)
  25.  
  26.   except:
  27.  
  28.     print "Did not rename"
  29.  
  30.  
  31.  
  32.   with nested(open(filename, "wb" ), open(filename+".old", "rb" )) as (destination, source):
  33.  
  34.     counter = 0
  35.  
  36.     for line in source:
  37.  
  38.       rObj = re.search(regex, line)
  39.  
  40.       counter += 1
  41.  
  42.       print counter
  43.  
  44.       if rObj is not None:
  45.  
  46.         hexValues = [hex(int(rObj.group()[2:5])), hex(int(rObj.group()[8:11]))]
  47.         newChar = ''.join([chr(int(c, 16)) for c in hexValues]).decode('utf8')
  48.         newLine = re.sub(regex, newChar, line)
  49.         destination.write(newLine)     
  50.  
  51.       else:
  52.  
  53.         destination.write(line)
  54.  
  55.  
  56.  
  57.   os.remove("%s.old" %filename)
  58.  
  59.  
  60.  
  61. def usage():
  62.  
  63.   print "Usage: python fix-xml.py relase, where release is for example 20091101"
  64.  
  65.   sys.exit()
  66.  
  67.  
  68.  
  69. def main(argv):
  70.  
  71.   if len(argv) == 0 or len(argv[0]) != 8:
  72.  
  73.     usage()
  74.  
  75.   try:
  76.  
  77.     int(argv[0])
  78.  
  79.   except ValueError:
  80.  
  81.     usage()
  82.  
  83.     sys.exit()
  84.  
  85.  
  86.  
  87.   release = argv[0]
  88.  
  89.  
  90.   filename = 'discogs_%s_labels.xml' % release
  91.  
  92.   clean(filename)
  93.  
  94.  
  95.  
  96.   filename = 'discogs_%s_releases.xml' % release
  97.  
  98.   clean(filename)
  99.  
  100.  
  101.   filename = 'discogs_%s_artists.xml' % release
  102.  
  103.   clean(filename)
  104.  
  105.  
  106.  
  107. if __name__ == '__main__':
  108.  
  109.         main(sys.argv[1:])
clone this paste RAW Paste Data