Want more features on Pastebin? Sign Up, it's FREE!
Guest

Encoding fix metadata.xml from gifiles-201211041513

By: a guest on Nov 7th, 2012  |  syntax: Python  |  size: 2.50 KB  |  views: 44  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. # Control characters
  5. BAD_CONTROL_CHARSET = [
  6.     chr(0),
  7.     chr(1),
  8.     chr(2),
  9.     chr(3),
  10.     chr(4),
  11.     chr(5),
  12.     chr(6),
  13.     chr(7),
  14.     chr(8),
  15.     #chr(9), # TAB (horizontal tab)
  16.     #chr(10), # LF (NL line feed, new line)
  17.     chr(11),
  18.     chr(12),
  19.     chr(13), # CR (carriage return)
  20.     chr(14),
  21.     chr(15),
  22.     chr(16),
  23.     chr(17),
  24.     chr(18),
  25.     chr(19),
  26.     chr(20),
  27.     chr(21),
  28.     chr(22),
  29.     chr(23),
  30.     chr(24),
  31.     chr(25),
  32.     chr(26),
  33.     chr(27), # ESC (escape)
  34.     chr(28),
  35.     chr(29),
  36.     chr(30),
  37.     chr(31),
  38. ]
  39.  
  40. # UTF-8
  41. BAD_CHARSET_1 = [
  42.     "\xe2\x80\x98",
  43.     "\xe2\x80\x99",
  44.     "\xe2\x80\x9c",
  45.     "\xe2\x80\x9d",
  46.     "\xe2\x80\x93",
  47.     "\xe2\x80\x94",
  48.     "\xe2\x80\xa6",
  49. ]
  50.  
  51. # Windows-1252
  52. BAD_CHARSET_2 = [
  53.     chr(145),
  54.     chr(146),
  55.     chr(147),
  56.     chr(148),
  57.     chr(150),
  58.     chr(151),
  59.     chr(133),
  60. ]
  61.  
  62. # Corrupted
  63. BAD_CHARSET_3 = [
  64.     '$B!G(B',
  65.     '$B!G(B', # duplicated to map to GOOD_CHARSET properly
  66.     '$B!H(B',
  67.     '$B!I(B',
  68.     '$B"-(B',
  69.     #'',
  70.     #'',
  71. ]
  72.  
  73. # We want to map BAD_CHARSET_* to GOOD_CHARSET
  74. GOOD_CHARSET = [
  75.     "'",
  76.     "'",
  77.     '"',
  78.     '"',
  79.     '-',
  80.     '--',
  81.     '...',
  82. ]
  83.  
  84. def ascii(s):
  85.     return "".join(i for i in s if ord(i) < 128)
  86.  
  87. lines = open("metadata.xml", "r").readlines()
  88. out = open("metadata.fixed.xml", "w")
  89.  
  90. index = 1
  91. for line in lines:
  92.     i = 0
  93.     for char in BAD_CONTROL_CHARSET:
  94.         if char in line:
  95.             line = line.replace(char, '')
  96.             print "[%d] Removed BAD_CONTROL_CHARSET[%d]" % (index, i)
  97.         i += 1
  98.  
  99.     i = 0
  100.     for char in BAD_CHARSET_1:
  101.         if char in line:
  102.             line = line.replace(char, GOOD_CHARSET[i])
  103.             print "[%d] Replaced BAD_CHARSET_1[%d] with %s" % (index, i, GOOD_CHARSET[i])
  104.         i += 1
  105.  
  106.     i = 0
  107.     for char in BAD_CHARSET_2:
  108.         if char in line:
  109.             line = line.replace(char, GOOD_CHARSET[i])
  110.             print "[%d] Replaced BAD_CHARSET_2[%d] with %s" % (index, i, GOOD_CHARSET[i])
  111.         i += 1
  112.  
  113.     i = 0
  114.     for char in BAD_CHARSET_3:
  115.         if char in line:
  116.             line = line.replace(char, GOOD_CHARSET[i])
  117.             print "[%d] Replaced BAD_CHARSET_3[%d] with %s" % (index, i, GOOD_CHARSET[i])
  118.         i += 1
  119.  
  120.     # Any remaining non-ASCII characters are removed!
  121.     line = ascii(line)
  122.  
  123.     out.write(line)
  124.  
  125.     index += 1
  126.     if index % 100000 == 0:
  127.         print index
  128.  
  129. out.close()
clone this paste RAW Paste Data