#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Control characters
BAD_CONTROL_CHARSET = [
chr(0),
chr(1),
chr(2),
chr(3),
chr(4),
chr(5),
chr(6),
chr(7),
chr(8),
#chr(9), # TAB (horizontal tab)
#chr(10), # LF (NL line feed, new line)
chr(11),
chr(12),
chr(13), # CR (carriage return)
chr(14),
chr(15),
chr(16),
chr(17),
chr(18),
chr(19),
chr(20),
chr(21),
chr(22),
chr(23),
chr(24),
chr(25),
chr(26),
chr(27), # ESC (escape)
chr(28),
chr(29),
chr(30),
chr(31),
]
# UTF-8
BAD_CHARSET_1 = [
"\xe2\x80\x98",
"\xe2\x80\x99",
"\xe2\x80\x9c",
"\xe2\x80\x9d",
"\xe2\x80\x93",
"\xe2\x80\x94",
"\xe2\x80\xa6",
]
# Windows-1252
BAD_CHARSET_2 = [
chr(145),
chr(146),
chr(147),
chr(148),
chr(150),
chr(151),
chr(133),
]
# Corrupted
BAD_CHARSET_3 = [
'$B!G(B',
'$B!G(B', # duplicated to map to GOOD_CHARSET properly
'$B!H(B',
'$B!I(B',
'$B"-(B',
#'',
#'',
]
# We want to map BAD_CHARSET_* to GOOD_CHARSET
GOOD_CHARSET = [
"'",
"'",
'"',
'"',
'-',
'--',
'...',
]
def ascii(s):
return "".join(i for i in s if ord(i) < 128)
lines = open("metadata.xml", "r").readlines()
out = open("metadata.fixed.xml", "w")
index = 1
for line in lines:
i = 0
for char in BAD_CONTROL_CHARSET:
if char in line:
line = line.replace(char, '')
print "[%d] Removed BAD_CONTROL_CHARSET[%d]" % (index, i)
i += 1
i = 0
for char in BAD_CHARSET_1:
if char in line:
line = line.replace(char, GOOD_CHARSET[i])
print "[%d] Replaced BAD_CHARSET_1[%d] with %s" % (index, i, GOOD_CHARSET[i])
i += 1
i = 0
for char in BAD_CHARSET_2:
if char in line:
line = line.replace(char, GOOD_CHARSET[i])
print "[%d] Replaced BAD_CHARSET_2[%d] with %s" % (index, i, GOOD_CHARSET[i])
i += 1
i = 0
for char in BAD_CHARSET_3:
if char in line:
line = line.replace(char, GOOD_CHARSET[i])
print "[%d] Replaced BAD_CHARSET_3[%d] with %s" % (index, i, GOOD_CHARSET[i])
i += 1
# Any remaining non-ASCII characters are removed!
line = ascii(line)
out.write(line)
index += 1
if index % 100000 == 0:
print index
out.close()