Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # Control characters
- BAD_CONTROL_CHARSET = [
- chr(0),
- chr(1),
- chr(2),
- chr(3),
- chr(4),
- chr(5),
- chr(6),
- chr(7),
- chr(8),
- #chr(9), # TAB (horizontal tab)
- #chr(10), # LF (NL line feed, new line)
- chr(11),
- chr(12),
- chr(13), # CR (carriage return)
- chr(14),
- chr(15),
- chr(16),
- chr(17),
- chr(18),
- chr(19),
- chr(20),
- chr(21),
- chr(22),
- chr(23),
- chr(24),
- chr(25),
- chr(26),
- chr(27), # ESC (escape)
- chr(28),
- chr(29),
- chr(30),
- chr(31),
- ]
- # UTF-8
- BAD_CHARSET_1 = [
- "\xe2\x80\x98",
- "\xe2\x80\x99",
- "\xe2\x80\x9c",
- "\xe2\x80\x9d",
- "\xe2\x80\x93",
- "\xe2\x80\x94",
- "\xe2\x80\xa6",
- ]
- # Windows-1252
- BAD_CHARSET_2 = [
- chr(145),
- chr(146),
- chr(147),
- chr(148),
- chr(150),
- chr(151),
- chr(133),
- ]
- # Corrupted
- BAD_CHARSET_3 = [
- '$B!G(B',
- '$B!G(B', # duplicated to map to GOOD_CHARSET properly
- '$B!H(B',
- '$B!I(B',
- '$B"-(B',
- #'',
- #'',
- ]
- # We want to map BAD_CHARSET_* to GOOD_CHARSET
- GOOD_CHARSET = [
- "'",
- "'",
- '"',
- '"',
- '-',
- '--',
- '...',
- ]
- def ascii(s):
- return "".join(i for i in s if ord(i) < 128)
- lines = open("metadata.xml", "r").readlines()
- out = open("metadata.fixed.xml", "w")
- index = 1
- for line in lines:
- i = 0
- for char in BAD_CONTROL_CHARSET:
- if char in line:
- line = line.replace(char, '')
- print "[%d] Removed BAD_CONTROL_CHARSET[%d]" % (index, i)
- i += 1
- i = 0
- for char in BAD_CHARSET_1:
- if char in line:
- line = line.replace(char, GOOD_CHARSET[i])
- print "[%d] Replaced BAD_CHARSET_1[%d] with %s" % (index, i, GOOD_CHARSET[i])
- i += 1
- i = 0
- for char in BAD_CHARSET_2:
- if char in line:
- line = line.replace(char, GOOD_CHARSET[i])
- print "[%d] Replaced BAD_CHARSET_2[%d] with %s" % (index, i, GOOD_CHARSET[i])
- i += 1
- i = 0
- for char in BAD_CHARSET_3:
- if char in line:
- line = line.replace(char, GOOD_CHARSET[i])
- print "[%d] Replaced BAD_CHARSET_3[%d] with %s" % (index, i, GOOD_CHARSET[i])
- i += 1
- # Any remaining non-ASCII characters are removed!
- line = ascii(line)
- out.write(line)
- index += 1
- if index % 100000 == 0:
- print index
- out.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement