Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- import re
- import struct
- # Per "struct" module docs
- UNSIGNED_SHORT = "H"
- def shorts_as_utf16(short_sequence):
- if not isinstance(short_sequence, list):
- short_sequence = list(short_sequence)
- format = UNSIGNED_SHORT * (len(short_sequence) + 1)
- # 0xFEFF is a byte-order marker---however it gets encoded by pack(), the
- # UTF-16 decoder is supposed to understand it and use that interpretation
- # for the endianness of the remaining bytes. We probably don't need it
- # here, but it can't hurt!
- bits = struct.pack(format, 0xFEFF, *short_sequence)
- return bits.decode("UTF-16")
- # Numeric XML entities, e.g. "��".
- rgx1 = re.compile(r"(?:&#\d+;)+")
- # Capture one of the numbers inside an entity
- rgx2 = re.compile(r"&#(\d+);")
- def fix_codepoints(s, raw=False):
- """Fix malformed XML entities generated by "SMS Backup & Restore".
- Note: this function may break well-formed numeric entities, so be sure that
- the input string does not mix the two.
- Input:
- s -- a string
- raw -- whether to do "raw" conversion (see "Output" below)
- Output:
- The string s, but with bad entities fixed. If "raw" is True, the bad
- entities are replaced with their actual unicode characters. If "raw"
- is False, the bad entities are replaced with correct XML entities.
- Details:
- SMS Backup app encodes complicated unicode characters as
- &#XXXXX;&#XXXXX;
- where the two XXX numbers are two unsigned shorts that form a UTF-16
- character. (Makes sense---it's probably implemented in Java, which
- uses UTF-16 encoding for in-memory strings.) Proper XML uses unicode
- codepoints (Python's ord()) for the XXX digits, so we need to do
- conversion.
- """
- matches = list(rgx1.finditer(s))
- if not matches:
- return s
- with io.StringIO() as out:
- i = 0
- for m in matches:
- out.write(s[i:m.start()])
- i = m.end()
- repl = shorts_as_utf16(int(i) for i in rgx2.findall(m.group(0)))
- if raw:
- out.write(repl)
- else:
- for c in repl:
- out.write("&#{};".format(ord(c)))
- out.write(s[i:])
- return out.getvalue()
Add Comment
Please, Sign In to add comment