Guest User

Untitled

a guest
Oct 22nd, 2018
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.23 KB | None | 0 0
  1. import io
  2. import re
  3. import struct
  4.  
  5. # Per "struct" module docs
  6. UNSIGNED_SHORT = "H"
  7.  
  8. def shorts_as_utf16(short_sequence):
  9. if not isinstance(short_sequence, list):
  10. short_sequence = list(short_sequence)
  11. format = UNSIGNED_SHORT * (len(short_sequence) + 1)
  12. # 0xFEFF is a byte-order marker---however it gets encoded by pack(), the
  13. # UTF-16 decoder is supposed to understand it and use that interpretation
  14. # for the endianness of the remaining bytes. We probably don't need it
  15. # here, but it can't hurt!
  16. bits = struct.pack(format, 0xFEFF, *short_sequence)
  17. return bits.decode("UTF-16")
  18.  
  19. # Numeric XML entities, e.g. "��".
  20. rgx1 = re.compile(r"(?:&#\d+;)+")
  21.  
  22. # Capture one of the numbers inside an entity
  23. rgx2 = re.compile(r"&#(\d+);")
  24.  
  25. def fix_codepoints(s, raw=False):
  26. """Fix malformed XML entities generated by "SMS Backup & Restore".
  27.  
  28. Note: this function may break well-formed numeric entities, so be sure that
  29. the input string does not mix the two.
  30.  
  31. Input:
  32. s -- a string
  33. raw -- whether to do "raw" conversion (see "Output" below)
  34.  
  35. Output:
  36. The string s, but with bad entities fixed. If "raw" is True, the bad
  37. entities are replaced with their actual unicode characters. If "raw"
  38. is False, the bad entities are replaced with correct XML entities.
  39.  
  40. Details:
  41. SMS Backup app encodes complicated unicode characters as
  42. &#XXXXX;&#XXXXX;
  43. where the two XXX numbers are two unsigned shorts that form a UTF-16
  44. character. (Makes sense---it's probably implemented in Java, which
  45. uses UTF-16 encoding for in-memory strings.) Proper XML uses unicode
  46. codepoints (Python's ord()) for the XXX digits, so we need to do
  47. conversion.
  48. """
  49.  
  50. matches = list(rgx1.finditer(s))
  51. if not matches:
  52. return s
  53. with io.StringIO() as out:
  54. i = 0
  55. for m in matches:
  56. out.write(s[i:m.start()])
  57. i = m.end()
  58. repl = shorts_as_utf16(int(i) for i in rgx2.findall(m.group(0)))
  59. if raw:
  60. out.write(repl)
  61. else:
  62. for c in repl:
  63. out.write("&#{};".format(ord(c)))
  64. out.write(s[i:])
  65. return out.getvalue()
Add Comment
Please, Sign In to add comment