Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Very slightly adapted from http://stackoverflow.com/a/30141700/106244
- // 99.99% Credit to Martin R!
- // Mapping from XML/HTML character entity reference to character
- // From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
- private let characterEntities : [String: Character] = [
- // XML predefined entities:
- """ : "\"",
- "&" : "&",
- "'" : "'",
- "<" : "<",
- ">" : ">",
- // HTML character entity references:
- " " : "\u{00A0}",
- "¡" : "\u{00A1}",
- "¢" : "\u{00A2}",
- "£" : "\u{00A3}",
- "¤" : "\u{00A4}",
- "¥" : "\u{00A5}",
- "¦" : "\u{00A6}",
- "§" : "\u{00A7}",
- "¨" : "\u{00A8}",
- "©" : "\u{00A9}",
- "ª" : "\u{00AA}",
- "«" : "\u{00AB}",
- "¬" : "\u{00AC}",
- "­" : "\u{00AD}",
- "®" : "\u{00AE}",
- "¯" : "\u{00AF}",
- "°" : "\u{00B0}",
- "±" : "\u{00B1}",
- "²" : "\u{00B2}",
- "³" : "\u{00B3}",
- "´" : "\u{00B4}",
- "µ" : "\u{00B5}",
- "¶" : "\u{00B6}",
- "·" : "\u{00B7}",
- "¸" : "\u{00B8}",
- "¹" : "\u{00B9}",
- "º" : "\u{00BA}",
- "»" : "\u{00BB}",
- "¼" : "\u{00BC}",
- "½" : "\u{00BD}",
- "¾" : "\u{00BE}",
- "¿" : "\u{00BF}",
- "À" : "\u{00C0}",
- "Á" : "\u{00C1}",
- "Â" : "\u{00C2}",
- "Ã" : "\u{00C3}",
- "Ä" : "\u{00C4}",
- "Å" : "\u{00C5}",
- "Æ" : "\u{00C6}",
- "Ç" : "\u{00C7}",
- "È" : "\u{00C8}",
- "É" : "\u{00C9}",
- "Ê" : "\u{00CA}",
- "Ë" : "\u{00CB}",
- "Ì" : "\u{00CC}",
- "Í" : "\u{00CD}",
- "Î" : "\u{00CE}",
- "Ï" : "\u{00CF}",
- "Ð" : "\u{00D0}",
- "Ñ" : "\u{00D1}",
- "Ò" : "\u{00D2}",
- "Ó" : "\u{00D3}",
- "Ô" : "\u{00D4}",
- "Õ" : "\u{00D5}",
- "Ö" : "\u{00D6}",
- "×" : "\u{00D7}",
- "Ø" : "\u{00D8}",
- "Ù" : "\u{00D9}",
- "Ú" : "\u{00DA}",
- "Û" : "\u{00DB}",
- "Ü" : "\u{00DC}",
- "Ý" : "\u{00DD}",
- "Þ" : "\u{00DE}",
- "ß" : "\u{00DF}",
- "à" : "\u{00E0}",
- "á" : "\u{00E1}",
- "â" : "\u{00E2}",
- "ã" : "\u{00E3}",
- "ä" : "\u{00E4}",
- "å" : "\u{00E5}",
- "æ" : "\u{00E6}",
- "ç" : "\u{00E7}",
- "è" : "\u{00E8}",
- "é" : "\u{00E9}",
- "ê" : "\u{00EA}",
- "ë" : "\u{00EB}",
- "ì" : "\u{00EC}",
- "í" : "\u{00ED}",
- "î" : "\u{00EE}",
- "ï" : "\u{00EF}",
- "ð" : "\u{00F0}",
- "ñ" : "\u{00F1}",
- "ò" : "\u{00F2}",
- "ó" : "\u{00F3}",
- "ô" : "\u{00F4}",
- "õ" : "\u{00F5}",
- "ö" : "\u{00F6}",
- "÷" : "\u{00F7}",
- "ø" : "\u{00F8}",
- "ù" : "\u{00F9}",
- "ú" : "\u{00FA}",
- "û" : "\u{00FB}",
- "ü" : "\u{00FC}",
- "ý" : "\u{00FD}",
- "þ" : "\u{00FE}",
- "ÿ" : "\u{00FF}",
- "Œ" : "\u{0152}",
- "œ" : "\u{0153}",
- "Š" : "\u{0160}",
- "š" : "\u{0161}",
- "Ÿ" : "\u{0178}",
- "ƒ" : "\u{0192}",
- "ˆ" : "\u{02C6}",
- "˜" : "\u{02DC}",
- "Α" : "\u{0391}",
- "Β" : "\u{0392}",
- "Γ" : "\u{0393}",
- "Δ" : "\u{0394}",
- "Ε" : "\u{0395}",
- "Ζ" : "\u{0396}",
- "Η" : "\u{0397}",
- "Θ" : "\u{0398}",
- "Ι" : "\u{0399}",
- "Κ" : "\u{039A}",
- "Λ" : "\u{039B}",
- "Μ" : "\u{039C}",
- "Ν" : "\u{039D}",
- "Ξ" : "\u{039E}",
- "Ο" : "\u{039F}",
- "Π" : "\u{03A0}",
- "Ρ" : "\u{03A1}",
- "Σ" : "\u{03A3}",
- "Τ" : "\u{03A4}",
- "Υ" : "\u{03A5}",
- "Φ" : "\u{03A6}",
- "Χ" : "\u{03A7}",
- "Ψ" : "\u{03A8}",
- "Ω" : "\u{03A9}",
- "α" : "\u{03B1}",
- "β" : "\u{03B2}",
- "γ" : "\u{03B3}",
- "δ" : "\u{03B4}",
- "ε" : "\u{03B5}",
- "ζ" : "\u{03B6}",
- "η" : "\u{03B7}",
- "θ" : "\u{03B8}",
- "ι" : "\u{03B9}",
- "κ" : "\u{03BA}",
- "λ" : "\u{03BB}",
- "μ" : "\u{03BC}",
- "ν" : "\u{03BD}",
- "ξ" : "\u{03BE}",
- "ο" : "\u{03BF}",
- "π" : "\u{03C0}",
- "ρ" : "\u{03C1}",
- "ς" : "\u{03C2}",
- "σ" : "\u{03C3}",
- "τ" : "\u{03C4}",
- "υ" : "\u{03C5}",
- "φ" : "\u{03C6}",
- "χ" : "\u{03C7}",
- "ψ" : "\u{03C8}",
- "ω" : "\u{03C9}",
- "ϑ" : "\u{03D1}",
- "ϒ" : "\u{03D2}",
- "ϖ" : "\u{03D6}",
- " " : "\u{2002}",
- " " : "\u{2003}",
- " " : "\u{2009}",
- "‌" : "\u{200C}",
- "‍" : "\u{200D}",
- "‎" : "\u{200E}",
- "‏" : "\u{200F}",
- "–" : "\u{2013}",
- "—" : "\u{2014}",
- "‘" : "\u{2018}",
- "’" : "\u{2019}",
- "‚" : "\u{201A}",
- "“" : "\u{201C}",
- "”" : "\u{201D}",
- "„" : "\u{201E}",
- "†" : "\u{2020}",
- "‡" : "\u{2021}",
- "•" : "\u{2022}",
- "…" : "\u{2026}",
- "‰" : "\u{2030}",
- "′" : "\u{2032}",
- "″" : "\u{2033}",
- "‹" : "\u{2039}",
- "›" : "\u{203A}",
- "‾" : "\u{203E}",
- "⁄" : "\u{2044}",
- "€" : "\u{20AC}",
- "ℑ" : "\u{2111}",
- "℘" : "\u{2118}",
- "ℜ" : "\u{211C}",
- "™" : "\u{2122}",
- "ℵ" : "\u{2135}",
- "←" : "\u{2190}",
- "↑" : "\u{2191}",
- "→" : "\u{2192}",
- "↓" : "\u{2193}",
- "↔" : "\u{2194}",
- "↵" : "\u{21B5}",
- "⇐" : "\u{21D0}",
- "⇑" : "\u{21D1}",
- "⇒" : "\u{21D2}",
- "⇓" : "\u{21D3}",
- "⇔" : "\u{21D4}",
- "∀" : "\u{2200}",
- "∂" : "\u{2202}",
- "∃" : "\u{2203}",
- "∅" : "\u{2205}",
- "∇" : "\u{2207}",
- "∈" : "\u{2208}",
- "∉" : "\u{2209}",
- "∋" : "\u{220B}",
- "∏" : "\u{220F}",
- "∑" : "\u{2211}",
- "−" : "\u{2212}",
- "∗" : "\u{2217}",
- "√" : "\u{221A}",
- "∝" : "\u{221D}",
- "∞" : "\u{221E}",
- "∠" : "\u{2220}",
- "∧" : "\u{2227}",
- "∨" : "\u{2228}",
- "∩" : "\u{2229}",
- "∪" : "\u{222A}",
- "∫" : "\u{222B}",
- "∴" : "\u{2234}",
- "∼" : "\u{223C}",
- "≅" : "\u{2245}",
- "≈" : "\u{2248}",
- "≠" : "\u{2260}",
- "≡" : "\u{2261}",
- "≤" : "\u{2264}",
- "≥" : "\u{2265}",
- "⊂" : "\u{2282}",
- "⊃" : "\u{2283}",
- "⊄" : "\u{2284}",
- "⊆" : "\u{2286}",
- "⊇" : "\u{2287}",
- "⊕" : "\u{2295}",
- "⊗" : "\u{2297}",
- "⊥" : "\u{22A5}",
- "⋅" : "\u{22C5}",
- "⌈" : "\u{2308}",
- "⌉" : "\u{2309}",
- "⌊" : "\u{230A}",
- "⌋" : "\u{230B}",
- "⟨" : "\u{2329}",
- "⟩" : "\u{232A}",
- "◊" : "\u{25CA}",
- "♠" : "\u{2660}",
- "♣" : "\u{2663}",
- "♥" : "\u{2665}",
- "♦" : "\u{2666}",
- ]
- extension String {
- /// Returns a new string made by replacing in the `String`
- /// all HTML character entity references with the corresponding
- /// character.
- var stringByDecodingHTMLEntities: String {
- return decodeHTMLEntities().decodedString
- }
- /// Returns a tuple containing the string made by relpacing in the
- /// `String` all HTML character entity references with the corresponding
- /// character. Also returned is an array of offset information describing
- /// the location and length offsets for each replacement. This allows
- /// for the correct adjust any attributes that may be associated with
- /// with substrings within the `String`
- func decodeHTMLEntities() -> (decodedString: String, replacementOffsets: [(index: String.Index, offset: String.Index.Distance)]) {
- // ===== Utility functions =====
- // Record the index offsets of each replacement
- // This allows anyone to correctly adjust any attributes that may be
- // associated with substrings within the string
- var replacementOffsets: [(index: String.Index, offset: String.Index.Distance)] = []
- // Convert the number in the string to the corresponding
- // Unicode character, e.g.
- // decodeNumeric("64", 10) --> "@"
- // decodeNumeric("20ac", 16) --> "β¬"
- func decodeNumeric(string : String, base : Int32) -> Character? {
- let code = UInt32(string)
- return Character(UnicodeScalar(code!))
- }
- // Decode the HTML character entity to the corresponding
- // Unicode character, return `nil` for invalid input.
- // decode("@") --> "@"
- // decode("€") --> "β¬"
- // decode("<") --> "<"
- // decode("&foo;") --> nil
- func decode(entity : String) -> Character? {
- if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
- return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(3)), base: 16)
- } else if entity.hasPrefix("&#") {
- return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(2)), base: 10)
- } else {
- return characterEntities[entity]
- }
- }
- // ===== Method starts here =====
- var result = ""
- var position = startIndex
- // Find the next '&' and copy the characters preceding it to `result`:
- while let ampRange = self.rangeOfString("&", range: position ..< endIndex) {
- result.appendContentsOf(self[position ..< ampRange.startIndex])
- position = ampRange.startIndex
- // Find the next ';' and copy everything from '&' to ';' into `entity`
- if let semiRange = self.rangeOfString(";", range: position ..< endIndex) {
- let entity = self[position ..< semiRange.endIndex]
- if let decoded = decode(entity) {
- // Replace by decoded character:
- result.append(decoded)
- // Record offset
- let offset = (index: semiRange.endIndex, offset: 1 - position.distanceTo(semiRange.endIndex))
- replacementOffsets.append(offset)
- } else {
- // Invalid entity, copy verbatim:
- result.appendContentsOf(entity)
- }
- position = semiRange.endIndex
- } else {
- // No matching ';'.
- break
- }
- }
- // Copy remaining characters to `result`:
- result.appendContentsOf(self[position ..< endIndex])
- // Return results
- return (decodedString: result, replacementOffsets: replacementOffsets)
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement