Odpowiedź @ akashivskyy jest świetna i pokazuje, jak wykorzystać NSAttributedString
do dekodowania encji HTML. Jedną z możliwych wad (jak stwierdził) jest to, że wszystko usuwane są również znaczniki HTML, więc
<strong> 4 < 5 & 3 > 2</strong>
staje się
4 < 5 & 3 > 2
Na OS X jest CFXMLCreateStringByUnescapingEntities()
który wykonuje zadanie:
let encoded = "<strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @ "
let decoded = CFXMLCreateStringByUnescapingEntities(nil, encoded, nil) as String
println(decoded)
// <strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @
ale to nie jest dostępne na iOS.
Oto czysta implementacja Swift. Odszyfrowuje odniesienia do jednostek znakowych, takie jak <
użycie słownika, i wszystkie numeryczne jednostki znakowe, takie jak@
lub €
. (Zauważ, że nie wymieniłem wyraźnie wszystkich 252 encji HTML).
Swift 4:
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [ Substring : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
// HTML character entity references:
" " : "\u{00a0}",
// ...
"♦" : "♦",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : Substring, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : Substring) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X") {
return decodeNumeric(entity.dropFirst(3).dropLast(), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.dropFirst(2).dropLast(), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self[position...].range(of: "&") {
result.append(contentsOf: self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound
// Find the next ';' and copy everything from '&' to ';' into `entity`
guard let semiRange = self[position...].range(of: ";") else {
// No matching ';'.
break
}
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(contentsOf: entity)
}
}
// Copy remaining characters to `result`:
result.append(contentsOf: self[position...])
return result
}
}
Przykład:
let encoded = "<strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @ "
let decoded = encoded.stringByDecodingHTMLEntities
print(decoded)
// <strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @
Swift 3:
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [ String : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
// HTML character entity references:
" " : "\u{00a0}",
// ...
"♦" : "♦",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : String, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.range(of: "&", range: position ..< endIndex) {
result.append(self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.range(of: ";", range: position ..< endIndex) {
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(entity)
}
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.append(self[position ..< endIndex])
return result
}
}
Swift 2:
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [ String : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
// HTML character entity references:
" " : "\u{00a0}",
// ...
"♦" : "♦",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(string : String, base : Int32) -> Character? {
let code = UInt32(strtoul(string, nil, base))
return Character(UnicodeScalar(code))
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(3)), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(2)), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.rangeOfString("&", range: position ..< endIndex) {
result.appendContentsOf(self[position ..< ampRange.startIndex])
position = ampRange.startIndex
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.rangeOfString(";", range: position ..< endIndex) {
let entity = self[position ..< semiRange.endIndex]
position = semiRange.endIndex
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.appendContentsOf(entity)
}
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.appendContentsOf(self[position ..< endIndex])
return result
}
}