fix invalid parsing of invalid html entities
This commit is contained in:
committed by
alex wennerberg
parent
fd7383cdba
commit
10c2a0cbff
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "nanohtml2text"
|
name = "nanohtml2text"
|
||||||
version = "0.1.2"
|
version = "0.1.3"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
readme = "README.txt"
|
readme = "README.txt"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
16
src/lib.rs
16
src/lib.rs
@@ -50,13 +50,14 @@ fn html_entitities_to_text(s: &str) -> String {
|
|||||||
if let Some(entity) = parse_html_entity(&part[..end]) {
|
if let Some(entity) = parse_html_entity(&part[..end]) {
|
||||||
out.push(entity);
|
out.push(entity);
|
||||||
// get byte length of the char we did `find` above
|
// get byte length of the char we did `find` above
|
||||||
let skip = &part[end..]
|
let real_end = if let Some(next) = &part[end..].chars().next() {
|
||||||
.chars()
|
end + next.len_utf8()
|
||||||
.next()
|
} else {
|
||||||
// we know there is another character so its safe to unwrap
|
// invalid html entity that doesn't end with `;`
|
||||||
.unwrap()
|
end
|
||||||
.len_utf8();
|
};
|
||||||
out.push_str(&part[end + skip..]);
|
|
||||||
|
out.push_str(&part[real_end..]);
|
||||||
} else {
|
} else {
|
||||||
out.push('&');
|
out.push('&');
|
||||||
out.push_str(part);
|
out.push_str(part);
|
||||||
@@ -339,5 +340,6 @@ mod tests {
|
|||||||
"<aa >hello</aa>" to "hello",
|
"<aa >hello</aa>" to "hello",
|
||||||
ignore_unknown_tag_attributes:
|
ignore_unknown_tag_attributes:
|
||||||
"<aa x=\"1\">hello</aa>" to "hello",
|
"<aa x=\"1\">hello</aa>" to "hello",
|
||||||
|
invalid_html_entity_without_semicolon: "&hellip" to "…",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user