fix invalid parsing of invalid html entities

This commit is contained in:
Ayrat Badykov
2022-04-23 11:41:02 +03:00
committed by alex wennerberg
parent fd7383cdba
commit 10c2a0cbff
2 changed files with 10 additions and 8 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "nanohtml2text"
version = "0.1.2"
version = "0.1.3"
edition = "2018"
readme = "README.txt"
license = "MIT"

View File

@@ -50,13 +50,14 @@ fn html_entitities_to_text(s: &str) -> String {
if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity);
// get byte length of the char we did `find` above
let skip = &part[end..]
.chars()
.next()
// we know there is another character so its safe to unwrap
.unwrap()
.len_utf8();
out.push_str(&part[end + skip..]);
let real_end = if let Some(next) = &part[end..].chars().next() {
end + next.len_utf8()
} else {
// invalid html entity that doesn't end with `;`
end
};
out.push_str(&part[real_end..]);
} else {
out.push('&');
out.push_str(part);
@@ -339,5 +340,6 @@ mod tests {
"<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes:
"<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "",
}
}