Fix whitespace bug

This commit is contained in:
alex wennerberg
2026-02-14 13:29:52 -08:00
parent 8d57ec3524
commit 97a1194cb1
2 changed files with 6 additions and 5 deletions

4
Cargo.lock generated
View File

@@ -1,7 +1,7 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 4
[[package]] [[package]]
name = "nanohtml2text" name = "nanohtml2text"
version = "0.2.0" version = "0.2.1"

View File

@@ -48,9 +48,9 @@ fn html_entities_to_text(s: &str) -> String {
if let Some(entity) = parse_html_entity(&part[..end]) { if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity); out.push(entity);
// Advance past the entity and any following semicolon or whitespace // Advance past the entity and any following semicolon
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8()); let skip = if part[end..].starts_with(';') { 1 } else { 0 };
let remaining = &part[end + next_char_len..]; let remaining = &part[end + skip..];
out.push_str(remaining); out.push_str(remaining);
} else { } else {
@@ -276,6 +276,7 @@ mod tests {
ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello", ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello", ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "", invalid_html_entity_without_semicolon: "&hellip" to "",
entity_whitespace_preserved: "&amp test" to "& test",
} }
} }