A ton of changes

Someone emailed me a patch with a lot of improvements here
This commit is contained in:
alex wennerberg
2024-10-03 21:27:01 -04:00
parent 9c7a627e3f
commit cc8b9b0210
4 changed files with 205 additions and 290 deletions

2
Cargo.lock generated
View File

@@ -4,4 +4,4 @@ version = 3
[[package]] [[package]]
name = "nanohtml2text" name = "nanohtml2text"
version = "0.1.3" version = "0.2.0"

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "nanohtml2text" name = "nanohtml2text"
version = "0.1.5" version = "0.2.0"
edition = "2018" edition = "2018"
readme = "README.md" readme = "README.md"
license = "MIT" license = "MIT"

View File

@@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[
("natur", '\u{00266E}'), ("natur", '\u{00266E}'),
("natural", '\u{00266E}'), ("natural", '\u{00266E}'),
("naturals", '\u{002115}'), ("naturals", '\u{002115}'),
("nbsp", '\u{0000A0}'), ("nbsp", ' '),
("ncap", '\u{002A43}'), ("ncap", '\u{002A43}'),
("ncaron", '\u{000148}'), ("ncaron", '\u{000148}'),
("ncedil", '\u{000146}'), ("ncedil", '\u{000146}'),

View File

@@ -2,64 +2,60 @@ mod entity;
fn decode_named_entity(entity: &str) -> Option<char> { fn decode_named_entity(entity: &str) -> Option<char> {
entity::ENTITIES entity::ENTITIES
.binary_search_by_key(&entity, |t| t.0) .binary_search_by_key(&entity, |&(name, _)| name)
.map(|idx| entity::ENTITIES[idx].1)
.ok() .ok()
.map(|idx| entity::ENTITIES[idx].1)
} }
fn parse_html_entity(ent_name: &str) -> Option<char> { // Parse an HTML entity (named or numeric) and return the corresponding
let d = decode_named_entity(ent_name); // character.
if d.is_some() {
return d; fn parse_html_entity(entity: &str) -> Option<char> {
if let Some(c) = decode_named_entity(entity) {
return Some(c);
} }
let num = ent_name.strip_prefix("#")?; let num = entity.strip_prefix('#')?;
if num.chars().next()? == 'x' {
u32::from_str_radix(&num[1..].to_lowercase(), 16) let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
u32::from_str_radix(hex, 16).ok()?
} else { } else {
// remaining string may be empty, but that will generate an Err(Empty) u32::from_str_radix(num, 10).ok()?
u32::from_str_radix(num, 10) };
// Exclude control characters and ensure valid Unicode code point
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
char::from_u32(code_point)
} else {
None
} }
.ok()
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
.and_then(|n| char::from_u32(n))
} }
fn html_entitities_to_text(s: &str) -> String { /// Convert HTML entities in a string to their corresponding characters.
fn html_entities_to_text(s: &str) -> String {
let mut out = String::new(); let mut out = String::new();
// except for the first part, every part will have started with an ampersand
// thus the start of the remaining parts is a HTML entity
let mut parts = s.split('&'); let mut parts = s.split('&');
/*
skip first part. if the string started with an ampersand, the first part
will be an empty string
if the string was empty, the first part will also be an empty string so its // Add the first part (before any '&')
safe to unwrap out.push_str(parts.next().unwrap_or_default());
*/
out.push_str(parts.next().unwrap());
for part in parts { for part in parts {
let end = part let end = part
// entity can be terminated by semicolon or whitespace
.find(|c: char| c.is_whitespace() || c == ';') .find(|c: char| c.is_whitespace() || c == ';')
// entity can also terminated by end of string or start of
// another entity
.unwrap_or_else(|| part.len()); .unwrap_or_else(|| part.len());
if let Some(entity) = parse_html_entity(&part[..end]) { if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity); out.push(entity);
// get byte length of the char we did `find` above
let real_end = if let Some(next) = &part[end..].chars().next() {
end + next.len_utf8()
} else {
// invalid html entity that doesn't end with `;`
end
};
out.push_str(&part[real_end..]); // Advance past the entity and any following semicolon or whitespace
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
let remaining = &part[end + next_char_len..];
out.push_str(remaining);
} else { } else {
out.push('&'); out.push('&');
out.push_str(part); out.push_str(part);
} }
} }
@@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
out out
} }
/// Function to parse and handle the individual tags. // Handle individual HTML tags and convert them to text.
/// Assumes that there was a '<' before the given string // Returns the generated text and the number of bytes to skip.
///
/// Returns the generated text and the byte length to skip.
fn handle_tag(s: &str) -> (String, usize) { fn handle_tag(s: &str) -> (String, usize) {
let (tag, more) = match s.split_once('>') { let (tag_content, rest) = match s.split_once('>') {
Some((tag, more)) if !tag.is_empty() => (tag, more), Some((tag, rest)) if !tag.is_empty() => (tag, rest),
_ => { _ => {
// was not actually a tag, so reinsert the '<' // Not a valid tag, treat '<' as a regular character
return (String::from("<"), 0); return ("<".to_string(), 0);
} }
}; };
let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) { // Split the tag into name and attributes
(name, Some(attribs)) let (tag_name, attribs) = tag_content
} else { .split_once(char::is_whitespace)
(tag, None) .map_or((tag_content, ""), |(name, attrs)| (name, attrs));
};
match name.to_lowercase().as_str() { match tag_name.to_lowercase().as_str() {
// Handle anchor tags
"a" => { "a" => {
// Extract href attribute
let href = attribs let href = attribs
.and_then(|attribs| { .split_ascii_whitespace()
Some( .find_map(|attr| {
attribs let mut parts = attr.splitn(2, '=');
// check for the href and then discard everything before it
.split_once("href")? if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
.1 if key.eq_ignore_ascii_case("href") {
// there might be whitespace between 'href' and '=' Some(value.trim_matches(['"', '\''].as_ref()))
.trim_start() } else {
// check for and then discard the equal sign None
.strip_prefix('=')?
// remove whitespace after the equal sign
.trim_start(),
)
})
.and_then(|href_value|
// find quoted string
match href_value.chars().next()? {
start @ '\'' | start @ '"' => {
let (end, _) = href_value
.char_indices()
.skip(1)
.find(|(_, c)| *c == start)?;
Some(href_value[1..end].to_string())
} }
_ => None,
})
.filter(|href| !href.starts_with("javascript:"))
.map(|href| html_entitities_to_text(&href));
// only use to_ascii_lowercase here so the byte offsets dont get
// messed up from one uppercase symbol becoming two lowercase
// symbols or something like that
let more = more.to_ascii_lowercase();
let end_without_closing = more.find("</a");
let content = end_without_closing.map(|i| more[0..i].trim());
let end = end_without_closing
.map(|i| i + 3)
.and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1))
.unwrap_or_else(|| more.len());
let link = match (content, href) {
(Some(content_value), Some(href_value)) => {
if content_value == href_value {
href_value
} else { } else {
let cleaned_content_value = html2text(content_value); None
format!("{} ({})", cleaned_content_value, href_value)
} }
})
.filter(|href| !href.starts_with("javascript:"))
.map(html_entities_to_text);
// Search for closing </a> tag
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
let content = &rest[..end_tag_start];
// Calculate the total length to skip
let closing_tag_len = if end_tag_start < lower_rest.len() {
4
} else {
0
};
// Length of "</a>"
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
let content_text = html2text(content.trim());
let link = match (href, content_text.is_empty()) {
(Some(href_value), false) if content_text != href_value => {
format!("{} ({})", content_text, href_value)
} }
(None, Some(href_value)) => href_value,
(Some(content_value), None) => content_value.to_string(), (Some(href_value), _) => href_value,
(None, None) => "".to_string(),
(_, false) => content_text,
_ => String::new(),
}; };
(link, tag.len() + 1 + end) (link, total_skip)
} }
"br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1), // Line breaks and list items
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
| "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
name @ "head" | name @ "script" | name @ "style" => {
// silence tags
// only use to_ascii_lowercase here so the byte offsets dont get // Paragraphs and headings
// messed up from one uppercase symbol becoming two lowercase "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
// symbols or something like that
let more = more.to_ascii_lowercase(); // Tags to ignore along with their content
let end = more name if ["head", "script", "style"].contains(&name) => {
.find(&format!("</{}", name)) // Search for the closing tag
.map(|i| i + 2 + name.len())
.and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1)) let closing_tag = format!("</{}>", name);
.unwrap_or_else(|| more.len()); let lower_rest = rest.to_ascii_lowercase();
(String::new(), tag.len() + 1 + end) let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
let closing_tag_len = if end_tag_start < lower_rest.len() {
closing_tag.len()
} else {
0
};
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
(String::new(), total_skip)
} }
// HTML comments
"!--" => { "!--" => {
// HTML comment let end = s.find("-->").map_or(s.len(), |n| n + 3);
(String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
(String::new(), end)
} }
// other/unknown tags are just discarded
_ => (String::new(), tag.len() + 1), // Discard other tags but keep their content
_ => (String::new(), tag_content.len() + 1),
} }
} }
/// Convert some HTML to plain text. Only some simple HTML tags are handled: /// Convert an HTML string to plain text.
/// - `a` tags are transformed to their href attribute value /// Handles basic HTML tags and entities, and collapses whitespace.
/// - paragraph, linebreak, heading, list, and list item tags insert different
/// amounts of line breaks.
/// - HTML comments as well as `head`, `script` and `style` are completely
/// discarded, including their content
/// - unknown tags are skipped, but their content is printed
///
/// HTML named entities will be replaced with the respecive Unicode code point,
/// and whitespace will be collapsed as is usual in HTML.
///
/// The resulting string will have CRLF line endings.
pub fn html2text(html: &str) -> String { pub fn html2text(html: &str) -> String {
// collapse spaces // Collapse multiple whitespace characters into a single space
let html = html.split_whitespace().collect::<Vec<_>>().join(" "); let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
let mut out = String::new(); let mut out = String::new();
let mut index = 0;
let mut i = 0; while index < html.len() {
while i < html.len() { if let Some(pos) = html[index..].find('<') {
match html[i..].find('<') { if pos > 0 {
None => { out.push_str(&html_entities_to_text(&html[index..index + pos]));
// no more tags in the input, done index += pos;
out += &html_entitities_to_text(&html[i..]);
break;
} }
Some(text_segment) => { index += 1; // Skip the '<'
if text_segment > 0 { let (parsed_text, advance) = handle_tag(&html[index..]);
out += &html_entitities_to_text(&html[i..i + text_segment]); if !parsed_text.is_empty() {
i += text_segment; if out.ends_with("\r\n\r\n") || out.is_empty() {
out.push_str(&parsed_text.trim_start());
} else {
out.push_str(&parsed_text);
} }
i += 1; // skip the '<'
let (s, advance) = handle_tag(&html[i..]);
if !s.is_empty() {
if out.ends_with("\r\n\r\n") || out.is_empty() {
out += &s.trim_start();
} else {
out += &s;
}
}
i += advance;
} }
index += advance;
} else {
// No more tags, process the remaining text
out.push_str(&html_entities_to_text(&html[index..]));
break;
} }
} }
@@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
macro_rules! test { macro_rules! test {
($name:ident, $from:literal, $to:literal $(,)?) => { ($name:ident, $from:literal, $to:literal $(,)?) => {
#[test] #[test]
fn $name() { fn $name() {
assert_eq!(&html2text($from), $to); assert_eq!(html2text($from), $to);
} }
}; };
($($name:ident: $from:literal to $to:literal,)* $(,)?) => { ($($name:ident: $from:literal to $to:literal,)*) => {
$(test!{$name, $from, $to})* $(test!{$name, $from, $to})*
}; };
} }
test! { test! {
plaintext: "blah" to "blah", plaintext: "blah" to "blah",
tag: "<div></div>" to "", tag: "<div></div>" to "",
tag_contents: "<div>simple text</div>" to "simple text", tag_contents: "<div>simple text</div>" to "simple text",
// links // Links
link: link: "click <a href=\"test\">here</a>" to "click here (test)",
"click <a href=\"test\">here</a>" link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
to "click here (test)", links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
link_href_equal_to_content: link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" to "click here (ents/'x')",
"click <a href=\"test\">test</a>" link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
to "click test", link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
links_ignore_attributes: link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
"click <a class=\"x\" href=\"test\">here</a>" link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
to "click here (test)", // Inline elements
link_entities_in_url: ignore_inline: "strong <strong>text</strong>" to "strong text",
"click <a href=\"ents/&apos;x&apos;\">here</a>" ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
to "click here (ents/'x')", // Line breaks and spaces
link_javascript: collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
"click <a href=\"javascript:void(0)\">here</a>" collapse_linebreaks: "a\nb\nc" to "a b c",
to "click here", collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
link_ignore_content_tags: br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
"click <a href=\"test\"><span>here</span> or here</a>" paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
to "click here or here (test)", // Headers
link_absolute_url: h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
"click <a href=\"http://bit.ly/2n4wXRs\">news</a>" h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
to "click news (http://bit.ly/2n4wXRs)", h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
link_ignore_attributes_2: h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
"<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)", h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
// inlines h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
ignore_inline: h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
"strong <strong>text</strong>" h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
to "strong text", h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
ignore_inline_attributes: h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
"some <div id=\"a\" class=\"b\">div</div>" no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
to "some div", // HTML entities
// lines breaks and spaces entity_nbsp: "two&nbsp;&nbsp;spaces" to "two spaces",
collapse_spaces: entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
"should ignore more spaces" to "should ignore more spaces", entity_tag: "&lt;printtag&gt;" to "<printtag>",
collapse_linebreaks: entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
"a\nb\nc" to "a b c", ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
collapse_mixed: entity_unknown: "this &neither; as you see" to "this &neither; as you see",
"should \nignore \r\nnew lines" to "should ignore new lines", entity_amp: "fish &amp; chips" to "fish & chips",
br_tag: // Unordered list
"two<br>line<br/>breaks" to "two\r\nline\r\nbreaks", unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
paragraph: entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" HAL, 2001: A Space Odyssey",
"<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs", entity_reg: "Google &reg;" to "Google ®",
// Headers // Large entity
h1: entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
"<h1>First</h1>main text" to "First\r\n\r\nmain text", // Numeric HTML entities
h2_inline: entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
"First<h2>Second</h2>next section" entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
to "First\r\n\r\nSecond\r\n\r\nnext section", // Full HTML structure
h2: empty: "" to "",
"<h2>Second</h2>next section" to "Second\r\n\r\nnext section", full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
h3_inline: ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
"Second<h3>Third</h3>next section" // Custom HTML tags
to "Second\r\n\r\nThird\r\n\r\nnext section", ignore_unknown_tag: "<aa>hello</aa>" to "hello",
h3: ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
"<h3>Third</h3>next section" to "Third\r\n\r\nnext section", ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
h4_inline: invalid_html_entity_without_semicolon: "&hellip" to "",
"Third<h4>Fourth</h4>next section"
to "Third\r\n\r\nFourth\r\n\r\nnext section",
h4:
"<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
h5_inline:
"Fourth<h5>Fifth</h5>next section"
to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
h5:
"<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
h6_inline:
"Fifth<h6>Sixth</h6>next section"
to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
h6:
"<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
no_h7:
"<h7>Not Header</h7>next section" to "Not Headernext section",
// html entitites
entity_nbsp:
"two&nbsp;&nbsp;spaces" to "two  spaces",
entity_copy:
"&copy; 2017 K3A" to "© 2017 K3A",
entity_tag:
"&lt;printtag&gt;" to "<printtag>",
entity_currencies:
"would you pay in &cent;, &pound;, &yen; or &euro;?"
to "would you pay in ¢, £, ¥ or €?",
ampersand_not_entity:
"Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
entity_unknown:
"this &neither; as you see" to "this &neither; as you see",
entity_amp:
"fish &amp; chips" to "fish & chips",
unordered_list:
"list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
entity_quot:
"&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; HAL, 2001: A Space Odyssey"
to "\"I'm sorry, Dave. I'm afraid I can't do that.\" HAL, 2001: A Space Odyssey",
entity_reg:
"Google &reg;" to "Google ®",
// Large entity
entity_large_unknown:
"&abcdefghij;" to "&abcdefghij;",
// Numeric HTML entities
entity_numeric:
"&#8268; decimal and hex entities supported &#x204D;"
to "⁌ decimal and hex entities supported ⁍",
entity_numeric_2:
"&#39;single quotes&#39; and &#52765;"
to "'single quotes' and 츝",
// full thml structure
empty: "" to "",
full_html:
"<html><head><title>Good</title></head><body>x</body>" to "x",
ignore_script:
"we are not <script type=\"javascript\"></script>interested in scripts"
to "we are not interested in scripts",
// custom html tags
ignore_unknown_tag:
"<aa>hello</aa>" to "hello",
ignore_unknown_tag_whitespace:
"<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes:
"<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "",
} }
} }