A ton of changes

Someone emailed me a patch with a lot of improvements here
This commit is contained in:
alex wennerberg
2024-10-03 21:27:01 -04:00
parent 9c7a627e3f
commit cc8b9b0210
4 changed files with 205 additions and 290 deletions

2
Cargo.lock generated
View File

@@ -4,4 +4,4 @@ version = 3
[[package]]
name = "nanohtml2text"
version = "0.1.3"
version = "0.2.0"

View File

@@ -1,6 +1,6 @@
[package]
name = "nanohtml2text"
version = "0.1.5"
version = "0.2.0"
edition = "2018"
readme = "README.md"
license = "MIT"

View File

@@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[
("natur", '\u{00266E}'),
("natural", '\u{00266E}'),
("naturals", '\u{002115}'),
("nbsp", '\u{0000A0}'),
("nbsp", ' '),
("ncap", '\u{002A43}'),
("ncaron", '\u{000148}'),
("ncedil", '\u{000146}'),

View File

@@ -2,64 +2,60 @@ mod entity;
fn decode_named_entity(entity: &str) -> Option<char> {
entity::ENTITIES
.binary_search_by_key(&entity, |t| t.0)
.map(|idx| entity::ENTITIES[idx].1)
.binary_search_by_key(&entity, |&(name, _)| name)
.ok()
.map(|idx| entity::ENTITIES[idx].1)
}
fn parse_html_entity(ent_name: &str) -> Option<char> {
let d = decode_named_entity(ent_name);
if d.is_some() {
return d;
// Parse an HTML entity (named or numeric) and return the corresponding
// character.
fn parse_html_entity(entity: &str) -> Option<char> {
if let Some(c) = decode_named_entity(entity) {
return Some(c);
}
let num = ent_name.strip_prefix("#")?;
if num.chars().next()? == 'x' {
u32::from_str_radix(&num[1..].to_lowercase(), 16)
let num = entity.strip_prefix('#')?;
let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
u32::from_str_radix(hex, 16).ok()?
} else {
// remaining string may be empty, but that will generate an Err(Empty)
u32::from_str_radix(num, 10)
u32::from_str_radix(num, 10).ok()?
};
// Exclude control characters and ensure valid Unicode code point
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
char::from_u32(code_point)
} else {
None
}
.ok()
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
.and_then(|n| char::from_u32(n))
}
fn html_entitities_to_text(s: &str) -> String {
/// Convert HTML entities in a string to their corresponding characters.
fn html_entities_to_text(s: &str) -> String {
let mut out = String::new();
// except for the first part, every part will have started with an ampersand
// thus the start of the remaining parts is a HTML entity
let mut parts = s.split('&');
/*
skip first part. if the string started with an ampersand, the first part
will be an empty string
if the string was empty, the first part will also be an empty string so its
safe to unwrap
*/
out.push_str(parts.next().unwrap());
// Add the first part (before any '&')
out.push_str(parts.next().unwrap_or_default());
for part in parts {
let end = part
// entity can be terminated by semicolon or whitespace
.find(|c: char| c.is_whitespace() || c == ';')
// entity can also terminated by end of string or start of
// another entity
.unwrap_or_else(|| part.len());
if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity);
// get byte length of the char we did `find` above
let real_end = if let Some(next) = &part[end..].chars().next() {
end + next.len_utf8()
} else {
// invalid html entity that doesn't end with `;`
end
};
out.push_str(&part[real_end..]);
// Advance past the entity and any following semicolon or whitespace
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
let remaining = &part[end + next_char_len..];
out.push_str(remaining);
} else {
out.push('&');
out.push_str(part);
}
}
@@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
out
}
/// Function to parse and handle the individual tags.
/// Assumes that there was a '<' before the given string
///
/// Returns the generated text and the byte length to skip.
// Handle individual HTML tags and convert them to text.
// Returns the generated text and the number of bytes to skip.
fn handle_tag(s: &str) -> (String, usize) {
let (tag, more) = match s.split_once('>') {
Some((tag, more)) if !tag.is_empty() => (tag, more),
let (tag_content, rest) = match s.split_once('>') {
Some((tag, rest)) if !tag.is_empty() => (tag, rest),
_ => {
// was not actually a tag, so reinsert the '<'
return (String::from("<"), 0);
// Not a valid tag, treat '<' as a regular character
return ("<".to_string(), 0);
}
};
let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) {
(name, Some(attribs))
} else {
(tag, None)
};
// Split the tag into name and attributes
let (tag_name, attribs) = tag_content
.split_once(char::is_whitespace)
.map_or((tag_content, ""), |(name, attrs)| (name, attrs));
match name.to_lowercase().as_str() {
match tag_name.to_lowercase().as_str() {
// Handle anchor tags
"a" => {
// Extract href attribute
let href = attribs
.and_then(|attribs| {
Some(
attribs
// check for the href and then discard everything before it
.split_once("href")?
.1
// there might be whitespace between 'href' and '='
.trim_start()
// check for and then discard the equal sign
.strip_prefix('=')?
// remove whitespace after the equal sign
.trim_start(),
)
})
.and_then(|href_value|
// find quoted string
match href_value.chars().next()? {
start @ '\'' | start @ '"' => {
let (end, _) = href_value
.char_indices()
.skip(1)
.find(|(_, c)| *c == start)?;
Some(href_value[1..end].to_string())
.split_ascii_whitespace()
.find_map(|attr| {
let mut parts = attr.splitn(2, '=');
if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
if key.eq_ignore_ascii_case("href") {
Some(value.trim_matches(['"', '\''].as_ref()))
} else {
None
}
_ => None,
})
.filter(|href| !href.starts_with("javascript:"))
.map(|href| html_entitities_to_text(&href));
// only use to_ascii_lowercase here so the byte offsets dont get
// messed up from one uppercase symbol becoming two lowercase
// symbols or something like that
let more = more.to_ascii_lowercase();
let end_without_closing = more.find("</a");
let content = end_without_closing.map(|i| more[0..i].trim());
let end = end_without_closing
.map(|i| i + 3)
.and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1))
.unwrap_or_else(|| more.len());
let link = match (content, href) {
(Some(content_value), Some(href_value)) => {
if content_value == href_value {
href_value
} else {
let cleaned_content_value = html2text(content_value);
format!("{} ({})", cleaned_content_value, href_value)
None
}
})
.filter(|href| !href.starts_with("javascript:"))
.map(html_entities_to_text);
// Search for closing </a> tag
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
let content = &rest[..end_tag_start];
// Calculate the total length to skip
let closing_tag_len = if end_tag_start < lower_rest.len() {
4
} else {
0
};
// Length of "</a>"
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
let content_text = html2text(content.trim());
let link = match (href, content_text.is_empty()) {
(Some(href_value), false) if content_text != href_value => {
format!("{} ({})", content_text, href_value)
}
(None, Some(href_value)) => href_value,
(Some(content_value), None) => content_value.to_string(),
(None, None) => "".to_string(),
(Some(href_value), _) => href_value,
(_, false) => content_text,
_ => String::new(),
};
(link, tag.len() + 1 + end)
(link, total_skip)
}
"br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1),
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5"
| "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
name @ "head" | name @ "script" | name @ "style" => {
// silence tags
// Line breaks and list items
"br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
// only use to_ascii_lowercase here so the byte offsets dont get
// messed up from one uppercase symbol becoming two lowercase
// symbols or something like that
let more = more.to_ascii_lowercase();
let end = more
.find(&format!("</{}", name))
.map(|i| i + 2 + name.len())
.and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1))
.unwrap_or_else(|| more.len());
(String::new(), tag.len() + 1 + end)
// Paragraphs and headings
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
// Tags to ignore along with their content
name if ["head", "script", "style"].contains(&name) => {
// Search for the closing tag
let closing_tag = format!("</{}>", name);
let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
let closing_tag_len = if end_tag_start < lower_rest.len() {
closing_tag.len()
} else {
0
};
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
(String::new(), total_skip)
}
// HTML comments
"!--" => {
// HTML comment
(String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
let end = s.find("-->").map_or(s.len(), |n| n + 3);
(String::new(), end)
}
// other/unknown tags are just discarded
_ => (String::new(), tag.len() + 1),
// Discard other tags but keep their content
_ => (String::new(), tag_content.len() + 1),
}
}
/// Convert some HTML to plain text. Only some simple HTML tags are handled:
/// - `a` tags are transformed to their href attribute value
/// - paragraph, linebreak, heading, list, and list item tags insert different
/// amounts of line breaks.
/// - HTML comments as well as `head`, `script` and `style` are completely
/// discarded, including their content
/// - unknown tags are skipped, but their content is printed
///
/// HTML named entities will be replaced with the respecive Unicode code point,
/// and whitespace will be collapsed as is usual in HTML.
///
/// The resulting string will have CRLF line endings.
/// Convert an HTML string to plain text.
/// Handles basic HTML tags and entities, and collapses whitespace.
pub fn html2text(html: &str) -> String {
// collapse spaces
// Collapse multiple whitespace characters into a single space
let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
let mut out = String::new();
let mut i = 0;
while i < html.len() {
match html[i..].find('<') {
None => {
// no more tags in the input, done
out += &html_entitities_to_text(&html[i..]);
break;
let mut index = 0;
while index < html.len() {
if let Some(pos) = html[index..].find('<') {
if pos > 0 {
out.push_str(&html_entities_to_text(&html[index..index + pos]));
index += pos;
}
Some(text_segment) => {
if text_segment > 0 {
out += &html_entitities_to_text(&html[i..i + text_segment]);
i += text_segment;
index += 1; // Skip the '<'
let (parsed_text, advance) = handle_tag(&html[index..]);
if !parsed_text.is_empty() {
if out.ends_with("\r\n\r\n") || out.is_empty() {
out.push_str(&parsed_text.trim_start());
} else {
out.push_str(&parsed_text);
}
i += 1; // skip the '<'
let (s, advance) = handle_tag(&html[i..]);
if !s.is_empty() {
if out.ends_with("\r\n\r\n") || out.is_empty() {
out += &s.trim_start();
} else {
out += &s;
}
}
i += advance;
}
index += advance;
} else {
// No more tags, process the remaining text
out.push_str(&html_entities_to_text(&html[index..]));
break;
}
}
@@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String {
#[cfg(test)]
mod tests {
use super::*;
macro_rules! test {
($name:ident, $from:literal, $to:literal $(,)?) => {
#[test]
#[test]
fn $name() {
assert_eq!(&html2text($from), $to);
}
};
($($name:ident: $from:literal to $to:literal,)* $(,)?) => {
$(test!{$name, $from, $to})*
assert_eq!(html2text($from), $to);
}
};
($($name:ident: $from:literal to $to:literal,)*) => {
$(test!{$name, $from, $to})*
};
}
test! {
plaintext: "blah" to "blah",
tag: "<div></div>" to "",
tag_contents: "<div>simple text</div>" to "simple text",
// links
link:
"click <a href=\"test\">here</a>"
to "click here (test)",
link_href_equal_to_content:
"click <a href=\"test\">test</a>"
to "click test",
links_ignore_attributes:
"click <a class=\"x\" href=\"test\">here</a>"
to "click here (test)",
link_entities_in_url:
"click <a href=\"ents/&apos;x&apos;\">here</a>"
to "click here (ents/'x')",
link_javascript:
"click <a href=\"javascript:void(0)\">here</a>"
to "click here",
link_ignore_content_tags:
"click <a href=\"test\"><span>here</span> or here</a>"
to "click here or here (test)",
link_absolute_url:
"click <a href=\"http://bit.ly/2n4wXRs\">news</a>"
to "click news (http://bit.ly/2n4wXRs)",
link_ignore_attributes_2:
"<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>"
to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
// inlines
ignore_inline:
"strong <strong>text</strong>"
to "strong text",
ignore_inline_attributes:
"some <div id=\"a\" class=\"b\">div</div>"
to "some div",
// lines breaks and spaces
collapse_spaces:
"should ignore more spaces" to "should ignore more spaces",
collapse_linebreaks:
"a\nb\nc" to "a b c",
collapse_mixed:
"should \nignore \r\nnew lines" to "should ignore new lines",
br_tag:
"two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
paragraph:
"<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
// Headers
h1:
"<h1>First</h1>main text" to "First\r\n\r\nmain text",
h2_inline:
"First<h2>Second</h2>next section"
to "First\r\n\r\nSecond\r\n\r\nnext section",
h2:
"<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
h3_inline:
"Second<h3>Third</h3>next section"
to "Second\r\n\r\nThird\r\n\r\nnext section",
h3:
"<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
h4_inline:
"Third<h4>Fourth</h4>next section"
to "Third\r\n\r\nFourth\r\n\r\nnext section",
h4:
"<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
h5_inline:
"Fourth<h5>Fifth</h5>next section"
to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
h5:
"<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
h6_inline:
"Fifth<h6>Sixth</h6>next section"
to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
h6:
"<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
no_h7:
"<h7>Not Header</h7>next section" to "Not Headernext section",
// html entitites
entity_nbsp:
"two&nbsp;&nbsp;spaces" to "two  spaces",
entity_copy:
"&copy; 2017 K3A" to "© 2017 K3A",
entity_tag:
"&lt;printtag&gt;" to "<printtag>",
entity_currencies:
"would you pay in &cent;, &pound;, &yen; or &euro;?"
to "would you pay in ¢, £, ¥ or €?",
ampersand_not_entity:
"Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
entity_unknown:
"this &neither; as you see" to "this &neither; as you see",
entity_amp:
"fish &amp; chips" to "fish & chips",
unordered_list:
"list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
entity_quot:
"&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; HAL, 2001: A Space Odyssey"
to "\"I'm sorry, Dave. I'm afraid I can't do that.\" HAL, 2001: A Space Odyssey",
entity_reg:
"Google &reg;" to "Google ®",
// Large entity
entity_large_unknown:
"&abcdefghij;" to "&abcdefghij;",
// Numeric HTML entities
entity_numeric:
"&#8268; decimal and hex entities supported &#x204D;"
to "⁌ decimal and hex entities supported ⁍",
entity_numeric_2:
"&#39;single quotes&#39; and &#52765;"
to "'single quotes' and 츝",
// full thml structure
empty: "" to "",
full_html:
"<html><head><title>Good</title></head><body>x</body>" to "x",
ignore_script:
"we are not <script type=\"javascript\"></script>interested in scripts"
to "we are not interested in scripts",
// custom html tags
ignore_unknown_tag:
"<aa>hello</aa>" to "hello",
ignore_unknown_tag_whitespace:
"<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes:
"<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "",
plaintext: "blah" to "blah",
tag: "<div></div>" to "",
tag_contents: "<div>simple text</div>" to "simple text",
// Links
link: "click <a href=\"test\">here</a>" to "click here (test)",
link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" to "click here (ents/'x')",
link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
// Inline elements
ignore_inline: "strong <strong>text</strong>" to "strong text",
ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
// Line breaks and spaces
collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
collapse_linebreaks: "a\nb\nc" to "a b c",
collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
// Headers
h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
// HTML entities
entity_nbsp: "two&nbsp;&nbsp;spaces" to "two spaces",
entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
entity_tag: "&lt;printtag&gt;" to "<printtag>",
entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
entity_unknown: "this &neither; as you see" to "this &neither; as you see",
entity_amp: "fish &amp; chips" to "fish & chips",
// Unordered list
unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" HAL, 2001: A Space Odyssey",
entity_reg: "Google &reg;" to "Google ®",
// Large entity
entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
// Numeric HTML entities
entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
// Full HTML structure
empty: "" to "",
full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
// Custom HTML tags
ignore_unknown_tag: "<aa>hello</aa>" to "hello",
ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "",
}
}