Compare commits

...

10 Commits

Author SHA1 Message Date
alex wennerberg
97a1194cb1 Fix whitespace bug 2026-02-14 13:29:52 -08:00
alex wennerberg
8d57ec3524 Fix entity bug 2024-10-08 10:27:18 -04:00
alex wennerberg
cc8b9b0210 A ton of changes
Someone emailed me a patch with a lot of improvements here
2024-10-03 21:27:45 -04:00
alex wennerberg
9c7a627e3f update crate 2024-09-28 00:15:58 -04:00
alex wennerberg
7f97665afb Rename README 2024-01-19 21:13:01 -05:00
alex wennerberg
01cb16a240 update readme 2024-01-13 22:34:51 -05:00
alex wennerberg
e7ecde866e bump version 2022-09-24 10:01:07 -07:00
alex wennerberg
334a179f4f update readme in package 2022-09-24 09:59:27 -07:00
alex wennerberg
e97a40575b Update mailing list link 2022-09-24 09:56:55 -07:00
Ayrat Badykov
05fc9a55c8 do not add content if href == content 2022-09-24 15:43:54 +03:00
4 changed files with 207 additions and 284 deletions

4
Cargo.lock generated
View File

@@ -1,7 +1,7 @@
# This file is automatically @generated by Cargo. # This file is automatically @generated by Cargo.
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 4
[[package]] [[package]]
name = "nanohtml2text" name = "nanohtml2text"
version = "0.1.3" version = "0.2.1"

View File

@@ -1,10 +1,10 @@
[package] [package]
name = "nanohtml2text" name = "nanohtml2text"
version = "0.1.3" version = "0.2.1"
edition = "2018" edition = "2018"
readme = "README.txt" readme = "README.md"
license = "MIT" license = "MIT"
repository = "https://git.alexwennerberg.com/nanohtml2text" repository = "https://git.sr.ht/~aw/nanohtml2text"
keywords = ["html", "text"] keywords = ["html", "text"]
description = "A zero-dependency library to convert HTML to plain text" description = "A zero-dependency library to convert HTML to plain text"

View File

@@ -20,4 +20,4 @@ to test/experiment with it
Contributing Contributing
------------ ------------
git-send-email or git-request-pull to [my mailing list](https://lists.flounder.online/patches/) git-send-email or git-request-pull to [me](mailto:alex@alexwennerberg.com)

View File

@@ -2,64 +2,60 @@ mod entity;
fn decode_named_entity(entity: &str) -> Option<char> { fn decode_named_entity(entity: &str) -> Option<char> {
entity::ENTITIES entity::ENTITIES
.binary_search_by_key(&entity, |t| t.0) .binary_search_by_key(&entity, |&(name, _)| name)
.map(|idx| entity::ENTITIES[idx].1)
.ok() .ok()
.map(|idx| entity::ENTITIES[idx].1)
} }
fn parse_html_entity(ent_name: &str) -> Option<char> { // Parse an HTML entity (named or numeric) and return the corresponding
let d = decode_named_entity(ent_name); // character.
if d.is_some() {
return d; fn parse_html_entity(entity: &str) -> Option<char> {
if let Some(c) = decode_named_entity(entity) {
return Some(c);
} }
let num = ent_name.strip_prefix("#")?; let num = entity.strip_prefix('#')?;
if num.chars().next()? == 'x' {
u32::from_str_radix(&num[1..].to_lowercase(), 16) let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
u32::from_str_radix(hex, 16).ok()?
} else { } else {
// remaining string may be empty, but that will generate an Err(Empty) u32::from_str_radix(num, 10).ok()?
u32::from_str_radix(num, 10) };
// Exclude control characters and ensure valid Unicode code point
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
char::from_u32(code_point)
} else {
None
} }
.ok()
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
.and_then(|n| char::from_u32(n))
} }
fn html_entitities_to_text(s: &str) -> String { /// Convert HTML entities in a string to their corresponding characters.
fn html_entities_to_text(s: &str) -> String {
let mut out = String::new(); let mut out = String::new();
// except for the first part, every part will have started with an ampersand
// thus the start of the remaining parts is a HTML entity
let mut parts = s.split('&'); let mut parts = s.split('&');
/*
skip first part. if the string started with an ampersand, the first part
will be an empty string
if the string was empty, the first part will also be an empty string so its // Add the first part (before any '&')
safe to unwrap out.push_str(parts.next().unwrap_or_default());
*/
out.push_str(parts.next().unwrap());
for part in parts { for part in parts {
let end = part let end = part
// entity can be terminated by semicolon or whitespace
.find(|c: char| c.is_whitespace() || c == ';') .find(|c: char| c.is_whitespace() || c == ';')
// entity can also terminated by end of string or start of
// another entity
.unwrap_or_else(|| part.len()); .unwrap_or_else(|| part.len());
if let Some(entity) = parse_html_entity(&part[..end]) { if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity); out.push(entity);
// get byte length of the char we did `find` above
let real_end = if let Some(next) = &part[end..].chars().next() {
end + next.len_utf8()
} else {
// invalid html entity that doesn't end with `;`
end
};
out.push_str(&part[real_end..]); // Advance past the entity and any following semicolon
let skip = if part[end..].starts_with(';') { 1 } else { 0 };
let remaining = &part[end + skip..];
out.push_str(remaining);
} else { } else {
out.push('&'); out.push('&');
out.push_str(part); out.push_str(part);
} }
} }
@@ -67,149 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
out out
} }
/// Function to parse and handle the individual tags. // Handle individual HTML tags and convert them to text.
/// Assumes that there was a '<' before the given string // Returns the generated text and the number of bytes to skip.
///
/// Returns the generated text and the byte length to skip.
fn handle_tag(s: &str) -> (String, usize) { fn handle_tag(s: &str) -> (String, usize) {
let (tag, more) = match s.split_once('>') { let (tag_content, rest) = match s.split_once('>') {
Some((tag, more)) if !tag.is_empty() => (tag, more), Some((tag, rest)) if !tag.is_empty() => (tag, rest),
_ => { _ => {
// was not actually a tag, so reinsert the '<' // Not a valid tag, treat '<' as a regular character
return (String::from("<"), 0); return ("<".to_string(), 0);
} }
}; };
let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) { // Split the tag into name and attributes
(name, Some(attribs)) let (tag_name, attribs) = tag_content
} else { .split_once(char::is_whitespace)
(tag, None) .map_or((tag_content, ""), |(name, attrs)| (name, attrs));
};
match name.to_lowercase().as_str() { match tag_name.to_lowercase().as_str() {
// Handle anchor tags
"a" => { "a" => {
// Extract href attribute
let href = attribs let href = attribs
.and_then(|attribs| { .split_ascii_whitespace()
Some( .find_map(|attr| {
attribs let mut parts = attr.splitn(2, '=');
// check for the href and then discard everything before it
.split_once("href")? if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
.1 if key.eq_ignore_ascii_case("href") {
// there might be whitespace between 'href' and '=' Some(value.trim_matches(['"', '\''].as_ref()))
.trim_start() } else {
// check for and then discard the equal sign None
.strip_prefix('=')?
// remove whitespace after the equal sign
.trim_start(),
)
})
.and_then(|href_value|
// find quoted string
match href_value.chars().next()? {
start @ '\'' | start @ '"' => {
let (end, _) = href_value
.char_indices()
.skip(1)
.find(|(_, c)| *c == start)?;
Some(href_value[1..end].to_string())
} }
_ => None, } else {
}) None
}
})
.filter(|href| !href.starts_with("javascript:")) .filter(|href| !href.starts_with("javascript:"))
.map(|href| html_entitities_to_text(&href)); .map(html_entities_to_text);
// only use to_ascii_lowercase here so the byte offsets dont get
// messed up from one uppercase symbol becoming two lowercase
// symbols or something like that
let more = more.to_ascii_lowercase();
let end_without_closing = more.find("</a"); // Search for closing </a> tag
let content = end_without_closing.map(|i| more[0..i].trim()); let lower_rest = rest.to_ascii_lowercase();
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
let content = &rest[..end_tag_start];
let end = end_without_closing // Calculate the total length to skip
.map(|i| i + 3) let closing_tag_len = if end_tag_start < lower_rest.len() {
.and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1)) 4
.unwrap_or_else(|| more.len()); } else {
0
};
// Length of "</a>"
let link = match (content, href) { let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
(Some(content_value), Some(href_value)) => { let content_text = html2text(content.trim());
let cleaned_content_value = html2text(content_value); let link = match (href, content_text.is_empty()) {
format!("{} ({})", cleaned_content_value, href_value) (Some(href_value), false) if content_text != href_value => {
format!("{} ({})", content_text, href_value)
} }
(None, Some(href_value)) => href_value,
(Some(content_value), None) => content_value.to_string(), (Some(href_value), _) => href_value,
(None, None) => "".to_string(),
(_, false) => content_text,
_ => String::new(),
}; };
(link, tag.len() + 1 + end) (link, total_skip)
} }
"br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1), // Line breaks and list items
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
| "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
name @ "head" | name @ "script" | name @ "style" => {
// silence tags
// only use to_ascii_lowercase here so the byte offsets dont get // Paragraphs and headings
// messed up from one uppercase symbol becoming two lowercase "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
// symbols or something like that
let more = more.to_ascii_lowercase(); // Tags to ignore along with their content
let end = more name if ["head", "script", "style"].contains(&name) => {
.find(&format!("</{}", name)) // Search for the closing tag
.map(|i| i + 2 + name.len())
.and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1)) let closing_tag = format!("</{}>", name);
.unwrap_or_else(|| more.len()); let lower_rest = rest.to_ascii_lowercase();
(String::new(), tag.len() + 1 + end) let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
let closing_tag_len = if end_tag_start < lower_rest.len() {
closing_tag.len()
} else {
0
};
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
(String::new(), total_skip)
} }
// HTML comments
"!--" => { "!--" => {
// HTML comment let end = s.find("-->").map_or(s.len(), |n| n + 3);
(String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
(String::new(), end)
} }
// other/unknown tags are just discarded
_ => (String::new(), tag.len() + 1), // Discard other tags but keep their content
_ => (String::new(), tag_content.len() + 1),
} }
} }
/// Convert some HTML to plain text. Only some simple HTML tags are handled: /// Convert an HTML string to plain text.
/// - `a` tags are transformed to their href attribute value /// Handles basic HTML tags and entities, and collapses whitespace.
/// - paragraph, linebreak, heading, list, and list item tags insert different
/// amounts of line breaks.
/// - HTML comments as well as `head`, `script` and `style` are completely
/// discarded, including their content
/// - unknown tags are skipped, but their content is printed
///
/// HTML named entities will be replaced with the respecive Unicode code point,
/// and whitespace will be collapsed as is usual in HTML.
///
/// The resulting string will have CRLF line endings.
pub fn html2text(html: &str) -> String { pub fn html2text(html: &str) -> String {
// collapse spaces // Collapse multiple whitespace characters into a single space
let html = html.split_whitespace().collect::<Vec<_>>().join(" "); let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
let mut out = String::new(); let mut out = String::new();
let mut index = 0;
let mut i = 0; while index < html.len() {
while i < html.len() { if let Some(pos) = html[index..].find('<') {
match html[i..].find('<') { if pos > 0 {
None => { out.push_str(&html_entities_to_text(&html[index..index + pos]));
// no more tags in the input, done index += pos;
out += &html_entitities_to_text(&html[i..]);
break;
} }
Some(text_segment) => { index += 1; // Skip the '<'
if text_segment > 0 { let (parsed_text, advance) = handle_tag(&html[index..]);
out += &html_entitities_to_text(&html[i..i + text_segment]); if !parsed_text.is_empty() {
i += text_segment; if out.ends_with("\r\n\r\n") || out.is_empty() {
out.push_str(&parsed_text.trim_start());
} else {
out.push_str(&parsed_text);
} }
i += 1; // skip the '<'
let (s, advance) = handle_tag(&html[i..]);
if !s.is_empty() {
if out.ends_with("\r\n\r\n") || out.is_empty() {
out += &s.trim_start();
} else {
out += &s;
}
}
i += advance;
} }
index += advance;
} else {
// No more tags, process the remaining text
out.push_str(&html_entities_to_text(&html[index..]));
break;
} }
} }
@@ -219,141 +203,80 @@ pub fn html2text(html: &str) -> String {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
macro_rules! test { macro_rules! test {
($name:ident, $from:literal, $to:literal $(,)?) => { ($name:ident, $from:literal, $to:literal $(,)?) => {
#[test] #[test]
fn $name() { fn $name() {
assert_eq!(&html2text($from), $to); assert_eq!(html2text($from), $to);
} }
}; };
($($name:ident: $from:literal to $to:literal,)* $(,)?) => { ($($name:ident: $from:literal to $to:literal,)*) => {
$(test!{$name, $from, $to})* $(test!{$name, $from, $to})*
}; };
} }
test! { test! {
plaintext: "blah" to "blah", plaintext: "blah" to "blah",
tag: "<div></div>" to "", tag: "<div></div>" to "",
tag_contents: "<div>simple text</div>" to "simple text", tag_contents: "<div>simple text</div>" to "simple text",
// links // Links
link: link: "click <a href=\"test\">here</a>" to "click here (test)",
"click <a href=\"test\">here</a>" link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
to "click here (test)", links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
links_ignore_attributes: link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" to "click here (ents/'x')",
"click <a class=\"x\" href=\"test\">here</a>" link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
to "click here (test)", link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
link_entities_in_url: link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
"click <a href=\"ents/&apos;x&apos;\">here</a>" link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
to "click here (ents/'x')", // Inline elements
link_javascript: ignore_inline: "strong <strong>text</strong>" to "strong text",
"click <a href=\"javascript:void(0)\">here</a>" ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
to "click here", // Line breaks and spaces
link_ignore_content_tags: collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
"click <a href=\"test\"><span>here</span> or here</a>" collapse_linebreaks: "a\nb\nc" to "a b c",
to "click here or here (test)", collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
link_absolute_url: br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
"click <a href=\"http://bit.ly/2n4wXRs\">news</a>" paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
to "click news (http://bit.ly/2n4wXRs)", // Headers
link_ignore_attributes_2: h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
"<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)", h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
// inlines h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
ignore_inline: h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
"strong <strong>text</strong>" h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
to "strong text", h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
ignore_inline_attributes: h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
"some <div id=\"a\" class=\"b\">div</div>" h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
to "some div", h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
// lines breaks and spaces h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
collapse_spaces: no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
"should ignore more spaces" to "should ignore more spaces", // HTML entities
collapse_linebreaks: entity_nbsp: "two&nbsp;&nbsp;spaces" to "two\u{a0}\u{a0}spaces",
"a\nb\nc" to "a b c", entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
collapse_mixed: entity_tag: "&lt;printtag&gt;" to "<printtag>",
"should \nignore \r\nnew lines" to "should ignore new lines", entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
br_tag: ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
"two<br>line<br/>breaks" to "two\r\nline\r\nbreaks", entity_unknown: "this &neither; as you see" to "this &neither; as you see",
paragraph: entity_amp: "fish &amp; chips" to "fish & chips",
"<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs", // Unordered list
// Headers unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
h1: entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" HAL, 2001: A Space Odyssey",
"<h1>First</h1>main text" to "First\r\n\r\nmain text", entity_reg: "Google &reg;" to "Google ®",
h2_inline: // Large entity
"First<h2>Second</h2>next section" entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
to "First\r\n\r\nSecond\r\n\r\nnext section", // Numeric HTML entities
h2: entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
"<h2>Second</h2>next section" to "Second\r\n\r\nnext section", entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
h3_inline: // Full HTML structure
"Second<h3>Third</h3>next section" empty: "" to "",
to "Second\r\n\r\nThird\r\n\r\nnext section", full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
h3: ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
"<h3>Third</h3>next section" to "Third\r\n\r\nnext section", // Custom HTML tags
h4_inline: ignore_unknown_tag: "<aa>hello</aa>" to "hello",
"Third<h4>Fourth</h4>next section" ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
to "Third\r\n\r\nFourth\r\n\r\nnext section", ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
h4: invalid_html_entity_without_semicolon: "&hellip" to "",
"<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section", entity_whitespace_preserved: "&amp test" to "& test",
h5_inline:
"Fourth<h5>Fifth</h5>next section"
to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
h5:
"<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
h6_inline:
"Fifth<h6>Sixth</h6>next section"
to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
h6:
"<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
no_h7:
"<h7>Not Header</h7>next section" to "Not Headernext section",
// html entitites
entity_nbsp:
"two&nbsp;&nbsp;spaces" to "two  spaces",
entity_copy:
"&copy; 2017 K3A" to "© 2017 K3A",
entity_tag:
"&lt;printtag&gt;" to "<printtag>",
entity_currencies:
"would you pay in &cent;, &pound;, &yen; or &euro;?"
to "would you pay in ¢, £, ¥ or €?",
ampersand_not_entity:
"Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
entity_unknown:
"this &neither; as you see" to "this &neither; as you see",
entity_amp:
"fish &amp; chips" to "fish & chips",
unordered_list:
"list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
entity_quot:
"&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; HAL, 2001: A Space Odyssey"
to "\"I'm sorry, Dave. I'm afraid I can't do that.\" HAL, 2001: A Space Odyssey",
entity_reg:
"Google &reg;" to "Google ®",
// Large entity
entity_large_unknown:
"&abcdefghij;" to "&abcdefghij;",
// Numeric HTML entities
entity_numeric:
"&#8268; decimal and hex entities supported &#x204D;"
to "⁌ decimal and hex entities supported ⁍",
entity_numeric_2:
"&#39;single quotes&#39; and &#52765;"
to "'single quotes' and 츝",
// full thml structure
empty: "" to "",
full_html:
"<html><head><title>Good</title></head><body>x</body>" to "x",
ignore_script:
"we are not <script type=\"javascript\"></script>interested in scripts"
to "we are not interested in scripts",
// custom html tags
ignore_unknown_tag:
"<aa>hello</aa>" to "hello",
ignore_unknown_tag_whitespace:
"<aa >hello</aa>" to "hello",
ignore_unknown_tag_attributes:
"<aa x=\"1\">hello</aa>" to "hello",
invalid_html_entity_without_semicolon: "&hellip" to "",
} }
} }