diff --git a/Cargo.lock b/Cargo.lock index 96d16ba..d4adc35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,4 @@ version = 3 [[package]] name = "nanohtml2text" -version = "0.1.3" +version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 76af1e0..5d3dac6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "nanohtml2text" -version = "0.1.5" +version = "0.2.0" edition = "2018" readme = "README.md" license = "MIT" diff --git a/src/entity.rs b/src/entity.rs index d5efe07..b80c53c 100644 --- a/src/entity.rs +++ b/src/entity.rs @@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[ ("natur", '\u{00266E}'), ("natural", '\u{00266E}'), ("naturals", '\u{002115}'), - ("nbsp", '\u{0000A0}'), + ("nbsp", ' '), ("ncap", '\u{002A43}'), ("ncaron", '\u{000148}'), ("ncedil", '\u{000146}'), diff --git a/src/lib.rs b/src/lib.rs index 2479df4..81cc031 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,64 +2,60 @@ mod entity; fn decode_named_entity(entity: &str) -> Option { entity::ENTITIES - .binary_search_by_key(&entity, |t| t.0) - .map(|idx| entity::ENTITIES[idx].1) + .binary_search_by_key(&entity, |&(name, _)| name) .ok() + .map(|idx| entity::ENTITIES[idx].1) } -fn parse_html_entity(ent_name: &str) -> Option { - let d = decode_named_entity(ent_name); - if d.is_some() { - return d; +// Parse an HTML entity (named or numeric) and return the corresponding +// character. + +fn parse_html_entity(entity: &str) -> Option { + if let Some(c) = decode_named_entity(entity) { + return Some(c); } - let num = ent_name.strip_prefix("#")?; - if num.chars().next()? == 'x' { - u32::from_str_radix(&num[1..].to_lowercase(), 16) + let num = entity.strip_prefix('#')?; + + let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') { + u32::from_str_radix(hex, 16).ok()? } else { - // remaining string may be empty, but that will generate an Err(Empty) - u32::from_str_radix(num, 10) + u32::from_str_radix(num, 10).ok()? + }; + + // Exclude control characters and ensure valid Unicode code point + if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) { + char::from_u32(code_point) + } else { + None } - .ok() - .filter(|n| !matches!(n, 9 | 10 | 13 | 32)) - .and_then(|n| char::from_u32(n)) } -fn html_entitities_to_text(s: &str) -> String { +/// Convert HTML entities in a string to their corresponding characters. + +fn html_entities_to_text(s: &str) -> String { let mut out = String::new(); - - // except for the first part, every part will have started with an ampersand - // thus the start of the remaining parts is a HTML entity let mut parts = s.split('&'); - /* - skip first part. if the string started with an ampersand, the first part - will be an empty string - if the string was empty, the first part will also be an empty string so its - safe to unwrap - */ - out.push_str(parts.next().unwrap()); + // Add the first part (before any '&') + out.push_str(parts.next().unwrap_or_default()); for part in parts { let end = part - // entity can be terminated by semicolon or whitespace .find(|c: char| c.is_whitespace() || c == ';') - // entity can also terminated by end of string or start of - // another entity .unwrap_or_else(|| part.len()); + if let Some(entity) = parse_html_entity(&part[..end]) { out.push(entity); - // get byte length of the char we did `find` above - let real_end = if let Some(next) = &part[end..].chars().next() { - end + next.len_utf8() - } else { - // invalid html entity that doesn't end with `;` - end - }; - out.push_str(&part[real_end..]); + // Advance past the entity and any following semicolon or whitespace + let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8()); + let remaining = &part[end + next_char_len..]; + + out.push_str(remaining); } else { out.push('&'); + out.push_str(part); } } @@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String { out } -/// Function to parse and handle the individual tags. -/// Assumes that there was a '<' before the given string -/// -/// Returns the generated text and the byte length to skip. +// Handle individual HTML tags and convert them to text. +// Returns the generated text and the number of bytes to skip. fn handle_tag(s: &str) -> (String, usize) { - let (tag, more) = match s.split_once('>') { - Some((tag, more)) if !tag.is_empty() => (tag, more), + let (tag_content, rest) = match s.split_once('>') { + Some((tag, rest)) if !tag.is_empty() => (tag, rest), + _ => { - // was not actually a tag, so reinsert the '<' - return (String::from("<"), 0); + // Not a valid tag, treat '<' as a regular character + return ("<".to_string(), 0); } }; - let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) { - (name, Some(attribs)) - } else { - (tag, None) - }; + // Split the tag into name and attributes + let (tag_name, attribs) = tag_content + .split_once(char::is_whitespace) + .map_or((tag_content, ""), |(name, attrs)| (name, attrs)); - match name.to_lowercase().as_str() { + match tag_name.to_lowercase().as_str() { + // Handle anchor tags "a" => { + // Extract href attribute let href = attribs - .and_then(|attribs| { - Some( - attribs - // check for the href and then discard everything before it - .split_once("href")? - .1 - // there might be whitespace between 'href' and '=' - .trim_start() - // check for and then discard the equal sign - .strip_prefix('=')? - // remove whitespace after the equal sign - .trim_start(), - ) - }) - .and_then(|href_value| - // find quoted string - match href_value.chars().next()? { - start @ '\'' | start @ '"' => { - let (end, _) = href_value - .char_indices() - .skip(1) - .find(|(_, c)| *c == start)?; - Some(href_value[1..end].to_string()) + .split_ascii_whitespace() + .find_map(|attr| { + let mut parts = attr.splitn(2, '='); + + if let (Some(key), Some(value)) = (parts.next(), parts.next()) { + if key.eq_ignore_ascii_case("href") { + Some(value.trim_matches(['"', '\''].as_ref())) + } else { + None } - _ => None, - }) - .filter(|href| !href.starts_with("javascript:")) - .map(|href| html_entitities_to_text(&href)); - // only use to_ascii_lowercase here so the byte offsets dont get - // messed up from one uppercase symbol becoming two lowercase - // symbols or something like that - let more = more.to_ascii_lowercase(); - - let end_without_closing = more.find("').map(|i| end_tag + i + 1)) - .unwrap_or_else(|| more.len()); - - let link = match (content, href) { - (Some(content_value), Some(href_value)) => { - if content_value == href_value { - href_value } else { - let cleaned_content_value = html2text(content_value); - format!("{} ({})", cleaned_content_value, href_value) + None } + }) + .filter(|href| !href.starts_with("javascript:")) + .map(html_entities_to_text); + + // Search for closing tag + let lower_rest = rest.to_ascii_lowercase(); + let end_tag_start = lower_rest.find("").unwrap_or(lower_rest.len()); + let content = &rest[..end_tag_start]; + + // Calculate the total length to skip + let closing_tag_len = if end_tag_start < lower_rest.len() { + 4 + } else { + 0 + }; + // Length of "" + + let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len; + let content_text = html2text(content.trim()); + let link = match (href, content_text.is_empty()) { + (Some(href_value), false) if content_text != href_value => { + format!("{} ({})", content_text, href_value) } - (None, Some(href_value)) => href_value, - (Some(content_value), None) => content_value.to_string(), - (None, None) => "".to_string(), + + (Some(href_value), _) => href_value, + + (_, false) => content_text, + + _ => String::new(), }; - (link, tag.len() + 1 + end) + (link, total_skip) } - "br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1), - "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" - | "/h6" => (String::from("\r\n\r\n"), tag.len() + 1), - name @ "head" | name @ "script" | name @ "style" => { - // silence tags + // Line breaks and list items + "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1), - // only use to_ascii_lowercase here so the byte offsets dont get - // messed up from one uppercase symbol becoming two lowercase - // symbols or something like that - let more = more.to_ascii_lowercase(); - let end = more - .find(&format!("').map(|i| i + end_tag + 1)) - .unwrap_or_else(|| more.len()); - (String::new(), tag.len() + 1 + end) + // Paragraphs and headings + "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1), + + // Tags to ignore along with their content + name if ["head", "script", "style"].contains(&name) => { + // Search for the closing tag + + let closing_tag = format!("", name); + let lower_rest = rest.to_ascii_lowercase(); + let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len()); + let closing_tag_len = if end_tag_start < lower_rest.len() { + closing_tag.len() + } else { + 0 + }; + + let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len; + + (String::new(), total_skip) } + + // HTML comments "!--" => { - // HTML comment - (String::new(), s.find("-->").map_or(s.len(), |n| n + 3)) + let end = s.find("-->").map_or(s.len(), |n| n + 3); + + (String::new(), end) } - // other/unknown tags are just discarded - _ => (String::new(), tag.len() + 1), + + // Discard other tags but keep their content + _ => (String::new(), tag_content.len() + 1), } } -/// Convert some HTML to plain text. Only some simple HTML tags are handled: -/// - `a` tags are transformed to their href attribute value -/// - paragraph, linebreak, heading, list, and list item tags insert different -/// amounts of line breaks. -/// - HTML comments as well as `head`, `script` and `style` are completely -/// discarded, including their content -/// - unknown tags are skipped, but their content is printed -/// -/// HTML named entities will be replaced with the respecive Unicode code point, -/// and whitespace will be collapsed as is usual in HTML. -/// -/// The resulting string will have CRLF line endings. +/// Convert an HTML string to plain text. +/// Handles basic HTML tags and entities, and collapses whitespace. pub fn html2text(html: &str) -> String { - // collapse spaces + // Collapse multiple whitespace characters into a single space let html = html.split_whitespace().collect::>().join(" "); - let mut out = String::new(); - - let mut i = 0; - while i < html.len() { - match html[i..].find('<') { - None => { - // no more tags in the input, done - out += &html_entitities_to_text(&html[i..]); - break; + let mut index = 0; + while index < html.len() { + if let Some(pos) = html[index..].find('<') { + if pos > 0 { + out.push_str(&html_entities_to_text(&html[index..index + pos])); + index += pos; } - Some(text_segment) => { - if text_segment > 0 { - out += &html_entitities_to_text(&html[i..i + text_segment]); - i += text_segment; + index += 1; // Skip the '<' + let (parsed_text, advance) = handle_tag(&html[index..]); + if !parsed_text.is_empty() { + if out.ends_with("\r\n\r\n") || out.is_empty() { + out.push_str(&parsed_text.trim_start()); + } else { + out.push_str(&parsed_text); } - i += 1; // skip the '<' - let (s, advance) = handle_tag(&html[i..]); - if !s.is_empty() { - if out.ends_with("\r\n\r\n") || out.is_empty() { - out += &s.trim_start(); - } else { - out += &s; - } - } - i += advance; } + index += advance; + } else { + // No more tags, process the remaining text + out.push_str(&html_entities_to_text(&html[index..])); + break; } } @@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String { #[cfg(test)] mod tests { use super::*; - macro_rules! test { ($name:ident, $from:literal, $to:literal $(,)?) => { - #[test] + #[test] fn $name() { - assert_eq!(&html2text($from), $to); - } - }; - ($($name:ident: $from:literal to $to:literal,)* $(,)?) => { - $(test!{$name, $from, $to})* + assert_eq!(html2text($from), $to); + } + }; + ($($name:ident: $from:literal to $to:literal,)*) => { + $(test!{$name, $from, $to})* }; } test! { - plaintext: "blah" to "blah", - tag: "
" to "", - tag_contents: "
simple text
" to "simple text", - // links - link: - "click here" - to "click here (test)", - link_href_equal_to_content: - "click test" - to "click test", - links_ignore_attributes: - "click here" - to "click here (test)", - link_entities_in_url: - "click here" - to "click here (ents/'x')", - link_javascript: - "click here" - to "click here", - link_ignore_content_tags: - "click here or here" - to "click here or here (test)", - link_absolute_url: - "click news" - to "click news (http://bit.ly/2n4wXRs)", - link_ignore_attributes_2: - "yet, not yet" - to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)", - // inlines - ignore_inline: - "strong text" - to "strong text", - ignore_inline_attributes: - "some
div
" - to "some div", - // lines breaks and spaces - collapse_spaces: - "should ignore more spaces" to "should ignore more spaces", - collapse_linebreaks: - "a\nb\nc" to "a b c", - collapse_mixed: - "should \nignore \r\nnew lines" to "should ignore new lines", - br_tag: - "two
line
breaks" to "two\r\nline\r\nbreaks", - paragraph: - "

two

paragraphs

" to "two\r\n\r\nparagraphs", - // Headers - h1: - "

First

main text" to "First\r\n\r\nmain text", - h2_inline: - "First

Second

next section" - to "First\r\n\r\nSecond\r\n\r\nnext section", - h2: - "

Second

next section" to "Second\r\n\r\nnext section", - h3_inline: - "Second

Third

next section" - to "Second\r\n\r\nThird\r\n\r\nnext section", - h3: - "

Third

next section" to "Third\r\n\r\nnext section", - h4_inline: - "Third

Fourth

next section" - to "Third\r\n\r\nFourth\r\n\r\nnext section", - h4: - "

Fourth

next section" to "Fourth\r\n\r\nnext section", - h5_inline: - "Fourth
Fifth
next section" - to "Fourth\r\n\r\nFifth\r\n\r\nnext section", - h5: - "
Fifth
next section" to "Fifth\r\n\r\nnext section", - h6_inline: - "Fifth
Sixth
next section" - to "Fifth\r\n\r\nSixth\r\n\r\nnext section", - h6: - "
Sixth
next section" to "Sixth\r\n\r\nnext section", - no_h7: - "Not Headernext section" to "Not Headernext section", - // html entitites - entity_nbsp: - "two  spaces" to "two  spaces", - entity_copy: - "© 2017 K3A" to "© 2017 K3A", - entity_tag: - "<printtag>" to "", - entity_currencies: - "would you pay in ¢, £, ¥ or €?" - to "would you pay in ¢, £, ¥ or €?", - ampersand_not_entity: - "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity", - entity_unknown: - "this &neither; as you see" to "this &neither; as you see", - entity_amp: - "fish & chips" to "fish & chips", - unordered_list: - "list of items
  • One
  • Two
  • Three
" - to "list of items\r\nOne\r\nTwo\r\nThree\r\n", - entity_quot: - ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey" - to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey", - entity_reg: - "Google ®" to "Google ®", - // Large entity - entity_large_unknown: - "&abcdefghij;" to "&abcdefghij;", - // Numeric HTML entities - entity_numeric: - "⁌ decimal and hex entities supported ⁍" - to "⁌ decimal and hex entities supported ⁍", - entity_numeric_2: - "'single quotes' and 츝" - to "'single quotes' and 츝", - // full thml structure - empty: "" to "", - full_html: - "Goodx" to "x", - ignore_script: - "we are not interested in scripts" - to "we are not interested in scripts", - // custom html tags - ignore_unknown_tag: - "hello" to "hello", - ignore_unknown_tag_whitespace: - "hello" to "hello", - ignore_unknown_tag_attributes: - "hello" to "hello", - invalid_html_entity_without_semicolon: "&hellip" to "…", + plaintext: "blah" to "blah", + tag: "
" to "", + tag_contents: "
simple text
" to "simple text", + // Links + link: "click here" to "click here (test)", + link_href_equal_to_content: "click test" to "click test", + links_ignore_attributes: "click here" to "click here (test)", + link_entities_in_url: "click here" to "click here (ents/'x')", + link_javascript: "click here" to "click here", + link_ignore_content_tags: "click here or here" to "click here or here (test)", + link_absolute_url: "click news" to "click news (http://bit.ly/2n4wXRs)", + link_ignore_attributes_2: "yet, not yet" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)", + // Inline elements + ignore_inline: "strong text" to "strong text", + ignore_inline_attributes: "some
div
" to "some div", + // Line breaks and spaces + collapse_spaces: "should ignore more spaces" to "should ignore more spaces", + collapse_linebreaks: "a\nb\nc" to "a b c", + collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines", + br_tag: "two
line
breaks" to "two\r\nline\r\nbreaks", + paragraph: "

two

paragraphs

" to "two\r\n\r\nparagraphs", + // Headers + h1: "

First

main text" to "First\r\n\r\nmain text", + h2_inline: "First

Second

next section" to "First\r\n\r\nSecond\r\n\r\nnext section", + h2: "

Second

next section" to "Second\r\n\r\nnext section", + h3_inline: "Second

Third

next section" to "Second\r\n\r\nThird\r\n\r\nnext section", + h3: "

Third

next section" to "Third\r\n\r\nnext section", + h4_inline: "Third

Fourth

next section" to "Third\r\n\r\nFourth\r\n\r\nnext section", + h4: "

Fourth

next section" to "Fourth\r\n\r\nnext section", + h5_inline: "Fourth
Fifth
next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section", + h5: "
Fifth
next section" to "Fifth\r\n\r\nnext section", + h6_inline: "Fifth
Sixth
next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section", + h6: "
Sixth
next section" to "Sixth\r\n\r\nnext section", + no_h7: "Not Headernext section" to "Not Headernext section", + // HTML entities + entity_nbsp: "two  spaces" to "two spaces", + entity_copy: "© 2017 K3A" to "© 2017 K3A", + entity_tag: "<printtag>" to "", + entity_currencies: "would you pay in ¢, £, ¥ or €?" to "would you pay in ¢, £, ¥ or €?", + ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity", + entity_unknown: "this &neither; as you see" to "this &neither; as you see", + entity_amp: "fish & chips" to "fish & chips", + // Unordered list + unordered_list: "list of items
  • One
  • Two
  • Three
" to "list of items\r\nOne\r\nTwo\r\nThree\r\n", + entity_quot: ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey", + entity_reg: "Google ®" to "Google ®", + // Large entity + entity_large_unknown: "&abcdefghij;" to "&abcdefghij;", + // Numeric HTML entities + entity_numeric: "⁌ decimal and hex entities supported ⁍" to "⁌ decimal and hex entities supported ⁍", + entity_numeric_2: "'single quotes' and 츝" to "'single quotes' and 츝", + // Full HTML structure + empty: "" to "", + full_html: "Goodx" to "x", + ignore_script: "we are not interested in scripts" to "we are not interested in scripts", + // Custom HTML tags + ignore_unknown_tag: "hello" to "hello", + ignore_unknown_tag_whitespace: "hello" to "hello", + ignore_unknown_tag_attributes: "hello" to "hello", + invalid_html_entity_without_semicolon: "&hellip" to "…", + } }