A ton of changes

Someone emailed me a patch with a lot of improvements here
2024-10-03 21:27:01 -04:00
parent 9c7a627e3f
commit cc8b9b0210
4 changed files with 205 additions and 290 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,4 +4,4 @@ version = 3

 [[package]]
 name = "nanohtml2text"
-version = "0.1.3"
+version = "0.2.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "nanohtml2text"
-version = "0.1.5"
+version = "0.2.0"
 edition = "2018"
 readme = "README.md"
 license = "MIT"
--- a/src/entity.rs
+++ b/src/entity.rs
@@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[
    ("natur", '\u{00266E}'),
    ("natural", '\u{00266E}'),
    ("naturals", '\u{002115}'),
-    ("nbsp", '\u{0000A0}'),
+    ("nbsp", ' '),
    ("ncap", '\u{002A43}'),
    ("ncaron", '\u{000148}'),
    ("ncedil", '\u{000146}'),
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,64 +2,60 @@ mod entity;

 fn decode_named_entity(entity: &str) -> Option<char> {
    entity::ENTITIES
-        .binary_search_by_key(&entity, |t| t.0)
-        .map(|idx| entity::ENTITIES[idx].1)
+        .binary_search_by_key(&entity, |&(name, _)| name)
        .ok()
+        .map(|idx| entity::ENTITIES[idx].1)
 }

-fn parse_html_entity(ent_name: &str) -> Option<char> {
-    let d = decode_named_entity(ent_name);
-    if d.is_some() {
-        return d;
+// Parse an HTML entity (named or numeric) and return the corresponding
+// character.
+
+fn parse_html_entity(entity: &str) -> Option<char> {
+    if let Some(c) = decode_named_entity(entity) {
+        return Some(c);
    }

-    let num = ent_name.strip_prefix("#")?;
-    if num.chars().next()? == 'x' {
-        u32::from_str_radix(&num[1..].to_lowercase(), 16)
+    let num = entity.strip_prefix('#')?;
+
+    let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
+        u32::from_str_radix(hex, 16).ok()?
    } else {
-        // remaining string may be empty, but that will generate an Err(Empty)
-        u32::from_str_radix(num, 10)
+        u32::from_str_radix(num, 10).ok()?
+    };
+
+    // Exclude control characters and ensure valid Unicode code point
+    if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
+        char::from_u32(code_point)
+    } else {
+        None
    }
-    .ok()
-    .filter(|n| !matches!(n, 9 | 10 | 13 | 32))
-    .and_then(|n| char::from_u32(n))
 }

-fn html_entitities_to_text(s: &str) -> String {
+/// Convert HTML entities in a string to their corresponding characters.
+
+fn html_entities_to_text(s: &str) -> String {
    let mut out = String::new();
-
-    // except for the first part, every part will have started with an ampersand
-    // thus the start of the remaining parts is a HTML entity
    let mut parts = s.split('&');
-    /*
-    skip first part. if the string started with an ampersand, the first part
-    will be an empty string

-    if the string was empty, the first part will also be an empty string so its
-    safe to unwrap
-    */
-    out.push_str(parts.next().unwrap());
+    // Add the first part (before any '&')
+    out.push_str(parts.next().unwrap_or_default());

    for part in parts {
        let end = part
-            // entity can be terminated by semicolon or whitespace
            .find(|c: char| c.is_whitespace() || c == ';')
-            // entity can also terminated by end of string or start of
-            // another entity
            .unwrap_or_else(|| part.len());
+
        if let Some(entity) = parse_html_entity(&part[..end]) {
            out.push(entity);
-            // get byte length of the char we did `find` above
-            let real_end = if let Some(next) = &part[end..].chars().next() {
-                end + next.len_utf8()
-            } else {
-                // invalid html entity that doesn't end with `;`
-                end
-            };

-            out.push_str(&part[real_end..]);
+            // Advance past the entity and any following semicolon or whitespace
+            let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
+            let remaining = &part[end + next_char_len..];
+
+            out.push_str(remaining);
        } else {
            out.push('&');
+
            out.push_str(part);
        }
    }
@@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
    out
 }

-/// Function to parse and handle the individual tags.
-/// Assumes that there was a '<' before the given string
-///
-/// Returns the generated text and the byte length to skip.
+// Handle individual HTML tags and convert them to text.
+// Returns the generated text and the number of bytes to skip.
 fn handle_tag(s: &str) -> (String, usize) {
-    let (tag, more) = match s.split_once('>') {
-        Some((tag, more)) if !tag.is_empty() => (tag, more),
+    let (tag_content, rest) = match s.split_once('>') {
+        Some((tag, rest)) if !tag.is_empty() => (tag, rest),
+
        _ => {
-            // was not actually a tag, so reinsert the '<'
-            return (String::from("<"), 0);
+            // Not a valid tag, treat '<' as a regular character
+            return ("<".to_string(), 0);
        }
    };

-    let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) {
-        (name, Some(attribs))
-    } else {
-        (tag, None)
-    };
+    // Split the tag into name and attributes
+    let (tag_name, attribs) = tag_content
+        .split_once(char::is_whitespace)
+        .map_or((tag_content, ""), |(name, attrs)| (name, attrs));

-    match name.to_lowercase().as_str() {
+    match tag_name.to_lowercase().as_str() {
+        // Handle anchor tags
        "a" => {
+            // Extract href attribute
            let href = attribs
-                .and_then(|attribs| {
-                    Some(
-                        attribs
-                            // check for the href and then discard everything before it
-                            .split_once("href")?
-                            .1
-                            // there might be whitespace between 'href' and '='
-                            .trim_start()
-                            // check for and then discard the equal sign
-                            .strip_prefix('=')?
-                            // remove whitespace after the equal sign
-                            .trim_start(),
-                    )
-                })
-                .and_then(|href_value|
-                    // find quoted string
-                    match href_value.chars().next()? {
-                        start @ '\'' | start @ '"' => {
-                            let (end, _) = href_value
-                                .char_indices()
-                                .skip(1)
-                                .find(|(_, c)| *c == start)?;
-                            Some(href_value[1..end].to_string())
+                .split_ascii_whitespace()
+                .find_map(|attr| {
+                    let mut parts = attr.splitn(2, '=');
+
+                    if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
+                        if key.eq_ignore_ascii_case("href") {
+                            Some(value.trim_matches(['"', '\''].as_ref()))
+                        } else {
+                            None
                        }
-                        _ => None,
-                    })
-                .filter(|href| !href.starts_with("javascript:"))
-                .map(|href| html_entitities_to_text(&href));
-            // only use to_ascii_lowercase here so the byte offsets dont get
-            // messed up from one uppercase symbol becoming two lowercase
-            // symbols or something like that
-            let more = more.to_ascii_lowercase();
-
-            let end_without_closing = more.find("</a");
-            let content = end_without_closing.map(|i| more[0..i].trim());
-
-            let end = end_without_closing
-                .map(|i| i + 3)
-                .and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1))
-                .unwrap_or_else(|| more.len());
-
-            let link = match (content, href) {
-                (Some(content_value), Some(href_value)) => {
-                    if content_value == href_value {
-                        href_value
                    } else {
-                        let cleaned_content_value = html2text(content_value);
-                        format!("{} ({})", cleaned_content_value, href_value)
+                        None
                    }
+                })
+                .filter(|href| !href.starts_with("javascript:"))
+                .map(html_entities_to_text);
+
+            // Search for closing </a> tag
+            let lower_rest = rest.to_ascii_lowercase();
+            let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
+            let content = &rest[..end_tag_start];
+
+            // Calculate the total length to skip
+            let closing_tag_len = if end_tag_start < lower_rest.len() {
+                4
+            } else {
+                0
+            };
+            // Length of "</a>"
+
+            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
+            let content_text = html2text(content.trim());
+            let link = match (href, content_text.is_empty()) {
+                (Some(href_value), false) if content_text != href_value => {
+                    format!("{} ({})", content_text, href_value)
                }
-                (None, Some(href_value)) => href_value,
-                (Some(content_value), None) => content_value.to_string(),
-                (None, None) => "".to_string(),
+
+                (Some(href_value), _) => href_value,
+
+                (_, false) => content_text,
+
+                _ => String::new(),
            };

-            (link, tag.len() + 1 + end)
+            (link, total_skip)
        }
-        "br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1),
-        "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5"
-        | "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
-        name @ "head" | name @ "script" | name @ "style" => {
-            // silence tags
+        // Line breaks and list items
+        "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),

-            // only use to_ascii_lowercase here so the byte offsets dont get
-            // messed up from one uppercase symbol becoming two lowercase
-            // symbols or something like that
-            let more = more.to_ascii_lowercase();
-            let end = more
-                .find(&format!("</{}", name))
-                .map(|i| i + 2 + name.len())
-                .and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1))
-                .unwrap_or_else(|| more.len());
-            (String::new(), tag.len() + 1 + end)
+        // Paragraphs and headings
+        "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
+
+        // Tags to ignore along with their content
+        name if ["head", "script", "style"].contains(&name) => {
+            // Search for the closing tag
+
+            let closing_tag = format!("</{}>", name);
+            let lower_rest = rest.to_ascii_lowercase();
+            let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
+            let closing_tag_len = if end_tag_start < lower_rest.len() {
+                closing_tag.len()
+            } else {
+                0
+            };
+
+            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
+
+            (String::new(), total_skip)
        }
+
+        // HTML comments
        "!--" => {
-            // HTML comment
-            (String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
+            let end = s.find("-->").map_or(s.len(), |n| n + 3);
+
+            (String::new(), end)
        }
-        // other/unknown tags are just discarded
-        _ => (String::new(), tag.len() + 1),
+
+        // Discard other tags but keep their content
+        _ => (String::new(), tag_content.len() + 1),
    }
 }

-/// Convert some HTML to plain text. Only some simple HTML tags are handled:
-/// - `a` tags are transformed to their href attribute value
-/// - paragraph, linebreak, heading, list, and list item tags insert different
-///   amounts of line breaks.
-/// - HTML comments as well as `head`, `script` and `style` are completely
-///   discarded, including their content
-/// - unknown tags are skipped, but their content is printed
-///
-/// HTML named entities will be replaced with the respecive Unicode code point,
-/// and whitespace will be collapsed as is usual in HTML.
-///
-/// The resulting string will have CRLF line endings.
+/// Convert an HTML string to plain text.
+/// Handles basic HTML tags and entities, and collapses whitespace.
 pub fn html2text(html: &str) -> String {
-    // collapse spaces
+    // Collapse multiple whitespace characters into a single space
    let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
-
    let mut out = String::new();
-
-    let mut i = 0;
-    while i < html.len() {
-        match html[i..].find('<') {
-            None => {
-                // no more tags in the input, done
-                out += &html_entitities_to_text(&html[i..]);
-                break;
+    let mut index = 0;
+    while index < html.len() {
+        if let Some(pos) = html[index..].find('<') {
+            if pos > 0 {
+                out.push_str(&html_entities_to_text(&html[index..index + pos]));
+                index += pos;
            }
-            Some(text_segment) => {
-                if text_segment > 0 {
-                    out += &html_entitities_to_text(&html[i..i + text_segment]);
-                    i += text_segment;
+            index += 1; // Skip the '<'
+            let (parsed_text, advance) = handle_tag(&html[index..]);
+            if !parsed_text.is_empty() {
+                if out.ends_with("\r\n\r\n") || out.is_empty() {
+                    out.push_str(&parsed_text.trim_start());
+                } else {
+                    out.push_str(&parsed_text);
                }
-                i += 1; // skip the '<'
-                let (s, advance) = handle_tag(&html[i..]);
-                if !s.is_empty() {
-                    if out.ends_with("\r\n\r\n") || out.is_empty() {
-                        out += &s.trim_start();
-                    } else {
-                        out += &s;
-                    }
-                }
-                i += advance;
            }
+            index += advance;
+        } else {
+            // No more tags, process the remaining text
+            out.push_str(&html_entities_to_text(&html[index..]));
+            break;
        }
    }

@@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String {
 #[cfg(test)]
 mod tests {
    use super::*;
-
    macro_rules! test {
        ($name:ident, $from:literal, $to:literal $(,)?) => {
-            #[test]
+        #[test]
            fn $name() {
-                assert_eq!(&html2text($from), $to);
-            }
-        };
-        ($($name:ident: $from:literal to $to:literal,)* $(,)?) => {
-            $(test!{$name, $from, $to})*
+                assert_eq!(html2text($from), $to);
+                }
+            };
+        ($($name:ident: $from:literal to $to:literal,)*) => {
+        $(test!{$name, $from, $to})*
        };
    }

    test! {
-        plaintext: "blah" to "blah",
-        tag: "<div></div>" to "",
-        tag_contents: "<div>simple text</div>" to "simple text",
-        // links
-        link:
-            "click <a href=\"test\">here</a>"
-            to "click here (test)",
-        link_href_equal_to_content:
-            "click <a href=\"test\">test</a>"
-            to "click test",
-        links_ignore_attributes:
-            "click <a class=\"x\" href=\"test\">here</a>"
-            to "click here (test)",
-        link_entities_in_url:
-            "click <a href=\"ents/&apos;x&apos;\">here</a>"
-            to "click here (ents/'x')",
-        link_javascript:
-            "click <a href=\"javascript:void(0)\">here</a>"
-            to "click here",
-        link_ignore_content_tags:
-            "click <a href=\"test\"><span>here</span> or here</a>"
-            to "click here or here (test)",
-        link_absolute_url:
-            "click <a href=\"http://bit.ly/2n4wXRs\">news</a>"
-            to "click news (http://bit.ly/2n4wXRs)",
-        link_ignore_attributes_2:
-            "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>"
-            to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
-        // inlines
-        ignore_inline:
-            "strong <strong>text</strong>"
-            to "strong text",
-        ignore_inline_attributes:
-            "some <div id=\"a\" class=\"b\">div</div>"
-            to "some div",
-        // lines breaks and spaces
-        collapse_spaces:
-            "should    ignore more spaces" to "should ignore more spaces",
-        collapse_linebreaks:
-            "a\nb\nc" to "a b c",
-        collapse_mixed:
-            "should \nignore \r\nnew lines" to "should ignore new lines",
-        br_tag:
-            "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
-        paragraph:
-            "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
-        // Headers
-        h1:
-            "<h1>First</h1>main text" to "First\r\n\r\nmain text",
-        h2_inline:
-            "First<h2>Second</h2>next section"
-            to "First\r\n\r\nSecond\r\n\r\nnext section",
-        h2:
-            "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
-        h3_inline:
-            "Second<h3>Third</h3>next section"
-            to "Second\r\n\r\nThird\r\n\r\nnext section",
-        h3:
-            "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
-        h4_inline:
-            "Third<h4>Fourth</h4>next section"
-            to "Third\r\n\r\nFourth\r\n\r\nnext section",
-        h4:
-            "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
-        h5_inline:
-            "Fourth<h5>Fifth</h5>next section"
-            to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
-        h5:
-            "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
-        h6_inline:
-            "Fifth<h6>Sixth</h6>next section"
-            to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
-        h6:
-            "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
-        no_h7:
-            "<h7>Not Header</h7>next section" to "Not Headernext section",
-        // html entitites
-        entity_nbsp:
-            "two&nbsp;&nbsp;spaces" to "two  spaces",
-        entity_copy:
-            "&copy; 2017 K3A" to "© 2017 K3A",
-        entity_tag:
-            "&lt;printtag&gt;" to "<printtag>",
-        entity_currencies:
-            "would you pay in &cent;, &pound;, &yen; or &euro;?"
-            to "would you pay in ¢, £, ¥ or €?",
-        ampersand_not_entity:
-            "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
-        entity_unknown:
-            "this &neither; as you see" to "this &neither; as you see",
-        entity_amp:
-            "fish &amp; chips" to "fish & chips",
-        unordered_list:
-            "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
-            to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
-        entity_quot:
-            "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey"
-            to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
-        entity_reg:
-            "Google &reg;" to "Google ®",
-        // Large entity
-        entity_large_unknown:
-            "&abcdefghij;" to "&abcdefghij;",
-        // Numeric HTML entities
-        entity_numeric:
-            "&#8268; decimal and hex entities supported &#x204D;"
-            to "⁌ decimal and hex entities supported ⁍",
-        entity_numeric_2:
-            "&#39;single quotes&#39; and &#52765;"
-            to "'single quotes' and 츝",
-        // full thml structure
-        empty: "" to "",
-        full_html:
-            "<html><head><title>Good</title></head><body>x</body>" to "x",
-        ignore_script:
-            "we are not <script type=\"javascript\"></script>interested in scripts"
-            to "we are not interested in scripts",
-        // custom html tags
-        ignore_unknown_tag:
-            "<aa>hello</aa>" to "hello",
-        ignore_unknown_tag_whitespace:
-            "<aa >hello</aa>" to "hello",
-        ignore_unknown_tag_attributes:
-            "<aa x=\"1\">hello</aa>" to "hello",
-        invalid_html_entity_without_semicolon: "&hellip" to "…",
+    plaintext: "blah" to "blah",
+    tag: "<div></div>" to "",
+    tag_contents: "<div>simple text</div>" to "simple text",
+    // Links
+    link: "click <a href=\"test\">here</a>" to "click here (test)",
+    link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
+    links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
+    link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" to "click here (ents/'x')",
+    link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
+    link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
+    link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
+    link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
+    // Inline elements
+    ignore_inline: "strong <strong>text</strong>" to "strong text",
+    ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
+    // Line breaks and spaces
+    collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
+    collapse_linebreaks: "a\nb\nc" to "a b c",
+    collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
+    br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
+    paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
+    // Headers
+    h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
+    h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
+    h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
+    h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
+    h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
+    h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
+    h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
+    h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
+    h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
+    h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
+    h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
+    no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
+    // HTML entities
+    entity_nbsp: "two&nbsp;&nbsp;spaces" to "two  spaces",
+    entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
+    entity_tag: "&lt;printtag&gt;" to "<printtag>",
+    entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
+    ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
+    entity_unknown: "this &neither; as you see" to "this &neither; as you see",
+    entity_amp: "fish &amp; chips" to "fish & chips",
+    // Unordered list
+    unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
+    entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
+    entity_reg: "Google &reg;" to "Google ®",
+    // Large entity
+    entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
+    // Numeric HTML entities
+    entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
+    entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
+    // Full HTML structure
+    empty: "" to "",
+    full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
+    ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
+    // Custom HTML tags
+    ignore_unknown_tag: "<aa>hello</aa>" to "hello",
+    ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
+    ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
+    invalid_html_entity_without_semicolon: "&hellip" to "…",
+
    }
 }