A ton of changes

Someone emailed me a patch with a lot of improvements here
2024-10-03 21:27:01 -04:00
parent 9c7a627e3f
commit cc8b9b0210
4 changed files with 205 additions and 290 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,4 +4,4 @@ version = 3
 [[package]]
 name = "nanohtml2text"
-version = "0.1.3"
+version = "0.2.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "nanohtml2text"
-version = "0.1.5"
+version = "0.2.0"
 edition = "2018"
 readme = "README.md"
 license = "MIT"
--- a/src/entity.rs
+++ b/src/entity.rs
@@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[
    ("natur", '\u{00266E}'),
    ("natural", '\u{00266E}'),
    ("naturals", '\u{002115}'),
-    ("nbsp", '\u{0000A0}'),
+    ("nbsp", ' '),
    ("ncap", '\u{002A43}'),
    ("ncaron", '\u{000148}'),
    ("ncedil", '\u{000146}'),
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,64 +2,60 @@ mod entity;
 fn decode_named_entity(entity: &str) -> Option<char> {
    entity::ENTITIES
-        .binary_search_by_key(&entity, |t| t.0)
+        .binary_search_by_key(&entity, |&(name, _)| name)
        .map(|idx| entity::ENTITIES[idx].1)
        .ok()
        .map(|idx| entity::ENTITIES[idx].1)
 }
-fn parse_html_entity(ent_name: &str) -> Option<char> {
+// Parse an HTML entity (named or numeric) and return the corresponding
-    let d = decode_named_entity(ent_name);
+// character.
-    if d.is_some() {
+
-        return d;
+fn parse_html_entity(entity: &str) -> Option<char> {
    if let Some(c) = decode_named_entity(entity) {
        return Some(c);
    }
-    let num = ent_name.strip_prefix("#")?;
+    let num = entity.strip_prefix('#')?;
-    if num.chars().next()? == 'x' {
+
-        u32::from_str_radix(&num[1..].to_lowercase(), 16)
+    let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
        u32::from_str_radix(hex, 16).ok()?
    } else {
-        // remaining string may be empty, but that will generate an Err(Empty)
+        u32::from_str_radix(num, 10).ok()?
-        u32::from_str_radix(num, 10)
+    };
    // Exclude control characters and ensure valid Unicode code point
    if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
        char::from_u32(code_point)
    } else {
        None
    }
    .ok()
    .filter(|n| !matches!(n, 9 | 10 | 13 | 32))
    .and_then(|n| char::from_u32(n))
 }
-fn html_entitities_to_text(s: &str) -> String {
+/// Convert HTML entities in a string to their corresponding characters.
 fn html_entities_to_text(s: &str) -> String {
    let mut out = String::new();
    // except for the first part, every part will have started with an ampersand
    // thus the start of the remaining parts is a HTML entity
    let mut parts = s.split('&');
    /*
    skip first part. if the string started with an ampersand, the first part
    will be an empty string
-    if the string was empty, the first part will also be an empty string so its
+    // Add the first part (before any '&')
-    safe to unwrap
+    out.push_str(parts.next().unwrap_or_default());
    */
    out.push_str(parts.next().unwrap());
    for part in parts {
        let end = part
            // entity can be terminated by semicolon or whitespace
            .find(|c: char| c.is_whitespace() || c == ';')
            // entity can also terminated by end of string or start of
            // another entity
            .unwrap_or_else(|| part.len());
        if let Some(entity) = parse_html_entity(&part[..end]) {
            out.push(entity);
            // get byte length of the char we did `find` above
            let real_end = if let Some(next) = &part[end..].chars().next() {
                end + next.len_utf8()
            } else {
                // invalid html entity that doesn't end with `;`
                end
            };
-            out.push_str(&part[real_end..]);
+            // Advance past the entity and any following semicolon or whitespace
            let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
            let remaining = &part[end + next_char_len..];
            out.push_str(remaining);
        } else {
            out.push('&');
            out.push_str(part);
        }
    }
@@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
    out
 }
-/// Function to parse and handle the individual tags.
+// Handle individual HTML tags and convert them to text.
-/// Assumes that there was a '<' before the given string
+// Returns the generated text and the number of bytes to skip.
 ///
 /// Returns the generated text and the byte length to skip.
 fn handle_tag(s: &str) -> (String, usize) {
-    let (tag, more) = match s.split_once('>') {
+    let (tag_content, rest) = match s.split_once('>') {
-        Some((tag, more)) if !tag.is_empty() => (tag, more),
+        Some((tag, rest)) if !tag.is_empty() => (tag, rest),
        _ => {
-            // was not actually a tag, so reinsert the '<'
+            // Not a valid tag, treat '<' as a regular character
-            return (String::from("<"), 0);
+            return ("<".to_string(), 0);
        }
    };
-    let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) {
+    // Split the tag into name and attributes
-        (name, Some(attribs))
+    let (tag_name, attribs) = tag_content
-    } else {
+        .split_once(char::is_whitespace)
-        (tag, None)
+        .map_or((tag_content, ""), |(name, attrs)| (name, attrs));
    };
-    match name.to_lowercase().as_str() {
+    match tag_name.to_lowercase().as_str() {
        // Handle anchor tags
        "a" => {
            // Extract href attribute
            let href = attribs
-                .and_then(|attribs| {
+                .split_ascii_whitespace()
-                    Some(
+                .find_map(|attr| {
-                        attribs
+                    let mut parts = attr.splitn(2, '=');
-                            // check for the href and then discard everything before it
+
-                            .split_once("href")?
+                    if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
-                            .1
+                        if key.eq_ignore_ascii_case("href") {
-                            // there might be whitespace between 'href' and '='
+                            Some(value.trim_matches(['"', '\''].as_ref()))
-                            .trim_start()
+                        } else {
-                            // check for and then discard the equal sign
+                            None
                            .strip_prefix('=')?
                            // remove whitespace after the equal sign
                            .trim_start(),
                    )
                })
                .and_then(|href_value|
                    // find quoted string
                    match href_value.chars().next()? {
                        start @ '\'' | start @ '"' => {
                            let (end, _) = href_value
                                .char_indices()
                                .skip(1)
                                .find(|(_, c)| *c == start)?;
                            Some(href_value[1..end].to_string())
                        }
                        _ => None,
                    })
                .filter(|href| !href.starts_with("javascript:"))
                .map(|href| html_entitities_to_text(&href));
            // only use to_ascii_lowercase here so the byte offsets dont get
            // messed up from one uppercase symbol becoming two lowercase
            // symbols or something like that
            let more = more.to_ascii_lowercase();
            let end_without_closing = more.find("</a");
            let content = end_without_closing.map(|i| more[0..i].trim());
            let end = end_without_closing
                .map(|i| i + 3)
                .and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1))
                .unwrap_or_else(|| more.len());
            let link = match (content, href) {
                (Some(content_value), Some(href_value)) => {
                    if content_value == href_value {
                        href_value
                    } else {
-                        let cleaned_content_value = html2text(content_value);
+                        None
                        format!("{} ({})", cleaned_content_value, href_value)
                    }
                })
                .filter(|href| !href.starts_with("javascript:"))
                .map(html_entities_to_text);
            // Search for closing </a> tag
            let lower_rest = rest.to_ascii_lowercase();
            let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
            let content = &rest[..end_tag_start];
            // Calculate the total length to skip
            let closing_tag_len = if end_tag_start < lower_rest.len() {
                4
            } else {
                0
            };
            // Length of "</a>"
            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
            let content_text = html2text(content.trim());
            let link = match (href, content_text.is_empty()) {
                (Some(href_value), false) if content_text != href_value => {
                    format!("{} ({})", content_text, href_value)
                }
-                (None, Some(href_value)) => href_value,
+
-                (Some(content_value), None) => content_value.to_string(),
+                (Some(href_value), _) => href_value,
-                (None, None) => "".to_string(),
+
                (_, false) => content_text,
                _ => String::new(),
            };
-            (link, tag.len() + 1 + end)
+            (link, total_skip)
        }
-        "br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1),
+        // Line breaks and list items
-        "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5"
+        "br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
        | "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
        name @ "head" | name @ "script" | name @ "style" => {
            // silence tags
-            // only use to_ascii_lowercase here so the byte offsets dont get
+        // Paragraphs and headings
-            // messed up from one uppercase symbol becoming two lowercase
+        "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
-            // symbols or something like that
+
-            let more = more.to_ascii_lowercase();
+        // Tags to ignore along with their content
-            let end = more
+        name if ["head", "script", "style"].contains(&name) => {
-                .find(&format!("</{}", name))
+            // Search for the closing tag
-                .map(|i| i + 2 + name.len())
+
-                .and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1))
+            let closing_tag = format!("</{}>", name);
-                .unwrap_or_else(|| more.len());
+            let lower_rest = rest.to_ascii_lowercase();
-            (String::new(), tag.len() + 1 + end)
+            let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
            let closing_tag_len = if end_tag_start < lower_rest.len() {
                closing_tag.len()
            } else {
                0
            };
            let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
            (String::new(), total_skip)
        }
        // HTML comments
        "!--" => {
-            // HTML comment
+            let end = s.find("-->").map_or(s.len(), |n| n + 3);
-            (String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
+
            (String::new(), end)
        }
-        // other/unknown tags are just discarded
+
-        _ => (String::new(), tag.len() + 1),
+        // Discard other tags but keep their content
        _ => (String::new(), tag_content.len() + 1),
    }
 }
-/// Convert some HTML to plain text. Only some simple HTML tags are handled:
+/// Convert an HTML string to plain text.
-/// - `a` tags are transformed to their href attribute value
+/// Handles basic HTML tags and entities, and collapses whitespace.
 /// - paragraph, linebreak, heading, list, and list item tags insert different
 ///   amounts of line breaks.
 /// - HTML comments as well as `head`, `script` and `style` are completely
 ///   discarded, including their content
 /// - unknown tags are skipped, but their content is printed
 ///
 /// HTML named entities will be replaced with the respecive Unicode code point,
 /// and whitespace will be collapsed as is usual in HTML.
 ///
 /// The resulting string will have CRLF line endings.
 pub fn html2text(html: &str) -> String {
-    // collapse spaces
+    // Collapse multiple whitespace characters into a single space
    let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
    let mut out = String::new();
-
+    let mut index = 0;
-    let mut i = 0;
+    while index < html.len() {
-    while i < html.len() {
+        if let Some(pos) = html[index..].find('<') {
-        match html[i..].find('<') {
+            if pos > 0 {
-            None => {
+                out.push_str(&html_entities_to_text(&html[index..index + pos]));
-                // no more tags in the input, done
+                index += pos;
                out += &html_entitities_to_text(&html[i..]);
                break;
            }
-            Some(text_segment) => {
+            index += 1; // Skip the '<'
-                if text_segment > 0 {
+            let (parsed_text, advance) = handle_tag(&html[index..]);
-                    out += &html_entitities_to_text(&html[i..i + text_segment]);
+            if !parsed_text.is_empty() {
-                    i += text_segment;
+                if out.ends_with("\r\n\r\n") || out.is_empty() {
                    out.push_str(&parsed_text.trim_start());
                } else {
                    out.push_str(&parsed_text);
                }
                i += 1; // skip the '<'
                let (s, advance) = handle_tag(&html[i..]);
                if !s.is_empty() {
                    if out.ends_with("\r\n\r\n") || out.is_empty() {
                        out += &s.trim_start();
                    } else {
                        out += &s;
                    }
                }
                i += advance;
            }
            index += advance;
        } else {
            // No more tags, process the remaining text
            out.push_str(&html_entities_to_text(&html[index..]));
            break;
        }
    }
@@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String {
 #[cfg(test)]
 mod tests {
    use super::*;
    macro_rules! test {
        ($name:ident, $from:literal, $to:literal $(,)?) => {
-            #[test]
+        #[test]
            fn $name() {
-                assert_eq!(&html2text($from), $to);
+                assert_eq!(html2text($from), $to);
-            }
+                }
-        };
+            };
-        ($($name:ident: $from:literal to $to:literal,)* $(,)?) => {
+        ($($name:ident: $from:literal to $to:literal,)*) => {
-            $(test!{$name, $from, $to})*
+        $(test!{$name, $from, $to})*
        };
    }
    test! {
-        plaintext: "blah" to "blah",
+    plaintext: "blah" to "blah",
-        tag: "<div></div>" to "",
+    tag: "<div></div>" to "",
-        tag_contents: "<div>simple text</div>" to "simple text",
+    tag_contents: "<div>simple text</div>" to "simple text",
-        // links
+    // Links
-        link:
+    link: "click <a href=\"test\">here</a>" to "click here (test)",
-            "click <a href=\"test\">here</a>"
+    link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
-            to "click here (test)",
+    links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
-        link_href_equal_to_content:
+    link_entities_in_url: "click <a href=\"ents/&apos;x&apos;\">here</a>" to "click here (ents/'x')",
-            "click <a href=\"test\">test</a>"
+    link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
-            to "click test",
+    link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
-        links_ignore_attributes:
+    link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
-            "click <a class=\"x\" href=\"test\">here</a>"
+    link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
-            to "click here (test)",
+    // Inline elements
-        link_entities_in_url:
+    ignore_inline: "strong <strong>text</strong>" to "strong text",
-            "click <a href=\"ents/&apos;x&apos;\">here</a>"
+    ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
-            to "click here (ents/'x')",
+    // Line breaks and spaces
-        link_javascript:
+    collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
-            "click <a href=\"javascript:void(0)\">here</a>"
+    collapse_linebreaks: "a\nb\nc" to "a b c",
-            to "click here",
+    collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
-        link_ignore_content_tags:
+    br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
-            "click <a href=\"test\"><span>here</span> or here</a>"
+    paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
-            to "click here or here (test)",
+    // Headers
-        link_absolute_url:
+    h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
-            "click <a href=\"http://bit.ly/2n4wXRs\">news</a>"
+    h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
-            to "click news (http://bit.ly/2n4wXRs)",
+    h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
-        link_ignore_attributes_2:
+    h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
-            "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>"
+    h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
-            to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
+    h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
-        // inlines
+    h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
-        ignore_inline:
+    h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
-            "strong <strong>text</strong>"
+    h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
-            to "strong text",
+    h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
-        ignore_inline_attributes:
+    h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
-            "some <div id=\"a\" class=\"b\">div</div>"
+    no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
-            to "some div",
+    // HTML entities
-        // lines breaks and spaces
+    entity_nbsp: "two&nbsp;&nbsp;spaces" to "two  spaces",
-        collapse_spaces:
+    entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
-            "should    ignore more spaces" to "should ignore more spaces",
+    entity_tag: "&lt;printtag&gt;" to "<printtag>",
-        collapse_linebreaks:
+    entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
-            "a\nb\nc" to "a b c",
+    ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
-        collapse_mixed:
+    entity_unknown: "this &neither; as you see" to "this &neither; as you see",
-            "should \nignore \r\nnew lines" to "should ignore new lines",
+    entity_amp: "fish &amp; chips" to "fish & chips",
-        br_tag:
+    // Unordered list
-            "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
+    unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
-        paragraph:
+    entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
-            "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
+    entity_reg: "Google &reg;" to "Google ®",
-        // Headers
+    // Large entity
-        h1:
+    entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
-            "<h1>First</h1>main text" to "First\r\n\r\nmain text",
+    // Numeric HTML entities
-        h2_inline:
+    entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
-            "First<h2>Second</h2>next section"
+    entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
-            to "First\r\n\r\nSecond\r\n\r\nnext section",
+    // Full HTML structure
-        h2:
+    empty: "" to "",
-            "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
+    full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
-        h3_inline:
+    ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
-            "Second<h3>Third</h3>next section"
+    // Custom HTML tags
-            to "Second\r\n\r\nThird\r\n\r\nnext section",
+    ignore_unknown_tag: "<aa>hello</aa>" to "hello",
-        h3:
+    ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
-            "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
+    ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
-        h4_inline:
+    invalid_html_entity_without_semicolon: "&hellip" to "…",
-            "Third<h4>Fourth</h4>next section"
+
            to "Third\r\n\r\nFourth\r\n\r\nnext section",
        h4:
            "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
        h5_inline:
            "Fourth<h5>Fifth</h5>next section"
            to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
        h5:
            "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
        h6_inline:
            "Fifth<h6>Sixth</h6>next section"
            to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
        h6:
            "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
        no_h7:
            "<h7>Not Header</h7>next section" to "Not Headernext section",
        // html entitites
        entity_nbsp:
            "two&nbsp;&nbsp;spaces" to "two  spaces",
        entity_copy:
            "&copy; 2017 K3A" to "© 2017 K3A",
        entity_tag:
            "&lt;printtag&gt;" to "<printtag>",
        entity_currencies:
            "would you pay in &cent;, &pound;, &yen; or &euro;?"
            to "would you pay in ¢, £, ¥ or €?",
        ampersand_not_entity:
            "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
        entity_unknown:
            "this &neither; as you see" to "this &neither; as you see",
        entity_amp:
            "fish &amp; chips" to "fish & chips",
        unordered_list:
            "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
            to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
        entity_quot:
            "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey"
            to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
        entity_reg:
            "Google &reg;" to "Google ®",
        // Large entity
        entity_large_unknown:
            "&abcdefghij;" to "&abcdefghij;",
        // Numeric HTML entities
        entity_numeric:
            "&#8268; decimal and hex entities supported &#x204D;"
            to "⁌ decimal and hex entities supported ⁍",
        entity_numeric_2:
            "&#39;single quotes&#39; and &#52765;"
            to "'single quotes' and 츝",
        // full thml structure
        empty: "" to "",
        full_html:
            "<html><head><title>Good</title></head><body>x</body>" to "x",
        ignore_script:
            "we are not <script type=\"javascript\"></script>interested in scripts"
            to "we are not interested in scripts",
        // custom html tags
        ignore_unknown_tag:
            "<aa>hello</aa>" to "hello",
        ignore_unknown_tag_whitespace:
            "<aa >hello</aa>" to "hello",
        ignore_unknown_tag_attributes:
            "<aa x=\"1\">hello</aa>" to "hello",
        invalid_html_entity_without_semicolon: "&hellip" to "…",
    }
 }