// almost a line for line rewrite of https://github.com/k3a/html2text/blob/master/html2text.go // mod entity; fn main() { println!("Hello, world!"); } const LBR: &str = "\r\n"; // stolen from https://github.com/veddan/rust-htmlescape/blob/master/src/decode.rs fn decode_named_entity(entity: &str) -> Option { match entity::ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) { Err(..) => None, Ok(idx) => { let (_, c) = entity::ENTITIES[idx]; Some(c) } } } const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"]; // awkward fn parse_link(l: &str) -> Option<&str> { if l.starts_with("a") { let s: Vec<&str> = l.split("href=").collect(); if s.len() > 1 { if s[1] != "" { if s[1].as_bytes()[0] == b'\'' { let end = s[1][1..].chars().position(|c| c == '\''); if let Some(p) = end { return Some(&s[1][1..=p]); } } else if s[1].as_bytes()[0] == b'"' { let end = s[1][1..].chars().position(|c| c == '"'); if let Some(p) = end { return Some(&s[1][1..=p]); } } } } } None } fn is_bad_tag(t: &str) -> bool { let t = t.split_whitespace().next().unwrap(); if BAD_TAGS.contains(&t) { return true; } false } // replacing regex fn is_header(h: &str) -> bool { let mut b = h.as_bytes(); if b.len() == 3 && b[0] == b'/' { b = &b[1..] } if b.len() == 2 && b[0] == b'h' { if b'1' <= b[1] && b[1] <= b'6' { return true; } } false } fn parse_html_entity(ent_name: &str) -> Option { let d = decode_named_entity(ent_name); if d.is_some() { return d; } // rewriting without regex let lower = ent_name.to_lowercase(); if lower.starts_with("#") && lower.len() > 1 { let parsed; if lower.as_bytes()[1] == b'x' && lower.len() > 2 { parsed = lower[2..].parse().ok(); } else { parsed = lower[1..].parse().ok(); } return parsed.and_then(|n| { if n == 9 || n == 10 || n == 13 || n > 32 { return char::from_u32(n); } return None; }); } None } fn html_entitities_to_text(s: &str) -> String { let mut out = String::new(); let mut in_ent = false; for (i, r) in s.chars().enumerate() { if r == ';' && in_ent { in_ent = false; continue; } else if r == '&' { let mut ent_name = String::new(); let mut is_ent = false; let mut chars = 0; for er in s[i + 1..].chars() { if er == ';' { is_ent = true; break; } else { ent_name.push(er); } chars += 1; if chars == 10 { break; } } if is_ent { if let Some(ent) = parse_html_entity(&ent_name) { out.push(ent); in_ent = true; continue; } } } if !in_ent { out.push(r); } } out } fn write_space(s: &mut String) { let b = s.as_bytes(); if b.len() > 0 && b[b.len() - 1] != b' ' { s.push(' '); } } fn html2text(html: &str) -> String { let in_len = html.len(); let mut tag_start = 0; let mut in_ent = false; let mut bad_tag_stack_depth = 0; let mut should_output = true; let mut can_print_new_line = false; let mut out_buf = String::new(); for (i, r) in html.chars().enumerate() { if in_len > 0 && i == in_len - 1 { can_print_new_line = false } if r.is_whitespace() { if should_output && bad_tag_stack_depth == 0 && !in_ent { write_space(&mut out_buf); } continue; } else if r == ';' && in_ent { in_ent = false; continue; } else if r == '&' && should_output { let mut ent_name = String::new(); let mut is_ent = false; let mut chars = 10; for er in html[i + 1..].chars() { if er == ';' { is_ent = true; break; } else { ent_name.push(er); } chars += 1; if chars == 10 { break; } } if is_ent { if let Some(ent) = parse_html_entity(&ent_name) { out_buf.push(ent); in_ent = true; } } } else if r == '<' { // start of tag tag_start = i + 1; should_output = false; continue; } else if r == '>' { // end of tag should_output = true; let tag = &html[tag_start..i]; let tag_name_lower = tag.to_lowercase(); if tag_name_lower == "/ul" { out_buf.push_str(LBR); } else if tag_name_lower == "li" || tag_name_lower == "li/" { out_buf.push_str(LBR); } else if is_header(&tag_name_lower) { if can_print_new_line { out_buf.push_str(LBR); out_buf.push_str(LBR); } can_print_new_line = false; } else if tag_name_lower == "br" || tag_name_lower == "br/" { out_buf.push_str(LBR); } else if tag_name_lower == "p" || tag_name_lower == "/p" { if can_print_new_line { out_buf.push_str(LBR); out_buf.push_str(LBR); } can_print_new_line = false; } else if is_bad_tag(&tag_name_lower) { bad_tag_stack_depth += 1; // parse link if let Some(link) = parse_link(tag) { if !link.contains("javascript:") { out_buf.push_str(&html_entitities_to_text(link)); } } } else if tag_name_lower.len() > 0 && tag_name_lower.starts_with("/") && is_bad_tag(&tag_name_lower) { bad_tag_stack_depth -= 1; } continue; } if should_output && bad_tag_stack_depth == 0 && !in_ent { can_print_new_line = true; out_buf.push(r); } } out_buf } #[cfg(test)] mod tests { use super::*; const cases: &[(&str, &str)] = &[ ("blah", "blah"), // links ("
", ""), ("
simple text
", "simple text"), ("click here", "click test"), ("click here", "click test"), ( "click here", "click ents/'x'", ), ("click here", "click "), ( "click here or here", "click test", ), ( "click news", "click http://bit.ly/2n4wXRs", ), // ("yet, not yet", "/wiki/yet#English, /wiki/not_yet#English"), // inlines ("strong text", "strong text"), ("some
div
", "some div"), // lines breaks and spaces ("should ignore more spaces", "should ignore more spaces"), ("should \nignore \r\nnew lines", "should ignore new lines"), ("a\nb\nc", "a b c"), ("two
line
breaks", "two\r\nline\r\nbreaks"), ("

two

paragraphs

", "two\r\n\r\nparagraphs"), // Headers ("

First

main text", "First\r\n\r\nmain text"), ( "First

Second

next section", "First\r\n\r\nSecond\r\n\r\nnext section", ), ("

Second

next section", "Second\r\n\r\nnext section"), ( "Second

Third

next section", "Second\r\n\r\nThird\r\n\r\nnext section", ), ("

Third

next section", "Third\r\n\r\nnext section"), ( "Third

Fourth

next section", "Third\r\n\r\nFourth\r\n\r\nnext section", ), ("

Fourth

next section", "Fourth\r\n\r\nnext section"), ( "Fourth
Fifth
next section", "Fourth\r\n\r\nFifth\r\n\r\nnext section", ), ("
Fifth
next section", "Fifth\r\n\r\nnext section"), ( "Fifth
Sixth
next section", "Fifth\r\n\r\nSixth\r\n\r\nnext section", ), ("
Sixth
next section", "Sixth\r\n\r\nnext section"), ("Not Headernext section", "Not Headernext section"), // html entitites ("two  spaces", "two  spaces"), ("© 2017 K3A", "© 2017 K3A"), ("<printtag>", ""), ( "would you pay in ¢, £, ¥ or €?", "would you pay in ¢, £, ¥ or €?", ), ( "Tom & Jerry is not an entity", "Tom & Jerry is not an entity", ), ("this &neither; as you see", "this &neither; as you see"), ( "list of items
  • One
  • Two
  • Three
", "list of items\r\nOne\r\nTwo\r\nThree\r\n", ), ("fish & chips", "fish & chips"), ( ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey", "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey", ), ("Google ®", "Google ®"), ( "⁌ decimal and hex entities supported ⁍", "⁌ decimal and hex entities supported ⁍", ), // Large entity ("&abcdefghij;", "&abcdefghij;"), // Numeric HTML entities ( "'single quotes' and 츝", "'single quotes' and 츝", ), // full thml structure ("", ""), ("Goodx", "x"), ( "we are not interested in scripts", "we are not interested in scripts", ), // custom html tags ("hello", "hello"), ("hello", "hello"), ("hello", "hello"), ]; #[test] fn test_all() { for case in cases { assert_eq!(&html2text(case.0), case.1); } } }