diff --git a/src/lib.rs b/src/lib.rs index bda254f..2ae0ae5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,346 @@ -fn html2text() {} +// almost a line for line rewrite of https://github.com/k3a/html2text/blob/master/html2text.go +// +mod entity; +fn main() { + println!("Hello, world!"); +} + +const LBR: &str = "\r\n"; +// stolen from https://github.com/veddan/rust-htmlescape/blob/master/src/decode.rs +fn decode_named_entity(entity: &str) -> Option { + match entity::ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) { + Err(..) => None, + Ok(idx) => { + let (_, c) = entity::ENTITIES[idx]; + Some(c) + } + } +} + +const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"]; + +// awkward +fn parse_link(l: &str) -> Option<&str> { + if l.starts_with("a") { + let s: Vec<&str> = l.split("href=").collect(); + if s.len() > 1 { + if s[1] != "" { + if s[1].as_bytes()[0] == b'\'' { + let end = s[1][1..].chars().position(|c| c == '\''); + if let Some(p) = end { + return Some(&s[1][1..=p]); + } + } else if s[1].as_bytes()[0] == b'"' { + let end = s[1][1..].chars().position(|c| c == '"'); + if let Some(p) = end { + return Some(&s[1][1..=p]); + } + } + } + } + } + None +} + +fn is_bad_tag(t: &str) -> bool { + let t = t.split_whitespace().next().unwrap(); + if BAD_TAGS.contains(&t) { + return true; + } + false +} + +// replacing regex +fn is_header(h: &str) -> bool { + let mut b = h.as_bytes(); + if b.len() == 3 && b[0] == b'/' { + b = &b[1..] + } + if b.len() == 2 && b[0] == b'h' { + if b'1' <= b[1] && b[1] <= b'6' { + return true; + } + } + false +} + +fn parse_html_entity(ent_name: &str) -> Option { + let d = decode_named_entity(ent_name); + if d.is_some() { + return d; + } + // rewriting without regex + let lower = ent_name.to_lowercase(); + if lower.starts_with("#") && lower.len() > 1 { + let parsed; + if lower.as_bytes()[1] == b'x' && lower.len() > 2 { + parsed = u32::from_str_radix(&lower[2..], 16).ok(); + } else { + parsed = u32::from_str_radix(&lower[1..], 10).ok(); + } + return parsed.and_then(|n| { + if n == 9 || n == 10 || n == 13 || n > 32 { + return char::from_u32(n); + } + return None; + }); + } + + None +} + +fn html_entitities_to_text(s: &str) -> String { + let mut out = String::new(); + let mut in_ent = false; + for (i, r) in s.chars().enumerate() { + if r == ';' && in_ent { + in_ent = false; + continue; + } else if r == '&' { + let mut ent_name = String::new(); + let mut is_ent = false; + let mut chars = 0; + for er in s[i + 1..].chars() { + if er == ';' { + is_ent = true; + break; + } else { + ent_name.push(er); + } + chars += 1; + if chars == 10 { + break; + } + } + if is_ent { + if let Some(ent) = parse_html_entity(&ent_name) { + out.push(ent); + in_ent = true; + continue; + } + } + } + if !in_ent { + out.push(r); + } + } + out +} + +fn write_space(s: &mut String) { + let b = s.as_bytes(); + if b.len() > 0 && b[b.len() - 1] != b' ' { + s.push(' '); + } +} + +pub fn html2text(html: &str) -> String { + let in_len = html.len(); + let mut tag_start = 0; + let mut in_ent = false; + let mut bad_tag_stack_depth = 0; + let mut should_output = true; + let mut can_print_new_line = false; + let mut out_buf = String::new(); + for (i, r) in html.char_indices() { + if in_len > 0 && i == in_len - 1 { + can_print_new_line = false + } + if r.is_whitespace() { + if should_output && bad_tag_stack_depth == 0 && !in_ent { + write_space(&mut out_buf); + } + continue; + } else if r == ';' && in_ent { + in_ent = false; + continue; + } else if r == '&' && should_output { + let mut ent_name = String::new(); + let mut is_ent = false; + let mut chars = 10; + for er in html[i + 1..].chars() { + if er == ';' { + is_ent = true; + break; + } else { + ent_name.push(er); + } + chars += 1; + if chars == 10 { + break; + } + } + if is_ent { + if let Some(ent) = parse_html_entity(&ent_name) { + out_buf.push(ent); + in_ent = true; + } + } + } else if r == '<' { + // start of tag + tag_start = i + 1; + should_output = false; + continue; + } else if r == '>' { + should_output = true; + let tag = &html[tag_start..i]; + let tag_name_lower = tag.to_lowercase(); + if tag_name_lower == "/ul" { + out_buf.push_str(LBR); + } else if tag_name_lower == "li" || tag_name_lower == "li/" { + out_buf.push_str(LBR); + } else if is_header(&tag_name_lower) { + if can_print_new_line { + out_buf.push_str(LBR); + out_buf.push_str(LBR); + } + can_print_new_line = false; + } else if tag_name_lower == "br" || tag_name_lower == "br/" { + out_buf.push_str(LBR); + } else if tag_name_lower == "p" || tag_name_lower == "/p" { + if can_print_new_line { + out_buf.push_str(LBR); + out_buf.push_str(LBR); + } + can_print_new_line = false; + } else if is_bad_tag(&tag_name_lower) { + bad_tag_stack_depth += 1; + // parse link + if let Some(link) = parse_link(tag) { + if !link.contains("javascript:") { + out_buf.push_str(&html_entitities_to_text(link)); + } + } + } else if tag_name_lower.len() > 0 + && tag_name_lower.starts_with("/") + && is_bad_tag(&tag_name_lower[1..]) + { + bad_tag_stack_depth -= 1; + } + continue; + } + + if should_output && bad_tag_stack_depth == 0 && !in_ent { + can_print_new_line = true; + out_buf.push(r); + } + } + out_buf +} + +#[cfg(test)] +mod tests { + use super::*; + const cases: &[(&str, &str)] = &[ + ("blah", "blah"), + // links + ("
", ""), + ("
simple text
", "simple text"), + ("click here", "click test"), + ("click here", "click test"), + ( + "click here", + "click ents/'x'", + ), + ("click here", "click "), + ( + "click here or here", + "click test", + ), + ( + "click news", + "click http://bit.ly/2n4wXRs", + ), + ("yet, not yet", "/wiki/yet#English, /wiki/not_yet#English"), + + // inlines + ("strong text", "strong text"), + ("some
div
", "some div"), + // lines breaks and spaces + ("should ignore more spaces", "should ignore more spaces"), + ("should \nignore \r\nnew lines", "should ignore new lines"), + ("a\nb\nc", "a b c"), + ("two
line
breaks", "two\r\nline\r\nbreaks"), + ("

two

paragraphs

", "two\r\n\r\nparagraphs"), + // Headers + ("

First

main text", "First\r\n\r\nmain text"), + ( + "First

Second

next section", + "First\r\n\r\nSecond\r\n\r\nnext section", + ), + ("

Second

next section", "Second\r\n\r\nnext section"), + ( + "Second

Third

next section", + "Second\r\n\r\nThird\r\n\r\nnext section", + ), + ("

Third

next section", "Third\r\n\r\nnext section"), + ( + "Third

Fourth

next section", + "Third\r\n\r\nFourth\r\n\r\nnext section", + ), + ("

Fourth

next section", "Fourth\r\n\r\nnext section"), + ( + "Fourth
Fifth
next section", + "Fourth\r\n\r\nFifth\r\n\r\nnext section", + ), + ("
Fifth
next section", "Fifth\r\n\r\nnext section"), + ( + "Fifth
Sixth
next section", + "Fifth\r\n\r\nSixth\r\n\r\nnext section", + ), + ("
Sixth
next section", "Sixth\r\n\r\nnext section"), + ("Not Headernext section", "Not Headernext section"), + // html entitites + ("two  spaces", "two  spaces"), + ("© 2017 K3A", "© 2017 K3A"), + ("<printtag>", ""), + ( + "would you pay in ¢, £, ¥ or €?", + "would you pay in ¢, £, ¥ or €?", + ), + ( + "Tom & Jerry is not an entity", + "Tom & Jerry is not an entity", + ), + ("this &neither; as you see", "this &neither; as you see"), + ( + "list of items
  • One
  • Two
  • Three
", + "list of items\r\nOne\r\nTwo\r\nThree\r\n", + ), + ("fish & chips", "fish & chips"), + ( + ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey", + "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey", + ), + ("Google ®", "Google ®"), + ( + "⁌ decimal and hex entities supported ⁍", + "⁌ decimal and hex entities supported ⁍", + ), + // Large entity + ("&abcdefghij;", "&abcdefghij;"), + // Numeric HTML entities + ( + "'single quotes' and 츝", + "'single quotes' and 츝", + ), + // full thml structure + ("", ""), + ("Goodx", "x"), + ( + "we are not interested in scripts", + "we are not interested in scripts", + ), + // custom html tags + ("hello", "hello"), + ("hello", "hello"), + ("hello", "hello"), + ]; + + #[test] + fn test_all() { + for case in cases { + assert_eq!(&html2text(case.0), case.1); + } + } +} diff --git a/src/main.rs b/src/main.rs index a0f9fda..e3db6af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,347 +1,8 @@ -// almost a line for line rewrite of https://github.com/k3a/html2text/blob/master/html2text.go -// -mod entity; +use nanohtml2text::html2text; +use std::io::{self, Read}; + fn main() { - println!("Hello, world!"); -} - -const LBR: &str = "\r\n"; -// stolen from https://github.com/veddan/rust-htmlescape/blob/master/src/decode.rs -fn decode_named_entity(entity: &str) -> Option { - match entity::ENTITIES.binary_search_by(|&(ent, _)| ent.cmp(entity)) { - Err(..) => None, - Ok(idx) => { - let (_, c) = entity::ENTITIES[idx]; - Some(c) - } - } -} - -const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"]; - -// awkward -fn parse_link(l: &str) -> Option<&str> { - if l.starts_with("a") { - let s: Vec<&str> = l.split("href=").collect(); - if s.len() > 1 { - if s[1] != "" { - if s[1].as_bytes()[0] == b'\'' { - let end = s[1][1..].chars().position(|c| c == '\''); - if let Some(p) = end { - return Some(&s[1][1..=p]); - } - } else if s[1].as_bytes()[0] == b'"' { - let end = s[1][1..].chars().position(|c| c == '"'); - if let Some(p) = end { - return Some(&s[1][1..=p]); - } - } - } - } - } - None -} - -fn is_bad_tag(t: &str) -> bool { - let t = t.split_whitespace().next().unwrap(); - if BAD_TAGS.contains(&t) { - return true; - } - false -} - -// replacing regex -fn is_header(h: &str) -> bool { - let mut b = h.as_bytes(); - if b.len() == 3 && b[0] == b'/' { - b = &b[1..] - } - if b.len() == 2 && b[0] == b'h' { - if b'1' <= b[1] && b[1] <= b'6' { - return true; - } - } - false -} - -fn parse_html_entity(ent_name: &str) -> Option { - let d = decode_named_entity(ent_name); - if d.is_some() { - return d; - } - // rewriting without regex - let lower = ent_name.to_lowercase(); - if lower.starts_with("#") && lower.len() > 1 { - let parsed; - if lower.as_bytes()[1] == b'x' && lower.len() > 2 { - parsed = u32::from_str_radix(&lower[2..], 16).ok(); - } else { - parsed = u32::from_str_radix(&lower[1..], 10).ok(); - } - return parsed.and_then(|n| { - if n == 9 || n == 10 || n == 13 || n > 32 { - return char::from_u32(n); - } - return None; - }); - } - - None -} - -fn html_entitities_to_text(s: &str) -> String { - let mut out = String::new(); - let mut in_ent = false; - for (i, r) in s.chars().enumerate() { - if r == ';' && in_ent { - in_ent = false; - continue; - } else if r == '&' { - let mut ent_name = String::new(); - let mut is_ent = false; - let mut chars = 0; - for er in s[i + 1..].chars() { - if er == ';' { - is_ent = true; - break; - } else { - ent_name.push(er); - } - chars += 1; - if chars == 10 { - break; - } - } - if is_ent { - if let Some(ent) = parse_html_entity(&ent_name) { - out.push(ent); - in_ent = true; - continue; - } - } - } - if !in_ent { - out.push(r); - } - } - out -} - -fn write_space(s: &mut String) { - let b = s.as_bytes(); - if b.len() > 0 && b[b.len() - 1] != b' ' { - s.push(' '); - } -} - -fn html2text(html: &str) -> String { - let in_len = html.len(); - let mut tag_start = 0; - let mut in_ent = false; - let mut bad_tag_stack_depth = 0; - let mut should_output = true; - let mut can_print_new_line = false; - let mut out_buf = String::new(); - for (i, r) in html.chars().enumerate() { - if in_len > 0 && i == in_len - 1 { - can_print_new_line = false - } - if r.is_whitespace() { - if should_output && bad_tag_stack_depth == 0 && !in_ent { - write_space(&mut out_buf); - } - continue; - } else if r == ';' && in_ent { - in_ent = false; - continue; - } else if r == '&' && should_output { - let mut ent_name = String::new(); - let mut is_ent = false; - let mut chars = 10; - for er in html[i + 1..].chars() { - if er == ';' { - is_ent = true; - break; - } else { - ent_name.push(er); - } - chars += 1; - if chars == 10 { - break; - } - } - if is_ent { - if let Some(ent) = parse_html_entity(&ent_name) { - out_buf.push(ent); - in_ent = true; - } - } - } else if r == '<' { - // start of tag - tag_start = i + 1; - should_output = false; - continue; - } else if r == '>' { - // end of tag - should_output = true; - let tag = &html[tag_start..i]; - let tag_name_lower = tag.to_lowercase(); - if tag_name_lower == "/ul" { - out_buf.push_str(LBR); - } else if tag_name_lower == "li" || tag_name_lower == "li/" { - out_buf.push_str(LBR); - } else if is_header(&tag_name_lower) { - if can_print_new_line { - out_buf.push_str(LBR); - out_buf.push_str(LBR); - } - can_print_new_line = false; - } else if tag_name_lower == "br" || tag_name_lower == "br/" { - out_buf.push_str(LBR); - } else if tag_name_lower == "p" || tag_name_lower == "/p" { - if can_print_new_line { - out_buf.push_str(LBR); - out_buf.push_str(LBR); - } - can_print_new_line = false; - } else if is_bad_tag(&tag_name_lower) { - bad_tag_stack_depth += 1; - // parse link - if let Some(link) = parse_link(tag) { - if !link.contains("javascript:") { - out_buf.push_str(&html_entitities_to_text(link)); - } - } - } else if tag_name_lower.len() > 0 - && tag_name_lower.starts_with("/") - && is_bad_tag(&tag_name_lower[1..]) - { - bad_tag_stack_depth -= 1; - } - continue; - } - - if should_output && bad_tag_stack_depth == 0 && !in_ent { - can_print_new_line = true; - out_buf.push(r); - } - } - out_buf -} - -#[cfg(test)] -mod tests { - use super::*; - const cases: &[(&str, &str)] = &[ - ("blah", "blah"), - // links - ("
", ""), - ("
simple text
", "simple text"), - ("click here", "click test"), - ("click here", "click test"), - ( - "click here", - "click ents/'x'", - ), - ("click here", "click "), - ( - "click here or here", - "click test", - ), - ( - "click news", - "click http://bit.ly/2n4wXRs", - ), - ("yet, not yet", "/wiki/yet#English, /wiki/not_yet#English"), - - // inlines - ("strong text", "strong text"), - ("some
div
", "some div"), - // lines breaks and spaces - ("should ignore more spaces", "should ignore more spaces"), - ("should \nignore \r\nnew lines", "should ignore new lines"), - ("a\nb\nc", "a b c"), - ("two
line
breaks", "two\r\nline\r\nbreaks"), - ("

two

paragraphs

", "two\r\n\r\nparagraphs"), - // Headers - ("

First

main text", "First\r\n\r\nmain text"), - ( - "First

Second

next section", - "First\r\n\r\nSecond\r\n\r\nnext section", - ), - ("

Second

next section", "Second\r\n\r\nnext section"), - ( - "Second

Third

next section", - "Second\r\n\r\nThird\r\n\r\nnext section", - ), - ("

Third

next section", "Third\r\n\r\nnext section"), - ( - "Third

Fourth

next section", - "Third\r\n\r\nFourth\r\n\r\nnext section", - ), - ("

Fourth

next section", "Fourth\r\n\r\nnext section"), - ( - "Fourth
Fifth
next section", - "Fourth\r\n\r\nFifth\r\n\r\nnext section", - ), - ("
Fifth
next section", "Fifth\r\n\r\nnext section"), - ( - "Fifth
Sixth
next section", - "Fifth\r\n\r\nSixth\r\n\r\nnext section", - ), - ("
Sixth
next section", "Sixth\r\n\r\nnext section"), - ("Not Headernext section", "Not Headernext section"), - // html entitites - ("two  spaces", "two  spaces"), - ("© 2017 K3A", "© 2017 K3A"), - ("<printtag>", ""), - ( - "would you pay in ¢, £, ¥ or €?", - "would you pay in ¢, £, ¥ or €?", - ), - ( - "Tom & Jerry is not an entity", - "Tom & Jerry is not an entity", - ), - ("this &neither; as you see", "this &neither; as you see"), - ( - "list of items
  • One
  • Two
  • Three
", - "list of items\r\nOne\r\nTwo\r\nThree\r\n", - ), - ("fish & chips", "fish & chips"), - ( - ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey", - "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey", - ), - ("Google ®", "Google ®"), - ( - "⁌ decimal and hex entities supported ⁍", - "⁌ decimal and hex entities supported ⁍", - ), - // Large entity - ("&abcdefghij;", "&abcdefghij;"), - // Numeric HTML entities - ( - "'single quotes' and 츝", - "'single quotes' and 츝", - ), - // full thml structure - ("", ""), - ("Goodx", "x"), - ( - "we are not interested in scripts", - "we are not interested in scripts", - ), - // custom html tags - ("hello", "hello"), - ("hello", "hello"), - ("hello", "hello"), - ]; - - #[test] - fn test_all() { - for case in cases { - assert_eq!(&html2text(case.0), case.1); - } - } + let mut buffer = String::new(); + std::io::stdin().read_to_string(&mut buffer).unwrap(); + println!("{}", html2text(&buffer)); }