Add numeric entity parsing

This commit is contained in:
alex wennerberg
2022-01-04 07:54:58 -08:00
parent c6368a4906
commit 5e2fb2cc82

View File

@@ -16,8 +16,29 @@ fn decode_named_entity(entity: &str) -> Option<char> {
} }
} }
fn parse_html_entity(ent_name: &str) { fn parse_html_entity(ent_name: &str) -> Option<char> {
// entities.binary_search_by(k let d = decode_named_entity(ent_name);
if d.is_some() {
return d;
}
// rewriting without regex
let lower = ent_name.to_lowercase();
if lower.starts_with("#") && lower.len() > 1 {
let parsed;
if lower.as_bytes()[1] == b'x' && lower.len() > 2 {
parsed = lower[2..].parse().ok();
} else {
parsed = lower[1..].parse().ok();
}
return parsed.and_then(|n| {
if n == 9 || n == 10 || n == 13 || n > 32 {
return char::from_u32(n);
}
return None;
});
}
None
} }
fn write_space(s: &mut String) { fn write_space(s: &mut String) {
@@ -64,15 +85,31 @@ fn html2text(html: &str) -> String {
} }
} }
if is_ent { if is_ent {
// parseHTMLentity TODO if let Some(ent) = parse_html_entity(&ent_name) {
out_buf.push(ent);
in_ent = true;
}
} }
} else if r == '<' { } else if r == '<' {
// start of tag // start of tag
tag_start = i + 1; tag_start = i + 1;
should_output = false; should_output = false;
continue; continue;
} else if r == '>' { // end of tag } else if r == '>' {
// TODO // end of tag
should_output = true;
let tag = &html[tag_start..i];
let tag_name_lower = tag.to_lowercase();
// match a few special tags
if tag_name_lower == "/ul" {
out_buf.push('\n');
} else if tag_name_lower == "li" || tag_name_lower == "li/" {
out_buf.push('\n');
}
// else if {
// headers re
// } else if //headers regex
// TODO
} }
if should_output && bad_tag_stack_depth == 0 && !in_ent { if should_output && bad_tag_stack_depth == 0 && !in_ent {