make several functions more concise

Make more use of try operator and standard library functions.
Do not operate on bytes directly.
This commit is contained in:
Johann150
2022-01-12 22:39:45 +01:00
parent 6e3fd37e73
commit 00a9b04d90

View File

@@ -14,25 +14,29 @@ const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"];
// awkward // awkward
fn parse_link(l: &str) -> Option<&str> { fn parse_link(l: &str) -> Option<&str> {
if l.starts_with("a") { let href_value = l
let s: Vec<&str> = l.split("href=").collect(); .strip_prefix('a')?
if s.len() > 1 { // check for the href and then discard everything before it
if s[1] != "" { .split_once("href")?
if s[1].as_bytes()[0] == b'\'' { .1
let end = s[1][1..].bytes().position(|c| c == b'\''); // there might be whitespace between 'href' and '='
if let Some(p) = end { .trim_start()
return Some(&s[1][1..=p]); // check for and then discard the equal sign
.strip_prefix('=')?
// remove whitespace after the equal sign
.trim_start();
// find quoted string
match href_value.chars().next()? {
start @ '\'' | start @ '"' => {
let (end, _) = href_value
.char_indices()
.skip(1)
.find(|(_, c)| *c == start)?;
Some(&href_value[1..end])
} }
} else if s[1].as_bytes()[0] == b'"' { _ => None,
let end = s[1][1..].bytes().position(|c| c == b'"');
if let Some(p) = end {
return Some(&s[1][1..=p]);
} }
}
}
}
}
None
} }
fn is_bad_tag(t: &str) -> bool { fn is_bad_tag(t: &str) -> bool {
@@ -45,16 +49,18 @@ fn is_bad_tag(t: &str) -> bool {
// replacing regex // replacing regex
fn is_header(h: &str) -> bool { fn is_header(h: &str) -> bool {
let mut b = h.as_bytes(); // optionally remove leading slash
if b.len() == 3 && b[0] == b'/' { h.strip_prefix('/')
b = &b[1..] .unwrap_or(h)
} // remove leading h
if b.len() == 2 && b[0] == b'h' { .strip_prefix('h')
if b'1' <= b[1] && b[1] <= b'6' { // there should only be one more char
return true; .filter(|h| h.len() == 1)
} // if that all worked, take the char
} .and_then(|h| h.chars().next())
false // if we have the char, check if its 1 to 6
// or false if we dont have the char
.map_or(false, |c| matches!(c, '1'..='6'))
} }
fn parse_html_entity(ent_name: &str) -> Option<char> { fn parse_html_entity(ent_name: &str) -> Option<char> {
@@ -62,61 +68,49 @@ fn parse_html_entity(ent_name: &str) -> Option<char> {
if d.is_some() { if d.is_some() {
return d; return d;
} }
// rewriting without regex
let lower = ent_name.to_lowercase();
if lower.starts_with("#") && lower.len() > 1 {
let parsed;
if lower.as_bytes()[1] == b'x' && lower.len() > 2 {
parsed = u32::from_str_radix(&lower[2..], 16).ok();
} else {
parsed = u32::from_str_radix(&lower[1..], 10).ok();
}
return parsed.and_then(|n| {
if n == 9 || n == 10 || n == 13 || n > 32 {
return char::from_u32(n);
}
return None;
});
}
None let num = ent_name.strip_prefix("#")?;
if num.chars().next()? == 'x' {
u32::from_str_radix(&num[1..].to_lowercase(), 16)
} else {
// remaining string may be empty, but that will generate an Err(Empty)
u32::from_str_radix(num, 10)
}
.ok()
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
.and_then(|n| char::from_u32(n))
} }
fn html_entitities_to_text(s: &str) -> String { fn html_entitities_to_text(s: &str) -> String {
let mut out = String::new(); let mut out = String::new();
let mut in_ent = false;
for (i, r) in s.char_indices() { // except for the first part, every part will have started with an ampersand
if r == ';' && in_ent { // thus the start of the remaining parts is a HTML entity
in_ent = false; let mut parts = s.split('&');
continue; /*
} else if r == '&' { skip first part. if the string started with an ampersand, the first part
let mut ent_name = String::new(); will be an empty string
let mut is_ent = false;
let mut chars = 0; if the string was empty, the first part will also be an empty string so its
for er in s[i + 1..].chars() { safe to unwrap
if er == ';' { */
is_ent = true; out.push_str(parts.next().unwrap());
break;
for part in parts {
let end = part
// entity can be terminated by semicolon or whitespace
.find(|c: char| c.is_whitespace() || c == ';')
// entity can also terminated by end of string or start of
// another entity
.unwrap_or_else(|| part.len());
if let Some(entity) = parse_html_entity(&part[..end]) {
out.push(entity);
out.push_str(&part[end..]);
} else { } else {
ent_name.push(er); out.push_str(part)
}
chars += 1;
if chars == 10 {
break;
}
}
if is_ent {
if let Some(ent) = parse_html_entity(&ent_name) {
out.push(ent);
in_ent = true;
continue;
}
}
}
if !in_ent {
out.push(r);
} }
} }
out out
} }