make several functions more concise
Make more use of try operator and standard library functions. Do not operate on bytes directly.
This commit is contained in:
142
src/lib.rs
142
src/lib.rs
@@ -14,26 +14,30 @@ const BAD_TAGS: [&str; 4] = ["head", "script", "style", "a"];
|
|||||||
|
|
||||||
// awkward
|
// awkward
|
||||||
fn parse_link(l: &str) -> Option<&str> {
|
fn parse_link(l: &str) -> Option<&str> {
|
||||||
if l.starts_with("a") {
|
let href_value = l
|
||||||
let s: Vec<&str> = l.split("href=").collect();
|
.strip_prefix('a')?
|
||||||
if s.len() > 1 {
|
// check for the href and then discard everything before it
|
||||||
if s[1] != "" {
|
.split_once("href")?
|
||||||
if s[1].as_bytes()[0] == b'\'' {
|
.1
|
||||||
let end = s[1][1..].bytes().position(|c| c == b'\'');
|
// there might be whitespace between 'href' and '='
|
||||||
if let Some(p) = end {
|
.trim_start()
|
||||||
return Some(&s[1][1..=p]);
|
// check for and then discard the equal sign
|
||||||
|
.strip_prefix('=')?
|
||||||
|
// remove whitespace after the equal sign
|
||||||
|
.trim_start();
|
||||||
|
|
||||||
|
// find quoted string
|
||||||
|
match href_value.chars().next()? {
|
||||||
|
start @ '\'' | start @ '"' => {
|
||||||
|
let (end, _) = href_value
|
||||||
|
.char_indices()
|
||||||
|
.skip(1)
|
||||||
|
.find(|(_, c)| *c == start)?;
|
||||||
|
Some(&href_value[1..end])
|
||||||
}
|
}
|
||||||
} else if s[1].as_bytes()[0] == b'"' {
|
_ => None,
|
||||||
let end = s[1][1..].bytes().position(|c| c == b'"');
|
|
||||||
if let Some(p) = end {
|
|
||||||
return Some(&s[1][1..=p]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_bad_tag(t: &str) -> bool {
|
fn is_bad_tag(t: &str) -> bool {
|
||||||
let t = t.split_whitespace().next().unwrap();
|
let t = t.split_whitespace().next().unwrap();
|
||||||
@@ -45,16 +49,18 @@ fn is_bad_tag(t: &str) -> bool {
|
|||||||
|
|
||||||
// replacing regex
|
// replacing regex
|
||||||
fn is_header(h: &str) -> bool {
|
fn is_header(h: &str) -> bool {
|
||||||
let mut b = h.as_bytes();
|
// optionally remove leading slash
|
||||||
if b.len() == 3 && b[0] == b'/' {
|
h.strip_prefix('/')
|
||||||
b = &b[1..]
|
.unwrap_or(h)
|
||||||
}
|
// remove leading h
|
||||||
if b.len() == 2 && b[0] == b'h' {
|
.strip_prefix('h')
|
||||||
if b'1' <= b[1] && b[1] <= b'6' {
|
// there should only be one more char
|
||||||
return true;
|
.filter(|h| h.len() == 1)
|
||||||
}
|
// if that all worked, take the char
|
||||||
}
|
.and_then(|h| h.chars().next())
|
||||||
false
|
// if we have the char, check if its 1 to 6
|
||||||
|
// or false if we dont have the char
|
||||||
|
.map_or(false, |c| matches!(c, '1'..='6'))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html_entity(ent_name: &str) -> Option<char> {
|
fn parse_html_entity(ent_name: &str) -> Option<char> {
|
||||||
@@ -62,61 +68,49 @@ fn parse_html_entity(ent_name: &str) -> Option<char> {
|
|||||||
if d.is_some() {
|
if d.is_some() {
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
// rewriting without regex
|
|
||||||
let lower = ent_name.to_lowercase();
|
|
||||||
if lower.starts_with("#") && lower.len() > 1 {
|
|
||||||
let parsed;
|
|
||||||
if lower.as_bytes()[1] == b'x' && lower.len() > 2 {
|
|
||||||
parsed = u32::from_str_radix(&lower[2..], 16).ok();
|
|
||||||
} else {
|
|
||||||
parsed = u32::from_str_radix(&lower[1..], 10).ok();
|
|
||||||
}
|
|
||||||
return parsed.and_then(|n| {
|
|
||||||
if n == 9 || n == 10 || n == 13 || n > 32 {
|
|
||||||
return char::from_u32(n);
|
|
||||||
}
|
|
||||||
return None;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
let num = ent_name.strip_prefix("#")?;
|
||||||
|
if num.chars().next()? == 'x' {
|
||||||
|
u32::from_str_radix(&num[1..].to_lowercase(), 16)
|
||||||
|
} else {
|
||||||
|
// remaining string may be empty, but that will generate an Err(Empty)
|
||||||
|
u32::from_str_radix(num, 10)
|
||||||
|
}
|
||||||
|
.ok()
|
||||||
|
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
|
||||||
|
.and_then(|n| char::from_u32(n))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn html_entitities_to_text(s: &str) -> String {
|
fn html_entitities_to_text(s: &str) -> String {
|
||||||
let mut out = String::new();
|
let mut out = String::new();
|
||||||
let mut in_ent = false;
|
|
||||||
for (i, r) in s.char_indices() {
|
// except for the first part, every part will have started with an ampersand
|
||||||
if r == ';' && in_ent {
|
// thus the start of the remaining parts is a HTML entity
|
||||||
in_ent = false;
|
let mut parts = s.split('&');
|
||||||
continue;
|
/*
|
||||||
} else if r == '&' {
|
skip first part. if the string started with an ampersand, the first part
|
||||||
let mut ent_name = String::new();
|
will be an empty string
|
||||||
let mut is_ent = false;
|
|
||||||
let mut chars = 0;
|
if the string was empty, the first part will also be an empty string so its
|
||||||
for er in s[i + 1..].chars() {
|
safe to unwrap
|
||||||
if er == ';' {
|
*/
|
||||||
is_ent = true;
|
out.push_str(parts.next().unwrap());
|
||||||
break;
|
|
||||||
|
for part in parts {
|
||||||
|
let end = part
|
||||||
|
// entity can be terminated by semicolon or whitespace
|
||||||
|
.find(|c: char| c.is_whitespace() || c == ';')
|
||||||
|
// entity can also terminated by end of string or start of
|
||||||
|
// another entity
|
||||||
|
.unwrap_or_else(|| part.len());
|
||||||
|
if let Some(entity) = parse_html_entity(&part[..end]) {
|
||||||
|
out.push(entity);
|
||||||
|
out.push_str(&part[end..]);
|
||||||
} else {
|
} else {
|
||||||
ent_name.push(er);
|
out.push_str(part)
|
||||||
}
|
|
||||||
chars += 1;
|
|
||||||
if chars == 10 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if is_ent {
|
|
||||||
if let Some(ent) = parse_html_entity(&ent_name) {
|
|
||||||
out.push(ent);
|
|
||||||
in_ent = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !in_ent {
|
|
||||||
out.push(r);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user