A ton of changes
Someone emailed me a patch with a lot of improvements here
This commit is contained in:
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -4,4 +4,4 @@ version = 3
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nanohtml2text"
|
name = "nanohtml2text"
|
||||||
version = "0.1.3"
|
version = "0.2.0"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "nanohtml2text"
|
name = "nanohtml2text"
|
||||||
version = "0.1.5"
|
version = "0.2.0"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[
|
|||||||
("natur", '\u{00266E}'),
|
("natur", '\u{00266E}'),
|
||||||
("natural", '\u{00266E}'),
|
("natural", '\u{00266E}'),
|
||||||
("naturals", '\u{002115}'),
|
("naturals", '\u{002115}'),
|
||||||
("nbsp", '\u{0000A0}'),
|
("nbsp", ' '),
|
||||||
("ncap", '\u{002A43}'),
|
("ncap", '\u{002A43}'),
|
||||||
("ncaron", '\u{000148}'),
|
("ncaron", '\u{000148}'),
|
||||||
("ncedil", '\u{000146}'),
|
("ncedil", '\u{000146}'),
|
||||||
|
|||||||
489
src/lib.rs
489
src/lib.rs
@@ -2,64 +2,60 @@ mod entity;
|
|||||||
|
|
||||||
fn decode_named_entity(entity: &str) -> Option<char> {
|
fn decode_named_entity(entity: &str) -> Option<char> {
|
||||||
entity::ENTITIES
|
entity::ENTITIES
|
||||||
.binary_search_by_key(&entity, |t| t.0)
|
.binary_search_by_key(&entity, |&(name, _)| name)
|
||||||
.map(|idx| entity::ENTITIES[idx].1)
|
|
||||||
.ok()
|
.ok()
|
||||||
|
.map(|idx| entity::ENTITIES[idx].1)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html_entity(ent_name: &str) -> Option<char> {
|
// Parse an HTML entity (named or numeric) and return the corresponding
|
||||||
let d = decode_named_entity(ent_name);
|
// character.
|
||||||
if d.is_some() {
|
|
||||||
return d;
|
fn parse_html_entity(entity: &str) -> Option<char> {
|
||||||
|
if let Some(c) = decode_named_entity(entity) {
|
||||||
|
return Some(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
let num = ent_name.strip_prefix("#")?;
|
let num = entity.strip_prefix('#')?;
|
||||||
if num.chars().next()? == 'x' {
|
|
||||||
u32::from_str_radix(&num[1..].to_lowercase(), 16)
|
let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
|
||||||
|
u32::from_str_radix(hex, 16).ok()?
|
||||||
} else {
|
} else {
|
||||||
// remaining string may be empty, but that will generate an Err(Empty)
|
u32::from_str_radix(num, 10).ok()?
|
||||||
u32::from_str_radix(num, 10)
|
};
|
||||||
|
|
||||||
|
// Exclude control characters and ensure valid Unicode code point
|
||||||
|
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
|
||||||
|
char::from_u32(code_point)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
}
|
}
|
||||||
.ok()
|
|
||||||
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
|
|
||||||
.and_then(|n| char::from_u32(n))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn html_entitities_to_text(s: &str) -> String {
|
/// Convert HTML entities in a string to their corresponding characters.
|
||||||
|
|
||||||
|
fn html_entities_to_text(s: &str) -> String {
|
||||||
let mut out = String::new();
|
let mut out = String::new();
|
||||||
|
|
||||||
// except for the first part, every part will have started with an ampersand
|
|
||||||
// thus the start of the remaining parts is a HTML entity
|
|
||||||
let mut parts = s.split('&');
|
let mut parts = s.split('&');
|
||||||
/*
|
|
||||||
skip first part. if the string started with an ampersand, the first part
|
|
||||||
will be an empty string
|
|
||||||
|
|
||||||
if the string was empty, the first part will also be an empty string so its
|
// Add the first part (before any '&')
|
||||||
safe to unwrap
|
out.push_str(parts.next().unwrap_or_default());
|
||||||
*/
|
|
||||||
out.push_str(parts.next().unwrap());
|
|
||||||
|
|
||||||
for part in parts {
|
for part in parts {
|
||||||
let end = part
|
let end = part
|
||||||
// entity can be terminated by semicolon or whitespace
|
|
||||||
.find(|c: char| c.is_whitespace() || c == ';')
|
.find(|c: char| c.is_whitespace() || c == ';')
|
||||||
// entity can also terminated by end of string or start of
|
|
||||||
// another entity
|
|
||||||
.unwrap_or_else(|| part.len());
|
.unwrap_or_else(|| part.len());
|
||||||
|
|
||||||
if let Some(entity) = parse_html_entity(&part[..end]) {
|
if let Some(entity) = parse_html_entity(&part[..end]) {
|
||||||
out.push(entity);
|
out.push(entity);
|
||||||
// get byte length of the char we did `find` above
|
|
||||||
let real_end = if let Some(next) = &part[end..].chars().next() {
|
|
||||||
end + next.len_utf8()
|
|
||||||
} else {
|
|
||||||
// invalid html entity that doesn't end with `;`
|
|
||||||
end
|
|
||||||
};
|
|
||||||
|
|
||||||
out.push_str(&part[real_end..]);
|
// Advance past the entity and any following semicolon or whitespace
|
||||||
|
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
|
||||||
|
let remaining = &part[end + next_char_len..];
|
||||||
|
|
||||||
|
out.push_str(remaining);
|
||||||
} else {
|
} else {
|
||||||
out.push('&');
|
out.push('&');
|
||||||
|
|
||||||
out.push_str(part);
|
out.push_str(part);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
|
|||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Function to parse and handle the individual tags.
|
// Handle individual HTML tags and convert them to text.
|
||||||
/// Assumes that there was a '<' before the given string
|
// Returns the generated text and the number of bytes to skip.
|
||||||
///
|
|
||||||
/// Returns the generated text and the byte length to skip.
|
|
||||||
fn handle_tag(s: &str) -> (String, usize) {
|
fn handle_tag(s: &str) -> (String, usize) {
|
||||||
let (tag, more) = match s.split_once('>') {
|
let (tag_content, rest) = match s.split_once('>') {
|
||||||
Some((tag, more)) if !tag.is_empty() => (tag, more),
|
Some((tag, rest)) if !tag.is_empty() => (tag, rest),
|
||||||
|
|
||||||
_ => {
|
_ => {
|
||||||
// was not actually a tag, so reinsert the '<'
|
// Not a valid tag, treat '<' as a regular character
|
||||||
return (String::from("<"), 0);
|
return ("<".to_string(), 0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) {
|
// Split the tag into name and attributes
|
||||||
(name, Some(attribs))
|
let (tag_name, attribs) = tag_content
|
||||||
} else {
|
.split_once(char::is_whitespace)
|
||||||
(tag, None)
|
.map_or((tag_content, ""), |(name, attrs)| (name, attrs));
|
||||||
};
|
|
||||||
|
|
||||||
match name.to_lowercase().as_str() {
|
match tag_name.to_lowercase().as_str() {
|
||||||
|
// Handle anchor tags
|
||||||
"a" => {
|
"a" => {
|
||||||
|
// Extract href attribute
|
||||||
let href = attribs
|
let href = attribs
|
||||||
.and_then(|attribs| {
|
.split_ascii_whitespace()
|
||||||
Some(
|
.find_map(|attr| {
|
||||||
attribs
|
let mut parts = attr.splitn(2, '=');
|
||||||
// check for the href and then discard everything before it
|
|
||||||
.split_once("href")?
|
if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
|
||||||
.1
|
if key.eq_ignore_ascii_case("href") {
|
||||||
// there might be whitespace between 'href' and '='
|
Some(value.trim_matches(['"', '\''].as_ref()))
|
||||||
.trim_start()
|
} else {
|
||||||
// check for and then discard the equal sign
|
None
|
||||||
.strip_prefix('=')?
|
|
||||||
// remove whitespace after the equal sign
|
|
||||||
.trim_start(),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.and_then(|href_value|
|
|
||||||
// find quoted string
|
|
||||||
match href_value.chars().next()? {
|
|
||||||
start @ '\'' | start @ '"' => {
|
|
||||||
let (end, _) = href_value
|
|
||||||
.char_indices()
|
|
||||||
.skip(1)
|
|
||||||
.find(|(_, c)| *c == start)?;
|
|
||||||
Some(href_value[1..end].to_string())
|
|
||||||
}
|
}
|
||||||
_ => None,
|
|
||||||
})
|
|
||||||
.filter(|href| !href.starts_with("javascript:"))
|
|
||||||
.map(|href| html_entitities_to_text(&href));
|
|
||||||
// only use to_ascii_lowercase here so the byte offsets dont get
|
|
||||||
// messed up from one uppercase symbol becoming two lowercase
|
|
||||||
// symbols or something like that
|
|
||||||
let more = more.to_ascii_lowercase();
|
|
||||||
|
|
||||||
let end_without_closing = more.find("</a");
|
|
||||||
let content = end_without_closing.map(|i| more[0..i].trim());
|
|
||||||
|
|
||||||
let end = end_without_closing
|
|
||||||
.map(|i| i + 3)
|
|
||||||
.and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1))
|
|
||||||
.unwrap_or_else(|| more.len());
|
|
||||||
|
|
||||||
let link = match (content, href) {
|
|
||||||
(Some(content_value), Some(href_value)) => {
|
|
||||||
if content_value == href_value {
|
|
||||||
href_value
|
|
||||||
} else {
|
} else {
|
||||||
let cleaned_content_value = html2text(content_value);
|
None
|
||||||
format!("{} ({})", cleaned_content_value, href_value)
|
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
.filter(|href| !href.starts_with("javascript:"))
|
||||||
|
.map(html_entities_to_text);
|
||||||
|
|
||||||
|
// Search for closing </a> tag
|
||||||
|
let lower_rest = rest.to_ascii_lowercase();
|
||||||
|
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
|
||||||
|
let content = &rest[..end_tag_start];
|
||||||
|
|
||||||
|
// Calculate the total length to skip
|
||||||
|
let closing_tag_len = if end_tag_start < lower_rest.len() {
|
||||||
|
4
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
// Length of "</a>"
|
||||||
|
|
||||||
|
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
|
||||||
|
let content_text = html2text(content.trim());
|
||||||
|
let link = match (href, content_text.is_empty()) {
|
||||||
|
(Some(href_value), false) if content_text != href_value => {
|
||||||
|
format!("{} ({})", content_text, href_value)
|
||||||
}
|
}
|
||||||
(None, Some(href_value)) => href_value,
|
|
||||||
(Some(content_value), None) => content_value.to_string(),
|
(Some(href_value), _) => href_value,
|
||||||
(None, None) => "".to_string(),
|
|
||||||
|
(_, false) => content_text,
|
||||||
|
|
||||||
|
_ => String::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
(link, tag.len() + 1 + end)
|
(link, total_skip)
|
||||||
}
|
}
|
||||||
"br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1),
|
// Line breaks and list items
|
||||||
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5"
|
"br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
|
||||||
| "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
|
|
||||||
name @ "head" | name @ "script" | name @ "style" => {
|
|
||||||
// silence tags
|
|
||||||
|
|
||||||
// only use to_ascii_lowercase here so the byte offsets dont get
|
// Paragraphs and headings
|
||||||
// messed up from one uppercase symbol becoming two lowercase
|
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
|
||||||
// symbols or something like that
|
|
||||||
let more = more.to_ascii_lowercase();
|
// Tags to ignore along with their content
|
||||||
let end = more
|
name if ["head", "script", "style"].contains(&name) => {
|
||||||
.find(&format!("</{}", name))
|
// Search for the closing tag
|
||||||
.map(|i| i + 2 + name.len())
|
|
||||||
.and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1))
|
let closing_tag = format!("</{}>", name);
|
||||||
.unwrap_or_else(|| more.len());
|
let lower_rest = rest.to_ascii_lowercase();
|
||||||
(String::new(), tag.len() + 1 + end)
|
let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
|
||||||
|
let closing_tag_len = if end_tag_start < lower_rest.len() {
|
||||||
|
closing_tag.len()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
|
||||||
|
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
|
||||||
|
|
||||||
|
(String::new(), total_skip)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HTML comments
|
||||||
"!--" => {
|
"!--" => {
|
||||||
// HTML comment
|
let end = s.find("-->").map_or(s.len(), |n| n + 3);
|
||||||
(String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
|
|
||||||
|
(String::new(), end)
|
||||||
}
|
}
|
||||||
// other/unknown tags are just discarded
|
|
||||||
_ => (String::new(), tag.len() + 1),
|
// Discard other tags but keep their content
|
||||||
|
_ => (String::new(), tag_content.len() + 1),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert some HTML to plain text. Only some simple HTML tags are handled:
|
/// Convert an HTML string to plain text.
|
||||||
/// - `a` tags are transformed to their href attribute value
|
/// Handles basic HTML tags and entities, and collapses whitespace.
|
||||||
/// - paragraph, linebreak, heading, list, and list item tags insert different
|
|
||||||
/// amounts of line breaks.
|
|
||||||
/// - HTML comments as well as `head`, `script` and `style` are completely
|
|
||||||
/// discarded, including their content
|
|
||||||
/// - unknown tags are skipped, but their content is printed
|
|
||||||
///
|
|
||||||
/// HTML named entities will be replaced with the respecive Unicode code point,
|
|
||||||
/// and whitespace will be collapsed as is usual in HTML.
|
|
||||||
///
|
|
||||||
/// The resulting string will have CRLF line endings.
|
|
||||||
pub fn html2text(html: &str) -> String {
|
pub fn html2text(html: &str) -> String {
|
||||||
// collapse spaces
|
// Collapse multiple whitespace characters into a single space
|
||||||
let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
|
let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||||
|
|
||||||
let mut out = String::new();
|
let mut out = String::new();
|
||||||
|
let mut index = 0;
|
||||||
let mut i = 0;
|
while index < html.len() {
|
||||||
while i < html.len() {
|
if let Some(pos) = html[index..].find('<') {
|
||||||
match html[i..].find('<') {
|
if pos > 0 {
|
||||||
None => {
|
out.push_str(&html_entities_to_text(&html[index..index + pos]));
|
||||||
// no more tags in the input, done
|
index += pos;
|
||||||
out += &html_entitities_to_text(&html[i..]);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
Some(text_segment) => {
|
index += 1; // Skip the '<'
|
||||||
if text_segment > 0 {
|
let (parsed_text, advance) = handle_tag(&html[index..]);
|
||||||
out += &html_entitities_to_text(&html[i..i + text_segment]);
|
if !parsed_text.is_empty() {
|
||||||
i += text_segment;
|
if out.ends_with("\r\n\r\n") || out.is_empty() {
|
||||||
|
out.push_str(&parsed_text.trim_start());
|
||||||
|
} else {
|
||||||
|
out.push_str(&parsed_text);
|
||||||
}
|
}
|
||||||
i += 1; // skip the '<'
|
|
||||||
let (s, advance) = handle_tag(&html[i..]);
|
|
||||||
if !s.is_empty() {
|
|
||||||
if out.ends_with("\r\n\r\n") || out.is_empty() {
|
|
||||||
out += &s.trim_start();
|
|
||||||
} else {
|
|
||||||
out += &s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i += advance;
|
|
||||||
}
|
}
|
||||||
|
index += advance;
|
||||||
|
} else {
|
||||||
|
// No more tags, process the remaining text
|
||||||
|
out.push_str(&html_entities_to_text(&html[index..]));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
macro_rules! test {
|
macro_rules! test {
|
||||||
($name:ident, $from:literal, $to:literal $(,)?) => {
|
($name:ident, $from:literal, $to:literal $(,)?) => {
|
||||||
#[test]
|
#[test]
|
||||||
fn $name() {
|
fn $name() {
|
||||||
assert_eq!(&html2text($from), $to);
|
assert_eq!(html2text($from), $to);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
($($name:ident: $from:literal to $to:literal,)* $(,)?) => {
|
($($name:ident: $from:literal to $to:literal,)*) => {
|
||||||
$(test!{$name, $from, $to})*
|
$(test!{$name, $from, $to})*
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
test! {
|
test! {
|
||||||
plaintext: "blah" to "blah",
|
plaintext: "blah" to "blah",
|
||||||
tag: "<div></div>" to "",
|
tag: "<div></div>" to "",
|
||||||
tag_contents: "<div>simple text</div>" to "simple text",
|
tag_contents: "<div>simple text</div>" to "simple text",
|
||||||
// links
|
// Links
|
||||||
link:
|
link: "click <a href=\"test\">here</a>" to "click here (test)",
|
||||||
"click <a href=\"test\">here</a>"
|
link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
|
||||||
to "click here (test)",
|
links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
|
||||||
link_href_equal_to_content:
|
link_entities_in_url: "click <a href=\"ents/'x'\">here</a>" to "click here (ents/'x')",
|
||||||
"click <a href=\"test\">test</a>"
|
link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
|
||||||
to "click test",
|
link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
|
||||||
links_ignore_attributes:
|
link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
|
||||||
"click <a class=\"x\" href=\"test\">here</a>"
|
link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
|
||||||
to "click here (test)",
|
// Inline elements
|
||||||
link_entities_in_url:
|
ignore_inline: "strong <strong>text</strong>" to "strong text",
|
||||||
"click <a href=\"ents/'x'\">here</a>"
|
ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
|
||||||
to "click here (ents/'x')",
|
// Line breaks and spaces
|
||||||
link_javascript:
|
collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
|
||||||
"click <a href=\"javascript:void(0)\">here</a>"
|
collapse_linebreaks: "a\nb\nc" to "a b c",
|
||||||
to "click here",
|
collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
|
||||||
link_ignore_content_tags:
|
br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
|
||||||
"click <a href=\"test\"><span>here</span> or here</a>"
|
paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
|
||||||
to "click here or here (test)",
|
// Headers
|
||||||
link_absolute_url:
|
h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
|
||||||
"click <a href=\"http://bit.ly/2n4wXRs\">news</a>"
|
h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
|
||||||
to "click news (http://bit.ly/2n4wXRs)",
|
h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
|
||||||
link_ignore_attributes_2:
|
h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
|
||||||
"<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>"
|
h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
|
||||||
to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
|
h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
|
||||||
// inlines
|
h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
|
||||||
ignore_inline:
|
h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
|
||||||
"strong <strong>text</strong>"
|
h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
|
||||||
to "strong text",
|
h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
|
||||||
ignore_inline_attributes:
|
h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
|
||||||
"some <div id=\"a\" class=\"b\">div</div>"
|
no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
|
||||||
to "some div",
|
// HTML entities
|
||||||
// lines breaks and spaces
|
entity_nbsp: "two spaces" to "two spaces",
|
||||||
collapse_spaces:
|
entity_copy: "© 2017 K3A" to "© 2017 K3A",
|
||||||
"should ignore more spaces" to "should ignore more spaces",
|
entity_tag: "<printtag>" to "<printtag>",
|
||||||
collapse_linebreaks:
|
entity_currencies: "would you pay in ¢, £, ¥ or €?" to "would you pay in ¢, £, ¥ or €?",
|
||||||
"a\nb\nc" to "a b c",
|
ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
|
||||||
collapse_mixed:
|
entity_unknown: "this &neither; as you see" to "this &neither; as you see",
|
||||||
"should \nignore \r\nnew lines" to "should ignore new lines",
|
entity_amp: "fish & chips" to "fish & chips",
|
||||||
br_tag:
|
// Unordered list
|
||||||
"two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
|
unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
|
||||||
paragraph:
|
entity_quot: ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
|
||||||
"<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
|
entity_reg: "Google ®" to "Google ®",
|
||||||
// Headers
|
// Large entity
|
||||||
h1:
|
entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
|
||||||
"<h1>First</h1>main text" to "First\r\n\r\nmain text",
|
// Numeric HTML entities
|
||||||
h2_inline:
|
entity_numeric: "⁌ decimal and hex entities supported ⁍" to "⁌ decimal and hex entities supported ⁍",
|
||||||
"First<h2>Second</h2>next section"
|
entity_numeric_2: "'single quotes' and 츝" to "'single quotes' and 츝",
|
||||||
to "First\r\n\r\nSecond\r\n\r\nnext section",
|
// Full HTML structure
|
||||||
h2:
|
empty: "" to "",
|
||||||
"<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
|
full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
|
||||||
h3_inline:
|
ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
|
||||||
"Second<h3>Third</h3>next section"
|
// Custom HTML tags
|
||||||
to "Second\r\n\r\nThird\r\n\r\nnext section",
|
ignore_unknown_tag: "<aa>hello</aa>" to "hello",
|
||||||
h3:
|
ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
|
||||||
"<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
|
ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
|
||||||
h4_inline:
|
invalid_html_entity_without_semicolon: "&hellip" to "…",
|
||||||
"Third<h4>Fourth</h4>next section"
|
|
||||||
to "Third\r\n\r\nFourth\r\n\r\nnext section",
|
|
||||||
h4:
|
|
||||||
"<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
|
|
||||||
h5_inline:
|
|
||||||
"Fourth<h5>Fifth</h5>next section"
|
|
||||||
to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
|
|
||||||
h5:
|
|
||||||
"<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
|
|
||||||
h6_inline:
|
|
||||||
"Fifth<h6>Sixth</h6>next section"
|
|
||||||
to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
|
|
||||||
h6:
|
|
||||||
"<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
|
|
||||||
no_h7:
|
|
||||||
"<h7>Not Header</h7>next section" to "Not Headernext section",
|
|
||||||
// html entitites
|
|
||||||
entity_nbsp:
|
|
||||||
"two spaces" to "two spaces",
|
|
||||||
entity_copy:
|
|
||||||
"© 2017 K3A" to "© 2017 K3A",
|
|
||||||
entity_tag:
|
|
||||||
"<printtag>" to "<printtag>",
|
|
||||||
entity_currencies:
|
|
||||||
"would you pay in ¢, £, ¥ or €?"
|
|
||||||
to "would you pay in ¢, £, ¥ or €?",
|
|
||||||
ampersand_not_entity:
|
|
||||||
"Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
|
|
||||||
entity_unknown:
|
|
||||||
"this &neither; as you see" to "this &neither; as you see",
|
|
||||||
entity_amp:
|
|
||||||
"fish & chips" to "fish & chips",
|
|
||||||
unordered_list:
|
|
||||||
"list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
|
|
||||||
to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
|
|
||||||
entity_quot:
|
|
||||||
""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey"
|
|
||||||
to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
|
|
||||||
entity_reg:
|
|
||||||
"Google ®" to "Google ®",
|
|
||||||
// Large entity
|
|
||||||
entity_large_unknown:
|
|
||||||
"&abcdefghij;" to "&abcdefghij;",
|
|
||||||
// Numeric HTML entities
|
|
||||||
entity_numeric:
|
|
||||||
"⁌ decimal and hex entities supported ⁍"
|
|
||||||
to "⁌ decimal and hex entities supported ⁍",
|
|
||||||
entity_numeric_2:
|
|
||||||
"'single quotes' and 츝"
|
|
||||||
to "'single quotes' and 츝",
|
|
||||||
// full thml structure
|
|
||||||
empty: "" to "",
|
|
||||||
full_html:
|
|
||||||
"<html><head><title>Good</title></head><body>x</body>" to "x",
|
|
||||||
ignore_script:
|
|
||||||
"we are not <script type=\"javascript\"></script>interested in scripts"
|
|
||||||
to "we are not interested in scripts",
|
|
||||||
// custom html tags
|
|
||||||
ignore_unknown_tag:
|
|
||||||
"<aa>hello</aa>" to "hello",
|
|
||||||
ignore_unknown_tag_whitespace:
|
|
||||||
"<aa >hello</aa>" to "hello",
|
|
||||||
ignore_unknown_tag_attributes:
|
|
||||||
"<aa x=\"1\">hello</aa>" to "hello",
|
|
||||||
invalid_html_entity_without_semicolon: "&hellip" to "…",
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user