A ton of changes
Someone emailed me a patch with a lot of improvements here
This commit is contained in:
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -4,4 +4,4 @@ version = 3
|
||||
|
||||
[[package]]
|
||||
name = "nanohtml2text"
|
||||
version = "0.1.3"
|
||||
version = "0.2.0"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "nanohtml2text"
|
||||
version = "0.1.5"
|
||||
version = "0.2.0"
|
||||
edition = "2018"
|
||||
readme = "README.md"
|
||||
license = "MIT"
|
||||
|
||||
@@ -1377,7 +1377,7 @@ pub static ENTITIES: &'static [(&'static str, char)] = &[
|
||||
("natur", '\u{00266E}'),
|
||||
("natural", '\u{00266E}'),
|
||||
("naturals", '\u{002115}'),
|
||||
("nbsp", '\u{0000A0}'),
|
||||
("nbsp", ' '),
|
||||
("ncap", '\u{002A43}'),
|
||||
("ncaron", '\u{000148}'),
|
||||
("ncedil", '\u{000146}'),
|
||||
|
||||
489
src/lib.rs
489
src/lib.rs
@@ -2,64 +2,60 @@ mod entity;
|
||||
|
||||
fn decode_named_entity(entity: &str) -> Option<char> {
|
||||
entity::ENTITIES
|
||||
.binary_search_by_key(&entity, |t| t.0)
|
||||
.map(|idx| entity::ENTITIES[idx].1)
|
||||
.binary_search_by_key(&entity, |&(name, _)| name)
|
||||
.ok()
|
||||
.map(|idx| entity::ENTITIES[idx].1)
|
||||
}
|
||||
|
||||
fn parse_html_entity(ent_name: &str) -> Option<char> {
|
||||
let d = decode_named_entity(ent_name);
|
||||
if d.is_some() {
|
||||
return d;
|
||||
// Parse an HTML entity (named or numeric) and return the corresponding
|
||||
// character.
|
||||
|
||||
fn parse_html_entity(entity: &str) -> Option<char> {
|
||||
if let Some(c) = decode_named_entity(entity) {
|
||||
return Some(c);
|
||||
}
|
||||
|
||||
let num = ent_name.strip_prefix("#")?;
|
||||
if num.chars().next()? == 'x' {
|
||||
u32::from_str_radix(&num[1..].to_lowercase(), 16)
|
||||
let num = entity.strip_prefix('#')?;
|
||||
|
||||
let code_point = if let Some(hex) = num.strip_prefix(|c| c == 'x' || c == 'X') {
|
||||
u32::from_str_radix(hex, 16).ok()?
|
||||
} else {
|
||||
// remaining string may be empty, but that will generate an Err(Empty)
|
||||
u32::from_str_radix(num, 10)
|
||||
u32::from_str_radix(num, 10).ok()?
|
||||
};
|
||||
|
||||
// Exclude control characters and ensure valid Unicode code point
|
||||
if matches!(code_point, 0x09 | 0x0A | 0x0D | 0x20..) {
|
||||
char::from_u32(code_point)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
.ok()
|
||||
.filter(|n| !matches!(n, 9 | 10 | 13 | 32))
|
||||
.and_then(|n| char::from_u32(n))
|
||||
}
|
||||
|
||||
fn html_entitities_to_text(s: &str) -> String {
|
||||
/// Convert HTML entities in a string to their corresponding characters.
|
||||
|
||||
fn html_entities_to_text(s: &str) -> String {
|
||||
let mut out = String::new();
|
||||
|
||||
// except for the first part, every part will have started with an ampersand
|
||||
// thus the start of the remaining parts is a HTML entity
|
||||
let mut parts = s.split('&');
|
||||
/*
|
||||
skip first part. if the string started with an ampersand, the first part
|
||||
will be an empty string
|
||||
|
||||
if the string was empty, the first part will also be an empty string so its
|
||||
safe to unwrap
|
||||
*/
|
||||
out.push_str(parts.next().unwrap());
|
||||
// Add the first part (before any '&')
|
||||
out.push_str(parts.next().unwrap_or_default());
|
||||
|
||||
for part in parts {
|
||||
let end = part
|
||||
// entity can be terminated by semicolon or whitespace
|
||||
.find(|c: char| c.is_whitespace() || c == ';')
|
||||
// entity can also terminated by end of string or start of
|
||||
// another entity
|
||||
.unwrap_or_else(|| part.len());
|
||||
|
||||
if let Some(entity) = parse_html_entity(&part[..end]) {
|
||||
out.push(entity);
|
||||
// get byte length of the char we did `find` above
|
||||
let real_end = if let Some(next) = &part[end..].chars().next() {
|
||||
end + next.len_utf8()
|
||||
} else {
|
||||
// invalid html entity that doesn't end with `;`
|
||||
end
|
||||
};
|
||||
|
||||
out.push_str(&part[real_end..]);
|
||||
// Advance past the entity and any following semicolon or whitespace
|
||||
let next_char_len = part[end..].chars().next().map_or(0, |c| c.len_utf8());
|
||||
let remaining = &part[end + next_char_len..];
|
||||
|
||||
out.push_str(remaining);
|
||||
} else {
|
||||
out.push('&');
|
||||
|
||||
out.push_str(part);
|
||||
}
|
||||
}
|
||||
@@ -67,153 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
|
||||
out
|
||||
}
|
||||
|
||||
/// Function to parse and handle the individual tags.
|
||||
/// Assumes that there was a '<' before the given string
|
||||
///
|
||||
/// Returns the generated text and the byte length to skip.
|
||||
// Handle individual HTML tags and convert them to text.
|
||||
// Returns the generated text and the number of bytes to skip.
|
||||
fn handle_tag(s: &str) -> (String, usize) {
|
||||
let (tag, more) = match s.split_once('>') {
|
||||
Some((tag, more)) if !tag.is_empty() => (tag, more),
|
||||
let (tag_content, rest) = match s.split_once('>') {
|
||||
Some((tag, rest)) if !tag.is_empty() => (tag, rest),
|
||||
|
||||
_ => {
|
||||
// was not actually a tag, so reinsert the '<'
|
||||
return (String::from("<"), 0);
|
||||
// Not a valid tag, treat '<' as a regular character
|
||||
return ("<".to_string(), 0);
|
||||
}
|
||||
};
|
||||
|
||||
let (name, attribs) = if let Some((name, attribs)) = tag.split_once(char::is_whitespace) {
|
||||
(name, Some(attribs))
|
||||
} else {
|
||||
(tag, None)
|
||||
};
|
||||
// Split the tag into name and attributes
|
||||
let (tag_name, attribs) = tag_content
|
||||
.split_once(char::is_whitespace)
|
||||
.map_or((tag_content, ""), |(name, attrs)| (name, attrs));
|
||||
|
||||
match name.to_lowercase().as_str() {
|
||||
match tag_name.to_lowercase().as_str() {
|
||||
// Handle anchor tags
|
||||
"a" => {
|
||||
// Extract href attribute
|
||||
let href = attribs
|
||||
.and_then(|attribs| {
|
||||
Some(
|
||||
attribs
|
||||
// check for the href and then discard everything before it
|
||||
.split_once("href")?
|
||||
.1
|
||||
// there might be whitespace between 'href' and '='
|
||||
.trim_start()
|
||||
// check for and then discard the equal sign
|
||||
.strip_prefix('=')?
|
||||
// remove whitespace after the equal sign
|
||||
.trim_start(),
|
||||
)
|
||||
})
|
||||
.and_then(|href_value|
|
||||
// find quoted string
|
||||
match href_value.chars().next()? {
|
||||
start @ '\'' | start @ '"' => {
|
||||
let (end, _) = href_value
|
||||
.char_indices()
|
||||
.skip(1)
|
||||
.find(|(_, c)| *c == start)?;
|
||||
Some(href_value[1..end].to_string())
|
||||
.split_ascii_whitespace()
|
||||
.find_map(|attr| {
|
||||
let mut parts = attr.splitn(2, '=');
|
||||
|
||||
if let (Some(key), Some(value)) = (parts.next(), parts.next()) {
|
||||
if key.eq_ignore_ascii_case("href") {
|
||||
Some(value.trim_matches(['"', '\''].as_ref()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
_ => None,
|
||||
})
|
||||
.filter(|href| !href.starts_with("javascript:"))
|
||||
.map(|href| html_entitities_to_text(&href));
|
||||
// only use to_ascii_lowercase here so the byte offsets dont get
|
||||
// messed up from one uppercase symbol becoming two lowercase
|
||||
// symbols or something like that
|
||||
let more = more.to_ascii_lowercase();
|
||||
|
||||
let end_without_closing = more.find("</a");
|
||||
let content = end_without_closing.map(|i| more[0..i].trim());
|
||||
|
||||
let end = end_without_closing
|
||||
.map(|i| i + 3)
|
||||
.and_then(|end_tag| more[end_tag..].find('>').map(|i| end_tag + i + 1))
|
||||
.unwrap_or_else(|| more.len());
|
||||
|
||||
let link = match (content, href) {
|
||||
(Some(content_value), Some(href_value)) => {
|
||||
if content_value == href_value {
|
||||
href_value
|
||||
} else {
|
||||
let cleaned_content_value = html2text(content_value);
|
||||
format!("{} ({})", cleaned_content_value, href_value)
|
||||
None
|
||||
}
|
||||
})
|
||||
.filter(|href| !href.starts_with("javascript:"))
|
||||
.map(html_entities_to_text);
|
||||
|
||||
// Search for closing </a> tag
|
||||
let lower_rest = rest.to_ascii_lowercase();
|
||||
let end_tag_start = lower_rest.find("</a>").unwrap_or(lower_rest.len());
|
||||
let content = &rest[..end_tag_start];
|
||||
|
||||
// Calculate the total length to skip
|
||||
let closing_tag_len = if end_tag_start < lower_rest.len() {
|
||||
4
|
||||
} else {
|
||||
0
|
||||
};
|
||||
// Length of "</a>"
|
||||
|
||||
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
|
||||
let content_text = html2text(content.trim());
|
||||
let link = match (href, content_text.is_empty()) {
|
||||
(Some(href_value), false) if content_text != href_value => {
|
||||
format!("{} ({})", content_text, href_value)
|
||||
}
|
||||
(None, Some(href_value)) => href_value,
|
||||
(Some(content_value), None) => content_value.to_string(),
|
||||
(None, None) => "".to_string(),
|
||||
|
||||
(Some(href_value), _) => href_value,
|
||||
|
||||
(_, false) => content_text,
|
||||
|
||||
_ => String::new(),
|
||||
};
|
||||
|
||||
(link, tag.len() + 1 + end)
|
||||
(link, total_skip)
|
||||
}
|
||||
"br" | "br/" | "li" | "/ol" | "/ul" => (String::from("\r\n"), tag.len() + 1),
|
||||
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5"
|
||||
| "/h6" => (String::from("\r\n\r\n"), tag.len() + 1),
|
||||
name @ "head" | name @ "script" | name @ "style" => {
|
||||
// silence tags
|
||||
// Line breaks and list items
|
||||
"br" | "br/" | "li" | "/ol" | "/ul" => ("\r\n".to_string(), tag_content.len() + 1),
|
||||
|
||||
// only use to_ascii_lowercase here so the byte offsets dont get
|
||||
// messed up from one uppercase symbol becoming two lowercase
|
||||
// symbols or something like that
|
||||
let more = more.to_ascii_lowercase();
|
||||
let end = more
|
||||
.find(&format!("</{}", name))
|
||||
.map(|i| i + 2 + name.len())
|
||||
.and_then(|end_tag| more[end_tag..].find('>').map(|i| i + end_tag + 1))
|
||||
.unwrap_or_else(|| more.len());
|
||||
(String::new(), tag.len() + 1 + end)
|
||||
// Paragraphs and headings
|
||||
"p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "/h1" | "/h2" | "/h3" | "/h4" | "/h5" | "/h6" => ("\r\n\r\n".to_string(), tag_content.len() + 1),
|
||||
|
||||
// Tags to ignore along with their content
|
||||
name if ["head", "script", "style"].contains(&name) => {
|
||||
// Search for the closing tag
|
||||
|
||||
let closing_tag = format!("</{}>", name);
|
||||
let lower_rest = rest.to_ascii_lowercase();
|
||||
let end_tag_start = lower_rest.find(&closing_tag).unwrap_or(lower_rest.len());
|
||||
let closing_tag_len = if end_tag_start < lower_rest.len() {
|
||||
closing_tag.len()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let total_skip = tag_content.len() + 1 + end_tag_start + closing_tag_len;
|
||||
|
||||
(String::new(), total_skip)
|
||||
}
|
||||
|
||||
// HTML comments
|
||||
"!--" => {
|
||||
// HTML comment
|
||||
(String::new(), s.find("-->").map_or(s.len(), |n| n + 3))
|
||||
let end = s.find("-->").map_or(s.len(), |n| n + 3);
|
||||
|
||||
(String::new(), end)
|
||||
}
|
||||
// other/unknown tags are just discarded
|
||||
_ => (String::new(), tag.len() + 1),
|
||||
|
||||
// Discard other tags but keep their content
|
||||
_ => (String::new(), tag_content.len() + 1),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert some HTML to plain text. Only some simple HTML tags are handled:
|
||||
/// - `a` tags are transformed to their href attribute value
|
||||
/// - paragraph, linebreak, heading, list, and list item tags insert different
|
||||
/// amounts of line breaks.
|
||||
/// - HTML comments as well as `head`, `script` and `style` are completely
|
||||
/// discarded, including their content
|
||||
/// - unknown tags are skipped, but their content is printed
|
||||
///
|
||||
/// HTML named entities will be replaced with the respecive Unicode code point,
|
||||
/// and whitespace will be collapsed as is usual in HTML.
|
||||
///
|
||||
/// The resulting string will have CRLF line endings.
|
||||
/// Convert an HTML string to plain text.
|
||||
/// Handles basic HTML tags and entities, and collapses whitespace.
|
||||
pub fn html2text(html: &str) -> String {
|
||||
// collapse spaces
|
||||
// Collapse multiple whitespace characters into a single space
|
||||
let html = html.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
|
||||
let mut out = String::new();
|
||||
|
||||
let mut i = 0;
|
||||
while i < html.len() {
|
||||
match html[i..].find('<') {
|
||||
None => {
|
||||
// no more tags in the input, done
|
||||
out += &html_entitities_to_text(&html[i..]);
|
||||
break;
|
||||
let mut index = 0;
|
||||
while index < html.len() {
|
||||
if let Some(pos) = html[index..].find('<') {
|
||||
if pos > 0 {
|
||||
out.push_str(&html_entities_to_text(&html[index..index + pos]));
|
||||
index += pos;
|
||||
}
|
||||
Some(text_segment) => {
|
||||
if text_segment > 0 {
|
||||
out += &html_entitities_to_text(&html[i..i + text_segment]);
|
||||
i += text_segment;
|
||||
index += 1; // Skip the '<'
|
||||
let (parsed_text, advance) = handle_tag(&html[index..]);
|
||||
if !parsed_text.is_empty() {
|
||||
if out.ends_with("\r\n\r\n") || out.is_empty() {
|
||||
out.push_str(&parsed_text.trim_start());
|
||||
} else {
|
||||
out.push_str(&parsed_text);
|
||||
}
|
||||
i += 1; // skip the '<'
|
||||
let (s, advance) = handle_tag(&html[i..]);
|
||||
if !s.is_empty() {
|
||||
if out.ends_with("\r\n\r\n") || out.is_empty() {
|
||||
out += &s.trim_start();
|
||||
} else {
|
||||
out += &s;
|
||||
}
|
||||
}
|
||||
i += advance;
|
||||
}
|
||||
index += advance;
|
||||
} else {
|
||||
// No more tags, process the remaining text
|
||||
out.push_str(&html_entities_to_text(&html[index..]));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -223,144 +203,79 @@ pub fn html2text(html: &str) -> String {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
macro_rules! test {
|
||||
($name:ident, $from:literal, $to:literal $(,)?) => {
|
||||
#[test]
|
||||
#[test]
|
||||
fn $name() {
|
||||
assert_eq!(&html2text($from), $to);
|
||||
}
|
||||
};
|
||||
($($name:ident: $from:literal to $to:literal,)* $(,)?) => {
|
||||
$(test!{$name, $from, $to})*
|
||||
assert_eq!(html2text($from), $to);
|
||||
}
|
||||
};
|
||||
($($name:ident: $from:literal to $to:literal,)*) => {
|
||||
$(test!{$name, $from, $to})*
|
||||
};
|
||||
}
|
||||
|
||||
test! {
|
||||
plaintext: "blah" to "blah",
|
||||
tag: "<div></div>" to "",
|
||||
tag_contents: "<div>simple text</div>" to "simple text",
|
||||
// links
|
||||
link:
|
||||
"click <a href=\"test\">here</a>"
|
||||
to "click here (test)",
|
||||
link_href_equal_to_content:
|
||||
"click <a href=\"test\">test</a>"
|
||||
to "click test",
|
||||
links_ignore_attributes:
|
||||
"click <a class=\"x\" href=\"test\">here</a>"
|
||||
to "click here (test)",
|
||||
link_entities_in_url:
|
||||
"click <a href=\"ents/'x'\">here</a>"
|
||||
to "click here (ents/'x')",
|
||||
link_javascript:
|
||||
"click <a href=\"javascript:void(0)\">here</a>"
|
||||
to "click here",
|
||||
link_ignore_content_tags:
|
||||
"click <a href=\"test\"><span>here</span> or here</a>"
|
||||
to "click here or here (test)",
|
||||
link_absolute_url:
|
||||
"click <a href=\"http://bit.ly/2n4wXRs\">news</a>"
|
||||
to "click news (http://bit.ly/2n4wXRs)",
|
||||
link_ignore_attributes_2:
|
||||
"<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>"
|
||||
to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
|
||||
// inlines
|
||||
ignore_inline:
|
||||
"strong <strong>text</strong>"
|
||||
to "strong text",
|
||||
ignore_inline_attributes:
|
||||
"some <div id=\"a\" class=\"b\">div</div>"
|
||||
to "some div",
|
||||
// lines breaks and spaces
|
||||
collapse_spaces:
|
||||
"should ignore more spaces" to "should ignore more spaces",
|
||||
collapse_linebreaks:
|
||||
"a\nb\nc" to "a b c",
|
||||
collapse_mixed:
|
||||
"should \nignore \r\nnew lines" to "should ignore new lines",
|
||||
br_tag:
|
||||
"two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
|
||||
paragraph:
|
||||
"<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
|
||||
// Headers
|
||||
h1:
|
||||
"<h1>First</h1>main text" to "First\r\n\r\nmain text",
|
||||
h2_inline:
|
||||
"First<h2>Second</h2>next section"
|
||||
to "First\r\n\r\nSecond\r\n\r\nnext section",
|
||||
h2:
|
||||
"<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
|
||||
h3_inline:
|
||||
"Second<h3>Third</h3>next section"
|
||||
to "Second\r\n\r\nThird\r\n\r\nnext section",
|
||||
h3:
|
||||
"<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
|
||||
h4_inline:
|
||||
"Third<h4>Fourth</h4>next section"
|
||||
to "Third\r\n\r\nFourth\r\n\r\nnext section",
|
||||
h4:
|
||||
"<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
|
||||
h5_inline:
|
||||
"Fourth<h5>Fifth</h5>next section"
|
||||
to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
|
||||
h5:
|
||||
"<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
|
||||
h6_inline:
|
||||
"Fifth<h6>Sixth</h6>next section"
|
||||
to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
|
||||
h6:
|
||||
"<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
|
||||
no_h7:
|
||||
"<h7>Not Header</h7>next section" to "Not Headernext section",
|
||||
// html entitites
|
||||
entity_nbsp:
|
||||
"two spaces" to "two spaces",
|
||||
entity_copy:
|
||||
"© 2017 K3A" to "© 2017 K3A",
|
||||
entity_tag:
|
||||
"<printtag>" to "<printtag>",
|
||||
entity_currencies:
|
||||
"would you pay in ¢, £, ¥ or €?"
|
||||
to "would you pay in ¢, £, ¥ or €?",
|
||||
ampersand_not_entity:
|
||||
"Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
|
||||
entity_unknown:
|
||||
"this &neither; as you see" to "this &neither; as you see",
|
||||
entity_amp:
|
||||
"fish & chips" to "fish & chips",
|
||||
unordered_list:
|
||||
"list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>"
|
||||
to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
|
||||
entity_quot:
|
||||
""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey"
|
||||
to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
|
||||
entity_reg:
|
||||
"Google ®" to "Google ®",
|
||||
// Large entity
|
||||
entity_large_unknown:
|
||||
"&abcdefghij;" to "&abcdefghij;",
|
||||
// Numeric HTML entities
|
||||
entity_numeric:
|
||||
"⁌ decimal and hex entities supported ⁍"
|
||||
to "⁌ decimal and hex entities supported ⁍",
|
||||
entity_numeric_2:
|
||||
"'single quotes' and 츝"
|
||||
to "'single quotes' and 츝",
|
||||
// full thml structure
|
||||
empty: "" to "",
|
||||
full_html:
|
||||
"<html><head><title>Good</title></head><body>x</body>" to "x",
|
||||
ignore_script:
|
||||
"we are not <script type=\"javascript\"></script>interested in scripts"
|
||||
to "we are not interested in scripts",
|
||||
// custom html tags
|
||||
ignore_unknown_tag:
|
||||
"<aa>hello</aa>" to "hello",
|
||||
ignore_unknown_tag_whitespace:
|
||||
"<aa >hello</aa>" to "hello",
|
||||
ignore_unknown_tag_attributes:
|
||||
"<aa x=\"1\">hello</aa>" to "hello",
|
||||
invalid_html_entity_without_semicolon: "&hellip" to "…",
|
||||
plaintext: "blah" to "blah",
|
||||
tag: "<div></div>" to "",
|
||||
tag_contents: "<div>simple text</div>" to "simple text",
|
||||
// Links
|
||||
link: "click <a href=\"test\">here</a>" to "click here (test)",
|
||||
link_href_equal_to_content: "click <a href=\"test\">test</a>" to "click test",
|
||||
links_ignore_attributes: "click <a class=\"x\" href=\"test\">here</a>" to "click here (test)",
|
||||
link_entities_in_url: "click <a href=\"ents/'x'\">here</a>" to "click here (ents/'x')",
|
||||
link_javascript: "click <a href=\"javascript:void(0)\">here</a>" to "click here",
|
||||
link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here (test)",
|
||||
link_absolute_url: "click <a href=\"http://bit.ly/2n4wXRs\">news</a>" to "click news (http://bit.ly/2n4wXRs)",
|
||||
link_ignore_attributes_2: "<a rel=\"mw:WikiLink\" href=\"/wiki/yet#English\" title=\"yet\">yet</a>, <a rel=\"mw:WikiLink\" href=\"/wiki/not_yet#English\" title=\"not yet\">not yet</a>" to "yet (/wiki/yet#English), not yet (/wiki/not_yet#English)",
|
||||
// Inline elements
|
||||
ignore_inline: "strong <strong>text</strong>" to "strong text",
|
||||
ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
|
||||
// Line breaks and spaces
|
||||
collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
|
||||
collapse_linebreaks: "a\nb\nc" to "a b c",
|
||||
collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
|
||||
br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
|
||||
paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
|
||||
// Headers
|
||||
h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
|
||||
h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
|
||||
h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
|
||||
h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
|
||||
h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
|
||||
h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
|
||||
h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
|
||||
h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
|
||||
h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
|
||||
h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
|
||||
h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
|
||||
no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
|
||||
// HTML entities
|
||||
entity_nbsp: "two spaces" to "two spaces",
|
||||
entity_copy: "© 2017 K3A" to "© 2017 K3A",
|
||||
entity_tag: "<printtag>" to "<printtag>",
|
||||
entity_currencies: "would you pay in ¢, £, ¥ or €?" to "would you pay in ¢, £, ¥ or €?",
|
||||
ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
|
||||
entity_unknown: "this &neither; as you see" to "this &neither; as you see",
|
||||
entity_amp: "fish & chips" to "fish & chips",
|
||||
// Unordered list
|
||||
unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
|
||||
entity_quot: ""I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
|
||||
entity_reg: "Google ®" to "Google ®",
|
||||
// Large entity
|
||||
entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
|
||||
// Numeric HTML entities
|
||||
entity_numeric: "⁌ decimal and hex entities supported ⁍" to "⁌ decimal and hex entities supported ⁍",
|
||||
entity_numeric_2: "'single quotes' and 츝" to "'single quotes' and 츝",
|
||||
// Full HTML structure
|
||||
empty: "" to "",
|
||||
full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
|
||||
ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
|
||||
// Custom HTML tags
|
||||
ignore_unknown_tag: "<aa>hello</aa>" to "hello",
|
||||
ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
|
||||
ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
|
||||
invalid_html_entity_without_semicolon: "&hellip" to "…",
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user