@@ -2,64 +2,60 @@ mod entity;
fn decode_named_entity ( entity : & str ) -> Option < char > {
fn decode_named_entity ( entity : & str ) -> Option < char > {
entity ::ENTITIES
entity ::ENTITIES
. binary_search_by_key ( & entity , | t | t . 0 )
. binary_search_by_key ( & entity , | & ( name , _ ) | name )
. map ( | idx | entity ::ENTITIES [ idx ] . 1 )
. ok ( )
. ok ( )
. map ( | idx | entity ::ENTITIES [ idx ] . 1 )
}
}
fn parse_html_entity ( ent_name : & str ) -> Option < char > {
// Parse an HTML entity (named or numeric) and return the corresponding
let d = decode_named_entity ( ent_name ) ;
// character.
if d . is_some ( ) {
return d ;
fn parse_html_entity ( entity : & str ) -> Option < char > {
if let Some ( c ) = decode_named_entity ( entity ) {
return Some ( c ) ;
}
}
let num = ent_name . strip_prefix ( " # " ) ? ;
let num = entity . strip_prefix ( '#' ) ? ;
if num . chars ( ) . next ( ) ? = = 'x' {
u32 ::from_str_rad ix ( & num [ 1 .. ] . to_lowercase ( ) , 16 )
let code_point = if let Some ( hex ) = num . strip_pref ix ( | c | c = = 'x' | | c = = 'X' ) {
u32 ::from_str_radix ( hex , 16 ) . ok ( ) ?
} else {
} else {
// remaining string may be empty, but that will generate an Err(Empty)
u32 ::from_str_radix ( num , 10 ) . ok ( ) ?
u32 ::from_str_radix ( num , 10 )
} ;
// Exclude control characters and ensure valid Unicode code point
if matches! ( code_point , 0x09 | 0x0A | 0x0D | 0x20 .. ) {
char ::from_u32 ( code_point )
} else {
None
}
}
. ok ( )
. filter ( | n | ! matches! ( n , 9 | 10 | 13 | 32 ) )
. and_then ( | n | char ::from_u32 ( n ) )
}
}
fn html_entitities_to_text ( s : & str ) -> String {
/// Convert HTML entities in a string to their corresponding characters.
fn html_entities_to_text ( s : & str ) -> String {
let mut out = String ::new ( ) ;
let mut out = String ::new ( ) ;
// except for the first part, every part will have started with an ampersand
// thus the start of the remaining parts is a HTML entity
let mut parts = s . split ( '&' ) ;
let mut parts = s . split ( '&' ) ;
/*
skip first part. if the string started with an ampersand, the first part
will be an empty string
if the string was empty, the first part will also be an empty string so its
// Add the first part (before any '&')
safe to unwrap
out . push_str ( parts . next ( ) . unwrap_or_default ( ) ) ;
*/
out . push_str ( parts . next ( ) . unwrap ( ) ) ;
for part in parts {
for part in parts {
let end = part
let end = part
// entity can be terminated by semicolon or whitespace
. find ( | c : char | c . is_whitespace ( ) | | c = = ';' )
. find ( | c : char | c . is_whitespace ( ) | | c = = ';' )
// entity can also terminated by end of string or start of
// another entity
. unwrap_or_else ( | | part . len ( ) ) ;
. unwrap_or_else ( | | part . len ( ) ) ;
if let Some ( entity ) = parse_html_entity ( & part [ .. end ] ) {
if let Some ( entity ) = parse_html_entity ( & part [ .. end ] ) {
out . push ( entity ) ;
out . push ( entity ) ;
// get byte length of the char we did `find` above
let real_end = if let Some ( next ) = & part [ end .. ] . chars ( ) . next ( ) {
end + next . len_utf8 ( )
} else {
// invalid html entity that doesn't end with `;`
end
} ;
out . push_str ( & part [ real_end .. ] ) ;
// Advance past the entity and any following semicolon
let skip = if part [ end .. ] . starts_with ( ';' ) { 1 } else { 0 } ;
let remaining = & part [ end + skip .. ] ;
out . push_str ( remaining ) ;
} else {
} else {
out . push ( '&' ) ;
out . push ( '&' ) ;
out . push_str ( part ) ;
out . push_str ( part ) ;
}
}
}
}
@@ -67,149 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
out
out
}
}
/ // Function to parse and handle the individual tags .
// Handle individual HTML tags and convert them to text .
/// Assumes that there was a '<' before the given string
// Returns the generated text and the number of bytes to skip.
///
/// Returns the generated text and the byte length to skip.
fn handle_tag ( s : & str ) -> ( String , usize ) {
fn handle_tag ( s : & str ) -> ( String , usize ) {
let ( tag , mo re) = match s . split_once ( '>' ) {
let ( tag_content , rest ) = match s . split_once ( '>' ) {
Some ( ( tag , mo re) ) if ! tag . is_empty ( ) = > ( tag , mo re) ,
Some ( ( tag , rest ) ) if ! tag . is_empty ( ) = > ( tag , rest ) ,
_ = > {
_ = > {
// was not actually a tag, so reinsert the '<'
// Not a valid tag, treat '<' as a regular character
return ( String ::from ( " < " ) , 0 ) ;
return ( " < ". to_string ( ) , 0 ) ;
}
}
} ;
} ;
let ( name , attribs ) = if let Some ( ( name , attribs ) ) = tag . split_once ( char ::is_whitespace ) {
// Split the tag into name and attributes
( name , Some ( attribs ) )
let ( tag_ name, attribs ) = tag_content
} else {
. split_once ( char ::is_whitespace )
( tag , None )
. map_or ( ( tag_content , " " ) , | ( name , attrs ) | ( name , attrs ) ) ;
} ;
match name . to_lowercase ( ) . as_str ( ) {
match tag_ name. to_lowercase ( ) . as_str ( ) {
// Handle anchor tags
" a " = > {
" a " = > {
// Extract href attribute
let href = attribs
let href = attribs
. and_then ( | attribs | {
. split_ascii_whitespace ( )
Some (
. find_map ( | attr | {
attribs
let mut parts = attr . splitn ( 2 , '=' ) ;
// check for the href and then discard everything before it
. split_once ( " href " ) ?
if let ( Some ( key ) , Some ( value ) ) = ( parts . next ( ) , parts . next ( ) ) {
. 1
if key . eq_ignore_ascii_case ( " href " ) {
// there might be whitespace between 'href' and '='
Some ( value . trim_matches ( [ '"' , '\'' ] . as_ref ( ) ) )
. trim_start ( )
} else {
// check for and then discard the equal sign
None
. strip_prefix ( '=' ) ?
// remove whitespace after the equal sign
. trim_start ( ) ,
)
} )
. and_then ( | href_value |
// find quoted string
match href_value . chars ( ) . next ( ) ? {
start @ '\'' | start @ '"' = > {
let ( end , _ ) = href_value
. char_indices ( )
. skip ( 1 )
. find ( | ( _ , c ) | * c = = start ) ? ;
Some ( href_value [ 1 .. end ] . to_string ( ) )
}
}
_ = > None ,
} else {
} )
None
}
} )
. filter ( | href | ! href . starts_with ( " javascript: " ) )
. filter ( | href | ! href . starts_with ( " javascript: " ) )
. map ( | href | html_entititi es_to_text ( & href ) );
. map ( html_entities_to_text ) ;
// only use to_ascii_lowercase here so the byte offsets dont get
// messed up from one uppercase symbol becoming two lowercase
// symbols or something like that
let more = more . to_ascii_lowercase ( ) ;
let end_without_closing = more . find ( " </a " ) ;
// Search for closing </a> tag
let conten t = end_without_closing . map ( | i | more [ 0 .. i ] . trim ( ) ) ;
let lower_res t = rest . to_ascii_lowercase ( ) ;
let end_tag_start = lower_rest . find ( " </a> " ) . unwrap_or ( lower_rest . len ( ) ) ;
let content = & rest [ .. end_tag_start ] ;
let end = end_without_closing
// Calculate the total length to skip
. map ( | i | i + 3 )
let closing_tag_len = if end_tag_start < lower_rest . len ( ) {
. and_then ( | end_tag | more [ end_tag .. ] . find ( '>' ) . map ( | i | end_tag + i + 1 ) )
4
. unwrap_or_else ( | | more . len ( ) ) ;
} else {
0
} ;
// Length of "</a>"
let link = match ( content , href ) {
let total_skip = tag_content . len ( ) + 1 + end_tag_start + closing_tag_len ;
( Some ( content_value ) , Some ( href_value ) ) = > {
let content_text = html2text ( content . trim ( ) ) ;
let cleaned_content_value = html2text ( content_value ) ;
let link = match ( href , content_text . is_empty ( ) ) {
format! ( " {} ( {} ) " , cleaned_content_value , href_value )
( Some ( href_value ) , false ) if content_text ! = href_value = > {
format! ( " {} ( {} ) " , content_text , href_value )
}
}
( None , Some ( href_value ) ) = > href_value ,
( Some ( content _value) , None ) = > content_value . to_string ( ) ,
( Some ( href _value) , _ ) = > href_value ,
( None , None ) = > " " . to_string ( ) ,
( _ , false ) = > content_text ,
_ = > String ::new ( ) ,
} ;
} ;
( link , tag . len ( ) + 1 + end )
( link , total_skip )
}
}
" br " | " br/ " | " li " | " /ol " | " /ul " = > ( String ::from ( " \r \n " ) , tag . len ( ) + 1 ) ,
// Line breaks and list items
" p " | " h1 " | " h2 " | " h3 " | " h4 " | " h5 " | " h6 " | " /h1 " | " /h2 " | " /h3 " | " /h4 " | " /h5 "
" br " | " br/ " | " li " | " /ol " | " /ul " = > ( " \r \n " . to_string ( ) , tag_content . len ( ) + 1 ) ,
| " /h6 " = > ( String ::from ( " \r \n \r \n " ) , tag . len ( ) + 1 ) ,
name @ " head " | name @ " script " | name @ " style " = > {
// silence tags
// only use to_ascii_lowercase here so the byte offsets dont get
// Paragraphs and headings
// messed up from one uppercase symbol becoming two lowercase
" p " | " h1 " | " h2 " | " h3 " | " h4 " | " h5 " | " h6 " | " /h1 " | " /h2 " | " /h3 " | " /h4 " | " /h5 " | " /h6 " = > ( " \r \n \r \n " . to_string ( ) , tag_content . len ( ) + 1 ) ,
// symbols or something like that
let more = more . to_ascii_lowercase ( ) ;
// Tags to ignore along with their content
let end = more
name if [ " head " , " script " , " style " ] . contains ( & name ) = > {
. find ( & format! ( " </ {} " , name ) )
// Search for the closing tag
. map ( | i | i + 2 + name . len ( ) )
. and_then ( | end_tag | more [ end_tag .. ] . find ( '>' ) . map ( | i | i + end_tag + 1 ) )
let closing_tag = format! ( " </ {} > " , name ) ;
. unwrap_or_else ( | | mo re. len ( ) ) ;
let lower_rest = rest . to_ascii_lowercase ( ) ;
( String ::new ( ) , tag . len ( ) + 1 + end )
let end_tag_start = lower_rest . find ( & closing_tag ) . unwrap_or ( lower_rest . l en( ) ) ;
let closing_tag_len = if end_tag_start < lower_rest . len ( ) {
closing_tag . len ( )
} else {
0
} ;
let total_skip = tag_content . len ( ) + 1 + end_tag_start + closing_tag_len ;
( String ::new ( ) , total_skip )
}
}
// HTML comments
" !-- " = > {
" !-- " = > {
// HTML comment
let end = s . find ( " --> " ) . map_or ( s . len ( ) , | n | n + 3 ) ;
( String ::new ( ) , s . find ( " --> " ) . map_or ( s . len ( ) , | n | n + 3 ) )
( String ::new ( ) , end )
}
}
// other/unknown tags are just discarded
_ = > ( String ::new ( ) , tag . len ( ) + 1 ) ,
// Discard other tags but keep their content
_ = > ( String ::new ( ) , tag_content . len ( ) + 1 ) ,
}
}
}
}
/// Convert some HTML to plain text. Only some simple HTML tags are handled:
/// Convert an HTML string to plain text.
/// - `a` tags are transformed to their href attribute value
/// Handles basic HTML tags and entities, and collapses whitespace.
/// - paragraph, linebreak, heading, list, and list item tags insert different
/// amounts of line breaks.
/// - HTML comments as well as `head`, `script` and `style` are completely
/// discarded, including their content
/// - unknown tags are skipped, but their content is printed
///
/// HTML named entities will be replaced with the respecive Unicode code point,
/// and whitespace will be collapsed as is usual in HTML.
///
/// The resulting string will have CRLF line endings.
pub fn html2text ( html : & str ) -> String {
pub fn html2text ( html : & str ) -> String {
// c ollapse spaces
// C ollapse multiple whitespace characters into a single space
let html = html . split_whitespace ( ) . collect ::< Vec < _ > > ( ) . join ( " " ) ;
let html = html . split_whitespace ( ) . collect ::< Vec < _ > > ( ) . join ( " " ) ;
let mut out = String ::new ( ) ;
let mut out = String ::new ( ) ;
let mut index = 0 ;
let mut i = 0 ;
while index < html . len ( ) {
whi le i < html . len ( ) {
if let Some ( pos ) = html [ index .. ] . find ( '<' ) {
match html [ i .. ] . find ( '<' ) {
if pos > 0 {
None = > {
out . push_str ( & html_entities_to_text ( & html [ index .. index + pos ] ) ) ;
// no more tags in the input, done
index + = pos ;
out + = & html_entitities_to_text ( & html [ i .. ] ) ;
break ;
}
}
Some ( text_segment ) = > {
index + = 1 ; // Skip the '<'
if text_segment > 0 {
let ( parsed_text , advance ) = handle_tag ( & html [ index .. ] ) ;
out + = & html_entitities_to_text ( & html [ i .. i + text_segment ] ) ;
if ! parsed_text . is_empty ( ) {
i + = text_segment ;
if out . ends_with ( " \r \n \r \n " ) | | out . is_empty ( ) {
out . push_str ( & parsed_text . trim_start ( ) ) ;
} else {
out . push_str ( & parsed_text ) ;
}
}
i + = 1 ; // skip the '<'
let ( s , advance ) = handle_tag ( & html [ i .. ] ) ;
if ! s . is_empty ( ) {
if out . ends_with ( " \r \n \r \n " ) | | out . is_empty ( ) {
out + = & s . trim_start ( ) ;
} else {
out + = & s ;
}
}
i + = advance ;
}
}
index + = advance ;
} else {
// No more tags, process the remaining text
out . push_str ( & html_entities_to_text ( & html [ index .. ] ) ) ;
break ;
}
}
}
}
@@ -219,141 +203,80 @@ pub fn html2text(html: &str) -> String {
#[ cfg(test) ]
#[ cfg(test) ]
mod tests {
mod tests {
use super ::* ;
use super ::* ;
macro_rules ! test {
macro_rules ! test {
( $name :ident , $from :literal , $to :literal $(, ) ? ) = > {
( $name :ident , $from :literal , $to :literal $(, ) ? ) = > {
#[ test ]
#[ test ]
fn $name ( ) {
fn $name ( ) {
assert_eq! ( & html2text ( $from ) , $to ) ;
assert_eq! ( html2text ( $from ) , $to ) ;
}
}
} ;
} ;
( $( $name :ident : $from :literal to $to :literal , ) * $(, ) ? ) = > {
( $( $name :ident : $from :literal to $to :literal , ) * ) = > {
$( test! { $name , $from , $to } ) *
$( test! { $name , $from , $to } ) *
} ;
} ;
}
}
test! {
test! {
plaintext : " blah " to " blah " ,
plaintext : " blah " to " blah " ,
tag : " <div></div> " to " " ,
tag : " <div></div> " to " " ,
tag_contents : " <div>simple text</div> " to " simple text " ,
tag_contents : " <div>simple text</div> " to " simple text " ,
// l inks
// L inks
link :
link : " click <a href= \" test \" >here</a> " to " click here (test) " ,
" click <a href= \" test \" >here </a> "
link_href_equal_to_content : " click <a href= \" test \" >test </a> " to " click test " ,
to " click here (test) " ,
links_ignore_attributes : " click <a class= \" x \" href= \" test \" >here</a> " to " click here (test) " ,
links_ignore_attributes :
link_entities_in_url : " click <a href= \" ents/'x' \" >here</a> " to " click here (ents/'x') " ,
" click <a class= \" x \" href= \" test \" >here</a> "
link_javascript : " click <a href= \" javascript:void(0) \" >here</a> " to " click here " ,
to " click here (test) " ,
link_ignore_content_tags : " click <a href= \" test \" ><span>here</span> or here</a> " to " click here or here (test) " ,
link_entities_in_url :
link_absolute_url : " click <a href= \" http://bit.ly/2n4wXRs \" >news</a> " to " click news (http://bit.ly/2n4wXRs) " ,
" click <a href= \" ents/'x' \" >here</a> "
link_ignore_attributes_2 : " <a rel= \" mw:WikiLink \" href= \" /wiki/yet#English \" title= \" yet \" >yet</a>, <a rel= \" mw:WikiLink \" href= \" /wiki/not_yet#English \" title= \" not yet \" >not yet</a> " to " yet (/wiki/yet#English), not yet (/wiki/not_yet#English) " ,
to " click here (ents/'x') " ,
// Inline elements
link_javascript :
ignore_inline : " strong <strong>text</strong> " to " strong text " ,
" click <a href= \" javascript:void(0) \" >here</a> "
ignore_inline_attributes : " some <div id= \" a \" class= \" b \" >div</div> " to " some div " ,
to " click here " ,
// Line breaks and spaces
link_ignore_content_tags :
collapse_spaces : " should ignore more spaces " to " should ignore more spaces " ,
" click <a href= \" test \" ><span>here</span> or here</a> "
collapse_linebreaks : " a \n b \n c " to " a b c " ,
to " click here or here (test) " ,
collapse_mixed : " should \n ignore \r \n new lines " to " should ignore new lines " ,
link_absolute_url :
br_tag : " two<br>line<br/>breaks " to " two \r \n line \r \n breaks " ,
" click <a href= \" http://bit.ly/2n4wXRs \" >news</a> "
paragraph : " <p>two</p><p>paragraphs</p> " to " two \r \n \r \n paragraphs " ,
to " click news (http://bit.ly/2n4wXRs) " ,
// Headers
link_ignore_attributes_2 :
h1 : " <h1>First</h1>main text " to " First \r \n \r \n main text " ,
" <a rel= \" mw:WikiLink \" href= \" /wiki/yet#English \" title= \" yet \" >yet</a>, <a rel= \" mw:WikiLink \" href= \" /wiki/not_yet#English \" title= \" not yet \" >not yet</a> "
h2_inline : " First<h2>Second</h2>next section " to " First \r \n \r \n Second \r \n \r \n next section " ,
to " yet (/wiki/yet#English), not yet (/wiki/not_yet#English) " ,
h2 : " <h2>Second</h2>next section " to " Second \r \n \r \n next section " ,
// inlines
h3_inline : " Second<h3>Third</h3>next section " to " Second \r \n \r \n Third \r \n \r \n next section " ,
ignore_inline :
h3 : " <h3>Third</h3>next section " to " Third \r \n \r \n next section " ,
" strong <strong>text</strong> "
h4_inline : " Third<h4>Fourth</h4>next section " to " Third \r \n \r \n Fourth \r \n \r \n next section " ,
to " strong text " ,
h4 : " <h4>Fourth</h4>next section " to " Fourth \r \n \r \n next section " ,
ignore_inline_attributes :
h5_inline : " Fourth<h5>Fifth</h5>next section " to " Fourth \r \n \r \n Fifth \r \n \r \n next section " ,
" some <div id= \" a \" class= \" b \" >div</div> "
h5 : " <h5>Fifth</h5>next section " to " Fifth \r \n \r \n next section " ,
to " some div " ,
h6_inline : " Fifth<h6>Sixth</h6>next section " to " Fifth \r \n \r \n Sixth \r \n \r \n next section " ,
// lines breaks and spaces
h6 : " <h6>Sixth</h6>next section " to " Sixth \r \n \r \n next section " ,
collapse_spaces :
no_h7 : " <h7>Not Header</h7>next section " to " Not Headernext section " ,
" should ignore more spaces " to " should ignore more spaces " ,
// HTML entities
collapse_linebreaks :
entity_nbsp : " two spaces " to " two \u{a0} \u{a0} spaces " ,
" a \n b \n c " to " a b c " ,
entity_copy : " © 2017 K3A " to " © 2017 K3A " ,
collapse_mixed :
entity_tag : " <printtag> " to " <printtag> " ,
" sh ould \n ignore \r \n new lines " to " sh ould ignore new lines " ,
entity_currencies : " w ould you pay in ¢, £, ¥ or €? " to " w ould you pay in ¢, £, ¥ or €? " ,
br_tag :
ampersand_not_entity : " Tom & Jerry is not an entity " to " Tom & Jerry is not an entity " ,
" two<br>line<br/>breaks " to " two \r \n line \r \n breaks " ,
entity_unknown : " this &neither; as you see " to " this &neither; as you see " ,
paragraph :
entity_amp : " fish & chips " to " fish & chips " ,
" <p>two</p><p>paragraphs</p> " to " two \r \n \r \n paragraphs " ,
// Unordered list
// Headers
unordered_list : " list of items<ul><li>One</li><li>Two</li><li>Three</li></ul> " to " list of items \r \n One \r \n Two \r \n Three \r \n " ,
h1 :
entity_quot : " "I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey " to " \" I'm sorry, Dave. I'm afraid I can't do that. \" – HAL, 2001: A Space Odyssey " ,
" <h1>First</h1>main text " to " First \r \n \r \n main text " ,
entity_reg : " Google ® " to " Google ® " ,
h2_inline :
// Large entity
" First<h2>Second</h2>next section "
entity_large_unknown : " &abcdefghij; " to " &abcdefghij; " ,
to " First \r \n \r \n Second \r \n \r \n next section " ,
// Numeric HTML entities
h2 :
entity_numeric : " ⁌ decimal and hex entities supported ⁍ " to " ⁌ decimal and hex entities supported ⁍ " ,
" <h2>Second</h2>next section " to " Second \r \n \r \n next section " ,
entity_numeric_2 : " 'single quotes' and 츝 " to " 'single quotes' and 츝 " ,
h3_inline :
// Full HTML structure
" Second<h3>Third</h3>next section "
empty : " " to " " ,
to " Second \r \n \r \n Third \r \n \r \n next section " ,
full_html : " <html><head><title>Good</title></head><body>x</body> " to " x " ,
h3 :
ignore_script : " we are not <script type= \" javascript \" ></script>interested in scripts " to " we are not interested in scripts " ,
" <h3>Third</h3>next section " to " Third \r \n \r \n next section " ,
// Custom HTML tags
h4_inline :
ignore_unknown_tag : " <aa>hello</aa> " to " hello " ,
" Third<h4>Fourth</h4>next section "
ignore_unknown_tag_whitespace : " <aa >hello</aa> " to " hello " ,
to " Third \r \n \r \n Fourth \r \n \r \n next section " ,
ignore_unknown_tag_attributes : " <aa x= \" 1 \" >hello</aa> " to " hello " ,
h4 :
invalid_html_entity_without_semicolon : " &hellip " to " … " ,
" <h4>Fourth</h4>next section " to " Fourth \r \n \r \n next section " ,
entity_whitespace_preserved : " & test " to " & test " ,
h5_inline :
" Fourth<h5>Fifth</h5>next section "
to " Fourth \r \n \r \n Fifth \r \n \r \n next section " ,
h5 :
" <h5>Fifth</h5>next section " to " Fifth \r \n \r \n next section " ,
h6_inline :
" Fifth<h6>Sixth</h6>next section "
to " Fifth \r \n \r \n Sixth \r \n \r \n next section " ,
h6 :
" <h6>Sixth</h6>next section " to " Sixth \r \n \r \n next section " ,
no_h7 :
" <h7>Not Header</h7>next section " to " Not Headernext section " ,
// html entitites
entity_nbsp :
" two spaces " to " two spaces " ,
entity_copy :
" © 2017 K3A " to " © 2017 K3A " ,
entity_tag :
" <printtag> " to " <printtag> " ,
entity_currencies :
" would you pay in ¢, £, ¥ or €? "
to " would you pay in ¢, £, ¥ or €? " ,
ampersand_not_entity :
" Tom & Jerry is not an entity " to " Tom & Jerry is not an entity " ,
entity_unknown :
" this &neither; as you see " to " this &neither; as you see " ,
entity_amp :
" fish & chips " to " fish & chips " ,
unordered_list :
" list of items<ul><li>One</li><li>Two</li><li>Three</li></ul> "
to " list of items \r \n One \r \n Two \r \n Three \r \n " ,
entity_quot :
" "I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey "
to " \" I'm sorry, Dave. I'm afraid I can't do that. \" – HAL, 2001: A Space Odyssey " ,
entity_reg :
" Google ® " to " Google ® " ,
// Large entity
entity_large_unknown :
" &abcdefghij; " to " &abcdefghij; " ,
// Numeric HTML entities
entity_numeric :
" ⁌ decimal and hex entities supported ⁍ "
to " ⁌ decimal and hex entities supported ⁍ " ,
entity_numeric_2 :
" 'single quotes' and 츝 "
to " 'single quotes' and 츝 " ,
// full thml structure
empty : " " to " " ,
full_html :
" <html><head><title>Good</title></head><body>x</body> " to " x " ,
ignore_script :
" we are not <script type= \" javascript \" ></script>interested in scripts "
to " we are not interested in scripts " ,
// custom html tags
ignore_unknown_tag :
" <aa>hello</aa> " to " hello " ,
ignore_unknown_tag_whitespace :
" <aa >hello</aa> " to " hello " ,
ignore_unknown_tag_attributes :
" <aa x= \" 1 \" >hello</aa> " to " hello " ,
invalid_html_entity_without_semicolon : " &hellip " to " … " ,
}
}
}
}