@@ -2,64 +2,60 @@ mod entity;
fn decode_named_entity ( entity : & str ) -> Option < char > {
fn decode_named_entity ( entity : & str ) -> Option < char > {
entity ::ENTITIES
entity ::ENTITIES
. binary_search_by_key ( & entity , | t | t . 0 )
. binary_search_by_key ( & entity , | & ( name , _ ) | name )
. ok ( )
. map ( | idx | entity ::ENTITIES [ idx ] . 1 )
. map ( | idx | entity ::ENTITIES [ idx ] . 1 )
. ok ( )
}
}
fn parse_html_entity ( ent_name : & str ) -> Option < char > {
// Parse an HTML entity (named or numeric) and return the corresponding
let d = decode_named_entity ( ent_name ) ;
// character.
if d . is_some ( ) {
return d ;
fn parse_html_entity ( entity : & str ) -> Option < char > {
if let Some ( c ) = decode_named_entity ( entity ) {
return Some ( c ) ;
}
}
let num = ent_name . strip_prefix ( " # " ) ? ;
let num = entity . strip_prefix ( '#' ) ? ;
if num . chars ( ) . next ( ) ? = = 'x' {
u32 ::from_str_rad ix ( & num [ 1 .. ] . to_lowercase ( ) , 16 )
let code_point = if let Some ( hex ) = num . strip_pref ix ( | c | c = = 'x' | | c = = 'X' ) {
u32 ::from_str_radix ( hex , 16 ) . ok ( ) ?
} else {
} else {
// remaining string may be empty, but that will generate an Err(Empty)
u32 ::from_str_radix ( num , 10 ) . ok ( ) ?
u32 ::from_str_radix ( num , 10 )
} ;
// Exclude control characters and ensure valid Unicode code point
if matches! ( code_point , 0x09 | 0x0A | 0x0D | 0x20 .. ) {
char ::from_u32 ( code_point )
} else {
None
}
}
. ok ( )
. filter ( | n | ! matches! ( n , 9 | 10 | 13 | 32 ) )
. and_then ( | n | char ::from_u32 ( n ) )
}
}
fn html_entitities_to_text ( s : & str ) -> String {
/// Convert HTML entities in a string to their corresponding characters.
fn html_entities_to_text ( s : & str ) -> String {
let mut out = String ::new ( ) ;
let mut out = String ::new ( ) ;
// except for the first part, every part will have started with an ampersand
// thus the start of the remaining parts is a HTML entity
let mut parts = s . split ( '&' ) ;
let mut parts = s . split ( '&' ) ;
/*
skip first part. if the string started with an ampersand, the first part
will be an empty string
if the string was empty, the first part will also be an empty string so its
// Add the first part (before any '&')
safe to unwrap
out . push_str ( parts . next ( ) . unwrap_or_default ( ) ) ;
*/
out . push_str ( parts . next ( ) . unwrap ( ) ) ;
for part in parts {
for part in parts {
let end = part
let end = part
// entity can be terminated by semicolon or whitespace
. find ( | c : char | c . is_whitespace ( ) | | c = = ';' )
. find ( | c : char | c . is_whitespace ( ) | | c = = ';' )
// entity can also terminated by end of string or start of
// another entity
. unwrap_or_else ( | | part . len ( ) ) ;
. unwrap_or_else ( | | part . len ( ) ) ;
if let Some ( entity ) = parse_html_entity ( & part [ .. end ] ) {
if let Some ( entity ) = parse_html_entity ( & part [ .. end ] ) {
out . push ( entity ) ;
out . push ( entity ) ;
// get byte length of the char we did `find` above
let real_end = if let Some ( next ) = & part [ end .. ] . chars ( ) . next ( ) {
end + next . len_utf8 ( )
} else {
// invalid html entity that doesn't end with `;`
end
} ;
out . push_str ( & part [ real_end .. ] ) ;
// Advance past the entity and any following semicolon
let skip = if part [ end .. ] . starts_with ( ';' ) { 1 } else { 0 } ;
let remaining = & part [ end + skip .. ] ;
out . push_str ( remaining ) ;
} else {
} else {
out . push ( '&' ) ;
out . push ( '&' ) ;
out . push_str ( part ) ;
out . push_str ( part ) ;
}
}
}
}
@@ -67,149 +63,137 @@ fn html_entitities_to_text(s: &str) -> String {
out
out
}
}
/ // Function to parse and handle the individual tags .
// Handle individual HTML tags and convert them to text .
/// Assumes that there was a '<' before the given string
// Returns the generated text and the number of bytes to skip.
///
/// Returns the generated text and the byte length to skip.
fn handle_tag ( s : & str ) -> ( String , usize ) {
fn handle_tag ( s : & str ) -> ( String , usize ) {
let ( tag , mo re) = match s . split_once ( '>' ) {
let ( tag_content , rest ) = match s . split_once ( '>' ) {
Some ( ( tag , mo re) ) if ! tag . is_empty ( ) = > ( tag , mo re) ,
Some ( ( tag , rest ) ) if ! tag . is_empty ( ) = > ( tag , rest ) ,
_ = > {
_ = > {
// was not actually a tag, so reinsert the '<'
// Not a valid tag, treat '<' as a regular character
return ( String ::from ( " < " ) , 0 ) ;
return ( " < ". to_string ( ) , 0 ) ;
}
}
} ;
} ;
let ( name , attribs ) = if let Some ( ( name , attribs ) ) = tag . split_once ( char ::is_whitespace ) {
// Split the tag into name and attributes
( name , Some ( attribs ) )
let ( tag_ name, attribs ) = tag_content
} else {
. split_once ( char ::is_whitespace )
( tag , None )
. map_or ( ( tag_content , " " ) , | ( name , attrs ) | ( name , attrs ) ) ;
} ;
match name . to_lowercase ( ) . as_str ( ) {
match tag_ name. to_lowercase ( ) . as_str ( ) {
// Handle anchor tags
" a " = > {
" a " = > {
// Extract href attribute
let href = attribs
let href = attribs
. and_then ( | attribs | {
. split_ascii_whitespace ( )
Some (
. find_map ( | attr | {
attribs
let mut parts = attr . splitn ( 2 , '=' ) ;
// check for the href and then discard everything before it
. split_once ( " href " ) ?
if let ( Some ( key ) , Some ( value ) ) = ( parts . next ( ) , parts . next ( ) ) {
. 1
if key . eq_ignore_ascii_case ( " href " ) {
// there might be whitespace between 'href' and '='
Some ( value . trim_matches ( [ '"' , '\'' ] . as_ref ( ) ) )
. trim_start ( )
} else {
// check for and then discard the equal sign
None
. strip_prefix ( '=' ) ?
}
// remove whitespace after the equal sign
} else {
. trim_start ( ) ,
None
)
} )
. and_then ( | href_value |
// find quoted string
match href_value . chars ( ) . next ( ) ? {
start @ '\'' | start @ '"' = > {
let ( end , _ ) = href_value
. char_indices ( )
. skip ( 1 )
. find ( | ( _ , c ) | * c = = start ) ? ;
Some ( href_value [ 1 .. end ] . to_string ( ) )
}
}
_ = > None ,
} )
} )
. filter ( | href | ! href . starts_with ( " javascript: " ) )
. filter ( | href | ! href . starts_with ( " javascript: " ) )
. map ( | href | html_entititi es_to_text ( & href ) );
. map ( html_entities_to_text ) ;
// only use to_ascii_lowercase here so the byte offsets dont get
// messed up from one uppercase symbol becoming two lowercase
// symbols or something like that
let more = more . to_ascii_lowercase ( ) ;
let end_without_closing = more . find ( " </a " ) ;
// Search for closing </a> tag
let conten t = end_without_closing . map ( | i | more [ 0 .. i ] . trim ( ) ) ;
let lower_res t = rest . to_ascii_lowercase ( ) ;
let end_tag_start = lower_rest . find ( " </a> " ) . unwrap_or ( lower_rest . len ( ) ) ;
let content = & rest [ .. end_tag_start ] ;
let end = end_without_closing
// Calculate the total length to skip
. map ( | i | i + 3 )
let closing_tag_len = if end_tag_start < lower_rest . len ( ) {
. and_then ( | end_tag | more [ end_tag .. ] . find ( '>' ) . map ( | i | end_tag + i + 1 ) )
4
. unwrap_or_else ( | | more . len ( ) ) ;
} else {
0
} ;
// Length of "</a>"
let link = match ( content , href ) {
let total_skip = tag_content . len ( ) + 1 + end_tag_start + closing_tag_len ;
( Some ( content_value ) , Some ( href_value ) ) = > {
let content_text = html2text ( content . trim ( ) ) ;
let cleaned_content_value = html2text ( content_value ) ;
let link = match ( href , content_text . is_empty ( ) ) {
format! ( " {} ( {} ) " , cleaned_content_value , href_value )
( Some ( href_value ) , false ) if content_text ! = href_value = > {
format! ( " {} ( {} ) " , content_text , href_value )
}
}
( None , Some ( href_value ) ) = > href_value ,
( Some ( content _value) , None ) = > content_value . to_string ( ) ,
( Some ( href _value) , _ ) = > href_value ,
( None , None ) = > " " . to_string ( ) ,
( _ , false ) = > content_text ,
_ = > String ::new ( ) ,
} ;
} ;
( link , tag . len ( ) + 1 + end )
( link , total_skip )
}
}
" br " | " br/ " | " li " | " /ol " | " /ul " = > ( String ::from ( " \r \n " ) , tag . len ( ) + 1 ) ,
// Line breaks and list items
" p " | " h1 " | " h2 " | " h3 " | " h4 " | " h5 " | " h6 " | " /h1 " | " /h2 " | " /h3 " | " /h4 " | " /h5 "
" div " | " br " | " br/ " | " li " | " /ol " | " /ul " = > ( " \r \n " . to_string ( ) , tag_content . len ( ) + 1 ) ,
| " /h6 " = > ( String ::from ( " \r \n \r \n " ) , tag . len ( ) + 1 ) ,
name @ " head " | name @ " script " | name @ " style " = > {
// silence tags
// only use to_ascii_lowercase here so the byte offsets dont get
// Paragraphs and headings
// messed up from one uppercase symbol becoming two lowercase
" p " | " h1 " | " h2 " | " h3 " | " h4 " | " h5 " | " h6 " | " /h1 " | " /h2 " | " /h3 " | " /h4 " | " /h5 " | " /h6 " = > ( " \r \n \r \n " . to_string ( ) , tag_content . len ( ) + 1 ) ,
// symbols or something like that
let more = more . to_ascii_lowercase ( ) ;
// Tags to ignore along with their content
let end = more
name if [ " head " , " script " , " style " ] . contains ( & name ) = > {
. find ( & format! ( " </ {} " , name ) )
// Search for the closing tag
. map ( | i | i + 2 + name . len ( ) )
. and_then ( | end_tag | more [ end_tag .. ] . find ( '>' ) . map ( | i | i + end_tag + 1 ) )
let closing_tag = format! ( " </ {} > " , name ) ;
. unwrap_or_else ( | | mo re. len ( ) ) ;
let lower_rest = rest . to_ascii_lowercase ( ) ;
( String ::new ( ) , tag . len ( ) + 1 + end )
let end_tag_start = lower_rest . find ( & closing_tag ) . unwrap_or ( lower_rest . l en( ) ) ;
let closing_tag_len = if end_tag_start < lower_rest . len ( ) {
closing_tag . len ( )
} else {
0
} ;
let total_skip = tag_content . len ( ) + 1 + end_tag_start + closing_tag_len ;
( String ::new ( ) , total_skip )
}
}
// HTML comments
" !-- " = > {
" !-- " = > {
// HTML comment
let end = s . find ( " --> " ) . map_or ( s . len ( ) , | n | n + 3 ) ;
( String ::new ( ) , s . find ( " --> " ) . map_or ( s . len ( ) , | n | n + 3 ) )
( String ::new ( ) , end )
}
}
// other/unknown tags are just discarded
_ = > ( String ::new ( ) , tag . len ( ) + 1 ) ,
// Discard other tags but keep their content
_ = > ( String ::new ( ) , tag_content . len ( ) + 1 ) ,
}
}
}
}
/// Convert some HTML to plain text. Only some simple HTML tags are handled:
/// Convert an HTML string to plain text.
/// - `a` tags are transformed to their href attribute value
/// Handles basic HTML tags and entities, and collapses whitespace.
/// - paragraph, linebreak, heading, list, and list item tags insert different
/// amounts of line breaks.
/// - HTML comments as well as `head`, `script` and `style` are completely
/// discarded, including their content
/// - unknown tags are skipped, but their content is printed
///
/// HTML named entities will be replaced with the respecive Unicode code point,
/// and whitespace will be collapsed as is usual in HTML.
///
/// The resulting string will have CRLF line endings.
pub fn html2text ( html : & str ) -> String {
pub fn html2text ( html : & str ) -> String {
// c ollapse spaces
// C ollapse multiple whitespace characters into a single space
let html = html . split_whitespace ( ) . collect ::< Vec < _ > > ( ) . join ( " " ) ;
let html = html . split_whitespace ( ) . collect ::< Vec < _ > > ( ) . join ( " " ) ;
let mut out = String ::new ( ) ;
let mut out = String ::new ( ) ;
let mut index = 0 ;
let mut i = 0 ;
while index < html . len ( ) {
whi le i < html . len ( ) {
if let Some ( pos ) = html [ index .. ] . find ( '<' ) {
match html [ i .. ] . find ( '<' ) {
if pos > 0 {
None = > {
out . push_str ( & html_entities_to_text ( & html [ index .. index + pos ] ) ) ;
// no more tags in the input, done
index + = pos ;
out + = & html_entitities_to_text ( & html [ i .. ] ) ;
break ;
}
}
Some ( text_segment ) = > {
index + = 1 ; // Skip the '<'
if text_segment > 0 {
let ( parsed_text , advance ) = handle_tag ( & html [ index .. ] ) ;
out + = & html_entitities_to_text ( & html [ i .. i + text_segment ] ) ;
if ! parsed_text . is_empty ( ) {
i + = text_segment ;
}
i + = 1 ; // skip the '<'
let ( s , advance ) = handle_tag ( & html [ i .. ] ) ;
if ! s . is_empty ( ) {
if out . ends_with ( " \r \n \r \n " ) | | out . is_empty ( ) {
if out . ends_with ( " \r \n \r \n " ) | | out . is_empty ( ) {
out + = & s . trim_start ( ) ;
out . push_str ( & parsed_text . trim_start ( ) ) ;
} else {
} else {
out + = & s ;
out . push_str ( & parsed_text ) ;
}
}
}
}
i + = advance ;
index + = advance ;
}
} else {
// No more tags, process the remaining text
out . push_str ( & html_entities_to_text ( & html [ index .. ] ) ) ;
break ;
}
}
}
}
@@ -219,15 +203,14 @@ pub fn html2text(html: &str) -> String {
#[ cfg(test) ]
#[ cfg(test) ]
mod tests {
mod tests {
use super ::* ;
use super ::* ;
macro_rules ! test {
macro_rules ! test {
( $name :ident , $from :literal , $to :literal $(, ) ? ) = > {
( $name :ident , $from :literal , $to :literal $(, ) ? ) = > {
#[ test ]
#[ test ]
fn $name ( ) {
fn $name ( ) {
assert_eq! ( & html2text ( $from ) , $to ) ;
assert_eq! ( html2text ( $from ) , $to ) ;
}
}
} ;
} ;
( $( $name :ident : $from :literal to $to :literal , ) * $(, ) ? ) = > {
( $( $name :ident : $from :literal to $to :literal , ) * ) = > {
$( test! { $name , $from , $to } ) *
$( test! { $name , $from , $to } ) *
} ;
} ;
}
}
@@ -236,124 +219,64 @@ mod tests {
plaintext : " blah " to " blah " ,
plaintext : " blah " to " blah " ,
tag : " <div></div> " to " " ,
tag : " <div></div> " to " " ,
tag_contents : " <div>simple text</div> " to " simple text " ,
tag_contents : " <div>simple text</div> " to " simple text " ,
// l inks
// L inks
link :
link : " click <a href= \" test \" >here</a> " to " click here (test) " ,
" click <a href= \" test \" >here </a> "
link_href_equal_to_content : " click <a href= \" test \" >test </a> " to " click test " ,
to " click here (test) " ,
links_ignore_attributes : " click <a class= \" x \" href= \" test \" >here</a> " to " click here (test) " ,
links_ignore_attributes :
link_entities_in_url : " click <a href= \" ents/'x' \" >here</a> " to " click here (ents/'x') " ,
" click <a class= \" x \" href= \" test \" >here</a> "
link_javascript : " click <a href= \" javascript:void(0) \" >here</a> " to " click here " ,
to " click here (test) " ,
link_ignore_content_tags : " click <a href= \" test \" ><span>here</span> or here</a> " to " click here or here (test) " ,
link_entities_in_url :
link_absolute_url : " click <a href= \" http://bit.ly/2n4wXRs \" >news</a> " to " click news (http://bit.ly/2n4wXRs) " ,
" click <a href= \" ents/'x' \" >here</a> "
link_ignore_attributes_2 : " <a rel= \" mw:WikiLink \" href= \" /wiki/yet#English \" title= \" yet \" >yet</a>, <a rel= \" mw:WikiLink \" href= \" /wiki/not_yet#English \" title= \" not yet \" >not yet</a> " to " yet (/wiki/yet#English), not yet (/wiki/not_yet#English) " ,
to " click here (ents/'x') " ,
// Inline elements
link_javascript :
ignore_inline : " strong <strong>text</strong> " to " strong text " ,
" click <a href= \" javascript:void(0) \" >here</a> "
ignore_inline_attributes : " some <div id= \" a \" class= \" b \" >div</div> " to " some \r \n div " ,
to " click here " ,
// Line breaks and spaces
link_ignore_content_tags :
collapse_spaces : " should ignore more spaces " to " should ignore more spaces " ,
" click <a href= \" test \" ><span>here</span> or here</a> "
collapse_linebreaks : " a \n b \n c " to " a b c " ,
to " click here or here (test) " ,
collapse_mixed : " should \n ignore \r \n new lines " to " should ignore new lines " ,
link_absolute_url :
br_tag : " two<br>line<br/>breaks " to " two \r \n line \r \n breaks " ,
" click <a href= \" http://bit.ly/2n4wXRs \" >news</a> "
paragraph : " <p>two</p><p>paragraphs</p> " to " two \r \n \r \n paragraphs " ,
to " click news (http://bit.ly/2n4wXRs) " ,
link_ignore_attributes_2 :
" <a rel= \" mw:WikiLink \" href= \" /wiki/yet#English \" title= \" yet \" >yet</a>, <a rel= \" mw:WikiLink \" href= \" /wiki/not_yet#English \" title= \" not yet \" >not yet</a> "
to " yet (/wiki/yet#English), not yet (/wiki/not_yet#English) " ,
// inlines
ignore_inline :
" strong <strong>text</strong> "
to " strong text " ,
ignore_inline_attributes :
" some <div id= \" a \" class= \" b \" >div</div> "
to " some div " ,
// lines breaks and spaces
collapse_spaces :
" should ignore more spaces " to " should ignore more spaces " ,
collapse_linebreaks :
" a \n b \n c " to " a b c " ,
collapse_mixed :
" should \n ignore \r \n new lines " to " should ignore new lines " ,
br_tag :
" two<br>line<br/>breaks " to " two \r \n line \r \n breaks " ,
paragraph :
" <p>two</p><p>paragraphs</p> " to " two \r \n \r \n paragraphs " ,
// Headers
// Headers
h1 :
h1 : " <h1>First</h1>main text " to " First \r \n \r \n main text " ,
" <h1> First</h1>main text " to " First \r \n \r \n main text " ,
h2_inline : " First<h2>Second</h2>next section " to " First \r \n \r \n Second \r \n \r \n next section " ,
h2_inline :
h2 : " <h2>Second</h2>next section " to " Second \r \n \r \n next section " ,
" First<h2>Second</h2> next section "
h3_inline : " Second<h3>Third</h3>next section " to " Second \r \n \r \n Third \r \n \r \n next section" ,
to " First \r \n \r \n Secon d \r \n \r \n next section " ,
h3 : " <h3>Third</h3>next section " to " Thir d \r \n \r \n next section " ,
h2 :
h4_inline : " Third<h4>Fourth</h4>next section " to " Third \r \n \r \n Fourth \r \n \r \n next section " ,
" <h2>Second </h2 >next section " to " Second \r \n \r \n next section " ,
h4 : " <h4>Fourth </h4 >next section" to " Fourth \r \n \r \n next section " ,
h3 _inline :
h5 _inline : " Fourth<h5>Fifth</h5>next section " to " Fourth \r \n \r \n Fifth \r \n \r \n next section " ,
" Second<h3>Third </h3 >next section "
h5 : " <h5>Fifth </h5 >next section" to " Fifth \r \n \r \n next section " ,
to " Second \r \n \r \n Third \r \n \r \n next section " ,
h6_inline : " Fifth<h6>Sixth</h6>next section " to " Fifth \r \n \r \n Sixth \r \n \r \n next section " ,
h3 :
h6 : " <h6>Sixth</h6>next section " to " Sixth \r \n \r \n next section " ,
" <h3>Third </h3 >next section " to " Third \r \n \r \n next section" ,
no_h7 : " <h7>Not Header </h7 >next section" to " Not Header next section" ,
h4_inline :
// HTML entities
" Third<h4>Fourth</h4>next section "
entity_nbsp : " two spaces " to " two \u{a0} \u{a0} spaces " ,
to " Third \r \n \r \n Fourth \r \n \r \n next section " ,
entity_copy : " © 2017 K3A " to " © 2017 K3A " ,
h4 :
entity_tag : " <printtag> " to " <printtag> " ,
" <h4>Fourth</h4>next section " to " F ourth \r \n \r \n next section " ,
entity_currencies : " would you pay in ¢, £, ¥ or €? " to " w ould you pay in ¢, £, ¥ or €? " ,
h5_inline :
ampersand_not_entity : " Tom & Jerry is not an entity " to " Tom & Jerry is not an entity " ,
" Fourth<h5>Fifth</h5>next section "
entity_unknown : " this &neither; as you see " to " this &neither; as you see " ,
to " Fourth \r \n \r \n Fifth \r \n \r \n next section " ,
entity_amp : " fish & chips " to " fish & chips " ,
h5 :
// Unordered list
" <h5>Fifth</h5>next section " to " Fifth \r \n \r \n next sectio n" ,
unordered_list : " list of items<ul><li>One</li><li>Two</li><li>Three</li></ul> " to " list of items \r \n One \r \n Two \r \n Three \r \ n" ,
h6_inline :
entity_quot : " "I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey " to " \" I'm sorry, Dave. I'm afraid I can't do that. \" – HAL, 2001: A Space Odyssey " ,
" Fifth<h6>Sixth</h6>next section "
entity_reg : " Google ® " to " Google ® " ,
to " Fifth \r \n \r \n Sixth \r \n \r \n next section " ,
h6 :
" <h6>Sixth</h6>next section " to " Sixth \r \n \r \n next section " ,
no_h7 :
" <h7>Not Header</h7>next section " to " Not Headernext section " ,
// html entitites
entity_nbsp :
" two spaces " to " two spaces " ,
entity_copy :
" © 2017 K3A " to " © 2017 K3A " ,
entity_tag :
" <printtag> " to " <printtag> " ,
entity_currencies :
" would you pay in ¢, £, ¥ or €? "
to " would you pay in ¢, £, ¥ or €? " ,
ampersand_not_entity :
" Tom & Jerry is not an entity " to " Tom & Jerry is not an entity " ,
entity_unknown :
" this &neither; as you see " to " this &neither; as you see " ,
entity_amp :
" fish & chips " to " fish & chips " ,
unordered_list :
" list of items<ul><li>One</li><li>Two</li><li>Three</li></ul> "
to " list of items \r \n One \r \n Two \r \n Three \r \n " ,
entity_quot :
" "I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey "
to " \" I'm sorry, Dave. I'm afraid I can't do that. \" – HAL, 2001: A Space Odyssey " ,
entity_reg :
" Google ® " to " Google ® " ,
// Large entity
// Large entity
entity_large_unknown :
entity_large_unknown : " &abcdefghij; " to " &abcdefghij; " ,
" &abcdefghij; " to " &abcdefghij; " ,
// Numeric HTML entities
// Numeric HTML entities
entity_numeric :
entity_numeric : " ⁌ decimal and hex entities supported ⁍ " to " ⁌ decimal and hex entities supported ⁍ " ,
" ⁌ decimal and hex entities supported ⁍ "
entity_numeric_2 : " 'single quotes' and 츝 " to " 'single quotes' and 츝 " ,
to " ⁌ decimal and hex entities supported ⁍ " ,
// Full HTML structure
entity_numeric_2 :
" 'single quotes' and 츝 "
to " 'single quotes' and 츝 " ,
// full thml structure
empty : " " to " " ,
empty : " " to " " ,
full_html :
full_html : " <html><head><title>Good</title></head><body>x</body> " to " x " ,
" <html><head><title>Good</title></head><body>x</body> " to " x " ,
ignore_script : " we are not <script type= \" javascript \" ></script>interested in scripts " to " we are not interested in scripts " ,
ignore_script :
// Custom HTML tags
" we are not <script type= \" javascript \" ></script>interested in scripts "
ignore_unknown_tag : " <aa>hello</aa> " to " hello " ,
to " we are not interested in scripts " ,
ignore_unknown_tag_whitespace : " <aa >hello</aa> " to " hello " ,
// custom html tags
ignore_unknown_tag_attributes : " <aa x= \" 1 \" >hello</aa> " to " hello " ,
ignore_unknown_tag :
" <aa>hello</aa> " to " hello " ,
ignore_unknown_tag_whitespace :
" <aa >hello</aa> " to " hello " ,
ignore_unknown_tag_attributes :
" <aa x= \" 1 \" >hello</aa> " to " hello " ,
invalid_html_entity_without_semicolon : " &hellip " to " … " ,
invalid_html_entity_without_semicolon : " &hellip " to " … " ,
entity_whitespace_preserved : " & test " to " & test " ,
}
}
}
}