[libhtml-scrubber-perl] 14/30: Remove invalid end tags
Florian Schlichting
fsfs at moszumanska.debian.org
Sat Nov 11 13:46:19 UTC 2017
This is an automated email from the git hooks/post-receive script.
fsfs pushed a commit to annotated tag release/0.16
in repository libhtml-scrubber-perl.
commit cd2a2258b15078758c6f41b3aa9ab7acff1dc8cc
Author: Paul Cochrane <paul at liekut.de>
Date: Fri Jun 23 13:10:34 2017 +0200
Remove invalid end tags
... which solves the issue reported in RT#120384, namely that end tags
of empty elements (e.g. </br>, </hr>, </img>, etc.) are invalid HTML and
should be removed from the scrubbed output.
---
lib/HTML/Scrubber.pm | 7 +++
t/rt120384_remove_false_tags.t | 107 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 114 insertions(+)
diff --git a/lib/HTML/Scrubber.pm b/lib/HTML/Scrubber.pm
index 2a7b173..cba6945 100644
--- a/lib/HTML/Scrubber.pm
+++ b/lib/HTML/Scrubber.pm
@@ -61,6 +61,7 @@ use warnings;
use HTML::Parser 3.47 ();
use HTML::Entities;
use Scalar::Util ('weaken');
+use List::Util qw(any);
our ( @_scrub, @_scrub_fh );
@@ -429,6 +430,12 @@ sub _scrub_str {
}
}
elsif ( $e eq 'end' ) {
+ # empty tags list taken from
+ # https://developer.mozilla.org/en/docs/Glossary/empty_element
+ my @empty_tags =
+ qw(area base br col embed hr img input link meta param source track wbr);
+ return "" if $text ne '' && any { $t eq $_ } @empty_tags; # skip false closing empty tags
+
my $place = 0;
if ( exists $s->{_rules}->{$t} ) {
$place = 1 if $s->{_rules}->{$t};
diff --git a/t/rt120384_remove_false_tags.t b/t/rt120384_remove_false_tags.t
new file mode 100644
index 0000000..30ed554
--- /dev/null
+++ b/t/rt120384_remove_false_tags.t
@@ -0,0 +1,107 @@
+use warnings;
+use strict;
+
+use utf8;
+use Test::More;
+use Test::Differences;
+
+use HTML::Scrubber;
+use HTML::Parser;
+
+my $scrub = HTML::Scrubber->new(
+ allow => [
+ qw{ p b i area br base col colgroup embed hr img input
+ link map meta object param table track source video wbr }
+ ]
+);
+$scrub->default( undef, { '*' => 1 } ); # allow all attributes
+$scrub->comment(1);
+
+# html snippets adapted from https://developer.mozilla.org/en/docs/
+# except for the <br> snippet, which is taken from the example in the
+# original ticket
+my %snippets = (
+ "area" => '<map name="primary">
+ <area shape="circle" coords="75,75,75" href="left.html" alt="Click to go Left"></area>
+ </map>',
+ "base" =>
+ '<base target="_blank" href="http://www.example.com/page.html"></base>',
+ "br" => '<P STYLE="font-size: 300%">
+ <BLINK>"You may get to touch her<BR>
+ If your gloves are sterilized<BR></BR>
+ Rinse your mouth with Listerine</BR>
+ Blow disinfectant in her eyes"</BLINK><BR>
+ -- X-Ray Spex, <I>Germ-Free Adolescents<I>',
+ "col" => '<table>
+ <colgroup>
+ <col span="2"></col>
+ </colgroup>
+ </table>',
+ "embed" =>
+'<embed type="video/quicktime" src="movie.mov" width="640" height="480"></embed>',
+ "hr" => '<p>This is the first paragraph of text.</p>
+ <hr></hr>
+ <p>This is the second paragraph of text.</p>',
+ "img" => '<img src="image.png" alt="alt text"></img>',
+ "input" => '<input id="input1" type="text"></input>',
+ "link" => '<link href="style.css" rel="stylesheet"></link>',
+ "meta" => '<meta charset="utf-8"></meta>',
+ "param" => '<object data="movie.swf" type="application/x-shockwave-flash">
+ <param name="foo" value="bar"></param>
+ </object>',
+ "source" => '<video controls>
+ <source src="foo.ogg" type="video/ogg"></source>
+ </video>',
+ "track" => '<video controls>
+ <source src="sample.mp4" type="video/mp4">
+ <track kind="captions" src="sampleCaptions.vtt" srclang="en"></track>
+ </video>',
+ "wbr" => '<p>word<wbr>.break<wbr>.opportunity<wbr></wbr>.</p>',
+);
+
+plan tests => scalar keys %snippets;
+
+my %expected = (
+ "area" => '<map name="primary">
+ <area shape="circle" coords="75,75,75" href="left.html" alt="Click to go Left">
+ </map>',
+ "base" => '<base target="_blank" href="http://www.example.com/page.html">',
+ "br" => '<p style="font-size: 300%">
+ "You may get to touch her<br>
+ If your gloves are sterilized<br>
+ Rinse your mouth with Listerine
+ Blow disinfectant in her eyes"<br>
+ -- X-Ray Spex, <i>Germ-Free Adolescents<i>',
+ "col" => '<table>
+ <colgroup>
+ <col span="2">
+ </colgroup>
+ </table>',
+ "embed" =>
+ '<embed type="video/quicktime" src="movie.mov" width="640" height="480">',
+ "hr" => '<p>This is the first paragraph of text.</p>
+ <hr>
+ <p>This is the second paragraph of text.</p>',
+ "img" => '<img src="image.png" alt="alt text">',
+ "input" => '<input id="input1" type="text">',
+ "link" => '<link href="style.css" rel="stylesheet">',
+ "meta" => '<meta charset="utf-8">',
+ "param" => '<object data="movie.swf" type="application/x-shockwave-flash">
+ <param name="foo" value="bar">
+ </object>',
+ "source" => '<video controls>
+ <source src="foo.ogg" type="video/ogg">
+ </video>',
+ "track" => '<video controls>
+ <source src="sample.mp4" type="video/mp4">
+ <track kind="captions" src="sampleCaptions.vtt" srclang="en">
+ </video>',
+ "wbr" => "<p>word<wbr>.break<wbr>.opportunity<wbr>.</p>",
+);
+
+for my $tag_name ( sort keys %snippets ) {
+ eq_or_diff $scrub->scrub( $snippets{$tag_name} ), $expected{$tag_name},
+ "False ending <$tag_name> tags are removed";
+}
+
+# vim: expandtab shiftwidth=4
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libhtml-scrubber-perl.git
More information about the Pkg-perl-cvs-commits
mailing list