r28944 - in /branches/upstream/libhtml-strip-perl: ./ current/ current/Changes current/MANIFEST current/Makefile.PL current/README current/Strip.pm current/Strip.xs current/strip_html.c current/strip_html.h current/test.pl current/typemap

dmn at users.alioth.debian.org dmn at users.alioth.debian.org
Mon Dec 29 15:57:13 UTC 2008


Author: dmn
Date: Mon Dec 29 15:57:10 2008
New Revision: 28944

URL: http://svn.debian.org/wsvn/pkg-perl/?sc=1&rev=28944
Log:
[svn-inject] Installing original source of libhtml-strip-perl

Added:
    branches/upstream/libhtml-strip-perl/
    branches/upstream/libhtml-strip-perl/current/
    branches/upstream/libhtml-strip-perl/current/Changes
    branches/upstream/libhtml-strip-perl/current/MANIFEST
    branches/upstream/libhtml-strip-perl/current/Makefile.PL
    branches/upstream/libhtml-strip-perl/current/README
    branches/upstream/libhtml-strip-perl/current/Strip.pm
    branches/upstream/libhtml-strip-perl/current/Strip.xs
    branches/upstream/libhtml-strip-perl/current/strip_html.c
    branches/upstream/libhtml-strip-perl/current/strip_html.h
    branches/upstream/libhtml-strip-perl/current/test.pl
    branches/upstream/libhtml-strip-perl/current/typemap

Added: branches/upstream/libhtml-strip-perl/current/Changes
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/Changes?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/Changes (added)
+++ branches/upstream/libhtml-strip-perl/current/Changes Mon Dec 29 15:57:10 2008
@@ -1,0 +1,50 @@
+Revision history for Perl extension HTML::Strip.
+
+1.06  Fri Feb 10 11:18:35 2006
+	- documented 'set_decode_entities' method
+
+1.05  Thu Feb  9 12:11:50 2006
+	- added 'set_decode_entities' method
+
+1.04  Mon Jan 24 16:41:51 2005
+	- Replaced all instances of strcmp with strcasecmp to make the
+	  module case-insensitive towards HTML tag names
+
+1.03  Wed Jul  7 13:42:26 2004
+	- Added 'emit_spaces' configuration option which can turn off
+	  attempted conversion of HTML tags into spaces
+	- Constructor options now passed in a hash
+
+1.02  Tue Feb 24 16:24:18 2004
+        - Yet more checks to prevent extraneous whitespace
+        - Added many more tests
+
+1.01  Mon Jul  7 18:15:59 2003
+        - Removed provision for escaped quotes in attributes values
+        - More checks to prevent the outputting of extraneous whitespace
+
+1.00  Wed Jun 11 12:05:47 2003
+        - rewritten in C, using a struct for each object to keep track
+          of state and striptags
+
+0.05  Thu May 22 19:49:25 2003
+        - removed "XSOPT => '-C++'" from Makefile.PL as it was
+          unnecessary and causing problems for some people
+        - added "#include <string.h>" to strip_html.cpp as it's
+          absence was causing problems for some people
+
+0.04  Sun Mar 23 12:45:13 2003
+        - Tweaked docs, added FAQ explaining why 0.03 failed cpan testing
+
+0.03  Sat Mar 22 11:20:34 2003
+        - rewritten in C++ to make striptags an attribute of each
+          object
+
+0.02  Mon Mar 17 18:20:01 2003
+        - added set_striptags() method
+        - documented module
+
+0.01  Tue Mar  4 18:17:38 2003
+	- original version; created by h2xs 1.21 with options
+		-A -n HTML::Strip html_strip.h
+

Added: branches/upstream/libhtml-strip-perl/current/MANIFEST
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/MANIFEST?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/MANIFEST (added)
+++ branches/upstream/libhtml-strip-perl/current/MANIFEST Mon Dec 29 15:57:10 2008
@@ -1,0 +1,10 @@
+Changes
+Makefile.PL
+MANIFEST
+README
+Strip.pm
+Strip.xs
+strip_html.h
+strip_html.c
+typemap
+test.pl

Added: branches/upstream/libhtml-strip-perl/current/Makefile.PL
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/Makefile.PL?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/Makefile.PL (added)
+++ branches/upstream/libhtml-strip-perl/current/Makefile.PL Mon Dec 29 15:57:10 2008
@@ -1,0 +1,18 @@
+use ExtUtils::MakeMaker;
+
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    'NAME'		=> 'HTML::Strip',
+    'VERSION_FROM'	=> 'Strip.pm', # finds $VERSION
+    'PREREQ_PM'		=> {}, # e.g., Module::Name => 1.1
+    ($] >= 5.005 ?    ## Add these new keywords supported since 5.005
+      (ABSTRACT_FROM => 'Strip.pm', # retrieve abstract from module
+       AUTHOR     => 'Alex Bowley <kilinrax at cpan.org>') : ()),
+    'LIBS'		=> [''], # e.g., '-lm'
+    'DEFINE'		=> '', # e.g., '-DHAVE_SOMETHING'
+	# Insert -I. if you add *.h files later:
+    'INC'		=> '', # e.g., '-I/usr/include/other'
+	# Un-comment this if you add C files to link with later:
+    'OBJECT'		=> '$(O_FILES)', # link all the C files too
+);

Added: branches/upstream/libhtml-strip-perl/current/README
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/README?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/README (added)
+++ branches/upstream/libhtml-strip-perl/current/README Mon Dec 29 15:57:10 2008
@@ -1,0 +1,23 @@
+HTML::Strip
+===========
+
+This module strips HTML-like markup from text.
+It is written in XS, and thus about five times quicker than using
+regular expressions for the same task.
+
+INSTALLATION
+
+To install this module type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+COPYRIGHT AND LICENCE
+
+Please report any bugs/suggestions to Alex Bowley <kilinrax at cpan.org>
+
+Copyright (c) 2003 Alex Bowley. All rights reserved.
+This program is free software; you can redistribute it and/or modify it under 
+the same terms as Perl itself.

Added: branches/upstream/libhtml-strip-perl/current/Strip.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/Strip.pm?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/Strip.pm (added)
+++ branches/upstream/libhtml-strip-perl/current/Strip.pm Mon Dec 29 15:57:10 2008
@@ -1,0 +1,242 @@
+package HTML::Strip;
+
+use 5.006;
+use warnings;
+use strict;
+
+use Carp qw( carp croak );
+
+require Exporter;
+require DynaLoader;
+
+our @ISA = qw(Exporter DynaLoader);
+
+# Items to export into callers namespace by default. Note: do not export
+# names by default without a very good reason. Use EXPORT_OK instead.
+# Do not simply export all your public functions/methods/constants.
+
+# This allows declaration	use HTML::Strip ':all';
+# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
+# will save memory.
+our %EXPORT_TAGS = ( 'all' => [ qw(
+                                  ) ] );
+
+our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+
+our @EXPORT = qw();
+
+our $VERSION = '1.06';
+
+bootstrap HTML::Strip $VERSION;
+
+# Preloaded methods go here.
+
+my $_html_entities_p = eval 'require HTML::Entities';
+
+my %defaults = (
+                striptags	=> [qw( title
+                                        style
+                                        script
+                                        applet )],
+                emit_spaces	=> 1,
+                decode_entities	=> 1,
+               );
+
+sub new {
+  my $class = shift;
+  my $obj = create();
+  bless $obj, $class;
+
+  my %args = (%defaults, @_);
+  while( my ($key, $value) = each %args ) {
+    my $method = "set_${key}";
+    if( $obj->can($method) ) {
+      $obj->$method($value);
+    } else {
+      carp "Invalid setting '$key'";
+    }
+  }
+  return $obj;
+}
+
+sub set_striptags {
+  my ($self, @tags) = @_;
+  if( ref($tags[0]) eq 'ARRAY' ) {
+    $self->set_striptags_ref( $tags[0] );
+  } else {
+    $self->set_striptags_ref( \@tags );
+  }
+}
+
+sub parse {
+  my ($self, $text) = @_;
+  my $stripped = $self->strip_html( $text );
+  if( $self->decode_entities && $_html_entities_p ) {
+    $stripped = HTML::Entities::decode($stripped);
+  }
+  return $stripped;
+}
+
+sub eof {
+  my $self = shift;
+  $self->reset();
+}
+
+1;
+__END__
+# Below is stub documentation for your module. You better edit it!
+
+=head1 NAME
+
+HTML::Strip - Perl extension for stripping HTML markup from text.
+
+=head1 SYNOPSIS
+
+  use HTML::Strip;
+
+  my $hs = HTML::Strip->new();
+
+  my $clean_text = $hs->parse( $raw_html );
+  $hs->eof;
+
+=head1 DESCRIPTION
+
+This module simply strips HTML-like markup from text in a very quick
+and brutal manner. It could quite easily be used to strip XML or SGML
+from text as well; but removing HTML markup is a much more common
+problem, hence this module lives in the HTML:: namespace.
+
+It is written in XS, and thus about five times quicker than using
+regular expressions for the same task.
+
+It does I<not> do any syntax checking (if you want that, use
+L<HTML::Parser>), instead it merely applies the following rules:
+
+=over 4
+
+=item 1
+
+Anything that looks like a tag, or group of tags will be replaced with
+a single space character. Tags are considered to be anything that
+starts with a C<E<lt>> and ends with a C<E<gt>>; with the caveat that a
+C<E<gt>> character may appear in either of the following without
+ending the tag:
+
+=over 4
+
+=item Quote
+
+Quotes are considered to start with either a C<'> or a C<"> character,
+and end with a matching character I<not> preceded by an even number or
+escaping slashes (i.e. C<\"> does not end the quote but C<\\\\"> does).
+
+=item Comment
+
+If the tag starts with an exclamation mark, it is assumed to be a
+declaration or a comment. Within such tags, C<E<gt>> characters do not
+end the tag if they appear within pairs of double dashes (e.g. C<E<lt>!--
+E<lt>a href="old.htm"E<gt>old pageE<lt>/aE<gt> --E<gt>> would be
+stripped completely).
+
+=back
+
+=item 2
+
+Anything the appears within so-called I<strip tags> is stripped as
+well. By default, these tags are C<title>, C<script>, C<style> and
+C<applet>.
+
+=back
+
+HTML::Strip maintains state between calls, so you can parse a document
+in chunks should you wish. If one chunk ends half-way through a tag,
+quote, comment, or whatever; it will remember this, and expect the
+next call to parse to start with the remains of said tag.
+
+If this is not going to be the case, be sure to call $hs->eof()
+between calls to $hs->parse().
+
+=head2 METHODS
+
+=item new()
+
+Constructor. Can optionally take a hash of settings (with keys
+corresponsing to the C<set_> methods below).
+
+For example, the following is a valid constructor:
+
+ my $hs = HTML::Strip->new(
+                           striptags   => [ 'script', 'iframe' ],
+                           emit_spaces => 0
+                          );
+
+=item parse()
+
+Takes a string as an argument, returns it stripped of HTML.
+
+=item eof()
+
+Resets the current state information, ready to parse a new block of HTML.
+
+=item clear_striptags()
+
+Clears the current set of strip tags.
+
+=item add_striptag()
+
+Adds the string passed as an argument to the current set of strip tags.
+
+=item set_striptags()
+
+Takes a reference to an array of strings, which replace the current
+set of strip tags.
+
+=item set_emit_spaces()
+
+Takes a boolean value. If set to false, HTML::Strip will not attempt
+any conversion of tags into spaces. Set to true by default.
+
+=item set_decode_entities()
+
+Takes a boolean value. If set to false, HTML::Strip will decode HTML
+entities. Set to true by default.
+
+=head2 LIMITATIONS
+
+=over 4
+
+=item Whitespace
+
+Despite only outputting one space character per group of tags, and
+avoiding doing so when tags are bordered by spaces or the start or
+end of strings, HTML::Strip can often output more than desired; such
+as with the following HTML:
+
+ <h1> HTML::Strip </h1> <p> <em> <strong> fast, and brutal </strong> </em> </p>
+
+Which gives the following output:
+
+C<E<nbsp>HTML::StripE<nbsp>E<nbsp>E<nbsp>E<nbsp>fast, and brutalE<nbsp>E<nbsp>E<nbsp>>
+
+Thus, you may want to post-filter the output of HTML::Strip to remove
+excess whitespace (for example, using C<tr/ / /s;>).
+(This has been improved since previous releases, but is still an issue)
+
+=item HTML Entities
+
+HTML::Strip will only attempt decoding of HTML entities if
+L<HTML::Entities> is installed.
+
+=head2 EXPORT
+
+None by default.
+
+=head1 AUTHOR
+
+Alex Bowley E<lt>kilinrax at cpan.orgE<gt>
+
+=head1 SEE ALSO
+
+L<perl>, L<HTML::Parser>, L<HTML::Entities>
+
+=cut

Added: branches/upstream/libhtml-strip-perl/current/Strip.xs
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/Strip.xs?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/Strip.xs (added)
+++ branches/upstream/libhtml-strip-perl/current/Strip.xs Mon Dec 29 15:57:10 2008
@@ -1,0 +1,105 @@
+
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#include "strip_html.h"
+
+MODULE = HTML::Strip		PACKAGE = HTML::Strip		
+
+PROTOTYPES: ENABLE
+
+Stripper *
+create()
+ PREINIT:
+  Stripper * stripper;
+ CODE:
+  New( 0, stripper, 1, Stripper );
+  reset( stripper );
+  RETVAL = stripper;
+ OUTPUT:
+  RETVAL
+
+void
+DESTROY( stripper )
+  Stripper * stripper
+ CODE:
+  Safefree( stripper );
+
+char *
+strip_html( stripper, raw )
+  Stripper * stripper
+  char * raw
+ PREINIT:
+  char * clean;
+  int size = strlen(raw) + 1;
+ INIT:
+  New( 0, clean, size, char );
+ CODE:
+  strip_html( stripper, raw, clean );
+  RETVAL = clean;
+ OUTPUT:
+  RETVAL  
+ CLEANUP:
+  Safefree( clean );
+
+void
+reset( stripper )
+  Stripper * stripper
+
+void
+clear_striptags( stripper )
+  Stripper * stripper
+
+void
+add_striptag( stripper, tag )
+  Stripper * stripper
+  char * tag
+
+void
+set_emit_spaces( stripper, emit )
+  Stripper * stripper
+  int emit
+ CODE:
+  stripper->o_emit_spaces = emit;
+
+void
+set_decode_entities( stripper, decode )
+  Stripper * stripper
+  int decode
+ CODE:
+  stripper->o_decode_entities = decode;
+
+int
+decode_entities( stripper )
+  Stripper * stripper
+ CODE:
+  RETVAL = stripper->o_decode_entities;
+ OUTPUT:
+  RETVAL
+
+void
+set_striptags_ref( stripper, tagref )
+  Stripper * stripper
+  SV * tagref
+ PREINIT:
+  AV * tags;
+  I32 numtags = 0;
+  int n;
+  if( (SvROK(tagref)) &&
+      (SvTYPE(SvRV(tagref)) == SVt_PVAV) ) {
+    tags = (AV *) SvRV(tagref);  
+  } else {
+    XSRETURN_UNDEF;
+  }
+  numtags = av_len(tags);
+  if( numtags < 0 ) {
+    XSRETURN_UNDEF;
+  }
+ CODE:
+  clear_striptags( stripper );
+  for (n = 0; n <= numtags; n++) {
+     STRLEN l;
+     char * tag = SvPV(*av_fetch(tags, n, 0), l);
+     add_striptag( stripper, tag );
+  }

Added: branches/upstream/libhtml-strip-perl/current/strip_html.c
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/strip_html.c?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/strip_html.c (added)
+++ branches/upstream/libhtml-strip-perl/current/strip_html.c Mon Dec 29 15:57:10 2008
@@ -1,0 +1,196 @@
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "strip_html.h"
+
+
+void
+strip_html( Stripper * stripper, const char * raw, char * output ) {
+  const char * p_raw = raw;
+  const char * raw_end = raw + strlen(raw);
+  char * p_output = output;
+    
+  while( p_raw < raw_end ) {
+    if( stripper->f_in_tag ) {
+      /* inside a tag */
+      /* check if we know either the tagname, or that we're in a declaration */
+      if( !stripper->f_full_tagname && !stripper->f_in_decl ) {
+        /* if this is the first character, check if it's a '!'; if so, we're in a declaration */
+        if( stripper->p_tagname == stripper->tagname && *p_raw == '!' ) {
+          stripper->f_in_decl = 1;
+        }
+        /* then check if the first character is a '/', in which case, this is a closing tag */
+        else if( stripper->p_tagname == stripper->tagname && *p_raw == '/' ) {
+          stripper->f_closing = 1;
+        } else {
+          /* if we don't have the full tag name yet, add current character unless it's whitespace, a '/', or a '>';
+             otherwise null pad the string and set the full tagname flag, and check the tagname against stripped ones.
+             also sanity check we haven't reached the array bounds, and truncate the tagname here if we have */
+          if( (!isspace( *p_raw ) && *p_raw != '/' && *p_raw != '>') &&
+              !( (stripper->p_tagname - stripper->tagname) == MAX_TAGNAMELENGTH ) ) {
+            *stripper->p_tagname++ = *p_raw;
+          } else {
+            *stripper->p_tagname = 0;
+            stripper->f_full_tagname = 1;
+            /* if we're in a stripped tag block, and this is a closing tag, check to see if it ends the stripped block */
+            if( stripper->f_in_striptag && stripper->f_closing ) {
+              if( strcasecmp( stripper->tagname, stripper->striptag ) == 0 ) {
+                stripper->f_in_striptag = 0;
+              }
+              /* if we're outside a stripped tag block, check tagname against stripped tag list */
+            } else if( !stripper->f_in_striptag && !stripper->f_closing ) {
+              int i;
+              for( i = 0; i <= stripper->numstriptags; i++ ) {
+                if( strcasecmp( stripper->tagname, stripper->o_striptags[i] ) == 0 ) {
+                  stripper->f_in_striptag = 1;
+                  strcpy( stripper->striptag, stripper->tagname );
+                }
+              }
+            }
+            check_end( stripper, *p_raw );
+          }
+        }
+      } else {
+        if( stripper->f_in_quote ) {
+          /* inside a quote */
+          /* end of quote if current character matches the opening quote character */
+          if( *p_raw == stripper->quote ) {
+            stripper->quote = 0;
+            stripper->f_in_quote = 0;
+          }
+        } else {
+          /* not in a quote */
+          /* check for quote characters */
+          if( *p_raw == '\'' || *p_raw == '\"' ) {
+            stripper->f_in_quote = 1;
+            stripper->quote = *p_raw;
+            /* reset lastchar_* flags in case we have something perverse like '-"' or '/"' */
+            stripper->f_lastchar_minus = 0;
+            stripper->f_lastchar_slash = 0;
+          } else {
+            if( stripper->f_in_decl ) {
+              /* inside a declaration */
+              if( stripper->f_lastchar_minus ) {
+                /* last character was a minus, so if current one is, then we're either entering or leaving a comment */
+                if( *p_raw == '-' ) {
+                  stripper->f_in_comment = !stripper->f_in_comment;
+                }
+                stripper->f_lastchar_minus = 0;
+              } else {
+                /* if current character is a minus, we might be starting a comment marker */
+                if( *p_raw == '-' ) {
+                  stripper->f_lastchar_minus = 1;
+                }
+              }
+              if( !stripper->f_in_comment ) {
+                check_end( stripper, *p_raw );
+              }
+            } else {
+              check_end( stripper, *p_raw );
+            }
+          } /* quote character check */
+        } /* in quote check */
+      } /* full tagname check */
+    }
+    else {
+      /* not in a tag */
+      /* check for tag opening, and reset parameters if one has */
+      if( *p_raw == '<' ) {
+        stripper->f_in_tag = 1;
+        stripper->tagname[0] = 0;
+        stripper->p_tagname = stripper->tagname;
+        stripper->f_full_tagname = 0;
+        stripper->f_closing = 0;
+        stripper->f_just_seen_tag = 1;
+      }
+      else {
+        /* copy to stripped provided we're not in a stripped block */
+        if( !stripper->f_in_striptag ) {
+          /* only emit spaces if we're configured to do so (on by default) */
+          if( stripper->o_emit_spaces ){
+            /* output a space in place of tags we have previously parsed,
+               and set a flag so we only do this once for every group of tags.
+               done here to prevent unnecessary trailing spaces */
+            if( isspace(*p_raw) ) {
+              /* don't output a space if this character is one anyway */
+              stripper->f_outputted_space = 1;
+            } else {
+              if( !stripper->f_outputted_space &&
+                  stripper->f_just_seen_tag ) {
+                *p_output++ = ' ';
+                stripper->f_outputted_space = 1;
+              } else {
+                /* this character must not be a space */
+                stripper->f_outputted_space = 0;
+              }
+            }
+          }
+          *p_output++ = *p_raw;
+          /* reset 'just seen tag' flag */
+          stripper->f_just_seen_tag = 0;
+        }
+      }
+    } /* in tag check */
+    p_raw++;
+  } /* while loop */
+
+  *p_output = 0;
+}
+
+void
+reset( Stripper * stripper ) {
+  stripper->f_in_tag = 0;
+  stripper->f_closing = 0;
+  stripper->f_lastchar_slash = 0;
+  stripper->f_full_tagname = 0;
+  /* hack to stop a space being output on strings starting with a tag */
+  stripper->f_outputted_space = 1;
+  stripper->f_just_seen_tag = 0;
+    
+  stripper->f_in_quote = 0;
+
+  stripper->f_in_decl = 0;
+  stripper->f_in_comment = 0;
+  stripper->f_lastchar_minus = 0;
+    
+  stripper->f_in_striptag = 0;
+}
+
+void
+clear_striptags( Stripper * stripper ) {
+  strcpy(stripper->o_striptags[0], "");
+  stripper->numstriptags = 0;
+}
+
+void
+add_striptag( Stripper * stripper, char * striptag ) {
+  if( stripper->numstriptags < MAX_STRIPTAGS-1 ) {
+    strcpy(stripper->o_striptags[stripper->numstriptags++], striptag);
+  } else {
+    fprintf( stderr, "Cannot have more than %i strip tags", MAX_STRIPTAGS );
+  }
+}
+
+
+void
+check_end( Stripper * stripper, char end ) {
+  /* if current character is a slash, may be a closed tag */
+  if( end == '/' ) {
+    stripper->f_lastchar_slash = 1;
+  } else {
+    /* if the current character is a '>', then the tag has ended */
+    if( end == '>' ) {
+      stripper->f_in_quote = 0;
+      stripper->f_in_comment = 0;
+      stripper->f_in_decl = 0;
+      stripper->f_in_tag = 0;
+      /* Do not start a stripped tag block if the tag is a closed one, e.g. '<script src="foo" />' */
+      if( stripper->f_lastchar_slash &&
+          (strcasecmp( stripper->striptag, stripper->tagname ) == 0) ) {
+        stripper->f_in_striptag = 0;
+      }
+    }
+    stripper->f_lastchar_slash = 0;
+  }
+}

Added: branches/upstream/libhtml-strip-perl/current/strip_html.h
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/strip_html.h?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/strip_html.h (added)
+++ branches/upstream/libhtml-strip-perl/current/strip_html.h Mon Dec 29 15:57:10 2008
@@ -1,0 +1,37 @@
+
+#define MAX_TAGNAMELENGTH 20
+#define MAX_STRIPTAGS 20
+
+typedef struct Stripper {
+  int f_in_tag;
+  int f_closing;
+  int f_lastchar_slash;
+
+  char tagname[MAX_TAGNAMELENGTH];
+  char * p_tagname;
+  char f_full_tagname;
+
+  int f_outputted_space;
+  int f_just_seen_tag;
+
+  int f_in_quote;
+  char quote;
+
+  int f_in_decl;
+  int f_in_comment;
+  int f_lastchar_minus;
+
+  int f_in_striptag;
+  char striptag[MAX_TAGNAMELENGTH];
+  char o_striptags[MAX_STRIPTAGS][MAX_TAGNAMELENGTH];
+  int numstriptags;
+  int o_emit_spaces;
+  int o_decode_entities;
+} Stripper;
+
+void strip_html( Stripper * stripper, const char * raw, char * clean );
+void reset( Stripper * stripper );
+void clear_striptags( Stripper * stripper );
+void add_striptag( Stripper * stripper, char * tag );
+
+void check_end( Stripper * stripper, char );

Added: branches/upstream/libhtml-strip-perl/current/test.pl
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/test.pl?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/test.pl (added)
+++ branches/upstream/libhtml-strip-perl/current/test.pl Mon Dec 29 15:57:10 2008
@@ -1,0 +1,91 @@
+# Before `make install' is performed this script should be runnable with
+# `make test'. After `make install' it should work as `perl test.pl'
+
+#########################
+
+# change 'tests => 1' to 'tests => last_test_to_print';
+
+use Test;
+BEGIN { plan tests => 17 };
+use HTML::Strip;
+ok(1); # If we made it this far, we're ok.
+
+#########################
+
+# Insert your test code below, the Test module is use()ed here so read
+# its man page ( perldoc Test ) for help writing this test script.
+
+my $hs = new HTML::Strip;
+
+ok( $hs->parse( 'test' ), 'test' );
+$hs->eof;
+
+ok( $hs->parse( '<em>test</em>' ), 'test' );
+$hs->eof;
+
+ok( $hs->parse( 'foo<br>bar' ), 'foo bar' );
+$hs->eof;
+
+ok( $hs->parse( '<p align="center">test</p>' ), 'test' );
+$hs->eof;
+
+ok( $hs->parse( '<p align="center>test</p>' ), '' );
+$hs->eof;
+
+ok( $hs->parse( '<foo>bar' ), 'bar' );
+ok( $hs->parse( '</foo>baz' ), ' baz' );
+$hs->eof;
+
+ok( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz' );
+$hs->eof;
+
+ok( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar' );
+$hs->eof;
+
+ok( $hs->parse( '<script>if (a<b && a>c)</script>bar' ), 'bar' );
+$hs->eof;
+
+ok( $hs->parse( '<# just data #>bar' ), 'bar' );
+$hs->eof;
+
+#ok( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar' );
+#$hs->eof;
+
+ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
+$hs->eof;
+
+my $html_entities_p = eval 'require HTML::Entities' ? '' : 'HTML::Entities not available';
+skip( $html_entities_p, $hs->parse( '&#060;foo&#062;' ), '<foo>' );
+$hs->eof;
+skip( $html_entities_p, $hs->parse( '&lt;foo&gt;' ), '<foo>' );
+$hs->eof;
+$hs->set_decode_entities(0);
+skip( $html_entities_p, $hs->parse( '&#060;foo&#062;' ), '&#060;foo&#062;' );
+$hs->eof;
+skip( $html_entities_p, $hs->parse( '&lt;foo&gt;' ), '&lt;foo&gt;' );
+$hs->eof;
+
+
+my $hs2 = new HTML::Strip;
+$hs2->set_striptags( [ 'foo' ] );
+
+ok( $hs2->parse( '<script>foo</script>bar' ), 'foo bar' );
+$hs2->eof;
+
+ok( $hs2->parse( '<foo>foo</foo>bar' ), 'bar' );
+$hs2->eof;
+
+ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
+$hs->eof;
+
+my @striptags = qw(baz quux);
+$hs->set_striptags( @striptags );
+
+ok( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar' );
+$hs->eof;
+
+ok( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar' );
+$hs->eof;
+
+ok( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), '   baz ' );
+$hs->eof;

Added: branches/upstream/libhtml-strip-perl/current/typemap
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libhtml-strip-perl/current/typemap?rev=28944&op=file
==============================================================================
--- branches/upstream/libhtml-strip-perl/current/typemap (added)
+++ branches/upstream/libhtml-strip-perl/current/typemap Mon Dec 29 15:57:10 2008
@@ -1,0 +1,15 @@
+TYPEMAP
+Stripper * O_STRIP
+
+INPUT
+O_STRIP
+	if( sv_isobject($arg) && (SvTYPE(SvRV($arg)) == SVt_PVMG) )
+		$var = ($type)SvIV((SV*)SvRV( $arg ));
+	else{
+		warn( \"${Package}::$func_name() -- $var is not a blessed SV reference\" );
+		XSRETURN_UNDEF;
+	}
+
+OUTPUT
+O_STRIP
+	sv_setref_pv( $arg, "HTML::Strip", (void*)$var );




More information about the Pkg-perl-cvs-commits mailing list