r44445 - in /branches/upstream/libparse-mediawikidump-perl/current: ./ lib/Parse/ lib/Parse/MediaWikiDump/ t/

jawnsy-guest at users.alioth.debian.org jawnsy-guest at users.alioth.debian.org
Sun Sep 20 19:28:33 UTC 2009


Author: jawnsy-guest
Date: Sun Sep 20 19:28:27 2009
New Revision: 44445

URL: http://svn.debian.org/wsvn/pkg-perl/?sc=1&rev=44445
Log:
[svn-upgrade] Integrating new upstream version, libparse-mediawikidump-perl (0.93)

Added:
    branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t   (with props)
Modified:
    branches/upstream/libparse-mediawikidump-perl/current/Changes
    branches/upstream/libparse-mediawikidump-perl/current/MANIFEST
    branches/upstream/libparse-mediawikidump-perl/current/META.yml
    branches/upstream/libparse-mediawikidump-perl/current/Makefile.PL
    branches/upstream/libparse-mediawikidump-perl/current/TODO
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/CategoryLinks.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Links.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Pages.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Revisions.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/XML.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/category_link.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/link.pm
    branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/page.pm
    branches/upstream/libparse-mediawikidump-perl/current/t/pages.t
    branches/upstream/libparse-mediawikidump-perl/current/t/pages_test.xml

Modified: branches/upstream/libparse-mediawikidump-perl/current/Changes
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/Changes?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/Changes (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/Changes Sun Sep 20 19:28:27 2009
@@ -1,4 +1,8 @@
 Revision history for Parse-MediaWikiDump
+
+0.93	Sep 15, 2009
+	* Made ::Pages a subclass of ::Revisions
+	* Discovered a bug regression: ::Pages and ::Revisions leak memory/are not properly garbage collected
 
 0.92	Apr 15, 2009
 	* Completed documentation for all modules

Modified: branches/upstream/libparse-mediawikidump-perl/current/MANIFEST
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/MANIFEST?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/MANIFEST (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/MANIFEST Sun Sep 20 19:28:27 2009
@@ -14,6 +14,7 @@
 t/revisions_test.xml
 t/revisions.t
 t/pre-factory.t
+t/pages-single-revision-only.t
 lib/Parse/MediaWikiDump/category_link.pm
 lib/Parse/MediaWikiDump/CategoryLinks.pm
 lib/Parse/MediaWikiDump/link.pm

Modified: branches/upstream/libparse-mediawikidump-perl/current/META.yml
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/META.yml?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/META.yml (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/META.yml Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 --- #YAML:1.0
 name:               Parse-MediaWikiDump
-version:            0.92
+version:            0.93
 abstract:           Tools to process MediaWiki dump files
 author:
     - Tyler Riddle <triddle at gmail.com>
@@ -11,6 +11,7 @@
 requires:
     List::Util:         0
     Object::Destroyer:  0
+    Test::Exception:    0
     Test::More:         0
     XML::Parser:        0
 no_index:

Modified: branches/upstream/libparse-mediawikidump-perl/current/Makefile.PL
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/Makefile.PL?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/Makefile.PL (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/Makefile.PL Sun Sep 20 19:28:27 2009
@@ -14,6 +14,7 @@
     'XML::Parser' => 0,
     'List::Util' => 0,
     'Object::Destroyer' => 0,
+    'Test::Exception' => 0,
     },
     dist                => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
     clean               => { FILES => 'Parse-MediaWikiDump-*' },

Modified: branches/upstream/libparse-mediawikidump-perl/current/TODO
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/TODO?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/TODO (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/TODO Sun Sep 20 19:28:27 2009
@@ -1,6 +1,3 @@
-  * Make tests for Pages as thorough as tests for Revisions
-  * Update revisions to make it easier to subclass and change the behavior 
-  * Port Pages over to being a sucblass of Revisions that enforces only a single page per article 
-    revision
+  * Fix memory leak bug
   * Investigate if using pop for removing parsed items from the buffer will make Pages and Revisions faster; 
     if so, add an option for such

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump.pm Sun Sep 20 19:28:27 2009
@@ -1,5 +1,5 @@
 package Parse::MediaWikiDump;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 use Parse::MediaWikiDump::XML;
 use Parse::MediaWikiDump::Revisions;

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/CategoryLinks.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/CategoryLinks.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/CategoryLinks.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/CategoryLinks.pm Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 package Parse::MediaWikiDump::CategoryLinks;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 use strict;
 use warnings;

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Links.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Links.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Links.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Links.pm Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 package Parse::MediaWikiDump::Links;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 use strict;
 use warnings;

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Pages.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Pages.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Pages.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Pages.pm Sun Sep 20 19:28:27 2009
@@ -1,746 +1,87 @@
 package Parse::MediaWikiDump::Pages;
 
-our $VERSION = '0.92';
-
-#This parser works by placing all of the start, text, and end events into
-#a buffer as they come out of XML::Parser. On each call to page() the function
-#checks for a complete article in the buffer and calls for XML::Parser to add
-#more tokens if a complete article is not found. Once a complete article is 
-#found it is removed from the buffer, parsed, and an instance of the page
-#object is returned. 
-
-use 5.8.0;
+our $VERSION = '0.93';
+
+use base qw(Parse::MediaWikiDump::Revisions);
 
 use strict;
 use warnings;
-use List::Util;
-use XML::Parser;
 use Carp;
 
-sub new {
-	my ($class, $source) = @_;
-	my $self = {};
-	my $parser_state = {}; #Hash::NoRef->new;
-
-	bless ($self, $class);
-
-	$$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8');
-	$$self{PARSER}->setHandlers('Start', \&start_handler,
-					'End', \&end_handler);
-
-	$$self{GOOD_TAGS} = make_good_tags();
-	$$self{BUFFER} = []; 
-	$$self{CHUNK_SIZE} = 32768;
-	$$self{BUF_LIMIT} = 10000;
-	$$self{BYTE} = 0;
-
-	$parser_state->{GOOD_TAGS} = $$self{GOOD_TAGS};
-	$parser_state->{BUFFER} = $$self{BUFFER};
-
-	my $expat_bb = $$self{PARSER}->parse_start(state => $parser_state);
-	$$self{EXPAT} = Object::Destroyer->new($expat_bb, 'parse_done');
-
-	$self->open($source);
-	$self->init;
-
-	return $self;
+sub new_accumulator_engine {
+	my ($self) = @_;
+	my $f = Parse::MediaWikiDump::XML::Accumulator->new;
+	my $store_siteinfo = $self->{SITEINFO};
+	my $store_page = $self->{PAGE_LIST};
+	
+	my $root = $f->root;
+	my $mediawiki = $f->node('mediawiki', Start => \&validate_mediawiki_node);
+	
+	#stuff for siteinfo
+	my $siteinfo = $f->node('siteinfo', End => sub { %$store_siteinfo = %{ $_[1] } } );
+	my $sitename = $f->textcapture('sitename');
+	my $base = $f->textcapture('base');
+	my $generator = $f->textcapture('generator');
+	my $case = $f->textcapture('case');
+	my $namespaces = $f->node('namespaces', Start => sub { $_[1]->{namespaces} = []; } );
+	my $namespace = $f->node('namespace', Character => \&save_namespace_node);
+	
+	#stuff for page entries
+	my $page = $f->node('page', Start => sub { $_[0]->accumulator( {} ) } );
+	my $title = $f->textcapture('title');
+	my $id = $f->textcapture('id');
+	my $revision = $f->node('revision', 
+		Start => sub { $_[1]->{minor} = 0 }, 
+		End => sub { 
+			if (defined($_[1]->{seen_revision})) {
+				die "only one revision per page is allowed\n";
+			}
+			
+			$_[1]->{seen_revision} = 1;
+			
+			push(@$store_page, { %{ $_[1] } } );
+		} );
+	my $rev_id = $f->textcapture('id', 'revision_id');
+	my $minor = $f->node('minor', Start => sub { $_[1]->{minor} = 1 } );
+	my $time = $f->textcapture('timestamp');
+	my $contributor = $f->node('contributor');
+	my $username = $f->textcapture('username');
+	my $ip = $f->textcapture('ip');
+	my $contrib_id = $f->textcapture('id', 'userid');
+	my $comment = $f->textcapture('comment');
+	my $text = $f->textcapture('text');
+	my $restr = $f->textcapture('restrictions');
+	
+	#put together the tree
+	$siteinfo->add_child($sitename, $base, $generator, $case, $namespaces);
+	  $namespaces->add_child($namespace);
+	
+	$page->add_child($title, $id, $revision, $restr);
+	  $revision->add_child($rev_id, $time, $contributor, $minor, $comment, $text);
+	    $contributor->add_child($username, $ip, $contrib_id);
+	
+	$mediawiki->add_child($siteinfo, $page);
+	$root->add_child($mediawiki);
+	
+	my $engine = $f->engine($root, {});
+
+	return $engine;	
 }
 
-sub next {
-	my ($self) = @_;
-	my $buffer = $$self{BUFFER};
-	my $offset;
-	my @page;
-
-	#look through the contents of our buffer for a complete article; fill
-	#the buffer with more data if an entire article is not there
-	while(1) {
-		$offset = $self->search_buffer('/page');
-		last if $offset != -1;
-
-		#indicates EOF
-		return undef unless $self->parse_more;
-	}
-
-	#remove the entire page from the buffer
-	@page = splice(@$buffer, 0, $offset + 1);
-
-	if ($page[0][0] ne 'page') {
-		$self->dump($buffer);
-		die "expected <page>; got " . token2text($page[0]);
-	}
-
-	my $data = $self->parse_page(\@page);
-
-	return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR}, 
-		$$self{HEAD}{CASE}, $$self{HEAD}{namespaces});
+sub validate_mediawiki_node {
+	my ($engine, $a, $element, $attrs) = @_;
+	die "Only version 0.3 dump files are supported" unless $attrs->{version} eq '0.3';
 }
 
-#outputs a nicely formated representation of the tokens on the buffer specified
-sub dump {
-	my ($self, $buffer) = @_;
-	my $offset = 0;
-
-	if (! defined($buffer)) {
-		$buffer = $$self{BUFFER};
-	}
-
-	foreach my $i (0 .. $#$buffer) {
-		my $token = $$buffer[$i];
-
-		print STDERR "$i ";
-
-		if (substr($$token[0], 0, 1) ne '/') {
-			my $attr = $$token[1];
-			print STDERR "  " x $offset;
-			print STDERR "START $$token[0] ";
-
-			foreach my $key (sort(keys(%$attr))) {
-				print STDERR "$key=\"$$attr{$key}\" ";
-			}
-
-			print STDERR "\n";
-			$offset++;
-		} elsif (ref $token eq 'ARRAY') {
-			$offset--;
-			print STDERR "  " x $offset;
-			print STDERR "END $$token[0]\n";
-		} elsif (ref $token eq 'SCALAR') {
-			my $ref = $token;
-			print STDERR "  " x $offset;
-			print STDERR "TEXT ";
-
-			my $len = length($$ref);
-
-			if ($len < 50) {
-				print STDERR "'$$ref'\n";
-			} else {
-				print STDERR "$len characters\n";
-			}
-		}
-	}
-	
-	return 1;
+sub save_namespace_node {
+	my ($parser, $accum, $text, $element, $attrs) = @_;
+	my $key = $attrs->{key};
+	my $namespaces = $accum->{namespaces};
+	
+	push(@{ $accum->{namespaces} }, [$key, $text] );
 }
 
-sub sitename {
-	my ($self) = @_;
-	return $$self{HEAD}{sitename};
-}
-
-sub base {
-	my ($self) = @_;
-	return $$self{HEAD}{base};
-}
-
-sub generator {
-	my ($self) = @_;
-	return $$self{HEAD}{generator};
-}
-
-sub case {
-	my ($self) = @_;
-	return $$self{HEAD}{case};
-}
-
-sub namespaces {
-	my ($self) = @_;
-	return $$self{HEAD}{namespaces};
-}
-
-sub namespaces_names {
-	my $self = shift;
-	return $$self{HEAD}{namespaces_names};
-}
-
-sub current_byte {
-	my ($self) = @_;
-	return $$self{BYTE};
-}
-
-sub size {
-	my ($self) = @_;
-	
-	return undef unless defined $$self{SOURCE_FILE};
-
-	my @stat = stat($$self{SOURCE_FILE});
-
-	return $stat[7];
-}
-
-#depreciated backwards compatibility methods
-
-#replaced by next()
-sub page {
-	my ($self) = @_;
-	
-	carp("the page() method is depreciated and is going away in the future, use next() instead");
-	
-	return $self->next(@_);
-}
-
-#private functions with OO interface
-sub open {
-	my ($self, $source) = @_;
-
-	if (ref($source) eq 'GLOB') {
-		$$self{SOURCE} = $source;
-	} else {
-		if (! open($$self{SOURCE}, $source)) {
-			die "could not open $source: $!";
-		}
-
-		$$self{SOURCE_FILE} = $source;
-	}
-
-	binmode($$self{SOURCE}, ':utf8');
-
-	return 1;
-}
-
-sub init {
-	my ($self) = @_;
-	my $offset;
-	my @head;
-
-	#parse more XML until the entire siteinfo section is in the buffer
-	while(1) {
-		die "could not init" unless $self->parse_more;
-
-		$offset = $self->search_buffer('/siteinfo');
-
-		last if $offset != -1;
-	}
-
-	#pull the siteinfo section out of the buffer
-	@head = splice(@{$$self{BUFFER}}, 0, $offset + 1);
-
-	$self->parse_head(\@head);
-
-	return 1;
-}
-
-#feed data into expat and have it put more tokens onto the buffer
-sub parse_more {
-	my ($self) = @_;
-	my $buf;
-
-	my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE});
-
-	if (! defined($read)) {
-		die "error during read: $!";
-	} elsif ($read == 0) {
-		$$self{FINISHED} = 1;
-		$$self{EXPAT} = undef; #Object::Destroyer invokes parse_done()
-		return 0;
-	}
-
-	$$self{BYTE} += $read;
-	$$self{EXPAT}->parse_more($buf);
-
-	my $buflen = scalar(@{$$self{BUFFER}});
-
-	die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless
-		$buflen < $$self{BUF_LIMIT};
-
-	return 1;
-}
-
-#searches through a buffer for a specified token
-sub search_buffer {
-	my ($self, $search, $list) = @_;
-
-	$list = $$self{BUFFER} unless defined $list;
-
-	return -1 if scalar(@$list) == 0;
-
-	foreach my $i (0 .. $#$list) {
-		return $i if ref $$list[$i] eq 'ARRAY' && $list->[$i][0] eq $search;
-	}
-
-	return -1;
-}
-
-#this function is very frightning :-( 
-#a better alternative would be to have each part of the stack handled by a 
-#function that handles all the logic for that specific node in the tree
-sub parse_head {
-	my ($self, $buffer) = @_;
-	my $state = 'start';
-	my %data = (
-		namespaces			=> [],
-		namespaces_names	=> [],
-	);
-
-	for (my $i = 0; $i <= $#$buffer; $i++) {
-		my $token = $$buffer[$i];
-
-		if ($state eq 'start') {
-			my $version;
-			die "$i: expected <mediawiki> got " . token2text($token) unless
-				$$token[0] eq 'mediawiki';
-
-			die "$i: version is a required attribute" unless
-				defined($version = $$token[1]->{version});
-
-			die "$i: version $version unsupported" unless $version eq '0.3';
-
-			$token = $$buffer[++$i];
-
-			die "$i: expected <siteinfo> got " . token2text($token) unless
-				$$token[0] eq 'siteinfo';
-
-			$state = 'in_siteinfo';
-		} elsif ($state eq 'in_siteinfo') {
-			if ($$token[0] eq 'namespaces') {
-				$state = 'in_namespaces';
-				next;
-			} elsif ($$token[0] eq '/siteinfo') {
-				last;
-			} elsif ($$token[0] eq 'sitename') {
-				$token = $$buffer[++$i];
-
-				if (ref $token ne 'SCALAR') {
-					die "$i: expected TEXT but got " . token2text($token);
-				}
-
-				$data{sitename} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/sitename') {
-					die "$i: expected </sitename> but got " . token2text($token);
-				}
-			} elsif ($$token[0] eq 'base') {
-				$token = $$buffer[++$i];
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT but got " . token2text($token);
-				}
-
-				$data{base} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/base') {
-					$self->dump($buffer);
-					die "$i: expected </base> but got " . token2text($token);
-				}
-
-			} elsif ($$token[0] eq 'generator') {
-				$token = $$buffer[++$i];
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT but got " . token2text($token);
-				}
-
-				$data{generator} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/generator') {
-					$self->dump($buffer);
-					die "$i: expected </generator> but got " . token2text($token);
-				}
-
-			} elsif ($$token[0] eq 'case') {
-				$token = $$buffer[++$i];
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected </case> but got " . token2text($token);
-				}
-
-				$data{case} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/case') {
-					$self->dump($buffer);
-					die "$i: expected </case> but got " . token2text($token);
-				}
-			}
-
-		} elsif ($state eq 'in_namespaces') {
-			my $key;
-			my $name;
-
-			if ($$token[0] eq '/namespaces') {
-				$state = 'in_siteinfo';
-				next;
-			} 
-
-			if ($$token[0] ne 'namespace') {
-				die "$i: expected <namespace> or </namespaces>; got " . token2text($token);
-			}
-
-			die "$i: key is a required attribute" unless
-				defined($key = $$token[1]->{key});
-
-			$token = $$buffer[++$i];
-
-			#the default namespace has no text associated with it
-			if (ref $token eq 'SCALAR') {
-				$name = $$token;
-			} elsif ($$token[0] eq '/namespace') {
-				$name = '';
-				$i--; #move back one for below
-			} else {
-				die "$i: should never happen";	
-			}
-
-			push(@{$data{namespaces}}, [$key, $name]);
-			push(@{$data{namespaces_names}}, $name);
-
-			$token = $$buffer[++$i];
-
-			if ($$token[0] ne '/namespace') {
-				$self->dump($buffer);
-				die "$i: expected </namespace> but got " . token2text($token);
-			}
-
-		} else {
-			die "$i: unknown state '$state'";
-		}
-	}
-
-	$$self{HEAD} = \%data;
-
-	#locate the anchor that indicates what looks like a link is really a 
-	#category assignment ([[foo]] vs [[Category:foo]])
-	#fix for bug #16616
-	foreach my $ns (@{$data{namespaces}}) {
-		#namespace 14 is the category namespace
-		if ($$ns[0] == 14) {
-			$$self{CATEGORY_ANCHOR} = $$ns[1];
-			last;
-		}
-	}
-
-	if (! defined($$self{CATEGORY_ANCHOR})) {
-		die "Could not locate category indicator in namespace definitions";
-	}
-
-	return 1;
-}
-
-#this function is very frightning :-(
-#see the parse_head function comments for thoughts on improving these
-#awful functions
-sub parse_page {
-	my ($self, $buffer) = @_;
-	my %data;
-	my $state = 'start';
-
-	for (my $i = 0; $i <= $#$buffer; $i++) {
-		my $token = $$buffer[$i];
-
-
-		if ($state eq 'start') {
-			if ($$token[0] ne 'page') {
-				$self->dump($buffer);
-				die "$i: expected <page>; got " . token2text($token);
-			}
-
-			$state = 'in_page';
-		} elsif ($state eq 'in_page') {
-			next unless ref $token eq 'ARRAY';
-			if ($$token[0] eq 'revision') {
-				$state = 'in_revision';
-				next;
-			} elsif ($$token[0] eq '/page') {
-				last;
-			} elsif ($$token[0] eq 'title') {
-				$token = $$buffer[++$i];
-
-				if (ref $token eq 'ARRAY' && $$token[0] eq '/title') {
-					$data{title} = '';
-					next;
-				}
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{title} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/title') {
-					$self->dump($buffer);
-					die "$i: expected </title>; got " . token2text($token);
-				}
-			} elsif ($$token[0] eq 'id') {
-				$token = $$buffer[++$i];
-	
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{id} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/id') {
-					$self->dump($buffer);
-					die "$i: expected </id>; got " . token2text($token);
-				}
-			}
-		} elsif ($state eq 'in_revision') {
-			if ($$token[0] eq '/revision') {
-				#If a comprehensive dump file is parsed
-				#it can cause uncontrolled stack growth and the
-				#parser only returns one revision out of
-				#all revisions - if we run into a 
-				#comprehensive dump file, indicated by more
-				#than one <revision> section inside a <page>
-				#section then die with a message
-
-				#just peeking ahead, don't want to update
-				#the index
-				$token = $$buffer[$i + 1];
-
-				if ($$token[0] eq 'revision') {
-					die "unable to properly parse comprehensive dump files";
-				}
-
-				$state = 'in_page';
-				next;	
-			} elsif ($$token[0] eq 'contributor') {
-				$state = 'in_contributor';
-				next;
-			} elsif ($$token[0] eq 'id') {
-				$token = $$buffer[++$i];
-	
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{revision_id} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/id') {
-					$self->dump($buffer);
-					die "$i: expected </id>; got " . token2text($token);
-				}
-
-			} elsif ($$token[0] eq 'timestamp') {
-				$token = $$buffer[++$i];
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{timestamp} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/timestamp') {
-					$self->dump($buffer);
-					die "$i: expected </timestamp>; got " . token2text($token);
-				}
-			} elsif ($$token[0] eq 'minor') {
-				$data{minor} = 1;
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/minor') {
-					$self->dump($buffer);
-					die "$i: expected </minor>; got " . token2text($token);
-				}
-			} elsif ($$token[0] eq 'comment') {
-				$token = $$buffer[++$i];
-
-				#account for possible null-text 
-				if (ref $token eq 'ARRAY' && $$token[0] eq '/comment') {
-					$data{comment} = '';
-					next;
-				}
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{comment} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/comment') {
-					$self->dump($buffer);
-					die "$i: expected </comment>; got " . token2text($token);
-				}
-
-			} elsif ($$token[0] eq 'text') {
-				my $token = $$buffer[++$i];
-
-				if (ref $token eq 'ARRAY' && $$token[0] eq '/text') {
-					$data{text} = '';
-					next;
-				} elsif (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expected TEXT; got " . token2text($token);
-				}
-
-				$data{text} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/text') {
-					$self->dump($buffer);
-					die "$i: expected </text>; got " . token2text($token);
-				}
-			
-			}
-
-		} elsif ($state eq 'in_contributor') {
-			next unless ref $token eq 'ARRAY';
-			if ($$token[0] eq '/contributor') {
-				$state = 'in_revision';
-				next;
-			} elsif (ref $token eq 'ARRAY' && $$token[0] eq 'username') {
-				$token = $$buffer[++$i];
-
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expecting TEXT; got " . token2text($token);
-				}
-
-				$data{username} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/username') {
-					$self->dump($buffer);
-					die "$i: expected </username>; got " . token2text($token);
-				}
-
-			} elsif ($$token[0] eq 'id') {
-				$token = $$buffer[++$i];
-				
-				if (ref $token ne 'SCALAR') {
-					$self->dump($buffer);
-					die "$i: expecting TEXT; got " . token2text($token);
-				}
-
-				$data{userid} = $$token;
-
-				$token = $$buffer[++$i];
-
-				if ($$token[0] ne '/id') {
-					$self->dump($buffer);
-					die "$i: expecting </id>; got " . token2text($token);
-				}
-			}
-		} else {
-			die "unknown state: $state";
-		}
-	}
-
-	$data{namespace} = '';
-	# Many pages just have a : in the title, but it's not necessary
-	# a namespace designation.
-	if ($data{title} =~ m/^([^:]+)\:/) {
-		my $possible_namespace = $1;
-		if (List::Util::first { $_ eq $possible_namespace }
-			@{ $self->namespaces_names() })
-		{
-			$data{namespace} = $possible_namespace;
-		}
-	}
-
-	$data{minor} = 0 unless defined($data{minor});
-
-	return \%data;
-}
-
-#private functions with out OO interface
-sub make_good_tags {
-	return {
-		sitename => 1,
-		base => 1,
-		generator => 1,
-		case => 1,
-		namespace => 1,
-		title => 1,
-		id => 1,
-		timestamp => 1,
-		username => 1,
-		comment => 1,
-		text => 1
-	};
-}
-
-sub token2text {
-	my ($token) = @_;
-
-	if (ref $token eq 'ARRAY') {
-		return "<$$token[0]>";
-	} elsif (ref $token eq 'SCALAR') {
-		return "!text_token!";
-	} else {
-		return "!unknown!";
-	}
-}
-
-#this function is where the majority of time is spent in this software
-#sub token_compare {
-#	my ($toke1, $toke2) = @_;
-#
-#	foreach my $i (0 .. $#$toke2) {
-#		if ($$toke1[$i] ne $$toke2[$i]) {
-#			return 0;
-#		}
-#	}
-#
-#	return 1;
-#}
-
-sub start_handler {
-	my ($p, $tag, %atts) = @_;	
-	my $self = $p->{state};
-	my $good_tags = $$self{GOOD_TAGS};
-
-	push @{ $$self{BUFFER} }, [$tag, \%atts];
-
-	if (defined($good_tags->{$tag})) {
-		$p->setHandlers(Char => \&char_handler);
-	}
-
-	return 1;
-}
-
-sub end_handler {
-	my ($p, $tag) = @_;
-	my $self = $p->{state};
-
-	push @{ $$self{BUFFER} }, ["/$tag"];
-
-	$p->setHandlers(Char => undef);
-	
-	return 1;
-}
-
-sub char_handler {
-	my ($p, $chars) = @_;
-	my $self = $p->{state};
-	my $buffer = $$self{BUFFER};
-	my $curent = $$buffer[-1];
-
-	if (ref $curent eq 'SCALAR') {
-		$$curent .= $chars;
-	} elsif (substr($$curent[0], 0, 1) ne '/') {
-		push(@$buffer, \$chars);
-	} 
-
-	return 1;
-}
+
 
 1;
 
@@ -946,3 +287,9 @@
     }
   }
 
+=head1 LIMITATIONS
+
+=head2 Memory Leak
+
+This class is not performing proper garbage collection at destruction and will leak memory like crazy if 
+multiple instances of it are created inside one perl script. 

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Revisions.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Revisions.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Revisions.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/Revisions.pm Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 package Parse::MediaWikiDump::Revisions;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 use 5.8.0;
 
@@ -28,6 +28,7 @@
 	$self->open($source);
 	$self->init;
 	
+	#return Object::Destroyer($self, 'cleanup');
 	return $self;
 }
 
@@ -105,14 +106,16 @@
 
 #private functions with OO interface
 
-#sub cleanup {
-#	my ($self) = @_;
-#	
-#	warn "executing cleanup";
-#	
-##	$self->{EXPAT} = undef;	
-##	$self->{XML} = undef;
-#}
+sub cleanup {
+	my ($self) = @_;
+	
+	warn "executing cleanup";
+	
+	$self->{EXPAT}->setHandlers(Init => undef, Final => undef, Start => undef, 
+		End => undef, Char => undef);
+	$self->{EXPAT}->parse_done;	
+	#$self->{XML} = undef;
+}
 
 sub open {
 	my ($self, $source) = @_;
@@ -137,12 +140,13 @@
 	
 	$self->{XML} = $self->new_accumulator_engine;
 	my $expat_bb = $$self{XML}->parser->parse_start();
-	$$self{EXPAT} = Object::Destroyer->new($expat_bb, 'parse_done');
+	#$$self{EXPAT} = Object::Destroyer->new($expat_bb, 'parse_done'); #causes exceptions not to be thrown
+	$$self{EXPAT} = $expat_bb;
 	
 	#load the information from the siteinfo section so it is available before
 	#someone calls ->next
 	while(1) {
-		if (scalar(@{$self->{PAGE_LIST}}) > 1) {
+		if (scalar(@{$self->{PAGE_LIST}}) > 0) {
 			last;
 		}	
 		
@@ -394,3 +398,10 @@
   
     return $title;
   }
+  
+=head1 LIMITATIONS
+
+=head2 Memory Leak
+
+This class is not performing proper garbage collection at destruction and will leak memory like crazy if 
+multiple instances of it are created inside one perl script. 

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/XML.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/XML.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/XML.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/XML.pm Sun Sep 20 19:28:27 2009
@@ -2,7 +2,7 @@
 #testing is done and documentation is written
 package Parse::MediaWikiDump::XML::Accumulator;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 use warnings;
 use strict;
@@ -64,6 +64,7 @@
 	$self->{node_stack} = [ $root ];
 	
 	return Object::Destroyer->new($self, 'cleanup');
+	#return $self;
 }
 
 sub cleanup {
@@ -76,10 +77,12 @@
 sub init_parser {
 	my ($self) = @_;
 	
+	#warn "init_parser called";
+	
 	my $parser = XML::Parser->new(
 		Handlers => {
-			Init => sub { handle_init_event($self, @_) },
-			Final => sub { handle_final_event($self, @_) },
+			#Init => sub { handle_init_event($self, @_) },
+			#Final => sub { handle_final_event($self, @_) },
 			Start => sub { handle_start_event($self, @_) },
 			End => sub { handle_end_event($self, @_) },
 			Char => sub { handle_char_event($self, @_) },
@@ -120,7 +123,7 @@
 	my $element_stack = $self->{element_stack};
 	my $node = $self->node;
 	my $matched = $node->{children}->{$element};
-	my $handler;
+	my $handler; 
 	
 	if (! defined($matched)) {
 		die "fatal error - no match for element $element";

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/category_link.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/category_link.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/category_link.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/category_link.pm Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 package Parse::MediaWikiDump::category_link;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 #you must pass in a fully populated link array reference
 sub new {

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/link.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/link.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/link.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/link.pm Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 package Parse::MediaWikiDump::link;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 #you must pass in a fully populated link array reference
 sub new {

Modified: branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/page.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/page.pm?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/page.pm (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/lib/Parse/MediaWikiDump/page.pm Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 package Parse::MediaWikiDump::page;
 
-our $VERSION = '0.92';
+our $VERSION = '0.93';
 
 use strict;
 use warnings;

Added: branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t?rev=44445&op=file
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t (added)
+++ branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t Sun Sep 20 19:28:27 2009
@@ -1,0 +1,20 @@
+#!perl -w
+
+use strict;
+use warnings;
+
+use Test::Exception tests => 1;
+use Parse::MediaWikiDump;
+
+my $file = 't/revisions_test.xml';
+
+throws_ok { test() } qr/^only one revision per page is allowed$/, 'one revision per article ok';
+
+sub test {	
+	my $pages = Parse::MediaWikiDump->pages($file);
+	
+	while(defined($pages->next)) { };
+};
+
+
+

Propchange: branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t
------------------------------------------------------------------------------
    svn:executable = *

Modified: branches/upstream/libparse-mediawikidump-perl/current/t/pages.t
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/t/pages.t?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/t/pages.t (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/t/pages.t Sun Sep 20 19:28:27 2009
@@ -1,6 +1,6 @@
 #!perl -w
 
-use Test::Simple tests => 82;
+use Test::Simple tests => 94;
 use strict;
 use Parse::MediaWikiDump;
 
@@ -43,7 +43,7 @@
 	ok($pages->current_byte != 0);
 	
 	if ($mode eq 'file') {
-		ok($pages->size == 2872);
+		ok($pages->size == 2874);
 	} elsif ($mode eq 'handle') {
 		ok(! defined($pages->size))
 	} else {
@@ -53,24 +53,32 @@
 	
 	ok($page->title eq 'Talk:Title Test Value');
 	ok($page->id == 1);
-	ok($page->timestamp eq '2005-07-09T18:41:10Z');
+	ok($page->revision_id == 47084);
 	ok($page->username eq 'Username Test Value');
 	ok($page->userid == 1292);
+	ok($page->timestamp eq '2005-07-09T18:41:10Z');
+	ok($page->userid == 1292);
+	ok($page->minor);
 	ok($$text eq "Text Test Value\n");
 	ok($page->namespace eq 'Talk');
+	ok(! defined($page->redirect));
 	ok(! defined($page->categories));
 }
 
 sub test_two {
 	my $page = $pages->next;
+	my $text = $page->text;
 
-	ok(defined($page));
-	ok($page->redirect eq 'fooooo');
 	ok($page->title eq 'Title Test Value #2');
 	ok($page->id == 2);
+	ok($page->revision_id eq '47085');
+	ok($page->username eq 'Username Test Value 2');
 	ok($page->timestamp eq '2005-07-09T18:41:10Z');
-	ok($page->username eq 'Username Test Value');
 	ok($page->userid == 1292);
+	ok($page->minor);
+	ok($$text eq "#redirect : [[fooooo]]\n");
+	ok($page->namespace eq '');
+	ok($page->redirect eq 'fooooo');
 	ok(! defined($page->categories));
 }
 
@@ -78,13 +86,12 @@
 	my $page = $pages->next;
 
 	ok(defined($page));
-	ok($page->redirect eq 'fooooo');
+	ok($page->redirect);
 	ok($page->title eq 'Title Test Value #3');
 	ok($page->id == 3);
 	ok($page->timestamp eq '2005-07-09T18:41:10Z');
 	ok($page->username eq 'Username Test Value');
 	ok($page->userid == 1292);
-	ok(! defined($page->categories));
 }
 
 sub test_four {

Modified: branches/upstream/libparse-mediawikidump-perl/current/t/pages_test.xml
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libparse-mediawikidump-perl/current/t/pages_test.xml?rev=44445&op=diff
==============================================================================
--- branches/upstream/libparse-mediawikidump-perl/current/t/pages_test.xml (original)
+++ branches/upstream/libparse-mediawikidump-perl/current/t/pages_test.xml Sun Sep 20 19:28:27 2009
@@ -45,7 +45,7 @@
     <revision>
       <id>47085</id>
       <timestamp>2005-07-09T18:41:10Z</timestamp>
-      <contributor><username>Username Test Value</username><id>1292</id></contributor>
+      <contributor><username>Username Test Value 2</username><id>1292</id></contributor>
       <minor/>
       <comment>Comment Test Value</comment>
       <text xml:space="preserve">#redirect : [[fooooo]]




More information about the Pkg-perl-cvs-commits mailing list