[med-svn] r2403 - trunk/community/talks/200808_debconf8

tille at alioth.debian.org tille at alioth.debian.org
Sun Aug 10 23:36:05 UTC 2008


Author: tille
Date: 2008-08-10 23:36:05 +0000 (Sun, 10 Aug 2008)
New Revision: 2403

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Try to include alioth lists.  Not working now and some Debugging output added.


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-08-10 01:25:03 UTC (rev 2402)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-08-10 23:36:05 UTC (rev 2403)
@@ -10,8 +10,13 @@
 my @PROJECTS = ('med', 'edu', 'jr', 'accessibility', 'desktop', 'enterprise', 'lex',
                 'nonprofit', 'science', 'custom',
                 'i18n', 'devel', 'project') ; # ... just for the sake of interest
+
+## DEBUG
+ at PROJECTS = ('enterprise'); # Just find a very short list while testing Alioth ...
+
 # Well, there is also interest in alioth lists ...
-my @ALIOTHPRJ= ('debichem-devel', 'pkg-grass-general') ;
+my $BASEALIOTH = 'http://lists.alioth.debian.org/pipermail/';
+my @ALIOTHPRJ  = ('debichem-devel', 'pkg-grass-general') ;
 
 ## http://lists.alioth.debian.org/pipermail/debichem-devel/2008-August/thread.html
 ## http://lists.alioth.debian.org/pipermail/pkg-grass-general/2008-July/thread.html
@@ -42,7 +47,7 @@
 
 # if != 0 then extract of mailing list archives is stored in files in dirs
 # The prefered method is to use only the database
-my $storefiles = 0;
+my $storefiles = 1; # Just store the files again for debugging issues of alioth lists
 
 # Debian-Devel starts in 1995
 my $YEARSTART = 1995;
@@ -65,7 +70,25 @@
 my $datain = $dbh->prepare_cached($insert);
 my ( $robot, $robotflag );
 
+my %ALLPROJECTS;
+
 foreach $project (@PROJECTS) {
+    $ALLPROJECTS{$project} = { 'url'     => "${BASEURL}-${project}",
+			       'type'    => 0 # == lists.debian.org
+    };
+}
+
+foreach $project (@ALIOTHPRJ) {
+    $ALLPROJECTS{$project} = { 'url'     => "${BASEALIOTH}/${project}",
+			       'type'    => 1 # == lists.alioth.debian.org
+    };
+}
+
+# foreach $project (keys %ALLPROJECTS) {
+#    print "$project: $ALLPROJECTS{$project}{'url'}, $ALLPROJECTS{$project}{'type'}\n"
+#}
+
+foreach $project (keys %ALLPROJECTS) {
     # Remove database entries for this project
     my $query  = "DELETE FROM listarchive WHERE project = '$project'";
     my($daten) = $dbh->prepare_cached($query);
@@ -76,15 +99,19 @@
 	mkdir($project,0777);
 	chdir($project);
     }
-    my $URL="${BASEURL}-${project}";
-    my $year;
-    my $month;
+    my $URL="$ALLPROJECTS{$project}{'url'}";
+    my ( $year, $month, $url, @data, @lines ) ;
+    my ($content, $msgurl, $subject, $author, $messages, $pages, $page, $line) ;
     for ( $year = $YEARSTART ; $year <= $YEAREND; $year++ ) {
 	foreach $month (@MONTHES) {
 	    if ( $year == $YEAREND && $month == $MONTHEND ) {
 		last;
 	    }
-	    my $url = "${URL}/${year}/${month}/";
+	    if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+		$url = "${URL}/${year}/${month}/";
+	    } else {
+		$url = "${URL}/${year}-$monthdict{$month}/";
+	    }
 	    my $datafile = "${year}-${month}" ;
 	    if ( $storefiles ) {
 		unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
@@ -93,7 +120,7 @@
 	    my $spamlines    = 0;
 	    my $robotlines   = 0;
 	    while ( $url =~ /.+/ ) { # if only one page $url is set to ''
-		# print "$year-$month: $url\n";
+		print "DEBUG: $year-$month: $url\n";
 		my $uri = URI->new($url);
 		my $indexpage = $ua->get($url, Host => $uri->host );
 		unless ( $indexpage->is_success ) { # some mailing lists startet later ...
@@ -103,12 +130,36 @@
 		    unlink($datafile);
 		    next;
 		} ; 
-		(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-		my ($content, $msgurl, $subject, $author, $messages, $pages, $page) ;
+		if ( $ALLPROJECTS{$project}{'type'} == 1 ) {
+		    # make sure the loop will end in case of Alioth lists.  Seems these list do
+		    # not feature more than one page per Month so there is no point in looping over them
+		    $url = '';
+		}
+		if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+		    @data = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
+		} else {
+		    my @tmpdata = $indexpage->content =~ m#.*<b>Ending:</b> <i>[ \w]+ [ \d:]+ UTC [\d]+</i><br>\n(.+)<a name="end"><b>Last message date:</b></a>.*#gs;
+		    @data = ();
+		    foreach $content (@tmpdata) {
+			@lines = split(/(\n)/, $content);
+			foreach $line (@lines) {
+			    if ( $line =~ /^\s*$/   || $line =~ /^<!--\d+ / ||
+				 $line =~ /^<\/I>$/ || $line =~ /^<UL>$/    ||
+				 $line =~ /^<\/A><A NAME="\d+">&nbsp;<\/A>$/ ) { next ; }
+			    if ( $line =~ /^<LI><A HREF="\d+.html">\[[-\w]+\]/ ) {
+				@data = (@data, $line) ;
+			    } else {
+				@data = (@data, "$line\n" ) ;
+			    }
+			}
+		    }
+		    if ( $storefiles ) {
+			print HTMLSNIP "@data\n";
+		    }
+		}
 		foreach $content (@data) {
-		    my @lines = split(/(\n)/, $content);
+		    @lines = split(/(\n)/, $content);
 		    # print "------> @lines\n" ;
-		    my $line;
 		    my $linestart = '';
 		    foreach $line (@lines) {
 			if ( $line =~ /^\s*$/) { next ; }




More information about the debian-med-commit mailing list