[med-svn] r2341 - trunk/community/talks/200808_debconf8

tille at alioth.debian.org tille at alioth.debian.org
Sun Jul 27 17:33:45 UTC 2008


Author: tille
Date: 2008-07-27 17:33:45 +0000 (Sun, 27 Jul 2008)
New Revision: 2341

Added:
   trunk/community/talks/200808_debconf8/archives.sql
Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Store messages in database to be flexible when trying to obtain stats.


Added: trunk/community/talks/200808_debconf8/archives.sql
===================================================================
--- trunk/community/talks/200808_debconf8/archives.sql	                        (rev 0)
+++ trunk/community/talks/200808_debconf8/archives.sql	2008-07-27 17:33:45 UTC (rev 2341)
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+createdb cddlistarchives
+
+psql cddlistarchives <<EOT
+
+BEGIN;
+
+CREATE TABLE listarchive (
+   project   text,
+   yearmonth date,
+   author    text,
+   subject   text,
+   url       text,
+   ts        date
+);
+
+COMMIT;
+EOT


Property changes on: trunk/community/talks/200808_debconf8/archives.sql
___________________________________________________________________
Name: svn:executable
   + *

Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-27 17:14:24 UTC (rev 2340)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-27 17:33:45 UTC (rev 2341)
@@ -4,11 +4,15 @@
 use LWP::UserAgent;
 use URI;
 use Cwd;
+use DBI;
 
 my $BASEURL  = "http://lists.debian.org/debian" ;
 my @PROJECTS = ('med', 'edu', 'jr') ;
 my @MONTHES  = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12');
-my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster');
+my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster',
+                'Debian-med-request', 'Debian testing watch', 'Debian Bug Tracking System',
+                'Skolelinux archive Installer');
+my @SPAMAUTHORS = ('Pls check this new site');
 
 # Debian-Jr starts in 2000
 my $YEARSTART = 2000;
@@ -16,13 +20,28 @@
 my ($sec,$min,$hour,$day,$MONTHEND,$YEAREND,$wday,$yday,$isdst) = localtime(time);
 $MONTHEND++;
 $YEAREND +=1900;
+$day++;
+my $today = "$YEAREND-$MONTHEND-$day";
 
+my $dbname = 'cddlistarchives';
+my $dbh    = DBI->connect("dbi:Pg:dbname=$dbname");
+
 my $ua = LWP::UserAgent->new( agent => 'varbot');
 $ua->env_proxy;
 
 my $cdw = getcwd;
 my $project;
+my $insert = "INSERT INTO listarchive (project, yearmonth, author, subject, url, ts) VALUES (?, ?, ?, ?, ?, '$today')";
+my $datain = $dbh->prepare_cached($insert);
+my ( $robot, $robotflag );
+
 foreach $project (@PROJECTS) {
+    # Remove database entries for this project
+    my $query  = "DELETE FROM listarchive WHERE project = '$project'";
+    my($daten) = $dbh->prepare_cached($query);
+    $daten->execute() ;
+    $daten->finish() ;
+
     mkdir($project,0777);
     chdir($project);
     my $URL="${BASEURL}-${project}";
@@ -37,7 +56,8 @@
 	    my $datafile = "${year}-${month}" ;
 	    unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
 	    my $messagelines = 0;
-	    my $spamlines = 0;
+	    my $spamlines    = 0;
+	    my $robotlines   = 0;
 	    while ( $url =~ /.+/ ) { # if only one page $url is set to ''
 		# print "$year-$month: $url\n";
 		my $uri = URI->new($url);
@@ -50,7 +70,7 @@
 		    next;
 		} ; 
 		(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-		my ($content, $subject, $author, $messages, $pages, $page) ;
+		my ($content, $msgurl, $subject, $author, $messages, $pages, $page) ;
 		foreach $content (@data) {
 		    my @lines = split(/(\n)/, $content);
 		    # print "------> @lines\n" ;
@@ -67,7 +87,6 @@
 				# Append next line
 				$line = $linestart . $line;
 			    }
-			    print "DEBUG: Whole line is $line\n" ;
 			    $linestart = '';
 			}
 			if ( $line =~ /^\s*<\/?ul>\s*$/ || 
@@ -76,7 +95,8 @@
 			     $line =~ /^\s*<li><em>Message not available<\/em>/ ||
 			     $line =~ /<em>\(continued\)<\/em>\s*$/ ||
 			     $line =~ /^\s*$/) { next ; }
-			if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+			if ( ($msgurl, $subject, $author) = 
+                              $line =~ m#<li><strong>.*href="(msg\d+\.html)">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
 			    $_ = $subject ;
 			    $_ =~ s/^Re:\s*//i ;       # Remove Re:
 			    $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
@@ -86,8 +106,31 @@
 				print "Potential SPAM line - strange subject: $project $year-$month: $subject\n";
 				$spamlines++ ;
 			    } else {
-				print HTMLSNIP "$subject ; $author\n";
-				$messagelines++ ;
+				if ( $author =~ /^[&#x\d;\sA-F\?]+$/ ||
+                                     $author =~ /info/i ) { # never had a non-spam message from an author whos name contains info
+				    print "Potential SPAM line - strange author: $project $year-$month: $author\n";
+				    $spamlines++ ;
+				} else {
+				    if ( $author =~ /^Tille, Andreas$/ )    { $author = 'Andreas Tille'; }
+				    if ( $author =~ /Steffen M&#xF6;ller/ ) { $author = 'Steffen Moeller'; }
+				    $_ = $author;
+				    $_ = s/&#xF6;/ö/g ; 
+				    $_ = s/&#xFC;/ü/g ; 
+				    $robotflag = 0;
+				    foreach $robot (@ROBOTS) {
+					if ( $author =~ /$robot/ ) { # we are not interested in automatic mails
+					    $robotlines++ ;
+					    $robotflag = 1 ;
+					    last;
+					}
+				    }
+				    if ( $robotflag == 0 ) {
+					print HTMLSNIP "$subject ; $author\n";
+					$datain->execute($project, "$year-$month-01", $author, $subject,
+							 "${URL}/${year}/${month}/$msgurl") ;
+					$messagelines++ ;
+				    }
+				}
 			    }
 			} else {
 			    if ( ($messages, $page, $pages) = $line 
@@ -100,12 +143,11 @@
 				    $url = '';
 				}
 				print HTMLSNIP "$messages Messages (counted $messagelines)\n";
-				if ( $messages != $messagelines + $spamlines ) {
-				    print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
+				if ( $messages != $messagelines + $spamlines + $robotlines ) {
+				    print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n";
 				}
 			    } else {
 				unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
-				    print "DEBUG: Continued line $line\n" ;
 				    $linestart = $line;
 				    ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
 				} else {
@@ -127,3 +169,4 @@
     chdir($cdw);
 }
 
+$datain->finish;




More information about the debian-med-commit mailing list