[Debtags-commits] [svn] r1412 - autodebtag/trunk/dbacl

Enrico Zini enrico at costa.debian.org
Thu Oct 27 18:31:55 UTC 2005


Author: enrico
Date: Thu Oct 27 18:31:54 2005
New Revision: 1412

Modified:
   autodebtag/trunk/dbacl/debtags-ai
Log:
Added proper commandline parsing


Modified: autodebtag/trunk/dbacl/debtags-ai
==============================================================================
--- autodebtag/trunk/dbacl/debtags-ai	(original)
+++ autodebtag/trunk/dbacl/debtags-ai	Thu Oct 27 18:31:54 2005
@@ -1,20 +1,25 @@
 #!/usr/bin/perl -w
 
+use Getopt::Long;
 use IPC::Open2;
 use strict;
 use warnings;
 
+##
+## Program runtime options
+##
+
 # 0: Be silent
 # 1: Print terse progress info
 # 2: Be annoying
-my $verbose = 1;
+my $verbose = -1;
 
 # Minimum cardinality a tag should have to be considered for testing when
 # generating patches
 my $tag_min_card = 800;
 
 # Matches tags we don't want to consider anyway
-my $tag_blacklist = qr/^special::/;
+my $tag_blacklist = "^special::";
 
 # Minimum percentage of a dbacl result to be considered good
 my $sure_perc = 85;
@@ -22,96 +27,153 @@
 # What should we read as the package cache
 my $pkgcache_source = "apt-cache dumpavail |";
 
+# Directory that contains the training files
+my $traindir = $ENV{HOME}.'/.debtags/debtags-ai';
+
+
+##
+## Function prototypes
+##
+
+sub main ();
 sub train_all ();
 sub read_apt ();
-sub patch_package ($);
+sub patch_all (@);
+
+
+##
+## Globally cached data
+##
 
 my %pkgdata;
 my $pkg_count = 1;
 my %tags;
 my @interesting_tags;
 
+##
+## Commandline parsing and function dispatching
+##
+
+main();
+exit 0;
+
 sub usage (;$)
 {
 	my ($status) = $_;
-	print STDERR "Usage: $0 train|patch\n";
+	print STDERR qq{Usage: $0 [options] train|patch
+Commands are:
+  train           Generates training data for dbacl.
+  patch [pkgs]    Generate a tag patch for the given packages (or all packages
+                  if none is given)
+Options are:
+  --help, -h      Print this help message.
+  --verbose, -v   Be verbose.  A number can be provided for more verbose
+                  levels.
+  --min-pkg=num   Minimum cardinality of a tag to be considered when
+                  generating patches.  Default: $tag_min_card.
+  --tag-bl=regex  Regexp matching tags not to consider when generating
+                  patches.  Default: "$tag_blacklist".
+  --min-sure=num  Minimum percentage dbacl should report for $0 to use the
+                  answer.  Default: $sure_perc.
+  --pkgdata=file  File (or command ending in |) to use to read the package
+                  database.  Default: "$pkgcache_source".
+  --datadir=dir   Directory where dbacl will store its training data.
+    or -d dir     Default: "$traindir".
+};
 	exit (defined $status ? $status : 1);
 }
 
-my $cmd = shift @ARGV or usage();
-
-if ($cmd eq 'train')
+sub main ()
 {
-	# Fixed random seed to have predictable output
-	srand(1);
+	my $want_help;
+	GetOptions(
+				"help|h" => \$want_help,
+				"verbose|v:i" => \$verbose,
+				"min-pkg=i" => \$tag_min_card,
+				"tag-bl=s" => \$tag_blacklist,
+				"min-sure=i" => \$sure_perc,
+				"pkgdata=s" => \$pkgcache_source,
+				"datadir|d=s" => \$traindir,
+	) or usage();
+
+	usage(0) if $want_help;
+
+	## Get the main command
+	my $cmd = shift @ARGV or usage();
+
+
+	## Fix the values after parsing
+
+	# if one just does -v, GetOptions sets it to 0
+	if ($verbose == 0) {
+		$verbose = 1;
+	} elsif ($verbose == -1) {
+		$verbose = 0;
+	}
 
-	read_apt();
-	train_all();
-} elsif ($cmd eq 'patch') {
-	read_apt();
+	# $tag_blacklist comes as a string
+	$tag_blacklist = qr/$tag_blacklist/;
 
-	# Filter out tags for which we know the bayesian doesn't do a good job
-	for my $tag (keys %tags)
+
+	## Run the main command
+
+	if ($cmd eq 'train')
 	{
-		push @interesting_tags, $tag 
-			if scalar keys %{$tags{$tag}} >= $tag_min_card
-			   and $tag !~ $tag_blacklist;
+		# Fixed random seed to have predictable output
+		srand(1);
+
+		read_apt();
+		train_all();
+	} elsif ($cmd eq 'patch') {
+		read_apt();
+		if (@ARGV) {
+			patch_all(@ARGV);
+		} else {
+			patch_all(keys %pkgdata);
+		}
+	} else {
+		usage();
 	}
+	exit 0;
+}
 
-	my %pkgs_done;
-	print STDERR "Generating patch.  Create a file called 'patch-stop' to interrupt (you can resume later).\n" if $verbose > 0;
-	if (-f 'patch-resume')
+##
+## General
+##
+
+# Read apt database
+sub read_apt ()
+{
+	open IN, "$pkgcache_source" or die "Can't read package cache from \"$pkgcache_source\": $!";
+	local $/ = "\n\n";
+	while (my $rec = <IN>)
 	{
-		# If asked to resume, read the partial patch from stdin
-		open IN, "patch-resume" or die "Can't open patch-resume: $!";
-		while (<IN>)
+		if ($rec =~ /^Package: (\S+).+Tag: (.+?)(?:\n|$)/so)
 		{
-			print;
-			if (/^(\S+): /)
+			my ($pkg, $tags) = ($1, $2);
+			$pkgdata{$pkg} = $rec;
+			if ($tags)
 			{
-				print STDERR "Reusing $1.\n" if $verbose > 0;
-				$pkgs_done{$1} = 1;
-			} else {
-				print STDERR "Could not parse $_\n";
+				for my $tag (split(', ', $tags))
+				{
+					$tags{$tag}{$pkg} = 1;
+				}
 			}
+			$pkg_count++;
 		}
-		close(IN);
-	}
-	my $count_total = scalar keys %pkgdata;
-	my $count_reallydone = 0;
-	my $count_done = scalar keys %pkgs_done;
-	my $time_used = 0;
-	for my $pkg (keys %pkgdata)
-	{
-		if (-f 'patch-stop')
-		{
-			print STDERR "Interrupted.  Put the partial results in a file called 'patch-resume' to resume the computation from this point.\n".
-			unlink ('patch-stop');
-			last;
-		}
-
-		$count_done++;
-		if (not exists $pkgs_done{$pkg})
+		elsif ($rec =~ /^Package: (\S+)/o)
 		{
-			#print STDERR "Doing: $pkg\n" if $verbose > 0;
-			my ($user, $system) = patch_package($pkg);
-			$time_used += $user + $system;
-			$count_reallydone++;
-			printf STDERR "$pkg done.  It took %f (user), %f (system), %f (total)\n", $user, $system, $user + $system
-				if ($verbose > 1);
-			if ($verbose == 1 && $count_reallydone % 20 == 0)
-			{
-				printf STDERR "Computing on %d tags. (%d/%d, %.3f seconds per package)\r",
-					scalar(@interesting_tags),
-					$count_done, $count_total, $time_used / $count_reallydone;
-			}
+			$pkgdata{$1} = $rec;
 		}
 	}
-	printf STDERR "\n" if $verbose == 1;
-} else {
-	usage();
+	close IN;
+	printf STDERR "%d packages and %d tags found.\n", $pkg_count, scalar(keys %tags) if $verbose > 0;
 }
-exit 0;
+
+
+##
+## Generating patches
+##
 
 # Get dbacl opinion about the likelyhood that $tag belongs to $pkg
 # Returns (bool, perc)
@@ -121,11 +183,11 @@
 sub testtag ($$)
 {
 	my ($pkg, $tag) = @_;
-	return (undef, undef) if not -f "data/$tag";
-	return (undef, undef) if not -f "data/not-$tag";
+	return (undef, undef) if not -f "$traindir/$tag";
+	return (undef, undef) if not -f "$traindir/not-$tag";
 
 	my($rdrfh, $wtrfh);
-	my $pid = open2($rdrfh, $wtrfh, "dbacl -m -U -c data/$tag -c data/not-$tag");
+	my $pid = open2($rdrfh, $wtrfh, "dbacl -m -U -c $traindir/$tag -c $traindir/not-$tag");
 	$pid or die "Can't invoke dbacl: $!";
 	print $wtrfh $pkgdata{$pkg};
 	close $wtrfh;
@@ -179,46 +241,82 @@
 	return $ela[0] + $ela[2], $ela[1] + $ela[3]
 }
 
+# Output a tag patch for all the packages, computed by dbacl
+sub patch_all (@)
+{
+	my @pkgs = @_;
 
+	# Filter out tags for which we know the bayesian doesn't do a good job
+	for my $tag (keys %tags)
+	{
+		push @interesting_tags, $tag 
+			if scalar keys %{$tags{$tag}} >= $tag_min_card
+			   and $tag !~ $tag_blacklist;
+	}
 
-
-
-
-# Read apt database
-sub read_apt ()
-{
-	open IN, "$pkgcache_source" or die "Can't read package cache from \"$pkgcache_source\": $!";
-	local $/ = "\n\n";
-	while (my $rec = <IN>)
+	my %pkgs_done;
+	print STDERR "Generating patch.  Create a file called 'patch-stop' to interrupt (you can resume later).\n" if $verbose > 0;
+	if (-f 'patch-resume')
 	{
-		if ($rec =~ /^Package: (\S+).+Tag: (.+?)(?:\n|$)/so)
+		# If asked to resume, read the partial patch from stdin
+		open IN, "patch-resume" or die "Can't open patch-resume: $!";
+		while (<IN>)
 		{
-			my ($pkg, $tags) = ($1, $2);
-			$pkgdata{$pkg} = $rec;
-			if ($tags)
+			print;
+			if (/^(\S+): /)
 			{
-				for my $tag (split(', ', $tags))
-				{
-					$tags{$tag}{$pkg} = 1;
-				}
+				print STDERR "Reusing $1.\n" if $verbose > 0;
+				$pkgs_done{$1} = 1;
+			} else {
+				print STDERR "Could not parse $_\n";
 			}
-			$pkg_count++;
 		}
-		elsif ($rec =~ /^Package: (\S+)/o)
+		close(IN);
+	}
+	my $count_total = scalar @pkgs;
+	my $count_reallydone = 0;
+	my $count_done = scalar keys %pkgs_done;
+	my $time_used = 0;
+	for my $pkg (@pkgs)
+	{
+		if (-f 'patch-stop')
 		{
-			$pkgdata{$1} = $rec;
+			print STDERR "Interrupted.  Put the partial results in a file called 'patch-resume' to resume the computation from this point.\n".
+			unlink ('patch-stop');
+			last;
+		}
+
+		$count_done++;
+		if (not exists $pkgs_done{$pkg})
+		{
+			#print STDERR "Doing: $pkg\n" if $verbose > 0;
+			my ($user, $system) = patch_package($pkg);
+			$time_used += $user + $system;
+			$count_reallydone++;
+			printf STDERR "$pkg done.  It took %f (user), %f (system), %f (total)\n", $user, $system, $user + $system
+				if ($verbose > 1);
+			if ($verbose == 1 && $count_reallydone % 20 == 0)
+			{
+				printf STDERR "Computing on %d tags. (%d/%d, %.3f seconds per package)\r",
+					scalar(@interesting_tags),
+					$count_done, $count_total, $time_used / $count_reallydone;
+			}
 		}
 	}
-	close IN;
-	printf STDERR "%d packages and %d tags found.\n", $pkg_count, scalar(keys %tags);
+	printf STDERR "\n" if $verbose == 1;
 }
 
+##
+## Training
+##
+
+
 # Train dbacl for one tag
 sub train_one ($$;$)
 {
 	my ($tag, $pkgs, $count) = @_;
 
-	open OUT, "| dbacl -h10 -H20 -l data/$tag" or die "Can't train for $tag: $!";
+	open OUT, "| dbacl -h10 -H20 -l $traindir/$tag" or die "Can't train for $tag: $!";
 	if (defined $count)
 	{
 		my $perc = $count / scalar(@$pkgs);
@@ -250,13 +348,18 @@
 {
 	print "Training.  Create a file called 'trainall-stop' to interrupt (you can resume later).\n";
 
+	if (not -d $traindir)
+	{
+		mkdir $traindir or die "Can't create directory $traindir: $!";
+	}
+
 	my $count = 0;
 	my $all = scalar keys %tags;
 	for my $tag (keys %tags)
 	{
 		$count++;
 		last if -f "trainall-stop";
-		next if -f "data/$tag";
+		next if -f "$traindir/$tag";
 
 		# Names of packages having the tag
 		my @with = keys %{$tags{$tag}};



More information about the Debtags-commits mailing list