[Debtags-commits] [svn] r1412 - autodebtag/trunk/dbacl
Enrico Zini
enrico at costa.debian.org
Thu Oct 27 18:31:55 UTC 2005
Author: enrico
Date: Thu Oct 27 18:31:54 2005
New Revision: 1412
Modified:
autodebtag/trunk/dbacl/debtags-ai
Log:
Added proper commandline parsing
Modified: autodebtag/trunk/dbacl/debtags-ai
==============================================================================
--- autodebtag/trunk/dbacl/debtags-ai (original)
+++ autodebtag/trunk/dbacl/debtags-ai Thu Oct 27 18:31:54 2005
@@ -1,20 +1,25 @@
#!/usr/bin/perl -w
+use Getopt::Long;
use IPC::Open2;
use strict;
use warnings;
+##
+## Program runtime options
+##
+
# 0: Be silent
# 1: Print terse progress info
# 2: Be annoying
-my $verbose = 1;
+my $verbose = -1;
# Minimum cardinality a tag should have to be considered for testing when
# generating patches
my $tag_min_card = 800;
# Matches tags we don't want to consider anyway
-my $tag_blacklist = qr/^special::/;
+my $tag_blacklist = "^special::";
# Minimum percentage of a dbacl result to be considered good
my $sure_perc = 85;
@@ -22,96 +27,153 @@
# What should we read as the package cache
my $pkgcache_source = "apt-cache dumpavail |";
+# Directory that contains the training files
+my $traindir = $ENV{HOME}.'/.debtags/debtags-ai';
+
+
+##
+## Function prototypes
+##
+
+sub main ();
sub train_all ();
sub read_apt ();
-sub patch_package ($);
+sub patch_all (@);
+
+
+##
+## Globally cached data
+##
my %pkgdata;
my $pkg_count = 1;
my %tags;
my @interesting_tags;
+##
+## Commandline parsing and function dispatching
+##
+
+main();
+exit 0;
+
sub usage (;$)
{
my ($status) = $_;
- print STDERR "Usage: $0 train|patch\n";
+ print STDERR qq{Usage: $0 [options] train|patch
+Commands are:
+ train Generates training data for dbacl.
+ patch [pkgs] Generate a tag patch for the given packages (or all packages
+ if none is given)
+Options are:
+ --help, -h Print this help message.
+ --verbose, -v Be verbose. A number can be provided for more verbose
+ levels.
+ --min-pkg=num Minimum cardinality of a tag to be considered when
+ generating patches. Default: $tag_min_card.
+ --tag-bl=regex Regexp matching tags not to consider when generating
+ patches. Default: "$tag_blacklist".
+ --min-sure=num Minimum percentage dbacl should report for $0 to use the
+ answer. Default: $sure_perc.
+ --pkgdata=file File (or command ending in |) to use to read the package
+ database. Default: "$pkgcache_source".
+ --datadir=dir Directory where dbacl will store its training data.
+ or -d dir Default: "$traindir".
+};
exit (defined $status ? $status : 1);
}
-my $cmd = shift @ARGV or usage();
-
-if ($cmd eq 'train')
+sub main ()
{
- # Fixed random seed to have predictable output
- srand(1);
+ my $want_help;
+ GetOptions(
+ "help|h" => \$want_help,
+ "verbose|v:i" => \$verbose,
+ "min-pkg=i" => \$tag_min_card,
+ "tag-bl=s" => \$tag_blacklist,
+ "min-sure=i" => \$sure_perc,
+ "pkgdata=s" => \$pkgcache_source,
+ "datadir|d=s" => \$traindir,
+ ) or usage();
+
+ usage(0) if $want_help;
+
+ ## Get the main command
+ my $cmd = shift @ARGV or usage();
+
+
+ ## Fix the values after parsing
+
+ # if one just does -v, GetOptions sets it to 0
+ if ($verbose == 0) {
+ $verbose = 1;
+ } elsif ($verbose == -1) {
+ $verbose = 0;
+ }
- read_apt();
- train_all();
-} elsif ($cmd eq 'patch') {
- read_apt();
+ # $tag_blacklist comes as a string
+ $tag_blacklist = qr/$tag_blacklist/;
- # Filter out tags for which we know the bayesian doesn't do a good job
- for my $tag (keys %tags)
+
+ ## Run the main command
+
+ if ($cmd eq 'train')
{
- push @interesting_tags, $tag
- if scalar keys %{$tags{$tag}} >= $tag_min_card
- and $tag !~ $tag_blacklist;
+ # Fixed random seed to have predictable output
+ srand(1);
+
+ read_apt();
+ train_all();
+ } elsif ($cmd eq 'patch') {
+ read_apt();
+ if (@ARGV) {
+ patch_all(@ARGV);
+ } else {
+ patch_all(keys %pkgdata);
+ }
+ } else {
+ usage();
}
+ exit 0;
+}
- my %pkgs_done;
- print STDERR "Generating patch. Create a file called 'patch-stop' to interrupt (you can resume later).\n" if $verbose > 0;
- if (-f 'patch-resume')
+##
+## General
+##
+
+# Read apt database
+sub read_apt ()
+{
+ open IN, "$pkgcache_source" or die "Can't read package cache from \"$pkgcache_source\": $!";
+ local $/ = "\n\n";
+ while (my $rec = <IN>)
{
- # If asked to resume, read the partial patch from stdin
- open IN, "patch-resume" or die "Can't open patch-resume: $!";
- while (<IN>)
+ if ($rec =~ /^Package: (\S+).+Tag: (.+?)(?:\n|$)/so)
{
- print;
- if (/^(\S+): /)
+ my ($pkg, $tags) = ($1, $2);
+ $pkgdata{$pkg} = $rec;
+ if ($tags)
{
- print STDERR "Reusing $1.\n" if $verbose > 0;
- $pkgs_done{$1} = 1;
- } else {
- print STDERR "Could not parse $_\n";
+ for my $tag (split(', ', $tags))
+ {
+ $tags{$tag}{$pkg} = 1;
+ }
}
+ $pkg_count++;
}
- close(IN);
- }
- my $count_total = scalar keys %pkgdata;
- my $count_reallydone = 0;
- my $count_done = scalar keys %pkgs_done;
- my $time_used = 0;
- for my $pkg (keys %pkgdata)
- {
- if (-f 'patch-stop')
- {
- print STDERR "Interrupted. Put the partial results in a file called 'patch-resume' to resume the computation from this point.\n".
- unlink ('patch-stop');
- last;
- }
-
- $count_done++;
- if (not exists $pkgs_done{$pkg})
+ elsif ($rec =~ /^Package: (\S+)/o)
{
- #print STDERR "Doing: $pkg\n" if $verbose > 0;
- my ($user, $system) = patch_package($pkg);
- $time_used += $user + $system;
- $count_reallydone++;
- printf STDERR "$pkg done. It took %f (user), %f (system), %f (total)\n", $user, $system, $user + $system
- if ($verbose > 1);
- if ($verbose == 1 && $count_reallydone % 20 == 0)
- {
- printf STDERR "Computing on %d tags. (%d/%d, %.3f seconds per package)\r",
- scalar(@interesting_tags),
- $count_done, $count_total, $time_used / $count_reallydone;
- }
+ $pkgdata{$1} = $rec;
}
}
- printf STDERR "\n" if $verbose == 1;
-} else {
- usage();
+ close IN;
+ printf STDERR "%d packages and %d tags found.\n", $pkg_count, scalar(keys %tags) if $verbose > 0;
}
-exit 0;
+
+
+##
+## Generating patches
+##
# Get dbacl opinion about the likelyhood that $tag belongs to $pkg
# Returns (bool, perc)
@@ -121,11 +183,11 @@
sub testtag ($$)
{
my ($pkg, $tag) = @_;
- return (undef, undef) if not -f "data/$tag";
- return (undef, undef) if not -f "data/not-$tag";
+ return (undef, undef) if not -f "$traindir/$tag";
+ return (undef, undef) if not -f "$traindir/not-$tag";
my($rdrfh, $wtrfh);
- my $pid = open2($rdrfh, $wtrfh, "dbacl -m -U -c data/$tag -c data/not-$tag");
+ my $pid = open2($rdrfh, $wtrfh, "dbacl -m -U -c $traindir/$tag -c $traindir/not-$tag");
$pid or die "Can't invoke dbacl: $!";
print $wtrfh $pkgdata{$pkg};
close $wtrfh;
@@ -179,46 +241,82 @@
return $ela[0] + $ela[2], $ela[1] + $ela[3]
}
+# Output a tag patch for all the packages, computed by dbacl
+sub patch_all (@)
+{
+ my @pkgs = @_;
+ # Filter out tags for which we know the bayesian doesn't do a good job
+ for my $tag (keys %tags)
+ {
+ push @interesting_tags, $tag
+ if scalar keys %{$tags{$tag}} >= $tag_min_card
+ and $tag !~ $tag_blacklist;
+ }
-
-
-
-# Read apt database
-sub read_apt ()
-{
- open IN, "$pkgcache_source" or die "Can't read package cache from \"$pkgcache_source\": $!";
- local $/ = "\n\n";
- while (my $rec = <IN>)
+ my %pkgs_done;
+ print STDERR "Generating patch. Create a file called 'patch-stop' to interrupt (you can resume later).\n" if $verbose > 0;
+ if (-f 'patch-resume')
{
- if ($rec =~ /^Package: (\S+).+Tag: (.+?)(?:\n|$)/so)
+ # If asked to resume, read the partial patch from stdin
+ open IN, "patch-resume" or die "Can't open patch-resume: $!";
+ while (<IN>)
{
- my ($pkg, $tags) = ($1, $2);
- $pkgdata{$pkg} = $rec;
- if ($tags)
+ print;
+ if (/^(\S+): /)
{
- for my $tag (split(', ', $tags))
- {
- $tags{$tag}{$pkg} = 1;
- }
+ print STDERR "Reusing $1.\n" if $verbose > 0;
+ $pkgs_done{$1} = 1;
+ } else {
+ print STDERR "Could not parse $_\n";
}
- $pkg_count++;
}
- elsif ($rec =~ /^Package: (\S+)/o)
+ close(IN);
+ }
+ my $count_total = scalar @pkgs;
+ my $count_reallydone = 0;
+ my $count_done = scalar keys %pkgs_done;
+ my $time_used = 0;
+ for my $pkg (@pkgs)
+ {
+ if (-f 'patch-stop')
{
- $pkgdata{$1} = $rec;
+ print STDERR "Interrupted. Put the partial results in a file called 'patch-resume' to resume the computation from this point.\n".
+ unlink ('patch-stop');
+ last;
+ }
+
+ $count_done++;
+ if (not exists $pkgs_done{$pkg})
+ {
+ #print STDERR "Doing: $pkg\n" if $verbose > 0;
+ my ($user, $system) = patch_package($pkg);
+ $time_used += $user + $system;
+ $count_reallydone++;
+ printf STDERR "$pkg done. It took %f (user), %f (system), %f (total)\n", $user, $system, $user + $system
+ if ($verbose > 1);
+ if ($verbose == 1 && $count_reallydone % 20 == 0)
+ {
+ printf STDERR "Computing on %d tags. (%d/%d, %.3f seconds per package)\r",
+ scalar(@interesting_tags),
+ $count_done, $count_total, $time_used / $count_reallydone;
+ }
}
}
- close IN;
- printf STDERR "%d packages and %d tags found.\n", $pkg_count, scalar(keys %tags);
+ printf STDERR "\n" if $verbose == 1;
}
+##
+## Training
+##
+
+
# Train dbacl for one tag
sub train_one ($$;$)
{
my ($tag, $pkgs, $count) = @_;
- open OUT, "| dbacl -h10 -H20 -l data/$tag" or die "Can't train for $tag: $!";
+ open OUT, "| dbacl -h10 -H20 -l $traindir/$tag" or die "Can't train for $tag: $!";
if (defined $count)
{
my $perc = $count / scalar(@$pkgs);
@@ -250,13 +348,18 @@
{
print "Training. Create a file called 'trainall-stop' to interrupt (you can resume later).\n";
+ if (not -d $traindir)
+ {
+ mkdir $traindir or die "Can't create directory $traindir: $!";
+ }
+
my $count = 0;
my $all = scalar keys %tags;
for my $tag (keys %tags)
{
$count++;
last if -f "trainall-stop";
- next if -f "data/$tag";
+ next if -f "$traindir/$tag";
# Names of packages having the tag
my @with = keys %{$tags{$tag}};
More information about the Debtags-commits
mailing list