[Debtags-commits] [svn] r1355 - autodebtag/trunk/ai-tagger

Benjamin Mesing bmesing-guest at costa.debian.org
Thu Sep 15 17:29:19 UTC 2005


Author: bmesing-guest
Date: Thu Sep 15 17:29:17 2005
New Revision: 1355

Added:
   autodebtag/trunk/ai-tagger/TestPackage.pl   (contents, props changed)
   autodebtag/trunk/ai-tagger/Tools.pm
Modified:
   autodebtag/trunk/ai-tagger/BayesianAccess.pm
   autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm
   autodebtag/trunk/ai-tagger/NamedEntryDB.pm
   autodebtag/trunk/ai-tagger/Package.pm
   autodebtag/trunk/ai-tagger/TestBayesianAccess.pl
   autodebtag/trunk/ai-tagger/TokenDB.pm
   autodebtag/trunk/ai-tagger/bayesian-tagger.pl
   autodebtag/trunk/ai-tagger/create-data.pl
   autodebtag/trunk/ai-tagger/perform-test-help.pl
   autodebtag/trunk/ai-tagger/perform-test.pl
Log:
- added test-all functionality to bayesian tagger
- don't use libapt-perl any more, now we are using libparseapt


Modified: autodebtag/trunk/ai-tagger/BayesianAccess.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/BayesianAccess.pm	(original)
+++ autodebtag/trunk/ai-tagger/BayesianAccess.pm	Thu Sep 15 17:29:17 2005
@@ -30,11 +30,13 @@
 );
 
 
-=item C<hasDatabase> I<tag> I<basedir>
+=item hasDatabase I<tag> I<basedir>
 
 This checks if a database is available for the given tag.
-For this it searches the directory for the given tag in the 
-basedir (non recursive).
+It searches the directory for the given tag in the 
+basedir (non recursive), and returns true if the directory 
+exists.
+
 
 B<Parameter>
 

Modified: autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm	(original)
+++ autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm	Thu Sep 15 17:29:17 2005
@@ -289,7 +289,7 @@
 	return $this->{goodDb}->addEntry($entryName, $_[2]);
 }
 
-=item C<removeGoodEntry> I<entryName> I<entry>
+=item removeGoodEntry I<entryName>
 
 Removes the given entry from the good ones.
 
@@ -307,11 +307,11 @@
 
 
 
-=item C<addBadEntry> I<entryName> I<entry>
+=item addBadEntry I<entryName>
 
 Adds the given entry to the bad ones.
 
-B<Returns> @returns if the adding was successfull, i.e. either
+B<Returns> if the adding was successfull, i.e. either
 the allowDuplicate option was set, or the object
 was not in the database before
 
@@ -325,7 +325,7 @@
 }
 
 
-=item C<removeBadEntry> I<entryName> I<entry>
+=item removeBadEntry I<entryName>
 
 Removes the given entry from the bad ones.
 
@@ -342,16 +342,28 @@
 }
 
 
-## Tests if the given entry is good or bad.
-##
-## @param entry the entry to be tested
-## @returns two values: if the entry was bad(0), good(1) or unsure (-1) 
-## as first, and the $posterior 
-## as second value
+=item testEntry I<entry>
+
+Tests if the given text is categorized as  good or bad.
+
+B<Returns> two values, first if the entry was bad (0), good (1) 
+or unsure (-1), and second the posterior of the calculation.
+
+B<Parameter>
+
+=over 6
+
+=item entry
+
+The entry to be tested
+
+=back 
+
+=cut
+
 sub testEntry
 {
-	my $this = shift @_;
-	my ($entry) = @_;
+	my ($this, $entry) = @_;
 	my $tokens = $this->{tokenizer}->tokenize($entry);
 	my $posterior = $this->calculatePosterior($tokens);
 	print "Good posterior: ".$posterior."\n" if ($main::verboseOption >= 2);
@@ -371,7 +383,7 @@
 	my %set;
 	foreach my $token (@$tokens)
 	{
-		# if the token was allready used
+		# if the token was already used
 		next if ( exists($set{$token}) );
 		$set{$token} = 1;	# mark this token as used
 		# TODO we can spare the token name here, but for debug purposes we keep them

Modified: autodebtag/trunk/ai-tagger/NamedEntryDB.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/NamedEntryDB.pm	(original)
+++ autodebtag/trunk/ai-tagger/NamedEntryDB.pm	Thu Sep 15 17:29:17 2005
@@ -52,10 +52,10 @@
 sub DESTROY {
 	my $this = shift;
 	# check for an overridden destructor...
-	$this->SUPER::DESTROY if $this->can("SUPER::DESTROY");
+#	$this->SUPER::DESTROY if $this->can("SUPER::DESTROY");
 	# now do your own thing before or after
 	
-	print STDERR "destroying NamedEntryDB\n";
+#	print STDERR "destroying NamedEntryDB\n";
 	$this->{nameCountDbObject} = undef;
 	 untie %{$this->{nameCount}};
 }

Modified: autodebtag/trunk/ai-tagger/Package.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/Package.pm	(original)
+++ autodebtag/trunk/ai-tagger/Package.pm	Thu Sep 15 17:29:17 2005
@@ -13,17 +13,38 @@
 use warnings;
 use strict;
 
+use YAML;
+use IO::File;
+use Parse::Debian::Packages;
 
-my $cache;
+# this maps each package to a map, mapping the section to its string
+my %cache;
 
 require Exporter;
 @Tokenizer::ISA = qw(Exporter);
+ at Tools::EXPORT = qw (
+	splitVersionPackages
+);
+
 
 1;
 
+
 sub init
 {
-	$cache = AptPkg::Cache->new();
+
+	my $fh = IO::File->new("apt-cache dumpavail |");
+
+	my $parser = Parse::Debian::Packages->new( $fh );
+	while (my %package = $parser->next) 
+	{
+		$cache{$package{Package}} = \%package;
+	}
+}
+
+sub destroy
+{
+	%cache = undef;
 }
 
 
@@ -33,7 +54,10 @@
 	my ($name) = @_;
 	my $instance = {	name => $name,
 							description => undef,
+							# a hash mapping the packages this package is related to, to the type
+							# of dependency ( "Depends", "PreDepends", "Suggests", "Recommends", "Conflicts", "Replaces" or "Obsoletes")
 							depends => undef,
+							# holds an array of the virtual package names provided by this package
 							provides => undef,
 							tags => undef,
 							maintainer => undef,
@@ -85,7 +109,7 @@
 
 ## Returns the depends for the given package mapped to the type of
 ## dependencies (Depend, PreDepend,...) as reference and a second reference
-## to the provides of the package.
+## to an array, holding the provides of the package.
 ##
 ## This value is lazy initialized, i.e. after being requested
 ## once, it will be accessible fast (and consume you precious memory).
@@ -99,12 +123,17 @@
 	return ( $this->{depends}, $this->{provides} );
 }
 
-## Returns the depends for the given package mapped to the type of
-## dependencies (Depend, PreDepend,...) as reference and a second reference
-## to the provides of the package.
-##
-## This value is lazy initialized, i.e. after being requested
-## once, it will be accessible fast (and consume you precious memory).
+
+
+=item C<getMaintainer>
+
+Returns the maintainer of the package.
+
+This value is lazy initialized, i.e. after being requested
+once, it will be accessible fast (and consume you precious memory).
+
+=cut
+
 sub getMaintainer
 {
 	my ($this) = @_;
@@ -121,8 +150,10 @@
 
 This gets the Debian section the package is assigned to.
 
-=cut
+This value is lazy initialized, i.e. after being requested
+once, it will be accessible fast (and consume you precious memory).
 
+=cut
 sub getSection
 {
 	my ($this) = @_;
@@ -143,51 +174,73 @@
 	$this->{maintainer} = "";
 	$this->{section} = "";
 	$this->{description} = "";
+		
+# 	# begin remove block
+# 	$this->{maintainer} = "ksajadsg";
+# 	$this->{section} = "sec";
+# 	$this->{description} = "snglkfj sdg sfdlg fsdlg dflsjeowtf dgkljsdg ert osdg eorit dfsgiue goidsf gljdfg erojg odsfji gfdgj erojt eroj oitj odfgjpewwer#tüp kplkdkgm osgkmöeroktitr0etitdskf-g,öfljktoiwtudfkgj flkgf hfdsjks gfd";
+# 	my %depends = ();
+# 	my @provides = ();
+# 	$this->{depends} = \%depends;
+# 	$this->{provides} = \@provides;
+# 	return;
+# 	# end remove block
+	
 	my %depends = ();
 	my @provides = ();
-	my $package = $cache->{$this->{name}};
+	my $package = $cache{$this->{name}};
 	if ($package)
 	{
-		# returns a reference to an array of Version packages
-		my $versionList = $package->{VersionList};
-		if ( $versionList && scalar( @$versionList) )
+		$this->{maintainer} = $package->{Maintainer};
+		$this->{section} = $package->{Section};
+		# the description entry contains the short description, the body entry
+		# the long one
+		$this->{description} = $package->{Description}."\n".$package->{body};
+		if (my $provides = $package->{Provides})
 		{
-			# take the first entry from the list (which should be the newest version)
-			# iterate over all dependencies 
-			# brr... I hate perls oo syntax, you get really mad with it!
-			my $version = ${@{$versionList}}[0];
-			if ($version->{DependsList})
-			{
-				foreach my $depend ( @{$version->{DependsList}} )
-				{
-					$depends{$depend->{TargetPkg}->{Name}} = $depend->{DebType};
-				}
-			}
-			if ($version->{ProvidesList})
+			push @provides, split /\s*,\s*/, $provides;
+		}
+		foreach my $dependCategory ("PreDepends" , "Depends" , "PreDepends" , "Suggests" , "Recommends", "Conflicts", "Replaces", "Obsoletes")
+		{
+			my $entry = $package->{$dependCategory};
+			if ($entry)
 			{
-				foreach my $provide ( @{$version->{ProvidesList}} )
+				my @packages = splitVersionPackages($entry);
+				foreach (@packages)
 				{
-					push @provides, $provide->{Name};
+					$depends{$_} = $dependCategory;
 				}
 			}
-			# this dies if the package is not known, so keep it here
-			# were the existence was already checked
-			my $pkgRecord = $cache->packages()->lookup($this->{name});
-			if ($pkgRecord)
-			{
-				$this->{maintainer} = ${%{$pkgRecord}}{Maintainer};
-				$this->{section} = ${%{$pkgRecord}}{Section};
-				$this->{description} = ${%{$pkgRecord}}{LongDesc};
-			}
-		}
-		else
-		{	
-			print "WARNING: No information available for package $this->{name}\n" 
-				if ($main::verboseOption);
-			$this->{maintainer} = "";
-			$this->{section} = "";
-			$this->{description} = "";
 		}
+# 		# returns a reference to an array of Version packages
+# 		my $versionList = $package->{VersionList};
+# 		if ( $versionList && scalar( @$versionList) )
+# 		{
+# 			# take the first entry from the list (which should be the newest version)
+# 			# iterate over all dependencies 
+# 			# brr... I hate perls oo syntax, you get really mad with it!
+# 			my $version = ${@{$versionList}}[0];
+# 			if ($version->{DependsList})
+# 			{
+# 				foreach my $depend ( @{$version->{DependsList}} )
+# 				{
+# 					$depends{$depend->{TargetPkg}->{Name}} = $depend->{DebType};
+# 				}
+# 			}
+# 			if ($version->{ProvidesList})
+# 			{
+# 				foreach my $provide ( @{$version->{ProvidesList}} )
+# 				{
+# 					push @provides, $provide->{Name};
+# 				}
+# 			}
+# 			# this dies if the package is not known, so keep it here
+# 			# were the existence was already checked
+# 			my $pkgRecord = $cache->packages()->lookup($this->{name});
+# 			if ($pkgRecord)
+# 			{
+# 			}
+# 		}
 	}
 	else
 	{	
@@ -201,4 +254,20 @@
 	$this->{provides} = \@provides;
 }
 
+=item C<splitVersionPackages>
+
+Splits the version list of packages as seen in the package database.
+
+Hand the list as a string, returns the list of the package names only 
+as a list.
+
+=cut
+sub splitVersionPackages
+{
+	my ($string) = @_;
+	# dismiss everything in () because this contains versions
+	$string =~ s/\s*\([^\)*]*\)\s*//g;
+	return split /\s*\|\s*|\s*,\s*/, $string; # split on ',' and '|'
+}
+
 __END__
\ No newline at end of file

Modified: autodebtag/trunk/ai-tagger/TestBayesianAccess.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/TestBayesianAccess.pl	(original)
+++ autodebtag/trunk/ai-tagger/TestBayesianAccess.pl	Thu Sep 15 17:29:17 2005
@@ -1,4 +1,5 @@
 #!/usr/bin/perl -w
+# Test case for BayesienAccess class
 
 use strict;
 use warnings;

Modified: autodebtag/trunk/ai-tagger/TokenDB.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/TokenDB.pm	(original)
+++ autodebtag/trunk/ai-tagger/TokenDB.pm	Thu Sep 15 17:29:17 2005
@@ -1,3 +1,17 @@
+=head1 DESCRIPTION
+
+This class mapps tokens to the number of occurences. There
+are the getTokens, addToken(s), removeToken(s) and the tokenCount
+methods available.
+
+The data is held in the DB file handed on construction.
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
 package TokenDB;
 
 use warnings;
@@ -13,7 +27,7 @@
 1;
 
 ## This creates a database of simple string tokens mapped to
-## the number of ocurrences.
+## the number of occurences.
 ## 
 ## @param $filename the name of the file of this database
 sub new
@@ -33,7 +47,7 @@
 
 sub DESTROY
 {
-	print STDERR " destroying TokenDB\n";
+#	print STDERR " destroying TokenDB\n";
 	my ($this) = @_;
 	untie %{$this->{dbFile}};
 }

Modified: autodebtag/trunk/ai-tagger/bayesian-tagger.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/bayesian-tagger.pl	(original)
+++ autodebtag/trunk/ai-tagger/bayesian-tagger.pl	Thu Sep 15 17:29:17 2005
@@ -1,13 +1,76 @@
 #!/usr/bin/perl -w
 
 
+=head1 bayesian-tagger.pl
 
-=head1 DESCRIPTION
+bayesian-tagger.pl - tags or test for a given tag
+
+=head1 SYNOPSIS
+
+bayesian-tagger.pl [options] tag
+
+=head1 OPTIONS
+
+=over 6
+
+=item B<--help, -h> 
+
+Prints a brief help message and exits.
+
+=item B<--verbose, -v>    
+
+Toggle verbose mode, this may be given more than once to be even more verbose.
+
+=item B<--quiet, -q>      
+
+Run quiet (not fully supported).
+
+       
+=item B<--train, -t>      
+
+Train the tag-database with the traindata.
+
+=item B<--add-good, -g> I<packageName> 
+                     
+Add the given package as good package.
+
+=item B<--add-bad, -b> I<packageName>
+                     
+Add the given package as bad package.
 
+=item B<--remove-good, -G> I<packageName>
+                     
+Remove the given package from the good ones.
 
-=head1 METHODS
+=item B<--remove-bad, -B> I<packageName>
 
-=over 4
+Remove the given package from the bad ones.
+
+
+=item B<--perform-test, -o>
+
+Test the tag-database with the testdata.
+
+=item B<--stats, -s>      
+
+Show word probabilities.
+ 
+
+=item B<--test-package, -p> I<packageName> 
+                     
+Do not test the files but the given package.
+
+=item B<--test-all-packages, -a> 
+
+Test all packages for the given tag.
+
+=item B<--print-names-only, -no>
+
+Only names of good package will be printed when testing the packages.
+
+=back
+
+=head1 DESCRIPTION
 
 =cut
 
@@ -27,45 +90,34 @@
 use Heap::Priority;
 use Getopt::Long;
 use File::Spec;
+use Pod::Usage;
 
 Package::init();
 
+sub performPackageTest;
 
 
-my $usage = 
-"Usage:
-  bayesian-tagger.pl options tagname
-  Options:
-     --help|-h       print this message
-     --verbose|-v    toggle verbose mode, this may be given more than once
-     --quiet|-q      run quiet (not fully supported)
-     --no-train|-nt  do not train, only test (e.g. the database was allready
-                     filled)
-     --test-package|-t <packageName> 
-                     do not test the files but the given package
-     --stats|-s      show word probabilities
-     --add-good|-g <packageName> 
-                     add the given package as good package
-     --add-bad|-b <packageName>
-                     add the given package as bad package
-     --remove-good|-G <packageName>
-                     remove the given package from the good ones
-     --remove-bad|-B <packageName>
-                     remove the given package from the bad ones
-";
-
-my $noTrainOption;
-my $statsOption;
 $main::verboseOption=0;
-
 my $helpOption;
+my $manOption;
+my $quietOption;
 
-my $packageOption;
+# database modifying operations
+my $trainOption;
 my $addGoodOption;
 my $addBadOption;
 my $removeGoodOption;
 my $removeBadOption;
-my $quietOption;
+
+# database status display operations
+my $performTestOption;
+my $statsOption;
+
+# package testing operations
+my $testPackageOption;
+my $testAllPackagesOption;
+my $printNamesOnlyOption;
+
 
 #		print "Could not removed entry $name from database ", $this->{baseFilename}, 
 #			" as the entry is not in the database.\n"
@@ -74,27 +126,30 @@
 
 if ( !GetOptions(
 		"help|h" => \$helpOption,
+		"man|m" => \$manOption,
 		"verbose|v+" => \$main::verboseOption,
 		"quiet|q" => \$main::quietOption,
-		"no-train|nt" => \$noTrainOption,
-		"test-package|t=s" => \$packageOption,
-		"stats|s" => \$statsOption,
+		
+		"train|t" => \$trainOption,
 		"add-good|g=s" => \$addGoodOption,
 		"add-bad|b=s" => \$addBadOption,
 		"remove-good|G=s" => \$removeGoodOption,
 		"remove-bad|B=s" => \$removeBadOption,
-	) 
+		
+		"stats|s" => \$statsOption,
+		"perform-test|o" => \$performTestOption,
+		
+		"test-package|p=s" => \$testPackageOption,
+		"test-all-packages|a" => \$testAllPackagesOption,
+		"print-names-only|no" => \$printNamesOnlyOption
+	)  || (@ARGV != 1)
 )
 {
-	die $usage;
-}
-if ($helpOption)
-{
-	print "$usage\n";
-	exit 0;
+	pod2usage(1);
 }
 
-die($usage) if (@ARGV != 1);
+pod2usage(0) if $helpOption;
+pod2usage(-exitstatus => 0, -verbose => 2) if $manOption;
 
 my ($tag) = @ARGV;
 
@@ -109,11 +164,23 @@
 $filter->setAllowDuplicates(0);
 
 
-# set this if a single package is added/removed
-my $singlePackage = $packageOption || $addGoodOption || $addBadOption ||
-						  $removeGoodOption || $removeBadOption;
-
 
+if ($trainOption)
+{
+	my $badTrainFile = "$dirFromTag/bad_train.list";
+	my $goodTrainFile = "$dirFromTag/good_train.list";
+	my @goodTrainPackages = readPackagesFromFile($goodTrainFile);
+	my @badTrainPackages = readPackagesFromFile($badTrainFile);
+	foreach(@goodTrainPackages)
+	{
+		trainPackage($filter, $_, 1);
+	}
+	foreach(@badTrainPackages)
+	{
+		trainPackage($filter, $_, 0);
+	}
+	$filter->save();
+}
 if ($removeGoodOption)
 {
 	removePackage($filter, $removeGoodOption, 1);
@@ -134,47 +201,13 @@
 	trainPackage($filter, $addBadOption, 0);
 	$filter->save();
 }
-# if a single package shall be tested
-if ($packageOption)
-{
-	my ( $good, $posterior ) = categorizePackage($filter, $packageOption);
-	print "Package $packageOption was categorized as ".($good ? "good" : "bad"). " with a ";
-	print "posterior to be good of $posterior\n";
-}
-# stats about the DB shall be shown
-if ($statsOption)
-{
-# we could use a stats consumer here, but it is much slower...
-#	my $statsConsumer = PrintStatsConsumer->new(*STDOUT);
-#	$filter->outputStats($statsConsumer, 1, 1);
-	$filter->printStats();
-}
-elsif (!$singlePackage)
+
+if ($performTestOption)
 {
-	my $badTrainFile = "$dirFromTag/bad_train.list";
 	my $badTestFile = "$dirFromTag/bad_test.list";
-	my $goodTrainFile = "$dirFromTag/good_train.list";
 	my $goodTestFile = "$dirFromTag/good_test.list";
-
-	my @goodTrainPackages = readPackagesFromFile($goodTrainFile);
 	my @goodTestPackages = readPackagesFromFile($goodTestFile);
-	my @badTrainPackages = readPackagesFromFile($badTrainFile);
 	my @badTestPackages = readPackagesFromFile($badTestFile);
-
-	if (!$noTrainOption)
-	{
-		foreach(@goodTrainPackages)
-		{
-			trainPackage($filter, $_, 1);
-		}
-		foreach(@badTrainPackages)
-		{
-			trainPackage($filter, $_, 0);
-		}
-		$filter->save();
-	}
-	
-	
 	my $goodTests = scalar(@goodTestPackages);
 	my $badTests = scalar(@badTestPackages);
 
@@ -214,7 +247,68 @@
 	print "Expected bad, and wielded bad: $badMatches ^= ", 
 		$badMatches/scalar($badTests), "\n";
 }
+# stats about the DB shall be shown
+if ($statsOption)
+{
+# we could use a stats consumer here, but it is much slower...
+#	my $statsConsumer = PrintStatsConsumer->new(*STDOUT);
+#	$filter->outputStats($statsConsumer, 1, 1);
+	$filter->printStats();
+}
+
+# if a single package shall be tested
+if ($testPackageOption)
+{
+	performPackageTest($filter, $testPackageOption, $printNamesOnlyOption);
+}
+
+if ($testAllPackagesOption)
+{
+	open(FH, "apt-cache pkgnames --no-all-names |");
+	my $i;
+	while( <FH> )
+	{
+		++$i;
+		if ($i % 50 == 0)
+		{
+# 			Package::destroy();
+# 			Package::init();
+		}
+		chomp;
+		performPackageTest($filter, $_, $printNamesOnlyOption);
+	}
+	close(FH);
+}
+
 
+sub performPackageTest
+{
+	my ($filer, $package, $namesOnly) = @_;
+	my ( $good, $posterior ) = categorizePackage($filter, $package);
+	# if we shall print only the good packages
+	if ($namesOnly)
+	{
+		print "$package\n" if ($good==1);
+	}
+	else
+	{
+		my $categorization;
+		if ($good == 1)
+		{
+			$categorization = "good";
+		}
+		elsif ($good == -1)
+		{
+			$categorization = "unsure";
+		}
+		else
+		{
+			$categorization = "bad";
+		}
+		print "Package $package was categorized as $categorization  with a ";
+		print "posterior to be good of $posterior\n";
+	}
+}
 
 
 __END__

Modified: autodebtag/trunk/ai-tagger/create-data.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/create-data.pl	(original)
+++ autodebtag/trunk/ai-tagger/create-data.pl	Thu Sep 15 17:29:17 2005
@@ -70,24 +70,24 @@
 my $badRatioOption=1;
 my $directoryOption;	# current directory is default directory
 my $noRegenerateOption;
-my $help;
-my $man;
+my $helpOption;
+my $manOption;
 
 if ( !GetOptions(
  	"max-good|g=i" => \$maxGoodOption, 
  	"bad-ratio|r=i" => \$badRatioOption, 
 	"directory|d=s" => \$directoryOption,
 	"no-regenerate|n" => \$noRegenerateOption,
-	"help|h" => \$help,
-	"man|m" => \$man,
+	"help|h" => \$helpOption,
+	"man|m" => \$manOption,
 	) || (@ARGV != 1) 
 ) 
 {
 	pod2usage(1);
 }
 
-pod2usage(0) if $help;
-pod2usage(-exitstatus => 0, -verbose => 2) if $man;
+pod2usage(0) if $helpOption;
+pod2usage(-exitstatus => 0, -verbose => 2) if $manOption;
 
 my ($tag) = @ARGV;
 

Modified: autodebtag/trunk/ai-tagger/perform-test-help.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/perform-test-help.pl	(original)
+++ autodebtag/trunk/ai-tagger/perform-test-help.pl	Thu Sep 15 17:29:17 2005
@@ -26,18 +26,6 @@
 # $main::quietOption = 0;
 # 
 # 
-# # used to generate some random tags
-# # my @tags = getVocabulary(0);
-# # for (my $i=0; $i<20; ++$i)
-# # {
-# # 	print($tags[int(rand(scalar(@tags)))]."\n");
-# # }
-# # exit 0;
-# 
-# 
-# my $usage = "perform-test.pl taglistFile\n";
-# 
-# 
 # die $usage if ($#ARGV != 0);
 # my $taglistFile = $ARGV[0];
 # 
@@ -62,9 +50,9 @@
 	print "---- begin Processing $tag ----\n";
 	my $dirFromTag = $tag;
 	$dirFromTag  =~ s/:/_/g;	# remove all : from the tag and replace by an _
-	my $dataDir = File::Spec->catdir("test_data", $dirFromTag);
+	my $dataDir = File::Spec->catdir("test-data", $dirFromTag);
 	
-	if (hasDatabase($tag, "test_data"))
+	if (hasDatabase($tag, "test-data"))
 	{
 		print "database for $tag already existent - skipping\n";
 		return;
@@ -76,7 +64,7 @@
 	# create database
 	print "Creating test and training set...\n";
 	# create test sets - do not regenerate if already existent
-	system("./create-data.pl -d $dataDir -n $tag");
+	system("./create-data.pl -g 100 -r 4 -d $dataDir -n $tag");
 	my $goodTrainFile = "$dataDir/good_train.list";
 	my $badTrainFile = "$dataDir/bad_train.list";
 	my @goodTrainPackages = readPackagesFromFile($goodTrainFile);
@@ -144,40 +132,3 @@
 	BayesianAccess::clearStatistics();
 	print "---- end Processing $tag ----\n";
 }
-
-
-=item C<getVocabulary> I<includeFacets>
-
-Returns the debtags vocabulary as list. 
-
-B<Parameter>
-
-=over 6
-
-=item includeFacets
-
-Includes the facets if this is true.
-
-=back
-
-=cut
-sub getVocabulary
-{
-	my ($includeFacets) = @_;
-	my $vocabularyFile = "/var/lib/debtags/vocabulary";
-	my @result;
-	open(VF, $vocabularyFile) || die "could not open $vocabularyFile";
-	foreach (<VF>)
-	{
-		if (/^Tag: ([^\s]*)/)
-		{
-			push @result, $1;
-		}
-		elsif ($includeFacets && /^Facet: ([^\s]*)/)
-		{
-			push @result, $1;
-		}
-	}
-	close VF;
-	return @result;
-}
\ No newline at end of file

Modified: autodebtag/trunk/ai-tagger/perform-test.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/perform-test.pl	(original)
+++ autodebtag/trunk/ai-tagger/perform-test.pl	Thu Sep 15 17:29:17 2005
@@ -1,7 +1,36 @@
 #!/usr/bin/perl -w
-# performs a bayesian tagging test for the tags in the
-# given taglistFile, outputs result to stdout.
-# The taglistFile should contain one tag per line
+
+
+
+
+=head1 perform-test.pl
+
+perform-test.pl - performs a bayesian tagging test for the tags in the
+given taglistFile
+
+=head1 SYNOPSIS
+
+perform-test.pl taglistFile
+
+
+=head1 DESCRIPTION
+
+Performs a bayesian tagging test for the tags in the
+given taglistFile, outputs result to stdout.
+The taglistFile should contain one tag per line
+
+A bayesian test first creates a test- and training set using 
+B<create-data.pl>. Afterwards uses this data to test and
+train a the bayesian tagger and prints the training results.
+
+The training data created will be stored in a test-data
+directory. If training data does already exist, this tag
+will be skipped.
+
+=head1 FUNCTIONS
+
+=cut
+
 
 use strict;
 
@@ -144,39 +173,3 @@
 # 	print "---- end Processing $tag ----\n";
 # }
 
-
-=item C<getVocabulary> I<includeFacets>
-
-Returns the debtags vocabulary as list. 
-
-B<Parameter>
-
-=over 6
-
-=item includeFacets
-
-Includes the facets if this is true.
-
-=back
-
-=cut
-sub getVocabulary
-{
-	my ($includeFacets) = @_;
-	my $vocabularyFile = "/var/lib/debtags/vocabulary";
-	my @result;
-	open(VF, $vocabularyFile) || die "could not open $vocabularyFile";
-	foreach (<VF>)
-	{
-		if (/^Tag: ([^\s]*)/)
-		{
-			push @result, $1;
-		}
-		elsif ($includeFacets && /^Facet: ([^\s]*)/)
-		{
-			push @result, $1;
-		}
-	}
-	close VF;
-	return @result;
-}
\ No newline at end of file



More information about the Debtags-commits mailing list