[Debtags-commits] [svn] r1355 - autodebtag/trunk/ai-tagger
Benjamin Mesing
bmesing-guest at costa.debian.org
Thu Sep 15 17:29:19 UTC 2005
Author: bmesing-guest
Date: Thu Sep 15 17:29:17 2005
New Revision: 1355
Added:
autodebtag/trunk/ai-tagger/TestPackage.pl (contents, props changed)
autodebtag/trunk/ai-tagger/Tools.pm
Modified:
autodebtag/trunk/ai-tagger/BayesianAccess.pm
autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm
autodebtag/trunk/ai-tagger/NamedEntryDB.pm
autodebtag/trunk/ai-tagger/Package.pm
autodebtag/trunk/ai-tagger/TestBayesianAccess.pl
autodebtag/trunk/ai-tagger/TokenDB.pm
autodebtag/trunk/ai-tagger/bayesian-tagger.pl
autodebtag/trunk/ai-tagger/create-data.pl
autodebtag/trunk/ai-tagger/perform-test-help.pl
autodebtag/trunk/ai-tagger/perform-test.pl
Log:
- added test-all functionality to bayesian tagger
- don't use libapt-perl any more, now we are using libparseapt
Modified: autodebtag/trunk/ai-tagger/BayesianAccess.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/BayesianAccess.pm (original)
+++ autodebtag/trunk/ai-tagger/BayesianAccess.pm Thu Sep 15 17:29:17 2005
@@ -30,11 +30,13 @@
);
-=item C<hasDatabase> I<tag> I<basedir>
+=item hasDatabase I<tag> I<basedir>
This checks if a database is available for the given tag.
-For this it searches the directory for the given tag in the
-basedir (non recursive).
+It searches the directory for the given tag in the
+basedir (non recursive), and returns true if the directory
+exists.
+
B<Parameter>
Modified: autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm (original)
+++ autodebtag/trunk/ai-tagger/NaiveBayesianFilter.pm Thu Sep 15 17:29:17 2005
@@ -289,7 +289,7 @@
return $this->{goodDb}->addEntry($entryName, $_[2]);
}
-=item C<removeGoodEntry> I<entryName> I<entry>
+=item removeGoodEntry I<entryName>
Removes the given entry from the good ones.
@@ -307,11 +307,11 @@
-=item C<addBadEntry> I<entryName> I<entry>
+=item addBadEntry I<entryName>
Adds the given entry to the bad ones.
-B<Returns> @returns if the adding was successfull, i.e. either
+B<Returns> if the adding was successfull, i.e. either
the allowDuplicate option was set, or the object
was not in the database before
@@ -325,7 +325,7 @@
}
-=item C<removeBadEntry> I<entryName> I<entry>
+=item removeBadEntry I<entryName>
Removes the given entry from the bad ones.
@@ -342,16 +342,28 @@
}
-## Tests if the given entry is good or bad.
-##
-## @param entry the entry to be tested
-## @returns two values: if the entry was bad(0), good(1) or unsure (-1)
-## as first, and the $posterior
-## as second value
+=item testEntry I<entry>
+
+Tests if the given text is categorized as good or bad.
+
+B<Returns> two values, first if the entry was bad (0), good (1)
+or unsure (-1), and second the posterior of the calculation.
+
+B<Parameter>
+
+=over 6
+
+=item entry
+
+The entry to be tested
+
+=back
+
+=cut
+
sub testEntry
{
- my $this = shift @_;
- my ($entry) = @_;
+ my ($this, $entry) = @_;
my $tokens = $this->{tokenizer}->tokenize($entry);
my $posterior = $this->calculatePosterior($tokens);
print "Good posterior: ".$posterior."\n" if ($main::verboseOption >= 2);
@@ -371,7 +383,7 @@
my %set;
foreach my $token (@$tokens)
{
- # if the token was allready used
+ # if the token was already used
next if ( exists($set{$token}) );
$set{$token} = 1; # mark this token as used
# TODO we can spare the token name here, but for debug purposes we keep them
Modified: autodebtag/trunk/ai-tagger/NamedEntryDB.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/NamedEntryDB.pm (original)
+++ autodebtag/trunk/ai-tagger/NamedEntryDB.pm Thu Sep 15 17:29:17 2005
@@ -52,10 +52,10 @@
sub DESTROY {
my $this = shift;
# check for an overridden destructor...
- $this->SUPER::DESTROY if $this->can("SUPER::DESTROY");
+# $this->SUPER::DESTROY if $this->can("SUPER::DESTROY");
# now do your own thing before or after
- print STDERR "destroying NamedEntryDB\n";
+# print STDERR "destroying NamedEntryDB\n";
$this->{nameCountDbObject} = undef;
untie %{$this->{nameCount}};
}
Modified: autodebtag/trunk/ai-tagger/Package.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/Package.pm (original)
+++ autodebtag/trunk/ai-tagger/Package.pm Thu Sep 15 17:29:17 2005
@@ -13,17 +13,38 @@
use warnings;
use strict;
+use YAML;
+use IO::File;
+use Parse::Debian::Packages;
-my $cache;
+# this maps each package to a map, mapping the section to its string
+my %cache;
require Exporter;
@Tokenizer::ISA = qw(Exporter);
+ at Tools::EXPORT = qw (
+ splitVersionPackages
+);
+
1;
+
sub init
{
- $cache = AptPkg::Cache->new();
+
+ my $fh = IO::File->new("apt-cache dumpavail |");
+
+ my $parser = Parse::Debian::Packages->new( $fh );
+ while (my %package = $parser->next)
+ {
+ $cache{$package{Package}} = \%package;
+ }
+}
+
+sub destroy
+{
+ %cache = undef;
}
@@ -33,7 +54,10 @@
my ($name) = @_;
my $instance = { name => $name,
description => undef,
+ # a hash mapping the packages this package is related to, to the type
+ # of dependency ( "Depends", "PreDepends", "Suggests", "Recommends", "Conflicts", "Replaces" or "Obsoletes")
depends => undef,
+ # holds an array of the virtual package names provided by this package
provides => undef,
tags => undef,
maintainer => undef,
@@ -85,7 +109,7 @@
## Returns the depends for the given package mapped to the type of
## dependencies (Depend, PreDepend,...) as reference and a second reference
-## to the provides of the package.
+## to an array, holding the provides of the package.
##
## This value is lazy initialized, i.e. after being requested
## once, it will be accessible fast (and consume you precious memory).
@@ -99,12 +123,17 @@
return ( $this->{depends}, $this->{provides} );
}
-## Returns the depends for the given package mapped to the type of
-## dependencies (Depend, PreDepend,...) as reference and a second reference
-## to the provides of the package.
-##
-## This value is lazy initialized, i.e. after being requested
-## once, it will be accessible fast (and consume you precious memory).
+
+
+=item C<getMaintainer>
+
+Returns the maintainer of the package.
+
+This value is lazy initialized, i.e. after being requested
+once, it will be accessible fast (and consume you precious memory).
+
+=cut
+
sub getMaintainer
{
my ($this) = @_;
@@ -121,8 +150,10 @@
This gets the Debian section the package is assigned to.
-=cut
+This value is lazy initialized, i.e. after being requested
+once, it will be accessible fast (and consume you precious memory).
+=cut
sub getSection
{
my ($this) = @_;
@@ -143,51 +174,73 @@
$this->{maintainer} = "";
$this->{section} = "";
$this->{description} = "";
+
+# # begin remove block
+# $this->{maintainer} = "ksajadsg";
+# $this->{section} = "sec";
+# $this->{description} = "snglkfj sdg sfdlg fsdlg dflsjeowtf dgkljsdg ert osdg eorit dfsgiue goidsf gljdfg erojg odsfji gfdgj erojt eroj oitj odfgjpewwer#tüp kplkdkgm osgkmöeroktitr0etitdskf-g,öfljktoiwtudfkgj flkgf hfdsjks gfd";
+# my %depends = ();
+# my @provides = ();
+# $this->{depends} = \%depends;
+# $this->{provides} = \@provides;
+# return;
+# # end remove block
+
my %depends = ();
my @provides = ();
- my $package = $cache->{$this->{name}};
+ my $package = $cache{$this->{name}};
if ($package)
{
- # returns a reference to an array of Version packages
- my $versionList = $package->{VersionList};
- if ( $versionList && scalar( @$versionList) )
+ $this->{maintainer} = $package->{Maintainer};
+ $this->{section} = $package->{Section};
+ # the description entry contains the short description, the body entry
+ # the long one
+ $this->{description} = $package->{Description}."\n".$package->{body};
+ if (my $provides = $package->{Provides})
{
- # take the first entry from the list (which should be the newest version)
- # iterate over all dependencies
- # brr... I hate perls oo syntax, you get really mad with it!
- my $version = ${@{$versionList}}[0];
- if ($version->{DependsList})
- {
- foreach my $depend ( @{$version->{DependsList}} )
- {
- $depends{$depend->{TargetPkg}->{Name}} = $depend->{DebType};
- }
- }
- if ($version->{ProvidesList})
+ push @provides, split /\s*,\s*/, $provides;
+ }
+ foreach my $dependCategory ("PreDepends" , "Depends" , "PreDepends" , "Suggests" , "Recommends", "Conflicts", "Replaces", "Obsoletes")
+ {
+ my $entry = $package->{$dependCategory};
+ if ($entry)
{
- foreach my $provide ( @{$version->{ProvidesList}} )
+ my @packages = splitVersionPackages($entry);
+ foreach (@packages)
{
- push @provides, $provide->{Name};
+ $depends{$_} = $dependCategory;
}
}
- # this dies if the package is not known, so keep it here
- # were the existence was already checked
- my $pkgRecord = $cache->packages()->lookup($this->{name});
- if ($pkgRecord)
- {
- $this->{maintainer} = ${%{$pkgRecord}}{Maintainer};
- $this->{section} = ${%{$pkgRecord}}{Section};
- $this->{description} = ${%{$pkgRecord}}{LongDesc};
- }
- }
- else
- {
- print "WARNING: No information available for package $this->{name}\n"
- if ($main::verboseOption);
- $this->{maintainer} = "";
- $this->{section} = "";
- $this->{description} = "";
}
+# # returns a reference to an array of Version packages
+# my $versionList = $package->{VersionList};
+# if ( $versionList && scalar( @$versionList) )
+# {
+# # take the first entry from the list (which should be the newest version)
+# # iterate over all dependencies
+# # brr... I hate perls oo syntax, you get really mad with it!
+# my $version = ${@{$versionList}}[0];
+# if ($version->{DependsList})
+# {
+# foreach my $depend ( @{$version->{DependsList}} )
+# {
+# $depends{$depend->{TargetPkg}->{Name}} = $depend->{DebType};
+# }
+# }
+# if ($version->{ProvidesList})
+# {
+# foreach my $provide ( @{$version->{ProvidesList}} )
+# {
+# push @provides, $provide->{Name};
+# }
+# }
+# # this dies if the package is not known, so keep it here
+# # were the existence was already checked
+# my $pkgRecord = $cache->packages()->lookup($this->{name});
+# if ($pkgRecord)
+# {
+# }
+# }
}
else
{
@@ -201,4 +254,20 @@
$this->{provides} = \@provides;
}
+=item C<splitVersionPackages>
+
+Splits the version list of packages as seen in the package database.
+
+Hand the list as a string, returns the list of the package names only
+as a list.
+
+=cut
+sub splitVersionPackages
+{
+ my ($string) = @_;
+ # dismiss everything in () because this contains versions
+ $string =~ s/\s*\([^\)*]*\)\s*//g;
+ return split /\s*\|\s*|\s*,\s*/, $string; # split on ',' and '|'
+}
+
__END__
\ No newline at end of file
Modified: autodebtag/trunk/ai-tagger/TestBayesianAccess.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/TestBayesianAccess.pl (original)
+++ autodebtag/trunk/ai-tagger/TestBayesianAccess.pl Thu Sep 15 17:29:17 2005
@@ -1,4 +1,5 @@
#!/usr/bin/perl -w
+# Test case for BayesienAccess class
use strict;
use warnings;
Modified: autodebtag/trunk/ai-tagger/TokenDB.pm
==============================================================================
--- autodebtag/trunk/ai-tagger/TokenDB.pm (original)
+++ autodebtag/trunk/ai-tagger/TokenDB.pm Thu Sep 15 17:29:17 2005
@@ -1,3 +1,17 @@
+=head1 DESCRIPTION
+
+This class mapps tokens to the number of occurences. There
+are the getTokens, addToken(s), removeToken(s) and the tokenCount
+methods available.
+
+The data is held in the DB file handed on construction.
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
package TokenDB;
use warnings;
@@ -13,7 +27,7 @@
1;
## This creates a database of simple string tokens mapped to
-## the number of ocurrences.
+## the number of occurences.
##
## @param $filename the name of the file of this database
sub new
@@ -33,7 +47,7 @@
sub DESTROY
{
- print STDERR " destroying TokenDB\n";
+# print STDERR " destroying TokenDB\n";
my ($this) = @_;
untie %{$this->{dbFile}};
}
Modified: autodebtag/trunk/ai-tagger/bayesian-tagger.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/bayesian-tagger.pl (original)
+++ autodebtag/trunk/ai-tagger/bayesian-tagger.pl Thu Sep 15 17:29:17 2005
@@ -1,13 +1,76 @@
#!/usr/bin/perl -w
+=head1 bayesian-tagger.pl
-=head1 DESCRIPTION
+bayesian-tagger.pl - tags or test for a given tag
+
+=head1 SYNOPSIS
+
+bayesian-tagger.pl [options] tag
+
+=head1 OPTIONS
+
+=over 6
+
+=item B<--help, -h>
+
+Prints a brief help message and exits.
+
+=item B<--verbose, -v>
+
+Toggle verbose mode, this may be given more than once to be even more verbose.
+
+=item B<--quiet, -q>
+
+Run quiet (not fully supported).
+
+
+=item B<--train, -t>
+
+Train the tag-database with the traindata.
+
+=item B<--add-good, -g> I<packageName>
+
+Add the given package as good package.
+
+=item B<--add-bad, -b> I<packageName>
+
+Add the given package as bad package.
+=item B<--remove-good, -G> I<packageName>
+
+Remove the given package from the good ones.
-=head1 METHODS
+=item B<--remove-bad, -B> I<packageName>
-=over 4
+Remove the given package from the bad ones.
+
+
+=item B<--perform-test, -o>
+
+Test the tag-database with the testdata.
+
+=item B<--stats, -s>
+
+Show word probabilities.
+
+
+=item B<--test-package, -p> I<packageName>
+
+Do not test the files but the given package.
+
+=item B<--test-all-packages, -a>
+
+Test all packages for the given tag.
+
+=item B<--print-names-only, -no>
+
+Only names of good package will be printed when testing the packages.
+
+=back
+
+=head1 DESCRIPTION
=cut
@@ -27,45 +90,34 @@
use Heap::Priority;
use Getopt::Long;
use File::Spec;
+use Pod::Usage;
Package::init();
+sub performPackageTest;
-my $usage =
-"Usage:
- bayesian-tagger.pl options tagname
- Options:
- --help|-h print this message
- --verbose|-v toggle verbose mode, this may be given more than once
- --quiet|-q run quiet (not fully supported)
- --no-train|-nt do not train, only test (e.g. the database was allready
- filled)
- --test-package|-t <packageName>
- do not test the files but the given package
- --stats|-s show word probabilities
- --add-good|-g <packageName>
- add the given package as good package
- --add-bad|-b <packageName>
- add the given package as bad package
- --remove-good|-G <packageName>
- remove the given package from the good ones
- --remove-bad|-B <packageName>
- remove the given package from the bad ones
-";
-
-my $noTrainOption;
-my $statsOption;
$main::verboseOption=0;
-
my $helpOption;
+my $manOption;
+my $quietOption;
-my $packageOption;
+# database modifying operations
+my $trainOption;
my $addGoodOption;
my $addBadOption;
my $removeGoodOption;
my $removeBadOption;
-my $quietOption;
+
+# database status display operations
+my $performTestOption;
+my $statsOption;
+
+# package testing operations
+my $testPackageOption;
+my $testAllPackagesOption;
+my $printNamesOnlyOption;
+
# print "Could not removed entry $name from database ", $this->{baseFilename},
# " as the entry is not in the database.\n"
@@ -74,27 +126,30 @@
if ( !GetOptions(
"help|h" => \$helpOption,
+ "man|m" => \$manOption,
"verbose|v+" => \$main::verboseOption,
"quiet|q" => \$main::quietOption,
- "no-train|nt" => \$noTrainOption,
- "test-package|t=s" => \$packageOption,
- "stats|s" => \$statsOption,
+
+ "train|t" => \$trainOption,
"add-good|g=s" => \$addGoodOption,
"add-bad|b=s" => \$addBadOption,
"remove-good|G=s" => \$removeGoodOption,
"remove-bad|B=s" => \$removeBadOption,
- )
+
+ "stats|s" => \$statsOption,
+ "perform-test|o" => \$performTestOption,
+
+ "test-package|p=s" => \$testPackageOption,
+ "test-all-packages|a" => \$testAllPackagesOption,
+ "print-names-only|no" => \$printNamesOnlyOption
+ ) || (@ARGV != 1)
)
{
- die $usage;
-}
-if ($helpOption)
-{
- print "$usage\n";
- exit 0;
+ pod2usage(1);
}
-die($usage) if (@ARGV != 1);
+pod2usage(0) if $helpOption;
+pod2usage(-exitstatus => 0, -verbose => 2) if $manOption;
my ($tag) = @ARGV;
@@ -109,11 +164,23 @@
$filter->setAllowDuplicates(0);
-# set this if a single package is added/removed
-my $singlePackage = $packageOption || $addGoodOption || $addBadOption ||
- $removeGoodOption || $removeBadOption;
-
+if ($trainOption)
+{
+ my $badTrainFile = "$dirFromTag/bad_train.list";
+ my $goodTrainFile = "$dirFromTag/good_train.list";
+ my @goodTrainPackages = readPackagesFromFile($goodTrainFile);
+ my @badTrainPackages = readPackagesFromFile($badTrainFile);
+ foreach(@goodTrainPackages)
+ {
+ trainPackage($filter, $_, 1);
+ }
+ foreach(@badTrainPackages)
+ {
+ trainPackage($filter, $_, 0);
+ }
+ $filter->save();
+}
if ($removeGoodOption)
{
removePackage($filter, $removeGoodOption, 1);
@@ -134,47 +201,13 @@
trainPackage($filter, $addBadOption, 0);
$filter->save();
}
-# if a single package shall be tested
-if ($packageOption)
-{
- my ( $good, $posterior ) = categorizePackage($filter, $packageOption);
- print "Package $packageOption was categorized as ".($good ? "good" : "bad"). " with a ";
- print "posterior to be good of $posterior\n";
-}
-# stats about the DB shall be shown
-if ($statsOption)
-{
-# we could use a stats consumer here, but it is much slower...
-# my $statsConsumer = PrintStatsConsumer->new(*STDOUT);
-# $filter->outputStats($statsConsumer, 1, 1);
- $filter->printStats();
-}
-elsif (!$singlePackage)
+
+if ($performTestOption)
{
- my $badTrainFile = "$dirFromTag/bad_train.list";
my $badTestFile = "$dirFromTag/bad_test.list";
- my $goodTrainFile = "$dirFromTag/good_train.list";
my $goodTestFile = "$dirFromTag/good_test.list";
-
- my @goodTrainPackages = readPackagesFromFile($goodTrainFile);
my @goodTestPackages = readPackagesFromFile($goodTestFile);
- my @badTrainPackages = readPackagesFromFile($badTrainFile);
my @badTestPackages = readPackagesFromFile($badTestFile);
-
- if (!$noTrainOption)
- {
- foreach(@goodTrainPackages)
- {
- trainPackage($filter, $_, 1);
- }
- foreach(@badTrainPackages)
- {
- trainPackage($filter, $_, 0);
- }
- $filter->save();
- }
-
-
my $goodTests = scalar(@goodTestPackages);
my $badTests = scalar(@badTestPackages);
@@ -214,7 +247,68 @@
print "Expected bad, and wielded bad: $badMatches ^= ",
$badMatches/scalar($badTests), "\n";
}
+# stats about the DB shall be shown
+if ($statsOption)
+{
+# we could use a stats consumer here, but it is much slower...
+# my $statsConsumer = PrintStatsConsumer->new(*STDOUT);
+# $filter->outputStats($statsConsumer, 1, 1);
+ $filter->printStats();
+}
+
+# if a single package shall be tested
+if ($testPackageOption)
+{
+ performPackageTest($filter, $testPackageOption, $printNamesOnlyOption);
+}
+
+if ($testAllPackagesOption)
+{
+ open(FH, "apt-cache pkgnames --no-all-names |");
+ my $i;
+ while( <FH> )
+ {
+ ++$i;
+ if ($i % 50 == 0)
+ {
+# Package::destroy();
+# Package::init();
+ }
+ chomp;
+ performPackageTest($filter, $_, $printNamesOnlyOption);
+ }
+ close(FH);
+}
+
+sub performPackageTest
+{
+ my ($filer, $package, $namesOnly) = @_;
+ my ( $good, $posterior ) = categorizePackage($filter, $package);
+ # if we shall print only the good packages
+ if ($namesOnly)
+ {
+ print "$package\n" if ($good==1);
+ }
+ else
+ {
+ my $categorization;
+ if ($good == 1)
+ {
+ $categorization = "good";
+ }
+ elsif ($good == -1)
+ {
+ $categorization = "unsure";
+ }
+ else
+ {
+ $categorization = "bad";
+ }
+ print "Package $package was categorized as $categorization with a ";
+ print "posterior to be good of $posterior\n";
+ }
+}
__END__
Modified: autodebtag/trunk/ai-tagger/create-data.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/create-data.pl (original)
+++ autodebtag/trunk/ai-tagger/create-data.pl Thu Sep 15 17:29:17 2005
@@ -70,24 +70,24 @@
my $badRatioOption=1;
my $directoryOption; # current directory is default directory
my $noRegenerateOption;
-my $help;
-my $man;
+my $helpOption;
+my $manOption;
if ( !GetOptions(
"max-good|g=i" => \$maxGoodOption,
"bad-ratio|r=i" => \$badRatioOption,
"directory|d=s" => \$directoryOption,
"no-regenerate|n" => \$noRegenerateOption,
- "help|h" => \$help,
- "man|m" => \$man,
+ "help|h" => \$helpOption,
+ "man|m" => \$manOption,
) || (@ARGV != 1)
)
{
pod2usage(1);
}
-pod2usage(0) if $help;
-pod2usage(-exitstatus => 0, -verbose => 2) if $man;
+pod2usage(0) if $helpOption;
+pod2usage(-exitstatus => 0, -verbose => 2) if $manOption;
my ($tag) = @ARGV;
Modified: autodebtag/trunk/ai-tagger/perform-test-help.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/perform-test-help.pl (original)
+++ autodebtag/trunk/ai-tagger/perform-test-help.pl Thu Sep 15 17:29:17 2005
@@ -26,18 +26,6 @@
# $main::quietOption = 0;
#
#
-# # used to generate some random tags
-# # my @tags = getVocabulary(0);
-# # for (my $i=0; $i<20; ++$i)
-# # {
-# # print($tags[int(rand(scalar(@tags)))]."\n");
-# # }
-# # exit 0;
-#
-#
-# my $usage = "perform-test.pl taglistFile\n";
-#
-#
# die $usage if ($#ARGV != 0);
# my $taglistFile = $ARGV[0];
#
@@ -62,9 +50,9 @@
print "---- begin Processing $tag ----\n";
my $dirFromTag = $tag;
$dirFromTag =~ s/:/_/g; # remove all : from the tag and replace by an _
- my $dataDir = File::Spec->catdir("test_data", $dirFromTag);
+ my $dataDir = File::Spec->catdir("test-data", $dirFromTag);
- if (hasDatabase($tag, "test_data"))
+ if (hasDatabase($tag, "test-data"))
{
print "database for $tag already existent - skipping\n";
return;
@@ -76,7 +64,7 @@
# create database
print "Creating test and training set...\n";
# create test sets - do not regenerate if already existent
- system("./create-data.pl -d $dataDir -n $tag");
+ system("./create-data.pl -g 100 -r 4 -d $dataDir -n $tag");
my $goodTrainFile = "$dataDir/good_train.list";
my $badTrainFile = "$dataDir/bad_train.list";
my @goodTrainPackages = readPackagesFromFile($goodTrainFile);
@@ -144,40 +132,3 @@
BayesianAccess::clearStatistics();
print "---- end Processing $tag ----\n";
}
-
-
-=item C<getVocabulary> I<includeFacets>
-
-Returns the debtags vocabulary as list.
-
-B<Parameter>
-
-=over 6
-
-=item includeFacets
-
-Includes the facets if this is true.
-
-=back
-
-=cut
-sub getVocabulary
-{
- my ($includeFacets) = @_;
- my $vocabularyFile = "/var/lib/debtags/vocabulary";
- my @result;
- open(VF, $vocabularyFile) || die "could not open $vocabularyFile";
- foreach (<VF>)
- {
- if (/^Tag: ([^\s]*)/)
- {
- push @result, $1;
- }
- elsif ($includeFacets && /^Facet: ([^\s]*)/)
- {
- push @result, $1;
- }
- }
- close VF;
- return @result;
-}
\ No newline at end of file
Modified: autodebtag/trunk/ai-tagger/perform-test.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/perform-test.pl (original)
+++ autodebtag/trunk/ai-tagger/perform-test.pl Thu Sep 15 17:29:17 2005
@@ -1,7 +1,36 @@
#!/usr/bin/perl -w
-# performs a bayesian tagging test for the tags in the
-# given taglistFile, outputs result to stdout.
-# The taglistFile should contain one tag per line
+
+
+
+
+=head1 perform-test.pl
+
+perform-test.pl - performs a bayesian tagging test for the tags in the
+given taglistFile
+
+=head1 SYNOPSIS
+
+perform-test.pl taglistFile
+
+
+=head1 DESCRIPTION
+
+Performs a bayesian tagging test for the tags in the
+given taglistFile, outputs result to stdout.
+The taglistFile should contain one tag per line
+
+A bayesian test first creates a test- and training set using
+B<create-data.pl>. Afterwards uses this data to test and
+train a the bayesian tagger and prints the training results.
+
+The training data created will be stored in a test-data
+directory. If training data does already exist, this tag
+will be skipped.
+
+=head1 FUNCTIONS
+
+=cut
+
use strict;
@@ -144,39 +173,3 @@
# print "---- end Processing $tag ----\n";
# }
-
-=item C<getVocabulary> I<includeFacets>
-
-Returns the debtags vocabulary as list.
-
-B<Parameter>
-
-=over 6
-
-=item includeFacets
-
-Includes the facets if this is true.
-
-=back
-
-=cut
-sub getVocabulary
-{
- my ($includeFacets) = @_;
- my $vocabularyFile = "/var/lib/debtags/vocabulary";
- my @result;
- open(VF, $vocabularyFile) || die "could not open $vocabularyFile";
- foreach (<VF>)
- {
- if (/^Tag: ([^\s]*)/)
- {
- push @result, $1;
- }
- elsif ($includeFacets && /^Facet: ([^\s]*)/)
- {
- push @result, $1;
- }
- }
- close VF;
- return @result;
-}
\ No newline at end of file
More information about the Debtags-commits
mailing list