[Debtags-commits] [svn] r1215 - autodebtag/trunk/ai-tagger
Benjamin Mesing
bmesing-guest at costa.debian.org
Tue Aug 16 18:52:29 UTC 2005
Author: bmesing-guest
Date: Tue Aug 16 18:52:28 2005
New Revision: 1215
Modified:
autodebtag/trunk/ai-tagger/create-data.pl
Log:
- create-data.pl script can be handed a number of maximum good packages
used for training now
- the number of bad packages used for training depends on that of the
good ones now
Modified: autodebtag/trunk/ai-tagger/create-data.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/create-data.pl (original)
+++ autodebtag/trunk/ai-tagger/create-data.pl Tue Aug 16 18:52:28 2005
@@ -9,37 +9,36 @@
create-data.pl [options] tag
-Options:
-
- --help brief help message
- --man full documentation
- --keep=NUMBER keeps only every ith package
- --directory=DIRECTORY
- the directory where to save the files
- --no-regenerate don't create anything if files are already existing
-
=head1 OPTIONS
=over 6
-=item B<--help>
+=item B<-h, --help>
-Print a brief help message and exits.
+Prints a brief help message and exits.
-=item B<--man>
+=item B<-m, --man>
Prints the manual page and exits.
-=item B<--keep> I<number>
+=item B<-g, --max-good> I<number>
+
+Consider a maximum of NUMBER good packages
+
+=item B<-r, --bad-ratio> I<number>
-When collecting the packages for the tag, keep only every i'th package.
+A maximum of number times the good packages.
+I.e. If 56 good packages were available and the ratio is 2,
+112 bad packages are used for training and testing. (Use an integer here)
-=item B<--directory> I<directory>
+Default value is 1.
+
+=item B<-d, --directory> I<directory>
Specify the directory where to save the files here. If no directory is given
the name of the tag is used to create a directory (replacing : by _)
-=item B<--no-regenerate>
+=item B<-n, --no-regenerate>
Don't regenerate if the files are already existent. Not that only
good-train.list will be checked and if this is available nothing will
@@ -65,17 +64,18 @@
use Getopt::Long;
use File::Spec;
use Pod::Usage;
+use Tools;
-# my $usage = "./createTrainingSet.pl [--keep|k value] [-d|--directory directory] [-n|--no-regenerate] tag\n";
-
-my $keepOption = 1;
+my $maxGoodOption;
+my $badRatioOption=1;
my $directoryOption; # current directory is default directory
my $noRegenerateOption;
my $help;
my $man;
if ( !GetOptions(
- "keep|k=i" => \$keepOption,
+ "max-good|g=i" => \$maxGoodOption,
+ "bad-ratio|r=i" => \$badRatioOption,
"directory|d=s" => \$directoryOption,
"no-regenerate|n" => \$noRegenerateOption,
"help|h" => \$help,
@@ -120,10 +120,15 @@
exit 0;
}
-my $goodTestCount = 0;
+my $goodCount = 0;
# fill the good data
{
my @goodPackages = getPackagesWithTag($tag);
+ if ($maxGoodOption)
+ {
+ @goodPackages = getRandomElements(\@goodPackages, $maxGoodOption);
+ }
+
open(OUT_TRAIN_FILE, "> $goodTrainFile") || die "Could not open $goodTrainFile for writing.";
open(OUT_TEST_FILE, "> $goodTestFile") || die "Could not open $goodTestFile for writing.";
# add alternating to train and test file
@@ -142,14 +147,18 @@
}
close (OUT_TRAIN_FILE);
close (OUT_TEST_FILE);
- $goodTestCount = scalar(@goodPackages) / 2;
+ $goodCount = scalar(@goodPackages);
}
# fill bad data
{
# fill the bad train data
{
- my @badTrainPackages = getPackagesFromFacetWithoutTag($tag, 1);
+ my @badTrainPackages = getPackagesFromFacetWithoutTag($tag);
+ if ($maxGoodOption)
+ {
+ @badTrainPackages = getRandomElements(\@badTrainPackages, int($goodCount*$badRatioOption/2));
+ }
open(OUT_TRAIN_FILE, "> $badTrainFile") || die "Could not open $badTrainFile for writing.";
foreach(@badTrainPackages)
{
@@ -159,22 +168,17 @@
}
# fill the bad test data
{
- my @badTestPackages = getPackagesWithoutTag($tag, 1);
- my $goodToBad = $goodTestCount/scalar(@badTestPackages);
- $goodToBad = 1 if ($goodToBad > 1);
+ my @badTestPackages = getPackagesWithoutTag($tag);
+ if ($maxGoodOption)
+ {
+ @badTestPackages = getRandomElements(\@badTestPackages, int($goodCount*$badRatioOption/2));
+ }
open(OUT_TEST_FILE, "> $badTestFile") || die "Could not open $badTestFile for writing.";
- # only keep as many bad as good packages
- my $sum = 0;
foreach(@badTestPackages)
{
- $sum += $goodToBad;
- if ($sum >= 1)
- {
- --$sum;
- print(OUT_TEST_FILE "$_\n");
- }
+ print(OUT_TEST_FILE "$_\n");
}
- close (OUT_TEST_FILE);
+ close(OUT_TEST_FILE);
}
}
@@ -185,35 +189,25 @@
##
## @param $tag the tag to use
-#<!--
-## @param $keepOption the packages you want to keep
-## 1 means every package, 2 every second, 3 every third and so on
-#-->
sub getPackagesFromFacetWithoutTag
{
- my ($tag, $keepOption) = @_;
+ my ($tag) = @_;
# do not consider packages which are not yet tagged, as the
# probability is relativly high, that they will be tagged with $tag
# TODO we need the completelyTagged tag here!
my($facet) = split(':', $tag);
-# train with the tags in the same facets, as this wields better separation
-# because packages in common facets often share common words, so to get a better
-# distinction train with packages near each other
+ # train with the tags in the same facets, as this wields better separation
+ # because packages in common facets often share common words, so to get a better
+ # distinction train with packages near each other
open BAD_FILE, "debtags grep \"! $tag && ${facet}::*\" |";
-
my @result;
- my $i = 0;
while (<BAD_FILE>)
{
- # the output of debtags looks like this:
- # gnats: devel::bugtracker
- if ($i==0)
- {
- my $package = substr($_, 0, index($_, ":")); # get the package name
- my @packages = split /,\s*/, $package;
- push(@result, @packages);
- }
- $i = 0 if (++$i==$keepOption);
+ # the output of "debtags grep" like this:
+ # package1, package2, ..., packageN: facet::tag1, facet::tag2, ... , facet::tagN
+ my $package = substr($_, 0, index($_, ":")); # get the package name
+ my @packages = split /,\s*/, $package;
+ push(@result, @packages);
}
close BAD_FILE;
return @result;
@@ -222,35 +216,20 @@
## @brief This gets packages without the handed tag.
##
## @param $tag the tag to use
-## @param $keepOption the packages you want to keep
-## 1 means every package, 2 every second, 3 every third and so on
sub getPackagesWithoutTag
{
- my ($tag, $keepOption) = @_;
- # do not consider packages which are not yet tagged, as the
- # probability is relativly high, that they will be tagged with $tag
+ my ($tag) = @_;
# TODO we need the completelyTagged tag here!
- my($facet) = split(':', $tag);
- open BAD_FILE, "debtags grep \"! $tag && ! special::not-yet-tagged\" |";
-# use this e.g. for uitoolkit qt,
-# another approach to get a better separation
-# because packages in common facets often share common words, so to get a better
-# distinction train with packages near each other
-# open BAD_FILE, "debtags grep \"! $tag && ${facet}::*\" |";
+ open BAD_FILE, "debtags grep \"!$tag && !special::not-yet-tagged\" |";
my @result;
- my $i = 0;
while (<BAD_FILE>)
{
- # the output of debtags looks like this:
- # gnats: devel::bugtracker
- if ($i==0)
- {
- my $package = substr($_, 0, index($_, ":")); # get the package name
- my @packages = split /,\s*/, $package;
- push(@result, @packages);
- }
- $i = 0 if (++$i==$keepOption);
+ # the output of "debtags grep" looks like this:
+ # package1, package2, ..., packageN: facet::tag1, facet::tag2, ... , facet::tagN
+ my $package = substr($_, 0, index($_, ":")); # get the package name
+ my @packages = split /,\s*/, $package;
+ push(@result, @packages);
}
close BAD_FILE;
return @result;
@@ -269,8 +248,8 @@
my @result;
while (<GOOD_FILE>)
{
- # the output of debtags looks like this:
- # gnats: devel::bugtracker
+ # the output of "debtags grep" looks like this:
+ # package1, package2, ..., packageN: facet::tag1, facet::tag2, ... , facet::tagN
my $package = substr($_, 0, index($_, ":")); # get the package name
# in certain cases the output of debtags grep is compressed (i.e.
# multiple packages are grouped for the same tagset), this happens
More information about the Debtags-commits
mailing list