[Debtags-commits] [svn] r1215 - autodebtag/trunk/ai-tagger

Benjamin Mesing bmesing-guest at costa.debian.org
Tue Aug 16 18:52:29 UTC 2005


Author: bmesing-guest
Date: Tue Aug 16 18:52:28 2005
New Revision: 1215

Modified:
   autodebtag/trunk/ai-tagger/create-data.pl
Log:
- create-data.pl script can be handed a number of maximum good packages
  used for training now
- the number of bad packages used for training depends on that of the
  good ones now


Modified: autodebtag/trunk/ai-tagger/create-data.pl
==============================================================================
--- autodebtag/trunk/ai-tagger/create-data.pl	(original)
+++ autodebtag/trunk/ai-tagger/create-data.pl	Tue Aug 16 18:52:28 2005
@@ -9,37 +9,36 @@
 
 create-data.pl [options] tag
 
-Options:
-
-  --help            brief help message
-  --man             full documentation
-  --keep=NUMBER     keeps only every ith package
-  --directory=DIRECTORY
-                    the directory where to save the files
-  --no-regenerate   don't create anything if files are already existing
-
 =head1 OPTIONS
 
 =over 6
 
-=item B<--help>
+=item B<-h, --help>
 
-Print a brief help message and exits.
+Prints a brief help message and exits.
 
-=item B<--man>
+=item B<-m, --man>
 
 Prints the manual page and exits.
 
-=item B<--keep> I<number>
+=item B<-g, --max-good> I<number>
+
+Consider a maximum of NUMBER good packages
+
+=item B<-r, --bad-ratio> I<number>
 
-When collecting the packages for the tag, keep only every i'th package.
+A maximum of number times the good packages.
+I.e. If 56 good packages were available and the ratio is 2,
+112 bad packages are used for training and testing. (Use an integer here)
 
-=item B<--directory> I<directory>
+Default value is 1.
+
+=item B<-d, --directory> I<directory>
 
 Specify the directory where to save the files here. If no directory is given
 the name of the tag is used to create a directory (replacing : by _)
 
-=item B<--no-regenerate>
+=item B<-n, --no-regenerate>
 
 Don't regenerate if the files are already existent. Not that only 
 good-train.list will be checked and if this is available nothing will 
@@ -65,17 +64,18 @@
 use Getopt::Long;
 use File::Spec;
 use Pod::Usage;
+use Tools;
 
-# my $usage = "./createTrainingSet.pl [--keep|k value] [-d|--directory directory] [-n|--no-regenerate] tag\n";
-
-my $keepOption = 1;
+my $maxGoodOption;
+my $badRatioOption=1;
 my $directoryOption;	# current directory is default directory
 my $noRegenerateOption;
 my $help;
 my $man;
 
 if ( !GetOptions(
- 	"keep|k=i" => \$keepOption, 
+ 	"max-good|g=i" => \$maxGoodOption, 
+ 	"bad-ratio|r=i" => \$badRatioOption, 
 	"directory|d=s" => \$directoryOption,
 	"no-regenerate|n" => \$noRegenerateOption,
 	"help|h" => \$help,
@@ -120,10 +120,15 @@
 	exit 0;
 }
 
-my $goodTestCount = 0;
+my $goodCount = 0;
 # fill the good data
 {
 	my @goodPackages = getPackagesWithTag($tag);
+	if ($maxGoodOption)
+	{
+		@goodPackages = getRandomElements(\@goodPackages, $maxGoodOption);
+	}
+	
 	open(OUT_TRAIN_FILE, "> $goodTrainFile") || die "Could not open $goodTrainFile for writing.";
 	open(OUT_TEST_FILE, "> $goodTestFile") || die "Could not open $goodTestFile for writing.";
 	# add alternating to train and test file
@@ -142,14 +147,18 @@
 	}
 	close (OUT_TRAIN_FILE);
 	close (OUT_TEST_FILE);
-	$goodTestCount = scalar(@goodPackages) / 2;
+	$goodCount = scalar(@goodPackages);
 }
 
 # fill bad data
 {
 	# fill the bad train data
 	{
-		my @badTrainPackages = getPackagesFromFacetWithoutTag($tag, 1);
+		my @badTrainPackages = getPackagesFromFacetWithoutTag($tag);
+		if ($maxGoodOption)
+		{
+			@badTrainPackages = getRandomElements(\@badTrainPackages, int($goodCount*$badRatioOption/2));
+		}
 		open(OUT_TRAIN_FILE, "> $badTrainFile") || die "Could not open $badTrainFile for writing.";
 		foreach(@badTrainPackages)
 		{
@@ -159,22 +168,17 @@
 	}
 	# fill the bad test data
 	{
-		my @badTestPackages = getPackagesWithoutTag($tag, 1);
-		my $goodToBad = $goodTestCount/scalar(@badTestPackages);
-		$goodToBad = 1 if ($goodToBad > 1);
+		my @badTestPackages = getPackagesWithoutTag($tag);
+		if ($maxGoodOption)
+		{
+			@badTestPackages = getRandomElements(\@badTestPackages, int($goodCount*$badRatioOption/2));
+		}
 		open(OUT_TEST_FILE, "> $badTestFile") || die "Could not open $badTestFile for writing.";
-		# only keep as many bad as good packages
-		my $sum = 0;
 		foreach(@badTestPackages)
 		{
-			$sum += $goodToBad;
-			if ($sum >= 1)
-			{
-				--$sum;
-				print(OUT_TEST_FILE "$_\n");
-			}
+			print(OUT_TEST_FILE "$_\n");
 		}
-		close (OUT_TEST_FILE);
+		close(OUT_TEST_FILE);
 	}
 }
 
@@ -185,35 +189,25 @@
 ## 
 ## @param $tag the tag to use
 
-#<!--
-## @param $keepOption the packages you want to keep
-## 1 means every package, 2 every second, 3 every third and so on
-#-->
 sub getPackagesFromFacetWithoutTag
 {
-	my ($tag, $keepOption) = @_;
+	my ($tag) = @_;
 	# do not consider packages which are not yet tagged, as the
 	# probability is relativly high, that they will be tagged with $tag
 	# TODO we need the completelyTagged tag here!
 	my($facet) = split(':', $tag);
-# train with the tags in the same facets, as this wields better separation
-# because packages in common facets often share common words, so to get a better
-# distinction train with packages near each other
+	# train with the tags in the same facets, as this wields better separation
+	# because packages in common facets often share common words, so to get a better
+	# distinction train with packages near each other
 	open BAD_FILE, "debtags grep \"! $tag && ${facet}::*\" |";
-
 	my @result;
-	my $i = 0;
 	while (<BAD_FILE>)
 	{
-		# the output of debtags looks like this:  
-		# gnats: devel::bugtracker
-		if ($i==0)
-		{
-			my $package = substr($_, 0, index($_, ":"));	# get the package name
-			my @packages = split /,\s*/, $package;
-			push(@result, @packages);
-		}
-		$i = 0 if (++$i==$keepOption);
+		# the output of "debtags grep" like this:  
+		# package1, package2, ..., packageN: facet::tag1, facet::tag2, ... , facet::tagN
+		my $package = substr($_, 0, index($_, ":"));	# get the package name
+		my @packages = split /,\s*/, $package;
+		push(@result, @packages);
 	}
 	close BAD_FILE;
 	return @result;
@@ -222,35 +216,20 @@
 ## @brief This gets packages without the handed tag.
 ## 
 ## @param $tag the tag to use
-## @param $keepOption the packages you want to keep
-## 1 means every package, 2 every second, 3 every third and so on
 sub getPackagesWithoutTag
 {
-	my ($tag, $keepOption) = @_;
-	# do not consider packages which are not yet tagged, as the
-	# probability is relativly high, that they will be tagged with $tag
+	my ($tag) = @_;
 	# TODO we need the completelyTagged tag here!
-	my($facet) = split(':', $tag);
-	open BAD_FILE, "debtags grep \"! $tag && ! special::not-yet-tagged\" |";
-# use this e.g. for uitoolkit qt, 
-# another approach to get a better separation
-# because packages in common facets often share common words, so to get a better
-# distinction train with packages near each other
-#	open BAD_FILE, "debtags grep \"! $tag && ${facet}::*\" |";
+	open BAD_FILE, "debtags grep \"!$tag && !special::not-yet-tagged\" |";
 
 	my @result;
-	my $i = 0;
 	while (<BAD_FILE>)
 	{
-		# the output of debtags looks like this:  
-		# gnats: devel::bugtracker
-		if ($i==0)
-		{
-			my $package = substr($_, 0, index($_, ":"));	# get the package name
-			my @packages = split /,\s*/, $package;
-			push(@result, @packages);
-		}
-		$i = 0 if (++$i==$keepOption);
+		# the output of "debtags grep" looks like this:  
+		# package1, package2, ..., packageN: facet::tag1, facet::tag2, ... , facet::tagN
+		my $package = substr($_, 0, index($_, ":"));	# get the package name
+		my @packages = split /,\s*/, $package;
+		push(@result, @packages);
 	}
 	close BAD_FILE;
 	return @result;
@@ -269,8 +248,8 @@
 	my @result;
 	while (<GOOD_FILE>)
 	{
-		# the output of debtags looks like this:  
-		# gnats: devel::bugtracker
+		# the output of "debtags grep" looks like this:  
+		# package1, package2, ..., packageN: facet::tag1, facet::tag2, ... , facet::tagN
 		my $package = substr($_, 0, index($_, ":"));	# get the package name
 		# in certain cases the output of debtags grep is compressed (i.e.
 		# multiple packages are grouped for the same tagset), this happens



More information about the Debtags-commits mailing list