rev 9251 - in people/modax/copyright-helper/trunk: . parsers
Modestas Vainius
modax-guest at alioth.debian.org
Sat Feb 2 14:14:49 UTC 2008
Author: modax-guest
Date: 2008-02-02 14:14:48 +0000 (Sat, 02 Feb 2008)
New Revision: 9251
Modified:
people/modax/copyright-helper/trunk/CHCopyright.pm
people/modax/copyright-helper/trunk/CHCore.pm
people/modax/copyright-helper/trunk/copyright-helper.pl
people/modax/copyright-helper/trunk/parsers/c_cpp.pm
Log:
* v0.2
* More fixes to parsers & stuff
* Locale fixes
* Eliminated all false positives for konversation source.
Modified: people/modax/copyright-helper/trunk/CHCopyright.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCopyright.pm 2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/CHCopyright.pm 2008-02-02 14:14:48 UTC (rev 9251)
@@ -18,6 +18,35 @@
use encoding "utf8";
use utf8;
+# Perl does not seem to match (regex) © without this trick in utf8 mode
+our $__copyright_symbol = '©';
+utf8::encode($__copyright_symbol);
+
+our %blacklisted_names = (
+ "jan" => [ "jan", "january" ],
+ "feb" => [ "feb", "february" ],
+ "mar" => [ "mar", "march" ],
+ "apr" => [ "apr", "april" ],
+ "may" => [ "may" ],
+ "jun" => [ "jun", "june" ],
+ "jul" => [ "jul", "july" ],
+ "aug" => [ "aug", "august" ],
+ "sep" => [ "sep", "september" ],
+ "oct" => [ "oct", "october" ],
+ "nov" => [ "nov", "november" ],
+ "dec" => [ "dec", "december" ],
+
+ "mon" => [ "mon", "monday" ],
+ "tue" => [ "tue", "tuesday" ],
+ "wed" => [ "wed", "wednesday" ],
+ "thu" => [ "thu", "thursday" ],
+ "fri" => [ "fri", "friday" ],
+ "sat" => [ "sat", "saturday" ],
+ "sun" => [ "sun", "sunday" ],
+
+ "inc" => [ "inc." ], # Rule out Free Software Foundation, **Inc.**
+);
+
sub isUcFirst($) {
my $str = shift;
return $str eq ucfirst($str);
@@ -28,6 +57,23 @@
return $str eq lcfirst($str);
}
+sub isWordCharFirst($) {
+ my $word = shift;
+ return $word =~ m/^\w/;
+}
+
+sub isNameBlacklisted($) {
+ our %blacklisted_names;
+ my $name = lc(shift);
+ my $key = substr($name, 0, 3);
+
+ if (exists $blacklisted_names{$key}) {
+ for my $black (@{$blacklisted_names{$key}}) {
+ return 1 if $black eq $name;
+ }
+ }
+}
+
sub CHCopyright::Author::new {
my $self = {
"names" => [],
@@ -65,12 +111,17 @@
return 0;
}
+ # Skip blacklisted words
+ if (isNameBlacklisted($name)) {
+ return 0;
+ }
+
if ($closeName) {
$self->closeName();
}
- push @$names , $name;
- return 1;
+ push @$names, $name;
+ return $self->isNameClosed() + 1;
}
sub CHCopyright::Author::isNameClosed($) {
@@ -99,12 +150,14 @@
sub CHCopyright::Author::addYear($$) {
my ($self, $year) = @_;
+ my @years = split(/,/, $year);
+
if ($self->{autoYears}) {
# Replace years
- $self->{years} = [ $year ];
+ $self->{years} = \@years;
$self->{autoYears} = 0;
} else {
- push @{$self->{years}}, $year;
+ push @{$self->{years}}, @years;
}
}
@@ -128,33 +181,45 @@
sub CHCopyright::Author::isComplete($) {
my ($self) = @_;
return (@{$self->{names}} && $self->{email}) ||
- (@{$self->{names}} && @{$self->{years}});
+ (scalar(@{$self->{names}}) > 1 && @{$self->{years}});
}
-sub CHCopyright::Author::complete($) {
- my ($self) = @_;
+sub CHCopyright::Author::calcQualityOfNames($) {
+ my $self = shift;
my $names = $self->{names};
- my $otherWords = $self->{otherWords};
- # Count lowercase and uppercase names
+ my $ucount = 0;
+ my $uwcount = 0;
my $lcount = 0;
- my $ucount = 0;
my $dcount = 0;
for my $n (@$names) {
if (isUcFirst($n)) {
$ucount++; # non-word characters go here too
+ $uwcount++ if (isWordCharFirst($n));
# Increate "dot count" if the name ends with a dot.
$dcount++ if ($n =~ /\.$/);
} else {
$lcount++;
}
}
+ return ($ucount, $uwcount, $lcount, $dcount);
+}
- if (!@$names && @$otherWords) {
+sub CHCopyright::Author::complete($$) {
+ my ($self, $strict) = @_;
+ my $names = $self->{names};
+ my $otherWords = $self->{otherWords};
+
+ # Calculate "names" quality
+ my ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
+
+# print @{$self->{names}}, " -o ", @{$self->{otherWords}}, " -e ", $self->getEmail(), " -y ", $self->getAllYears(), "\n";
+
+ if (!$strict && !@$names && @$otherWords) {
# Assume the author was a bit lazy and wrote his
# firstname/lastname/nickname starting with a
# lowercase letter. Credit him anyway
- $self->{names} = $self->{otherWords};
+ $self->{names} = $otherWords;
} elsif ($lcount > 2 || $lcount >= $ucount) {
# Then probably this name is not a real name
$self->{names} = [];
@@ -163,9 +228,28 @@
# from the end of the names array
my $i = scalar(@$names) - 1;
for (; $i >= 0 && isLcFirst($$names[$i]); $i--) {;}
- my @newArray = splice(@$names, 0, $i+1);
- $self->{names} = \@newArray;
+ if ($i < scalar(@$names) - 1) {
+ my @removed = splice(@$names, -(scalar(@$names)-$i-1));
+ push (@$otherWords, @removed);
+ }
}
+ $names = $self->{names};
+
+ if ($strict) {
+ # If mode is strict, do not accept anything with too many
+ # otherWords
+ if (scalar(@$otherWords) > scalar(@$names)) {
+ $self->{names} = [];
+ } else {
+ # Recalc names quality
+ ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
+ # Ensure high quality standards
+ if (!($lcount <= 1 && $dcount <= 1 && $uwcount == 0)) {
+ $self->{names} = [];
+ }
+ }
+ }
+ $names = $self->{names};
if ($ucount == $dcount && !$self->getEmail() &&
(!@{$self->{years}} || $self->{autoYears})) {
@@ -216,19 +300,29 @@
########################### AuthorCollection ###################################
sub CHCopyright::AuthorCollection::new {
+ my ($cls, $strict) = @_;
+ $strict = 0 unless(defined($strict));
+
my $self = {
"authors" => [ new CHCopyright::Author ],
- "lastAddition" => "",
+ "additions" => [],
+ "strictMode" => $strict,
};
- return bless($self, $_[0]);
+ return bless($self, $cls);
}
+sub CHCopyright::AuthorCollection::isModeStrict($) {
+ $_[0]->{strictMode};
+}
+
sub CHCopyright::AuthorCollection::getAuthors($) {
$_[0]->{authors};
}
sub CHCopyright::AuthorCollection::getLastAddition($) {
- return $_[0]->{lastAddition};
+ my $array = $_[0]->{additions};
+ my $last = scalar(@$array) - 1;
+ return ($last > 0) ? $$array[$last] : "";
}
sub CHCopyright::AuthorCollection::setLastAddition($$) {
@@ -242,7 +336,7 @@
$self->closeYears();
}
- $self->{lastAddition} = $newAddition;
+ push @{$self->{additions}}, $newAddition;
}
sub CHCopyright::AuthorCollection::getAuthorCount($) {
@@ -257,6 +351,17 @@
return 0;
}
+sub CHCopyright::AuthorCollection::getLastNameClosedOrNonEmpty($$) {
+ my ($self) = @_;
+ my $prev = 0;
+ for my $a (@{$self->{authors}}) {
+ last if ($prev && !$a->{nameClosed} && !@{$a->{names}});
+ $prev = $a;
+ }
+
+ return $prev;
+}
+
sub CHCopyright::AuthorCollection::getLastAuthor($) {
return $_[0]->{authors}->[ $_[0]->getAuthorCount()-1 ];
}
@@ -276,14 +381,16 @@
my $author = $self->getCurrentByField("nameClosed");
$author = $self->addNewAuthor() unless ($author);
- $author->addName($name);
+ my $ret = $author->addName($name);
$self->setLastAddition("name");
+
+ return $ret;
}
sub CHCopyright::AuthorCollection::addOtherWord($$) {
my ($self, $word) = @_;
- my $author = $self->getCurrentByField("nameClosed");
+ my $author = $self->getLastNameClosedOrNonEmpty();
$author = $self->addNewAuthor() unless ($author);
$author->addOtherWord($word);
@@ -313,28 +420,65 @@
sub CHCopyright::AuthorCollection::closeYears($) {
my ($self) = @_;
for my $a (@{$self->{authors}}) {
- return $a->closeYears() if ($a->{years});
+ return $a->closeYears() if (!$a->isYearsClosed() && $a->{years});
}
}
-sub CHCopyright::AuthorCollection::complete($) {
+sub CHCopyright::AuthorCollection::complete($) {
+ my $self = shift;
+ my $authors = $self->getAuthors();
+
+ for my $author (@$authors) {
+ $author->complete($self->isModeStrict());
+ }
+
+ # If the last author is without email, check the order
+ # how words were added. We might need to lift emails down
+ # if the email came before name
+ my $author = $self->getLastAuthor();
+ if ($self->getAuthorCount() > 1 && !$author->getEmail()) {
+ my $additions = $self->{additions};
+ my $email = 0;
+ my $name = 0;
+ for (my $i = scalar(@$additions)-1; $i >= 0; $i--) {
+ my $el = $$additions[$i];
+ if ($el eq "email") {
+ $email++;
+ } elsif ($el eq "name") {
+ $name++;
+ last;
+ }
+ }
+
+ if ($email == 1 && $name == 1) {
+ # Then lift email down by 1
+ for (my $i = $self->getAuthorCount()-1; $i >= 1; $i--) {
+ $$authors[$i]->setEmail($$authors[$i-1]->getEmail());
+ }
+ $$authors[0]->setEmail("");
+ }
+ }
}
-sub __process_copyright_statement(\@$$) {
- my ($results, $p, $uniqOnly) = @_;
+sub __process_copyright_statement(\@$$$) {
+ my ($results, $p, $uniqOnly, $strict) = @_;
# Initialize author collection
- my $authorCollection = new CHCopyright::AuthorCollection;
+ my $authorCollection = new CHCopyright::AuthorCollection($strict);
my $is_name = 0;
my @words = split(/[\s]+/, $p);
for (my $i = 0; $i <= $#words; $i++) {
my $w = $words[$i];
+ my $added = 0;
- if ($w =~ m/^([0-9-]{2,})\W?$/) {
+# print "----- $w -- $is_name", "\n";
+
+ if ($w =~ m/^(\d{4}[\d,-]*)\W?$/) {
# Then probably it's a year
$authorCollection->addYear($1);
$is_name = 0;
+ $added = 1;
} else {
if ($w =~ m/@/) {
$w =~ m/^<?([^>]*)>?/;
@@ -343,20 +487,24 @@
# Then probably it's an e-mail address
$authorCollection->addEmail($w);
$is_name = 0;
- } elsif (($w =~ m/\w/) && isUcFirst($w)) { # if has any word chars and start with an uppercase
- $authorCollection->addName($w);
- $is_name = 1;
+ $added = 1;
+ } elsif (!$is_name && isWordCharFirst($w) && isUcFirst($w)) {
+ # if it starts with an uppercase word char
+ $added = $is_name = $authorCollection->addName($w);
} elsif ($is_name) {
- $authorCollection->addName($w);
- } else {
+ $added = $authorCollection->addName($w);
+ $is_name = 0 if ($added == 2);
+ }
+
+ if (!$added) {
$authorCollection->addOtherWord($w);
}
}
}
-
+
+ $authorCollection->complete();
nextAuthor:
for my $author (@{$authorCollection->getAuthors()}) {
- $author->complete();
if ($author->isComplete()) {
if ($uniqOnly) {
for my $a (@$results) {
@@ -371,17 +519,19 @@
sub getCopyright(\@$$) {
my ($results, $p, $separator) = @_;
+ our $__copyright_symbol;
- use bytes; # That's an ugly workaround for perl to recognize ©
- $separator = qr/copyright|\(C\)\s*|©\s*/i if (!$separator);
+ $separator = qr/copyright|\(C\)\s*|$__copyright_symbol\s*/i if (!$separator);
# Copyright statement usually starts with Copyright or (C)
my @copyrights = split($separator, $p);
if (scalar(@copyrights) > 1) {
+ my $i = -1;
for my $c (@copyrights) {
+ $i++;
next if ($c =~ m/^\s*$/);
- __process_copyright_statement(@$results, $c, 0);
+ __process_copyright_statement(@$results, $c, 0, ($i == 0));
}
return 1;
} else {
@@ -392,21 +542,23 @@
sub getFuzzyCopyright(\@$) {
my ($results, $p) = @_;
- if (($p =~ /\W\d{4}\W/) && ($p =~ /@/)) {
- my @sentenses = split(/\s*\.\s+/, $p);
+ my @sentenses = split(/\s*\.\s+/, $p);
+ if ($p =~ /(^|[\s.,-])\d{4}([\s.,-]|$)/) {
for my $s (@sentenses) {
$s =~ s/copyright\s*|\(C\)\s*//i;
- # Count words starting with a upper case letter
- my @words = split(/\s+/, $s);
- my $count = 0;
- for my $w (@words) {
- $count++ if (($w =~ m/\w/) && isUcFirst($w));
+ if (($s =~ /@/)) {
+# # Count words starting with a upper case letter (turned off)
+# my @words = split(/\s+/, $s);
+# my $count = 0;
+# for my $w (@words) {
+# $count++ if (isWordCharFirst($w) && isUcFirst($w));
+# }
+# if ($count >= 2) {
+ # OK. Treat it like copyright statement then
+ __process_copyright_statement(@$results, $s, 1, 0);
+# }
}
- if ($count >= 2) {
- # OK. Treat it like copyright statement then
- __process_copyright_statement(@$results, $s, 1);
- }
}
}
}
Modified: people/modax/copyright-helper/trunk/CHCore.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCore.pm 2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/CHCore.pm 2008-02-02 14:14:48 UTC (rev 9251)
@@ -20,9 +20,14 @@
@CHCore::Directory::ISA = qw( CHCore::File );
-#use CHParsers("fileparsers/*");
-#use CHLicenses("licenses/*");
-#use CHCopyright;
+sub __dump_array(\@$) {
+ my ($array, $name) = @_;
+ my $i = 0;
+ for my $a (@$array) {
+ $i++;
+ print "$name $i |", $a, "\n";
+ }
+}
sub CHCore::File::new($$$) {
my ($cls, $directory, $filename) = @_;
@@ -125,16 +130,13 @@
# Extract comments from file and split text to paragraphs
my @comments = $parser->filter($arg);
my @parags = $parser->toParagraphs(\@comments);
-
close FILE;
- #my $i = 0;
- #for my $c (@comments) {
- # $i++;
- # print "Parag $i |", $c, "\n";
- #}
-
- # Get copyright holders
+ # Array dumps for debugging purposes
+ #__dump_array(@comments, "Comment");
+ #__dump_array(@parags, "Paragraph");
+
+ # Get copyright holders and license information
my @copyrights = $parser->getCopyrights(\@parags);
my $license = $parser->getLicense(\@parags);
Modified: people/modax/copyright-helper/trunk/copyright-helper.pl
===================================================================
--- people/modax/copyright-helper/trunk/copyright-helper.pl 2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/copyright-helper.pl 2008-02-02 14:14:48 UTC (rev 9251)
@@ -169,7 +169,7 @@
}
# Entry point
-${main::VERSION}='0.1';
+${main::VERSION}='0.2';
print_msg "\n";
print_msg "Copyright Helper v${main::VERSION}\n";
print_msg "Extracts copyright and license information from source code\n\n";
Modified: people/modax/copyright-helper/trunk/parsers/c_cpp.pm
===================================================================
--- people/modax/copyright-helper/trunk/parsers/c_cpp.pm 2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/parsers/c_cpp.pm 2008-02-02 14:14:48 UTC (rev 9251)
@@ -19,9 +19,9 @@
use utf8;
our @ISA = qw( CHParsers::ParserBase );
-sub __push_comment(\@$$) {
- my ($comments, $comment, $before) = @_;
- if ($comment && !($before =~ /^\s*$/)) {
+sub __push_comment(\@$) {
+ my ($comments, $comment) = @_;
+ if ($comment) {
push @$comments, $comment;
$_[1] = "";
}
@@ -42,6 +42,9 @@
$compbound = 0;
$_ = $2 . "\n";
$more = 1;
+ # End of the compbound comment always means
+ # the end of the comment
+ __push_comment @comments, $comment;
} else {
$comment .= $_;
$more = 0;
@@ -49,22 +52,32 @@
} else {
# Beginning of the comment
if (m%^(.*?)/\*(.*)$%) {
- __push_comment @comments, $comment, $1;
+ # Start of the compbound comment always starts
+ # a new comment. Push old one and continue to
+ # the upper block
+ __push_comment @comments, $comment;
$_ = $2 . "\n";
$compbound = 1;
$more = 1;
} elsif (m%^(.*?)//(.*)$%) {
- __push_comment @comments, $comment, $1;
- $comment .= $2;
+ # Push _previous comment_ only if there is
+ # something (non-commentish before it)
+ my $before = $1;
+ my $after = $2;
+ __push_comment(@comments, $comment) unless($before =~ /^\s*$/);
+ $comment .= $after;
$more = 0;
} else {
- __push_comment @comments, $comment, $_;
+ __push_comment @comments, $comment;
$more = 0;
}
}
} while ($more);
}
- __push_comment @comments, $comment, "force";
+
+ # Push the rest if there anything to push
+ __push_comment @comments, $comment;
+
return @comments;
}
@@ -78,14 +91,14 @@
my @parag;
my $i = 0;
for my $line (@lines) {
- my $p = "";
- if ($line =~ m%^(\s|[/*])*(.*?)(\s|[*/])*$%) {
+ my $p;
+ if ($line =~ m%^([\s*/])*(.*?)([\s*/])*$%) {
$p = $2;
} else {
$p = $line;
}
if ($i == $#lines && $p ne "") {
- push @parags, $p;
+ push @parag, $p;
$p = "";
}
if ($p eq "") {
More information about the pkg-kde-commits
mailing list