rev 9402 - in people/modax/copyright-helper/trunk: . licenses parsers
Modestas Vainius
modax-guest at alioth.debian.org
Tue Feb 12 21:09:57 UTC 2008
Author: modax-guest
Date: 2008-02-12 21:09:56 +0000 (Tue, 12 Feb 2008)
New Revision: 9402
Modified:
people/modax/copyright-helper/trunk/CHCopyright.pm
people/modax/copyright-helper/trunk/CHCore.pm
people/modax/copyright-helper/trunk/CHParsers.pm
people/modax/copyright-helper/trunk/copyright-helper.pl
people/modax/copyright-helper/trunk/licenses/gnugpl.pm
people/modax/copyright-helper/trunk/parsers/c_cpp.pm
people/modax/copyright-helper/trunk/parsers/po.pm
Log:
v0.3.9 (v0.4.0 will be out in a few couple of minutes)
* __Tons__ of fixes to copyright parser.
* Copyright parser is now able to parse amarok2 source code without obvious false positives. Noone knows how many holders are not recognized though :)
* Support for --short-copyright-report
* Tons of stuff I don't remember
Modified: people/modax/copyright-helper/trunk/CHCopyright.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCopyright.pm 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/CHCopyright.pm 2008-02-12 21:09:56 UTC (rev 9402)
@@ -22,6 +22,12 @@
our $__copyright_symbol = '©';
utf8::encode($__copyright_symbol);
+our $__copyright_separator = qr/copyright\W|(?:^|[^@])authors?:|(?:\(C\)|$__copyright_symbol)\s+(?!\d)/io;
+our $__copyright_match = qr/\(C\)|$__copyright_symbol/io;
+our $__email_regexp = qr/([a-zA-Z0-9_\+-.]+@[a-zA-Z0-9_\+-.]+)/o;
+our $__fuzzy_emailpart_regexp = qr/(?:[a-z0-9_\\+-]+(?:\s+dot\s+|[.\s]))+/io;
+our $__fuzzy_email_regexp = qr/($__fuzzy_emailpart_regexp)\s*at\s+($__fuzzy_emailpart_regexp[a-z]{2,4})/io;
+
our %blacklisted_names = (
"jan" => [ "jan", "january" ],
"feb" => [ "feb", "february" ],
@@ -44,9 +50,19 @@
"sat" => [ "sat", "saturday" ],
"sun" => [ "sun", "sunday" ],
- "inc" => [ "inc." ], # Rule out Free Software Foundation, **Inc.**
+ "mai" => [ "maintainer", "maintainers" ],
+ "dev" => [ "developer", "developers" ],
+ "con" => [ "contributor", "contributors" ],
+ "roo" => [ "root" ],
);
+our %reopen_criteria_words = (
+ "inc" => 1,
+ "inc." => 1,
+ "ltd" => 1,
+ "ltd." => 1,
+);
+
sub isUcFirst($) {
my $str = shift;
return $str eq ucfirst($str);
@@ -62,9 +78,16 @@
return $word =~ m/^\w/;
}
+sub ucCount($) {
+ my $count = 0;
+ map { $count ++ if ($_ eq uc($_)) } split(//, $_[0]);
+ return $count;
+}
+
sub isNameBlacklisted($) {
our %blacklisted_names;
my $name = lc(shift);
+ $name =~ s/\W+$//;
my $key = substr($name, 0, 3);
if (exists $blacklisted_names{$key}) {
@@ -74,6 +97,12 @@
}
}
+sub isReopenCriteria($) {
+ our %reopen_criteria_words;
+ my $name = lc(shift);
+ return exists($reopen_criteria_words{$name});
+}
+
sub CHCopyright::Author::new {
my $self = {
"names" => [],
@@ -87,17 +116,18 @@
return bless($self, $_[0]);
}
-sub CHCopyright::Author::addName($$) {
- my ($self, $name) = @_;
+sub CHCopyright::Author::NAME_ADDED {
+ 1;
+}
+
+sub CHCopyright::Author::NAME_CLOSED {
+ 2;
+}
+
+sub CHCopyright::Author::addName($$$) {
+ my ($self, $name, $closeName) = @_;
my $names = $self->{'names'};
- my $prevName = (scalar(@$names) > 0) ? $names->[ scalar(@$names)-1 ] : "";
- # It's logical to assume that firstname and
- # lastname should start with a uppercase letter.
- # There are exceptions (e.g middle names). So treat
- # anything between two strings starting with
- # uppercase as part of the full name as well.
- my $closeName = $prevName && $name && isUcFirst($name) && isLcFirst($prevName);
if (!$closeName && ($name =~ m/(.*?)[,]+$/)) {
# If the name ends with a comma (,), close it
$name = $1;
@@ -108,20 +138,36 @@
# as separators
if (scalar(@$names) > 0 && ($name =~ m/^(and|by|[&])$/)) {
$self->closeName();
- return 0;
+ return ($self->isNameClosed()) ? CHCopyright::Author::NAME_CLOSED : 0;
}
+ my $prevName = (scalar(@$names) > 0) ? $names->[ scalar(@$names)-1 ] : "";
+ # It's logical to assume that firstname and
+ # lastname should start with a uppercase letter.
+ # There are exceptions (e.g middle names). So treat
+ # anything between two strings starting with
+ # uppercase as part of the full name as well.
+ if (!$closeName) {
+ $closeName = $prevName && $name && isUcFirst($name) && isLcFirst($prevName);
+ }
+
# Skip blacklisted words
if (isNameBlacklisted($name)) {
- return 0;
+ $self->closeName() if ($closeName);
+ return ($self->isNameClosed()) ? CHCopyright::Author::NAME_CLOSED : 0;
}
+ # Add and close name if needed
+ my $ret = 0;
+ if ($name) {
+ push @$names, $name;
+ $ret = CHCopyright::Author::NAME_ADDED;
+ }
if ($closeName) {
$self->closeName();
+ $ret |= $self->isNameClosed() ? CHCopyright::Author::NAME_CLOSED : 0;
}
-
- push @$names, $name;
- return $self->isNameClosed() + 1;
+ return $ret;
}
sub CHCopyright::Author::isNameClosed($) {
@@ -129,9 +175,13 @@
}
sub CHCopyright::Author::closeName($) {
- $_[0]->{nameClosed} = 1 if ($_[0]->{names})
+ $_[0]->{nameClosed} = 1 if (@{$_[0]->{names}})
}
+sub CHCopyright::Author::openName($) {
+ $_[0]->{nameClosed} = 0 if (@{$_[0]->{names}})
+}
+
sub CHCopyright::Author::getFullName($) {
return join(" ", @{$_[0]->{names}})
}
@@ -192,6 +242,7 @@
my $uwcount = 0;
my $lcount = 0;
my $dcount = 0;
+
for my $n (@$names) {
if (isUcFirst($n)) {
$ucount++; # non-word characters go here too
@@ -202,13 +253,31 @@
$lcount++;
}
}
+# ($q{'ufirst'}, $q{'uwfirst'}, $q{'lfirst'}, $q{'dotend'}, $q{'uwletters'}) = ($ucount, $uwcount, $lcount, $dcount);
return ($ucount, $uwcount, $lcount, $dcount);
}
-sub CHCopyright::Author::complete($$) {
- my ($self, $strict) = @_;
+sub CHCopyright::Author::COMPLETED_SUCCESS {
+ 0;
+}
+
+sub CHCopyright::Author::COMPLETED_LOWER {
+ 1;
+}
+
+sub CHCopyright::Author::COMPLETED_OTHERWORDS {
+ 2;
+}
+
+sub CHCopyright::Author::COMPLETED_LOWQUALITY {
+ 3;
+}
+
+sub CHCopyright::Author::complete($$$) {
+ my ($self, $strict, $prevCompletedCode) = @_;
my $names = $self->{names};
my $otherWords = $self->{otherWords};
+ my $ret = $self->COMPLETED_SUCCESS;
# Calculate "names" quality
my ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
@@ -216,13 +285,23 @@
# print @{$self->{names}}, " -o ", @{$self->{otherWords}}, " -e ", $self->getEmail(), " -y ", $self->getAllYears(), "\n";
if (!$strict && !@$names && @$otherWords) {
+ my $quality = 1;
+ for my $o (@$otherWords) {
+ if (!($o =~ /(?:^|[\w.,;])$/o)) {
+ $quality = 0;
+ last;
+ }
+ }
# Assume the author was a bit lazy and wrote his
# firstname/lastname/nickname starting with a
# lowercase letter. Credit him anyway
- $self->{names} = $otherWords;
+ if ($quality) {
+ $self->{names} = $otherWords;
+ }
} elsif ($lcount > 2 || $lcount >= $ucount) {
# Then probably this name is not a real name
$self->{names} = [];
+ $ret = $self->COMPLETED_LOWER;
} else {
# Drop words starting with a lowercase letter
# from the end of the names array
@@ -240,22 +319,46 @@
# otherWords
if (scalar(@$otherWords) > scalar(@$names)) {
$self->{names} = [];
+ $ret = $self->COMPLETED_OTHERWORDS;
} else {
# Recalc names quality
($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
- # Ensure high quality standards
- if (!($lcount <= 1 && $dcount <= 1 && $uwcount == 0)) {
+
+ # Ensure high quality standards of the name
+ if ($lcount > 1 || $dcount > 1 || $uwcount != $ucount) {
+ # No more than one word starting with lowercase
+ # No more than one word ending with a dot
+ # Names starting with a non-word character are not allowed
$self->{names} = [];
- }
+ $ret = $self->COMPLETED_LOWQUALITY;
+ } #elsif ($ucount > 0) {
+ # # Try counting uppercase characters in a word. This way
+ # # we might rule out variable names quite reliably.
+ # my $uccount = 0;
+ # map { $uccount += ucCount($_) } @$names;
+ # if (($uccount * 100 / $ucount) > 125) {
+ # $self->{names} = [];
+ # $ret = $self->COMPLETED_LOWQUALITY;
+ # }
+ #}
}
}
$names = $self->{names};
- if ($ucount == $dcount && !$self->getEmail() &&
- (!@{$self->{years}} || $self->{autoYears})) {
- # Something is wrong, drop this name
- $self->{names} = [];
+ if (!$self->getEmail() && (!@{$self->{years}} || $self->{autoYears})) {
+ if ($ucount == $dcount) {
+ # Something is wrong, drop this name
+ $self->{names} = [];
+ $ret = $self->COMPLETED_LOWQUALITY;
+ } elsif ($prevCompletedCode == CHCopyright::Author::COMPLETED_LOWER) {
+ # Assume that the sentence started which is unrelated to
+ # copyright in any way
+ $self->{names} = [];
+ $ret = $prevCompletedCode;
+ }
}
+
+ return $ret;
}
sub CHCopyright::Author::initializeBasedOn($$) {
@@ -299,6 +402,17 @@
}
}
+sub CHCopyright::Author::isCorporation($) {
+ my $self = shift;
+ my $names = $self->{names};
+
+ for my $name (@$names) {
+ return 1 if ($name =~ m/corp(ation)?/i || $name =~ m/ltd/i);
+ }
+
+ return 0;
+}
+
########################### AuthorCollection ###################################
sub CHCopyright::AuthorCollection::new {
my ($cls, $strict) = @_;
@@ -308,6 +422,8 @@
"authors" => [ new CHCopyright::Author ],
"additions" => [],
"strictMode" => $strict,
+ "lastNamedAuthor" => 0,
+ "emailFromIndex" => 0,
};
return bless($self, $cls);
}
@@ -323,16 +439,16 @@
sub CHCopyright::AuthorCollection::getLastAddition($) {
my $array = $_[0]->{additions};
my $last = scalar(@$array) - 1;
- return ($last > 0) ? $$array[$last] : "";
+ return ($last >= 0) ? $$array[$last] : "";
}
sub CHCopyright::AuthorCollection::setLastAddition($$) {
my ($self, $newAddition) = @_;
my $oldAddition = $self->getLastAddition();
- if ($oldAddition eq "name" && $newAddition ne "name") {
+ if ($oldAddition eq "name" && !($newAddition =~ m/^name/)) {
my $author = $self->getCurrentByField("nameClosed");
- $author->closeName() if ($author);
+ $self->closeAuthorName($author);
} elsif ($oldAddition eq "year" && $newAddition ne "year") {
$self->closeYears();
}
@@ -367,6 +483,18 @@
return $_[0]->{authors}->[ $_[0]->getAuthorCount()-1 ];
}
+sub CHCopyright::AuthorCollection::reopenPreviousAuthor($) {
+ my $self = shift;
+ my $author = $self->{lastNamedAuthor};
+ if ($author && $self->getLastAddition() eq "nameClose") {
+ $author->openName();
+ my $additions = $self->{additions};
+ pop @$additions;
+ push @$additions, "name";
+ }
+ return $author && !$author->isNameClosed();
+}
+
sub CHCopyright::AuthorCollection::addNewAuthor($$) {
my ($self, $forceInit) = @_;
my $newAuthor = new CHCopyright::Author;
@@ -377,18 +505,43 @@
return $newAuthor;
}
-sub CHCopyright::AuthorCollection::addName($$) {
- my ($self, $name) = @_;
+sub CHCopyright::AuthorCollection::getAuthorForName($) {
+ my $self = shift;
my $author = $self->getCurrentByField("nameClosed");
$author = $self->addNewAuthor() unless ($author);
- my $ret = $author->addName($name);
+ return $author;
+}
- $self->setLastAddition("name");
+sub CHCopyright::AuthorCollection::addName($$$) {
+ my ($self, $name, $closeName) = @_;
+ my $author = $self->getAuthorForName();
+ my $ret = $author->addName($name, $closeName);
+
+ $self->setLastAddition(($ret & CHCopyright::Author::NAME_CLOSED) ? "nameClose" : "name");
+ $self->{lastNamedAuthor} = $author;
+
return $ret;
}
+sub CHCopyright::AuthorCollection::closeAuthorName {
+ my ($self, $author) = @_;
+ my $closed = 0;
+
+ if ($author) {
+ if (!$author->isNameClosed()) {
+ $author->closeName();
+ $closed = $author->isNameClosed();
+ }
+ }
+ if ($closed) {
+ $self->setLastAddition("nameClose");
+ }
+ return $closed;
+}
+
+
sub CHCopyright::AuthorCollection::addOtherWord($$) {
my ($self, $word) = @_;
my $author = $self->getLastNameClosedOrNonEmpty();
@@ -397,12 +550,52 @@
$author->addOtherWord($word);
}
+sub CHCopyright::AuthorCollection::addCopyrightSymbol($$) {
+ my ($self, $symbol) = @_;
+
+ # Order of the two calls below are important because the
+ # 2nd one might add a new author!
+
+# my $author = $self->getAuthorForName();
+ my $author = $self->{lastNamedAuthor};
+ my $closed = $self->closeAuthorName($author);
+ if ($closed) {
+ $self->{emailFromIndex} = $self->getAuthorCount();
+ } else {
+ $self->{emailFromIndex} = $self->getAuthorCount() - 1;
+ }
+# $self->closeEmail() if ($closed);
+ $self->addOtherWord($symbol);
+
+ return $closed;
+}
+
+sub CHCopyright::AuthorCollection::closeEmail($) {
+ my ($self) = @_;
+ $self->{emailFromIndex} = $self->getAuthorCount();
+}
+
sub CHCopyright::AuthorCollection::addEmail($$) {
my ($self, $email) = @_;
- my $author = $self->getCurrentByField("email");
+ my $author = 0;
+ my $authors = $self->getAuthors();
+ my $authorCount = $self->getAuthorCount();
- $author = $self->addNewAuthor() unless ($author);
+ for (my $i = $self->{emailFromIndex}; $i < $authorCount; $i++) {
+ my $a = $$authors[$i];
+ if (!$a->{email} && !$a->isCorporation()) {
+ $author = $a;
+ last;
+ }
+ }
+
+ if (!$author || $author->{email}) {
+ # Do not create a new author if corporation matched
+ $author = $self->addNewAuthor();
+ }
+
$author->setEmail($email);
+ $self->{emailFromIndex}++;
$self->setLastAddition("email");
}
@@ -410,11 +603,20 @@
sub CHCopyright::AuthorCollection::addYear($$) {
my ($self, $year) = @_;
+ my $added = 0;
for my $a (@{$self->{authors}}) {
- $a->addYear($year) if (!$a->isYearsClosed());
+ if (!$a->isYearsClosed()) {
+ $a->addYear($year);
+ $added++;
+ }
}
+ if (!$added) {
+ my $author = $self->addNewAuthor();
+ $author->addYear($year);
+ }
+
$self->setLastAddition("year");
}
@@ -428,36 +630,55 @@
sub CHCopyright::AuthorCollection::complete($) {
my $self = shift;
my $authors = $self->getAuthors();
+ my $completeCode = 0;
for my $author (@$authors) {
- $author->complete($self->isModeStrict());
+ $completeCode = $author->complete($self->isModeStrict(), $completeCode);
}
# If the last author is without email, check the order
# how words were added. We might need to lift emails down
# if the email came before name
my $author = $self->getLastAuthor();
- if ($self->getAuthorCount() > 1 && !$author->getEmail()) {
+ $self->closeAuthorName($self->{lastNamedAuthor});
+
+ if ($self->getAuthorCount() > 1 && !$author->getEmail() && !$author->isCorporation()) {
my $additions = $self->{additions};
my $email = 0;
my $name = 0;
- for (my $i = scalar(@$additions)-1; $i >= 0; $i--) {
+ my $author_index = $self->getAuthorCount() - 1;
+
+ for (my $i = scalar(@$additions)-1; $i >= 0 && $author_index > 0; $i--) {
my $el = $$additions[$i];
if ($el eq "email") {
$email++;
- } elsif ($el eq "name") {
+ } elsif ($el eq "nameClose") {
$name++;
- last;
+ if ($email >= $name) {
+ # Then lift email down by 1
+ $$authors[$author_index]->setEmail(
+ $$authors[$author_index-1]->getEmail()
+ );
+ $author_index--;
+ $$authors[$author_index]->setEmail("");
+ } else {
+ last;
+ }
}
}
- if ($email == 1 && $name == 1) {
- # Then lift email down by 1
- for (my $i = $self->getAuthorCount()-1; $i >= 1; $i--) {
- $$authors[$i]->setEmail($$authors[$i-1]->getEmail());
- }
- $$authors[0]->setEmail("");
- }
+
+ # Rebuild authors array wiping out incomplete authors
+# $authors = [];
+# map { push @$authors, $_ if $_->isComplete() } (@{$self->getAuthors()});
+# $self->{authors} = $authors;
+
+# if ($email == 1 && $name == 1 && !$self->getLastAuthor()->isCorporation()) {
+# for (my $i = $self->getAuthorCount()-1; $i >= 1; $i--) {
+# $$authors[$i]->setEmail($$authors[$i-1]->getEmail());
+# }
+# $$authors[0]->setEmail("");
+# }
}
}
@@ -468,37 +689,55 @@
my $authorCollection = new CHCopyright::AuthorCollection($strict);
my $is_name = 0;
- my @words = split(/[\s]+/, $p);
- for (my $i = 0; $i <= $#words; $i++) {
- my $w = $words[$i];
- my $added = 0;
+ my @lines = split(/\n+/, $p);
+ for my $line (@lines) {
+ # Do not permit the word license
+ # Such copyright statement is probably invalid
+ next if ($line =~ /licen[sc]es?/i);
-# print "----- $w -- $is_name", "\n";
+ my @words = split(/\s+/, $line);
+ for (my $i = 0; $i <= $#words; $i++) {
+ my $w = $words[$i];
+ my $added = 0;
- if ($w =~ m/^(\d{4}[\d,-]*)\W?$/) {
- # Then probably it's a year
- $authorCollection->addYear($1);
- $is_name = 0;
- $added = 1;
- } else {
- if ($w =~ m/@/) {
- $w =~ m/^<?([^>]*)>?/;
- $w = $1;
+# print "----- $w -- $is_name -- $strict", "\n";
- # Then probably it's an e-mail address
- $authorCollection->addEmail($w);
+ if (($w =~ m/^((\d{4})[\d,-]*)\W?$/) &&
+ ($2 > 1970) && ($2 < 2050)) {
+
+ # Then probably it's a year
+ $authorCollection->addYear($1);
$is_name = 0;
$added = 1;
- } elsif (!$is_name && isWordCharFirst($w) && isUcFirst($w)) {
- # if it starts with an uppercase word char
- $added = $is_name = $authorCollection->addName($w);
- } elsif ($is_name) {
- $added = $authorCollection->addName($w);
- $is_name = 0 if ($added == 2);
- }
+ } else {
+ if ($w =~ $__email_regexp) {
+ $w = $1;
+ $w =~ m/^<?([^>]*)>?/;
+ $w = $1;
- if (!$added) {
- $authorCollection->addOtherWord($w);
+ # Then probably it's an e-mail address
+ $authorCollection->addEmail($w);
+ $is_name = 0;
+ $added = 1;
+ } else {
+ if (isReopenCriteria($w)) {
+ $is_name = $authorCollection->reopenPreviousAuthor();
+ }
+ # Special handling for (C) and ©. They close email etc.
+ if (lc($w) eq "(c)" || $w eq "©") {
+ $is_name = !$authorCollection->addCopyrightSymbol($w) && $is_name;
+ $added = 1;
+ } elsif ($is_name || (!$is_name && isWordCharFirst($w) && isUcFirst($w))) {
+ # if is_name or starts with an uppercase word char
+ my $ret = $authorCollection->addName($w, $i == $#words);
+ $added = $ret & CHCopyright::Author::NAME_ADDED;
+ $is_name = !($ret & CHCopyright::Author::NAME_CLOSED);
+ }
+
+ if (!$added) {
+ $authorCollection->addOtherWord($w);
+ }
+ }
}
}
}
@@ -518,21 +757,56 @@
# push (@$results, @{$authorCollection->getAuthors()});
}
+sub __preprocessStatement($) {
+ our $__fuzzy_emailpart_regexp;
+ our $__fuzzy_email_regexp;
+ my $in = shift;
+
+ # Handle cases when year range has whitespaces, e.g. 2005 - 2007
+ # We need to remove them
+ my $out = $in;
+ $out =~ s/(\d{4})\s*-\s*(\d{4})/$1-$2/g;
+
+ # Handle cases when e-mail address is expressed as e.g.
+ # "localpart at domain.com" or "localpart at domain dot com"
+ # Convert them to normal form, i.e. localpart at domain.com
+ if ($out =~ /<($__fuzzy_email_regexp)>/io) {
+ do {
+ my $full = $1;
+ my $localpart = $2;
+ my $domainpart = $3;
+ $localpart =~ s/([a-z0-9_\\+-]+)(?:\s+dot\s+|[.\s])/$1./ig;
+ $localpart =~ s/.$//o;
+ $domainpart =~ s/([a-z0-9_\\+-]+)(?:\s+dot\s+|[.\s])/$1./ig;
+ $out =~ s/\Q$full\E/${localpart}\@${domainpart}/;
+ } while ($out =~/\G<($__fuzzy_email_regexp)>/io);
+ }
+
+ # Capitalize "All Rights Reserved" statement so it can be captured
+ # properly.
+ $out =~ s/all rights reserved/All Rights Reserved/ig;
+
+ return $out;
+}
+
sub getCopyright(\@$$) {
my ($results, $p, $separator) = @_;
- our $__copyright_symbol;
+ our $__copyright_match;
+ our $__copyright_separator;
+ our $__email_regexp;
- $separator = qr/copyright|\(C\)\s*|$__copyright_symbol\s*/i if (!$separator);
+ $separator = $__copyright_separator if (!$separator);
- # Copyright statement usually starts with Copyright or (C)
- my @copyrights = split($separator, $p);
- if (scalar(@copyrights) > 1) {
+ my @copyrights = split($__copyright_separator, $p);
+ if (scalar(@copyrights) > 1 || $p =~ $__copyright_match) {
my $i = -1;
for my $c (@copyrights) {
$i++;
next if ($c =~ m/^\s*$/);
- __process_copyright_statement(@$results, $c, 0, ($i == 0));
+ __process_copyright_statement(@$results,
+ __preprocessStatement($c),
+ 0, ($i == 0));
}
return 1;
} else {
@@ -542,24 +816,34 @@
sub getFuzzyCopyright(\@$) {
my ($results, $p) = @_;
+ our $__email_regexp;
+ my $ok = 0;
- my @sentenses = split(/\s*\.\s+/, $p);
- if ($p =~ /(^|[\s.,-])\d{4}([\s.,-]|$)/) {
- for my $s (@sentenses) {
- $s =~ s/copyright\s*|\(C\)\s*//i;
+ if ($p =~ /(?:^|[\s.,-])(\d{4})(?:[\s.,-]|$)/ && $1 > 1950 && $1 < 2050) {
+ $ok = 1;
+ }
- if (($s =~ /@/)) {
-# # Count words starting with a upper case letter (turned off)
-# my @words = split(/\s+/, $s);
-# my $count = 0;
-# for my $w (@words) {
-# $count++ if (isWordCharFirst($w) && isUcFirst($w));
-# }
-# if ($count >= 2) {
- # OK. Treat it like copyright statement then
- __process_copyright_statement(@$results, $s, 1, 0);
-# }
+ my @sentenses = split(/\n|\w{4,}\.\s+/, $p);
+ for my $s (@sentenses) {
+ $s =~ s/(copyright|author)\S*\s*|\(C\)\s*//i;
+ $s = __preprocessStatement($s);
+ if ($s =~ $__email_regexp) {
+ if (!$ok) {
+ # Count words starting with an upper case letter
+ # Ignore words having more than one (i.e. first)
+ # uppercase
+ my @words = split(/\s+/, $s);
+ my $count = 0;
+ for my $w (@words) {
+ $count++ if (isWordCharFirst($w) && isUcFirst($w) && ucCount($w) == 1);
+ }
+ $ok = 2 if ($count >= 2);
}
+
+ if ($ok) {
+ # OK. Treat it like copyright statement then
+ __process_copyright_statement(@$results, $s, 1, 0);
+ }
}
}
}
Modified: people/modax/copyright-helper/trunk/CHCore.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCore.pm 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/CHCore.pm 2008-02-12 21:09:56 UTC (rev 9402)
@@ -133,8 +133,8 @@
close FILE;
# Array dumps for debugging purposes
- #__dump_array(@comments, "Comment");
- #__dump_array(@parags, "Paragraph");
+# __dump_array(@comments, "Comment");
+# __dump_array(@parags, "Paragraph");
# Get copyright holders and license information
my @copyrights = $parser->getCopyrights(\@parags);
@@ -195,7 +195,6 @@
} else {
return "";
}
-
}
sub CHCore::File::printAll($) {
@@ -213,7 +212,7 @@
print $self->toStringLicense();
print "\n";
} else {
- print " Copyright and license information not available or not applicable";
+ print "Copyright-And-License-N/A\n\n";
}
}
@@ -753,6 +752,21 @@
return $str;
}
+sub CHCore::CopyrightSummary::toStringShort($) {
+ my $self = shift;
+ my $years = $self->getYears();
+ my $emails = $self->getEmails();
+
+ my $str = "Copyright: © ";
+ $str .= join(", ", @$years);
+ $str .= " " . $self->getKeyName();
+
+ for my $email (@$emails) {
+ $str .= " <$email>";
+ }
+ return $str;
+}
+
sub __handle_indent_args {
my ($lvl1, $lvl2, $lvl3) = @_;
Modified: people/modax/copyright-helper/trunk/CHParsers.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHParsers.pm 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/CHParsers.pm 2008-02-12 21:09:56 UTC (rev 9402)
@@ -190,8 +190,8 @@
foreach $_ (@text) {
my @lines = split(/\n/);
- my $line = join(" ", @lines);
- $line =~ s/\s+/ /g;
+ map s/\s+/ /g, @lines;
+ my $line = join("\n", @lines);
push @parags, $line;
}
return @parags;
@@ -206,16 +206,18 @@
}
if ($fuzzy >= 0) {
- # Try finding a completely "full copyright", i.e. with
- # a complete one with an e-mail address
- for $_ (@copyrights) {
- if ($_->getEmail()) {
- $fuzzy = -1;
- last;
+ if ($fuzzy == 0) { # auto
+ # Try finding a completely "full copyright", i.e.
+ # a complete one with an e-mail address
+ for $_ (@copyrights) {
+ if ($_->getEmail()) {
+ $fuzzy = -1;
+ last;
+ }
}
}
- if ($fuzzy == 0) {
+ if ($fuzzy >= 0) {
# Try fuzzy search
for $_ (@$text) {
CHCopyright::getFuzzyCopyright(\@copyrights, $_);
Modified: people/modax/copyright-helper/trunk/copyright-helper.pl
===================================================================
--- people/modax/copyright-helper/trunk/copyright-helper.pl 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/copyright-helper.pl 2008-02-12 21:09:56 UTC (rev 9402)
@@ -101,24 +101,54 @@
$fileobj->printAll();
}
-sub copyright_report($) {
+sub copyright_prepare_for_report($) {
my $fileobj = shift;
print_msg "Calculating copyright summaries ... ";
my ($summaries, $cr_count) = $fileobj->getCopyrightSummaries();
print_msg "done.\n";
+ my @summaries = sort { -($a->getTimesCredited() <=> $b->getTimesCredited()) } @$summaries;
+
+ for my $summary (@summaries) {
+ $summary->cleanup();
+ }
+
+ return (\@summaries, $cr_count);
+}
+
+sub copyright_short_report(\@$) {
+ my ($summaries, $cr_count) = @_;
+ my $count = scalar(@$summaries);
+
if ($opt_show_headers) {
print_header_sep;
+ print_header "Short Copyright Holders Report";
+ print_header "($count holders out of $cr_count copyright statements)";
+ print_header_sep;
+ print "\n";
+ }
+
+ for my $summary (@$summaries) {
+ print $summary->toStringShort();
+ print "\n";
+ }
+ print "\n";
+}
+
+sub copyright_report(\@$) {
+ my ($summaries, $cr_count) = @_;
+ my $count = scalar(@$summaries);
+
+ if ($opt_show_headers) {
+ print_header_sep;
print_header "Copyright Holders Report";
- print_header "(Out of $cr_count copyright statements)";
+ print_header "($count holders out of $cr_count copyright statements)";
print_header_sep;
print "\n";
}
- my @summaries = sort { -($a->getTimesCredited() <=> $b->getTimesCredited()) } @$summaries;
- for my $summary (@summaries) {
- $summary->cleanup();
+ for my $summary (@$summaries) {
print $summary->toString();
print $summary->toStringLicenses();
print $summary->toStringFiles();
@@ -143,6 +173,8 @@
for my $arg (@ARGV) {
if ($arg eq "-c" || $arg eq "--copyright" || $arg eq "--copyright-report") {
push @reports, "copyright";
+ } elsif ($arg eq "-sc" || $arg eq "--short-copyright" || $arg eq "--short-copyright-report") {
+ push @reports, "copyright-short";
} elsif ($arg eq "-f" || $arg eq "--file" || $arg eq "--file-report") {
push @reports, "file";
} elsif ($arg eq "-l" || $arg eq "--license" || $arg eq "--license-report") {
@@ -160,11 +192,18 @@
} else {
my $fileobj = scan($filename);
return -1 if (!$fileobj);
+ my ($cr_summaries, $cr_count) = (undef, 0);
# Show reports
for my $report (@reports) {
if ($report eq "copyright") {
- copyright_report($fileobj);
+ ($cr_summaries, $cr_count) = copyright_prepare_for_report($fileobj)
+ unless defined($cr_summaries);
+ copyright_report(@$cr_summaries, $cr_count);
+ } elsif ($report eq "copyright-short") {
+ ($cr_summaries, $cr_count) = copyright_prepare_for_report($fileobj)
+ unless defined($cr_summaries);
+ copyright_short_report(@$cr_summaries, $cr_count);
} elsif ($report eq "file") {
file_report($fileobj);
} elsif ($report eq "license") {
@@ -178,12 +217,12 @@
}
# Entry point
-${main::VERSION}='0.3.3';
+${main::VERSION}='0.3.9';
print_msg "\n";
print_msg "Copyright Helper v${main::VERSION}\n";
print_msg "Extracts copyright and license information from source code\n\n";
-Getopt::Long::Configure ("pass_through");
+Getopt::Long::Configure (qw(pass_through no_bundling));
GetOptions(
"headers!" => \$opt_show_headers,
"max-header-length=i" => \$max_header,
Modified: people/modax/copyright-helper/trunk/licenses/gnugpl.pm
===================================================================
--- people/modax/copyright-helper/trunk/licenses/gnugpl.pm 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/licenses/gnugpl.pm 2008-02-12 21:09:56 UTC (rev 9402)
@@ -26,15 +26,15 @@
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
- if ($p =~ m/redistribute.*modify.*GNU General Public License as published(.*)$/) {
+ if ($p =~ m/redistribute.*modify.*GNU General Public License as published(.*)/so) {
my $version = $1;
my $license = new CHLicenses::gnugpl;
# Check for version and later clause (optional)
- if ($version =~ m/version ([\d.]*[\d]+) of the License(.*)$/i) {
+ if ($version =~ m/version ([\d.]*[\d]+) of the License(.*)/soi) {
$license->{"version"} = $1;
$version = $2;
- $license->{"later"} = 1 if ($version =~ m/any later version/);
+ $license->{"later"} = 1 if ($version =~ m/any\slater\sversion/);
}
$license->{"foundInText"} = $p;
return $license;
@@ -52,9 +52,9 @@
for my $p (@$text) {
if (!$m_title) {
# text is post-splitting to paragraphs
- if ($m_title = ($p =~ m/^\s*GNU GENERAL PUBLIC LICENSE(.*)$/)) {
+ if ($m_title = ($p =~ m/^\s*GNU GENERAL PUBLIC LICENSE(.*)$/so)) {
my $verstr = $1;
- if ($verstr =~ m/^\s*Version (\d), (.+?)\s*$/) {
+ if ($verstr =~ m/^\s*Version (\d), (.+?)\s*$/o) {
$m_version = $1;
my $date = $2;
if (($m_version eq '2' && $date eq "June 1991") ||
Modified: people/modax/copyright-helper/trunk/parsers/c_cpp.pm
===================================================================
--- people/modax/copyright-helper/trunk/parsers/c_cpp.pm 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/parsers/c_cpp.pm 2008-02-12 21:09:56 UTC (rev 9402)
@@ -22,6 +22,7 @@
sub __push_comment(\@$) {
my ($comments, $comment) = @_;
if ($comment) {
+ chomp($comment);
push @$comments, $comment;
$_[1] = "";
}
@@ -65,7 +66,7 @@
my $before = $1;
my $after = $2;
__push_comment(@comments, $comment) unless($before =~ /^\s*$/);
- $comment .= $after;
+ $comment .= $after . "\n";
$more = 0;
} else {
__push_comment @comments, $comment;
@@ -92,7 +93,7 @@
my $i = 0;
for my $line (@lines) {
my $p;
- if ($line =~ m%^([\s*/])*(.*?)([\s*/])*$%) {
+ if ($line =~ m%^([\s*/])*(.*?)([\s*/])*\s*$%) {
$p = $2;
} else {
$p = $line;
@@ -104,7 +105,7 @@
if ($p eq "") {
# Treat like the end of the paragraph
if (scalar(@parag) > 0) {
- push @parags, join(" ", @parag);
+ push @parags, join("\n", @parag);
@parag = ();
}
} else {
Modified: people/modax/copyright-helper/trunk/parsers/po.pm
===================================================================
--- people/modax/copyright-helper/trunk/parsers/po.pm 2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/parsers/po.pm 2008-02-12 21:09:56 UTC (rev 9402)
@@ -23,7 +23,7 @@
sub __push_comment(\@\@) {
my ($comments, $comment) = @_;
if (scalar(@$comment) > 0) {
- push @$comments, join(" ", @$comment);
+ push @$comments, join("\n", @$comment);
@{$_[1]} = ();
}
}
@@ -48,6 +48,24 @@
return @comments;
}
+sub toParagraphs($\@) {
+ my $self = shift;
+ my @text = @{shift()};
+ my @parags = ();
+
+ foreach $_ (@text) {
+ my @lines = split(/\n/);
+ map s/\s+/ /g, @lines;
+ push @parags, @lines;
+ }
+ return @parags;
+}
+
+sub getCopyrights($\@) {
+ # Fuzzy search is forced
+ return $_[0]->getStandardCopyrights($_[1], 1);
+}
+
sub extensions {
return qw( po );
}
More information about the pkg-kde-commits
mailing list