rev 9402 - in people/modax/copyright-helper/trunk: . licenses parsers

Modestas Vainius modax-guest at alioth.debian.org
Tue Feb 12 21:09:57 UTC 2008


Author: modax-guest
Date: 2008-02-12 21:09:56 +0000 (Tue, 12 Feb 2008)
New Revision: 9402

Modified:
   people/modax/copyright-helper/trunk/CHCopyright.pm
   people/modax/copyright-helper/trunk/CHCore.pm
   people/modax/copyright-helper/trunk/CHParsers.pm
   people/modax/copyright-helper/trunk/copyright-helper.pl
   people/modax/copyright-helper/trunk/licenses/gnugpl.pm
   people/modax/copyright-helper/trunk/parsers/c_cpp.pm
   people/modax/copyright-helper/trunk/parsers/po.pm
Log:
v0.3.9 (v0.4.0 will be out in a few couple of minutes)
* __Tons__ of fixes to copyright parser.
* Copyright parser is now able to parse amarok2 source code without obvious false positives. Noone knows how many holders are not recognized though :)
* Support for --short-copyright-report
* Tons of stuff I don't remember


Modified: people/modax/copyright-helper/trunk/CHCopyright.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCopyright.pm	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/CHCopyright.pm	2008-02-12 21:09:56 UTC (rev 9402)
@@ -22,6 +22,12 @@
 our $__copyright_symbol = '©';
 utf8::encode($__copyright_symbol);
 
+our $__copyright_separator = qr/copyright\W|(?:^|[^@])authors?:|(?:\(C\)|$__copyright_symbol)\s+(?!\d)/io;
+our $__copyright_match = qr/\(C\)|$__copyright_symbol/io;
+our $__email_regexp = qr/([a-zA-Z0-9_\+-.]+@[a-zA-Z0-9_\+-.]+)/o;
+our $__fuzzy_emailpart_regexp = qr/(?:[a-z0-9_\\+-]+(?:\s+dot\s+|[.\s]))+/io;
+our $__fuzzy_email_regexp = qr/($__fuzzy_emailpart_regexp)\s*at\s+($__fuzzy_emailpart_regexp[a-z]{2,4})/io;
+
 our %blacklisted_names = (
     "jan" => [ "jan", "january" ],
     "feb" => [ "feb", "february" ],
@@ -44,9 +50,19 @@
     "sat" => [ "sat", "saturday" ],
     "sun" => [ "sun", "sunday" ],
 
-    "inc" => [ "inc." ], # Rule out Free Software Foundation, **Inc.**
+    "mai" => [ "maintainer", "maintainers" ],
+    "dev" => [ "developer", "developers" ],
+    "con" => [ "contributor", "contributors" ],
+    "roo" => [ "root" ],
 );
 
+our %reopen_criteria_words = (
+    "inc" => 1,
+    "inc." => 1,
+    "ltd" => 1,
+    "ltd." => 1,
+);
+
 sub isUcFirst($) {
     my $str = shift;
     return $str eq ucfirst($str);
@@ -62,9 +78,16 @@
     return $word =~ m/^\w/;
 }
 
+sub ucCount($) {
+    my $count = 0;
+    map { $count ++ if ($_ eq uc($_)) } split(//, $_[0]);
+    return $count;
+}
+
 sub isNameBlacklisted($) {
     our %blacklisted_names;
     my $name = lc(shift);
+    $name =~ s/\W+$//;
     my $key = substr($name, 0, 3);
 
     if (exists $blacklisted_names{$key}) {
@@ -74,6 +97,12 @@
     }
 }
 
+sub isReopenCriteria($) {
+    our %reopen_criteria_words;
+    my $name = lc(shift);
+    return exists($reopen_criteria_words{$name});
+}
+
 sub CHCopyright::Author::new {
     my $self = { 
         "names" => [],
@@ -87,17 +116,18 @@
     return bless($self, $_[0]);
 }
 
-sub CHCopyright::Author::addName($$) {
-    my ($self, $name) = @_;
+sub CHCopyright::Author::NAME_ADDED {
+    1;
+}
+
+sub CHCopyright::Author::NAME_CLOSED {
+    2;
+}
+
+sub CHCopyright::Author::addName($$$) {
+    my ($self, $name, $closeName) = @_;
     my $names = $self->{'names'};
-    my $prevName = (scalar(@$names) > 0) ? $names->[ scalar(@$names)-1 ] : "";
 
-    # It's logical to assume that firstname and
-    # lastname should start with a uppercase letter.
-    # There are exceptions (e.g middle names). So treat
-    # anything between two strings starting with 
-    # uppercase as part of the full name as well.
-    my $closeName = $prevName && $name && isUcFirst($name) && isLcFirst($prevName);
     if (!$closeName && ($name =~ m/(.*?)[,]+$/)) {
         # If the name ends with a comma (,), close it
         $name = $1;
@@ -108,20 +138,36 @@
     # as separators
     if (scalar(@$names) > 0 && ($name =~ m/^(and|by|[&])$/)) {
         $self->closeName();
-        return 0;
+        return ($self->isNameClosed()) ? CHCopyright::Author::NAME_CLOSED : 0;
     }
 
+    my $prevName = (scalar(@$names) > 0) ? $names->[ scalar(@$names)-1 ] : "";
+    # It's logical to assume that firstname and
+    # lastname should start with a uppercase letter.
+    # There are exceptions (e.g middle names). So treat
+    # anything between two strings starting with 
+    # uppercase as part of the full name as well.
+    if (!$closeName) {
+        $closeName =  $prevName && $name && isUcFirst($name) && isLcFirst($prevName);
+    }
+
     # Skip blacklisted words
     if (isNameBlacklisted($name)) {
-        return 0;
+        $self->closeName() if ($closeName);
+        return ($self->isNameClosed()) ? CHCopyright::Author::NAME_CLOSED : 0;
     }
 
+    # Add and close name if needed
+    my $ret = 0;
+    if ($name) {
+        push @$names, $name;
+        $ret = CHCopyright::Author::NAME_ADDED;
+    }
     if ($closeName) {
         $self->closeName();
+        $ret |= $self->isNameClosed() ? CHCopyright::Author::NAME_CLOSED : 0;
     }
-
-    push @$names, $name;
-    return $self->isNameClosed() + 1;
+    return $ret;
 }
 
 sub CHCopyright::Author::isNameClosed($) {
@@ -129,9 +175,13 @@
 }
 
 sub CHCopyright::Author::closeName($) {
-    $_[0]->{nameClosed} = 1 if ($_[0]->{names})
+    $_[0]->{nameClosed} = 1 if (@{$_[0]->{names}})
 }
 
+sub CHCopyright::Author::openName($) {
+    $_[0]->{nameClosed} = 0 if (@{$_[0]->{names}})
+}
+
 sub CHCopyright::Author::getFullName($) {
     return join(" ", @{$_[0]->{names}})
 }
@@ -192,6 +242,7 @@
     my $uwcount = 0;
     my $lcount = 0;
     my $dcount = 0;
+
     for my $n (@$names) {
         if (isUcFirst($n)) {
             $ucount++; # non-word characters go here too
@@ -202,13 +253,31 @@
             $lcount++;
         }
     }
+#    ($q{'ufirst'}, $q{'uwfirst'}, $q{'lfirst'}, $q{'dotend'}, $q{'uwletters'}) = ($ucount, $uwcount, $lcount, $dcount);
     return ($ucount, $uwcount, $lcount, $dcount);
 }
 
-sub CHCopyright::Author::complete($$) {
-    my ($self, $strict) = @_;
+sub CHCopyright::Author::COMPLETED_SUCCESS {
+    0;
+}
+
+sub CHCopyright::Author::COMPLETED_LOWER {
+    1;
+}
+
+sub CHCopyright::Author::COMPLETED_OTHERWORDS {
+    2;
+}
+
+sub CHCopyright::Author::COMPLETED_LOWQUALITY  {
+    3;
+}
+
+sub CHCopyright::Author::complete($$$) {
+    my ($self, $strict, $prevCompletedCode) = @_;
     my $names = $self->{names};
     my $otherWords = $self->{otherWords};
+    my $ret = $self->COMPLETED_SUCCESS;
 
     # Calculate "names" quality
     my ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
@@ -216,13 +285,23 @@
 #    print @{$self->{names}}, " -o ", @{$self->{otherWords}}, " -e ", $self->getEmail(), " -y ", $self->getAllYears(), "\n";
 
     if (!$strict && !@$names && @$otherWords) {
+        my $quality = 1;
+        for my $o (@$otherWords) {
+            if (!($o =~ /(?:^|[\w.,;])$/o)) {
+                $quality = 0;
+                last;
+            }
+        }
         # Assume the author was a bit lazy and wrote his
         # firstname/lastname/nickname starting with a 
         # lowercase letter. Credit him anyway
-        $self->{names} = $otherWords;
+        if ($quality) {
+            $self->{names} = $otherWords;
+        }
     } elsif ($lcount > 2 || $lcount >= $ucount) {
         # Then probably this name is not a real name
         $self->{names} = [];
+        $ret = $self->COMPLETED_LOWER;
     } else {
         # Drop words starting with a lowercase letter
         # from the end of the names array
@@ -240,22 +319,46 @@
         # otherWords 
         if (scalar(@$otherWords) > scalar(@$names)) {
             $self->{names} = [];
+            $ret = $self->COMPLETED_OTHERWORDS;
         } else {
             # Recalc names quality
             ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
-            # Ensure high quality standards
-            if (!($lcount <= 1 && $dcount <= 1 && $uwcount == 0)) {
+
+            # Ensure high quality standards of the name
+            if ($lcount > 1 || $dcount > 1 || $uwcount != $ucount) {
+                # No more than one word starting with lowercase
+                # No more than one word ending with a dot
+                # Names starting with a non-word character are not allowed
                 $self->{names} = [];
-            }
+                $ret = $self->COMPLETED_LOWQUALITY;
+            } #elsif ($ucount > 0) {
+              #  # Try counting uppercase characters in a word. This way
+              #  # we might rule out variable names quite reliably.
+              #  my $uccount = 0;
+              #  map { $uccount += ucCount($_) } @$names;
+              #  if (($uccount * 100 / $ucount) > 125) {
+              #      $self->{names} = [];
+              #      $ret = $self->COMPLETED_LOWQUALITY;
+              #  }
+            #}
         }
     }
     $names = $self->{names};
 
-    if ($ucount == $dcount && !$self->getEmail() &&
-        (!@{$self->{years}} || $self->{autoYears})) {
-        # Something is wrong, drop this name
-        $self->{names} = [];
+    if (!$self->getEmail() && (!@{$self->{years}} || $self->{autoYears})) {
+        if ($ucount == $dcount) {
+            # Something is wrong, drop this name
+            $self->{names} = [];
+            $ret = $self->COMPLETED_LOWQUALITY;
+        } elsif ($prevCompletedCode == CHCopyright::Author::COMPLETED_LOWER) {
+            # Assume that the sentence started which is unrelated to
+            # copyright in any way
+            $self->{names} = [];
+            $ret = $prevCompletedCode;
+        }
     }
+
+    return $ret;
 }
 
 sub CHCopyright::Author::initializeBasedOn($$) {
@@ -299,6 +402,17 @@
     }
 }
 
+sub CHCopyright::Author::isCorporation($) {
+    my $self = shift;
+    my $names = $self->{names};
+
+    for my $name (@$names) {
+        return 1 if ($name =~ m/corp(ation)?/i || $name =~ m/ltd/i);
+    }
+
+    return 0;
+}
+
 ########################### AuthorCollection ###################################
 sub CHCopyright::AuthorCollection::new {
     my ($cls, $strict) = @_;
@@ -308,6 +422,8 @@
         "authors" => [ new CHCopyright::Author ],
         "additions" => [],
         "strictMode" => $strict,
+        "lastNamedAuthor" => 0,
+        "emailFromIndex" => 0,
     };
     return bless($self, $cls);
 }
@@ -323,16 +439,16 @@
 sub CHCopyright::AuthorCollection::getLastAddition($) {
     my $array = $_[0]->{additions};
     my $last = scalar(@$array) - 1;
-    return ($last > 0) ? $$array[$last] : "";
+    return ($last >= 0) ? $$array[$last] : "";
 }
 
 sub CHCopyright::AuthorCollection::setLastAddition($$) {
     my ($self, $newAddition) = @_;
     my $oldAddition = $self->getLastAddition();
 
-    if ($oldAddition eq "name" && $newAddition ne "name") {
+    if ($oldAddition eq "name" && !($newAddition =~ m/^name/)) {
         my $author = $self->getCurrentByField("nameClosed");
-        $author->closeName() if ($author);
+        $self->closeAuthorName($author);
     } elsif ($oldAddition eq "year" && $newAddition ne "year") {
         $self->closeYears();
     }
@@ -367,6 +483,18 @@
     return $_[0]->{authors}->[ $_[0]->getAuthorCount()-1 ]; 
 }
 
+sub CHCopyright::AuthorCollection::reopenPreviousAuthor($) {
+    my $self = shift;
+    my $author = $self->{lastNamedAuthor};
+    if ($author && $self->getLastAddition() eq "nameClose") {
+        $author->openName();
+        my $additions = $self->{additions};
+        pop @$additions;
+        push @$additions, "name";
+    }
+    return $author && !$author->isNameClosed();
+}
+
 sub CHCopyright::AuthorCollection::addNewAuthor($$) {
     my ($self, $forceInit) = @_;
     my $newAuthor = new CHCopyright::Author;
@@ -377,18 +505,43 @@
     return $newAuthor;
 }
 
-sub CHCopyright::AuthorCollection::addName($$) {
-    my ($self, $name) = @_;
+sub CHCopyright::AuthorCollection::getAuthorForName($) {
+    my $self = shift;
     my $author = $self->getCurrentByField("nameClosed");
 
     $author = $self->addNewAuthor() unless ($author);
-    my $ret = $author->addName($name);
+    return $author;
+}
 
-    $self->setLastAddition("name");
+sub CHCopyright::AuthorCollection::addName($$$) {
+    my ($self, $name, $closeName) = @_;
+    my $author = $self->getAuthorForName();
 
+    my $ret = $author->addName($name, $closeName);
+
+    $self->setLastAddition(($ret & CHCopyright::Author::NAME_CLOSED) ? "nameClose" : "name");
+    $self->{lastNamedAuthor} = $author;
+
     return $ret;
 }
 
+sub CHCopyright::AuthorCollection::closeAuthorName {
+    my ($self, $author) = @_;
+    my $closed = 0;
+
+    if ($author) {
+        if (!$author->isNameClosed()) {
+            $author->closeName();
+            $closed = $author->isNameClosed();
+        }
+    }
+    if ($closed) {
+        $self->setLastAddition("nameClose");
+    }
+    return $closed;
+}
+
+
 sub CHCopyright::AuthorCollection::addOtherWord($$) {
     my ($self, $word) = @_;
     my $author = $self->getLastNameClosedOrNonEmpty();
@@ -397,12 +550,52 @@
     $author->addOtherWord($word);
 }
 
+sub CHCopyright::AuthorCollection::addCopyrightSymbol($$) {
+    my ($self, $symbol) = @_;
+
+    # Order of the two calls below are important because the
+    # 2nd one might add a new author!
+
+#    my $author = $self->getAuthorForName();
+    my $author = $self->{lastNamedAuthor};
+    my $closed = $self->closeAuthorName($author);
+    if ($closed) {
+        $self->{emailFromIndex} = $self->getAuthorCount();
+    } else {
+        $self->{emailFromIndex} = $self->getAuthorCount() - 1;
+    }
+#    $self->closeEmail() if ($closed);
+    $self->addOtherWord($symbol);
+
+    return $closed;
+}
+
+sub CHCopyright::AuthorCollection::closeEmail($) {
+    my ($self) = @_;
+    $self->{emailFromIndex} = $self->getAuthorCount();
+}
+
 sub CHCopyright::AuthorCollection::addEmail($$) {
     my ($self, $email) = @_;
-    my $author = $self->getCurrentByField("email");
+    my $author = 0;
+    my $authors = $self->getAuthors();
+    my $authorCount = $self->getAuthorCount();
 
-    $author = $self->addNewAuthor() unless ($author);
+    for (my $i = $self->{emailFromIndex}; $i < $authorCount; $i++) {
+        my $a = $$authors[$i];
+        if (!$a->{email} && !$a->isCorporation()) {
+            $author = $a;
+            last;
+        }
+    }
+
+    if (!$author || $author->{email}) {
+        # Do not create a new author if corporation matched
+        $author = $self->addNewAuthor();
+    }
+
     $author->setEmail($email);
+    $self->{emailFromIndex}++;
 
     $self->setLastAddition("email");
 }
@@ -410,11 +603,20 @@
 
 sub CHCopyright::AuthorCollection::addYear($$) {
     my ($self, $year) = @_;
+    my $added = 0;
 
     for my $a (@{$self->{authors}}) {
-        $a->addYear($year) if (!$a->isYearsClosed());
+        if (!$a->isYearsClosed()) {
+            $a->addYear($year);
+            $added++;
+        }
     }
 
+    if (!$added) {
+        my $author = $self->addNewAuthor();
+        $author->addYear($year);
+    }
+
     $self->setLastAddition("year");
 }
 
@@ -428,36 +630,55 @@
 sub CHCopyright::AuthorCollection::complete($) {
     my $self = shift;
     my $authors = $self->getAuthors();
+    my $completeCode = 0;
 
     for my $author (@$authors) {
-        $author->complete($self->isModeStrict());
+        $completeCode = $author->complete($self->isModeStrict(), $completeCode);
     }
 
     # If the last author is without email, check the order
     # how words were added. We might need to lift emails down
     # if the email came before name
     my $author = $self->getLastAuthor();
-    if ($self->getAuthorCount() > 1 && !$author->getEmail()) {
+    $self->closeAuthorName($self->{lastNamedAuthor});
+
+    if ($self->getAuthorCount() > 1 && !$author->getEmail() && !$author->isCorporation()) {
         my $additions = $self->{additions};
         my $email = 0;
         my $name = 0;
-        for (my $i = scalar(@$additions)-1; $i >= 0; $i--) {
+        my $author_index = $self->getAuthorCount() - 1;
+
+        for (my $i = scalar(@$additions)-1; $i >= 0 && $author_index > 0; $i--) {
             my $el = $$additions[$i];
             if ($el eq "email") {
                 $email++;
-            } elsif ($el eq "name") {
+            } elsif ($el eq "nameClose") {
                 $name++;
-                last;
+                if ($email >= $name) {
+                    # Then lift email down by 1
+                    $$authors[$author_index]->setEmail(
+                        $$authors[$author_index-1]->getEmail()
+                    );
+                    $author_index--;
+                    $$authors[$author_index]->setEmail("");
+                } else {
+                    last;
+                }
             }
         }
 
-        if ($email == 1 && $name == 1) {
-            # Then lift email down by 1
-            for (my $i = $self->getAuthorCount()-1; $i >= 1; $i--) {
-                $$authors[$i]->setEmail($$authors[$i-1]->getEmail());
-            }
-            $$authors[0]->setEmail("");
-        }
+    
+    # Rebuild authors array wiping out incomplete authors
+#    $authors = [];
+#    map { push @$authors, $_ if $_->isComplete() } (@{$self->getAuthors()});
+#    $self->{authors} = $authors;
+
+#        if ($email == 1 && $name == 1 && !$self->getLastAuthor()->isCorporation()) {
+#            for (my $i = $self->getAuthorCount()-1; $i >= 1; $i--) {
+#                $$authors[$i]->setEmail($$authors[$i-1]->getEmail());
+#            }
+#            $$authors[0]->setEmail("");
+#        }
     }
 }
 
@@ -468,37 +689,55 @@
     my $authorCollection = new CHCopyright::AuthorCollection($strict);
     my $is_name = 0;
 
-    my @words = split(/[\s]+/, $p);
-    for (my $i = 0; $i <= $#words; $i++) {
-        my $w = $words[$i];
-        my $added = 0;
+    my @lines = split(/\n+/, $p);
+    for my $line (@lines) {
+        # Do not permit the word license
+        # Such copyright statement is probably invalid
+        next if ($line =~ /licen[sc]es?/i);
 
-#       print "----- $w -- $is_name", "\n";
+        my @words = split(/\s+/, $line);
+        for (my $i = 0; $i <= $#words; $i++) {
+            my $w = $words[$i];
+            my $added = 0;
 
-        if ($w =~ m/^(\d{4}[\d,-]*)\W?$/) {
-            # Then probably it's a year
-            $authorCollection->addYear($1);
-            $is_name = 0;
-            $added = 1;
-        } else {
-            if ($w =~ m/@/) {
-                $w =~ m/^<?([^>]*)>?/;
-                $w = $1;
+#            print "----- $w -- $is_name -- $strict", "\n";
 
-                # Then probably it's an e-mail address
-                $authorCollection->addEmail($w);
+            if (($w =~ m/^((\d{4})[\d,-]*)\W?$/) &&
+                ($2 > 1970) && ($2 < 2050)) {
+
+                # Then probably it's a year
+                $authorCollection->addYear($1);
                 $is_name = 0;
                 $added = 1;
-            } elsif (!$is_name && isWordCharFirst($w) && isUcFirst($w)) {
-                # if it starts with an uppercase word char
-                $added = $is_name = $authorCollection->addName($w);
-            } elsif ($is_name) {
-                $added = $authorCollection->addName($w);
-                $is_name = 0 if ($added == 2);
-            }
+            } else {
+                if ($w =~ $__email_regexp) {
+                    $w = $1;
+                    $w =~ m/^<?([^>]*)>?/;
+                    $w = $1;
 
-            if (!$added) {
-                $authorCollection->addOtherWord($w);
+                    # Then probably it's an e-mail address
+                    $authorCollection->addEmail($w);
+                    $is_name = 0;
+                    $added = 1;
+                } else {
+                    if (isReopenCriteria($w)) {
+                        $is_name = $authorCollection->reopenPreviousAuthor();
+                    }
+                    # Special handling for (C) and ©. They close email etc.
+                    if (lc($w) eq "(c)" || $w eq "©") {
+                        $is_name = !$authorCollection->addCopyrightSymbol($w) && $is_name;
+                        $added = 1;
+                    } elsif ($is_name || (!$is_name && isWordCharFirst($w) && isUcFirst($w))) {
+                        # if is_name or starts with an uppercase word char
+                        my $ret = $authorCollection->addName($w, $i == $#words);
+                        $added = $ret & CHCopyright::Author::NAME_ADDED;
+                        $is_name = !($ret & CHCopyright::Author::NAME_CLOSED);
+                    }
+
+                    if (!$added) {
+                        $authorCollection->addOtherWord($w);
+                    }
+                }
             }
         }
     }
@@ -518,21 +757,56 @@
 #    push (@$results, @{$authorCollection->getAuthors()});
 }
 
+sub __preprocessStatement($) {
+    our $__fuzzy_emailpart_regexp;
+    our $__fuzzy_email_regexp;
+    my $in = shift;
+
+    # Handle cases when year range has whitespaces, e.g. 2005 - 2007
+    # We need to remove them
+    my $out = $in;
+    $out =~ s/(\d{4})\s*-\s*(\d{4})/$1-$2/g;
+
+    # Handle cases when e-mail address is expressed as e.g.
+    # "localpart at domain.com" or "localpart at domain dot com"
+    # Convert them to normal form, i.e. localpart at domain.com
+    if ($out =~ /<($__fuzzy_email_regexp)>/io) {
+        do {
+            my $full = $1;
+            my $localpart = $2;
+            my $domainpart = $3;
+            $localpart =~ s/([a-z0-9_\\+-]+)(?:\s+dot\s+|[.\s])/$1./ig;
+            $localpart =~ s/.$//o;
+            $domainpart =~ s/([a-z0-9_\\+-]+)(?:\s+dot\s+|[.\s])/$1./ig;
+            $out =~ s/\Q$full\E/${localpart}\@${domainpart}/;
+        } while ($out =~/\G<($__fuzzy_email_regexp)>/io);
+    }
+
+    # Capitalize "All Rights Reserved" statement so it can be captured
+    # properly.
+    $out =~ s/all rights reserved/All Rights Reserved/ig;
+
+    return $out;
+}
+
 sub getCopyright(\@$$) {
     my ($results, $p, $separator) = @_;
-    our $__copyright_symbol;
+    our $__copyright_match;
+    our $__copyright_separator;
+    our $__email_regexp;
 
-    $separator = qr/copyright|\(C\)\s*|$__copyright_symbol\s*/i if (!$separator);
+    $separator = $__copyright_separator if (!$separator);
 
-    # Copyright statement usually starts with Copyright or (C)
-    my @copyrights = split($separator, $p);
-    if (scalar(@copyrights) > 1) {
+    my @copyrights = split($__copyright_separator, $p);
+    if (scalar(@copyrights) > 1 || $p =~ $__copyright_match) {
         my $i = -1;
         for my $c (@copyrights) {
             $i++;
             next if ($c =~ m/^\s*$/);
 
-            __process_copyright_statement(@$results, $c, 0, ($i == 0));
+            __process_copyright_statement(@$results,
+                __preprocessStatement($c),
+                0, ($i == 0));
         }
         return 1;
     } else {
@@ -542,24 +816,34 @@
 
 sub getFuzzyCopyright(\@$) {
     my ($results, $p) = @_;
+    our $__email_regexp;
+    my $ok = 0;
 
-    my @sentenses = split(/\s*\.\s+/, $p);
-    if ($p =~ /(^|[\s.,-])\d{4}([\s.,-]|$)/) {
-        for my $s (@sentenses) {
-            $s =~ s/copyright\s*|\(C\)\s*//i;
+    if ($p =~ /(?:^|[\s.,-])(\d{4})(?:[\s.,-]|$)/ && $1 > 1950 && $1 < 2050) {
+        $ok = 1;
+    }
 
-            if (($s =~ /@/)) {
-#               # Count words starting with a upper case letter (turned off)
-#               my @words = split(/\s+/, $s);
-#               my $count = 0;
-#               for my $w (@words) {
-#                   $count++ if (isWordCharFirst($w) && isUcFirst($w));
-#               }
-#               if ($count >= 2) {
-                    # OK. Treat it like copyright statement then
-                __process_copyright_statement(@$results, $s, 1, 0);
-#               }
+    my @sentenses = split(/\n|\w{4,}\.\s+/, $p);
+    for my $s (@sentenses) {
+        $s =~ s/(copyright|author)\S*\s*|\(C\)\s*//i;
+        $s = __preprocessStatement($s);
+        if ($s =~ $__email_regexp) {
+            if (!$ok) {
+                # Count words starting with an upper case letter
+                # Ignore words having more than one (i.e. first)
+                # uppercase
+                my @words = split(/\s+/, $s);
+                my $count = 0;
+                for my $w (@words) {
+                    $count++ if (isWordCharFirst($w) && isUcFirst($w) && ucCount($w) == 1);
+                }
+                $ok = 2 if ($count >= 2);
             }
+
+            if ($ok) {
+                 # OK. Treat it like copyright statement then
+                 __process_copyright_statement(@$results, $s, 1, 0);
+            }
         }
     }
 }

Modified: people/modax/copyright-helper/trunk/CHCore.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCore.pm	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/CHCore.pm	2008-02-12 21:09:56 UTC (rev 9402)
@@ -133,8 +133,8 @@
     close FILE;
 
     # Array dumps for debugging purposes
-    #__dump_array(@comments, "Comment");
-    #__dump_array(@parags, "Paragraph");
+#    __dump_array(@comments, "Comment");
+#    __dump_array(@parags, "Paragraph");
 
     # Get copyright holders and license information
     my @copyrights = $parser->getCopyrights(\@parags);
@@ -195,7 +195,6 @@
     } else {
         return "";
     }
-    
 }
 
 sub CHCore::File::printAll($) {
@@ -213,7 +212,7 @@
         print $self->toStringLicense();
         print "\n";
     } else {
-        print " Copyright and license information not available or not applicable";
+        print "Copyright-And-License-N/A\n\n";
     }
 }
 
@@ -753,6 +752,21 @@
     return $str;
 }
 
+sub CHCore::CopyrightSummary::toStringShort($) {
+    my $self = shift;
+    my $years = $self->getYears();
+    my $emails = $self->getEmails();
+
+    my $str = "Copyright: © ";
+    $str .= join(", ", @$years);
+    $str .= " " . $self->getKeyName();
+
+    for my $email (@$emails) {
+        $str .= " <$email>";
+    }
+    return $str;
+}
+
 sub __handle_indent_args {
     my ($lvl1, $lvl2, $lvl3) = @_;
 

Modified: people/modax/copyright-helper/trunk/CHParsers.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHParsers.pm	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/CHParsers.pm	2008-02-12 21:09:56 UTC (rev 9402)
@@ -190,8 +190,8 @@
 
     foreach $_ (@text) {
         my @lines = split(/\n/);
-        my $line = join(" ", @lines);
-        $line =~ s/\s+/ /g;
+        map s/\s+/ /g, @lines;
+        my $line = join("\n", @lines);
         push @parags, $line;
     }
     return @parags;
@@ -206,16 +206,18 @@
     }
 
     if ($fuzzy >= 0) {
-        # Try finding a completely "full copyright", i.e. with
-        # a complete one with an e-mail address
-        for $_ (@copyrights) {
-            if ($_->getEmail()) {
-                $fuzzy = -1;
-                last;
+        if ($fuzzy == 0) { # auto
+            # Try finding a completely "full copyright", i.e.
+            # a complete one with an e-mail address
+            for $_ (@copyrights) {
+                if ($_->getEmail()) {
+                    $fuzzy = -1;
+                    last;
+                }
             }
         }
 
-        if ($fuzzy == 0) {
+        if ($fuzzy >= 0) {
             # Try fuzzy search
             for $_ (@$text) {
                 CHCopyright::getFuzzyCopyright(\@copyrights, $_);

Modified: people/modax/copyright-helper/trunk/copyright-helper.pl
===================================================================
--- people/modax/copyright-helper/trunk/copyright-helper.pl	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/copyright-helper.pl	2008-02-12 21:09:56 UTC (rev 9402)
@@ -101,24 +101,54 @@
     $fileobj->printAll();
 }
 
-sub copyright_report($) {
+sub copyright_prepare_for_report($) {
     my $fileobj = shift;
 
     print_msg "Calculating copyright summaries ... ";
     my ($summaries, $cr_count) = $fileobj->getCopyrightSummaries();
     print_msg "done.\n";
 
+    my @summaries = sort { -($a->getTimesCredited() <=>  $b->getTimesCredited()) } @$summaries;
+
+    for my $summary (@summaries) {
+        $summary->cleanup();
+    }
+
+    return (\@summaries, $cr_count);
+}
+
+sub copyright_short_report(\@$) {
+    my ($summaries, $cr_count) = @_;
+    my $count = scalar(@$summaries);
+
     if ($opt_show_headers) {
         print_header_sep;
+        print_header "Short Copyright Holders Report";
+        print_header "($count holders out of $cr_count copyright statements)";
+        print_header_sep;
+        print "\n";
+    }
+
+    for my $summary (@$summaries) {
+        print $summary->toStringShort();
+        print "\n";
+    }
+    print "\n";
+}
+
+sub copyright_report(\@$) {
+    my ($summaries, $cr_count) = @_;
+    my $count = scalar(@$summaries);
+
+    if ($opt_show_headers) {
+        print_header_sep;
         print_header "Copyright Holders Report";
-        print_header "(Out of $cr_count copyright statements)";
+        print_header "($count holders out of $cr_count copyright statements)";
         print_header_sep;
         print "\n";
     }
 
-    my @summaries = sort { -($a->getTimesCredited() <=>  $b->getTimesCredited()) } @$summaries;
-    for my $summary (@summaries) {
-        $summary->cleanup();
+    for my $summary (@$summaries) {
         print $summary->toString();
         print $summary->toStringLicenses();
         print $summary->toStringFiles();
@@ -143,6 +173,8 @@
     for my $arg (@ARGV) {
         if ($arg eq "-c" || $arg eq "--copyright" || $arg eq "--copyright-report") {
             push @reports, "copyright";
+        } elsif ($arg eq "-sc" || $arg eq "--short-copyright" || $arg eq "--short-copyright-report") {
+            push @reports, "copyright-short";
         } elsif ($arg eq "-f" || $arg eq "--file" || $arg eq "--file-report") {
             push @reports, "file";
         } elsif ($arg eq "-l" || $arg eq "--license" || $arg eq "--license-report") {
@@ -160,11 +192,18 @@
     } else {
         my $fileobj = scan($filename);
         return -1 if (!$fileobj);
+        my ($cr_summaries, $cr_count) = (undef, 0);
 
         # Show reports
         for my $report (@reports) {
             if ($report eq "copyright") {
-                copyright_report($fileobj);
+                ($cr_summaries, $cr_count) = copyright_prepare_for_report($fileobj)
+                    unless defined($cr_summaries);
+                copyright_report(@$cr_summaries, $cr_count);
+            } elsif ($report eq "copyright-short") {
+                ($cr_summaries, $cr_count) = copyright_prepare_for_report($fileobj)
+                    unless defined($cr_summaries);
+                copyright_short_report(@$cr_summaries, $cr_count);
             } elsif ($report eq "file") {            
                 file_report($fileobj);
             } elsif ($report eq "license") {
@@ -178,12 +217,12 @@
 }
 
 # Entry point
-${main::VERSION}='0.3.3';
+${main::VERSION}='0.3.9';
 print_msg "\n";
 print_msg "Copyright Helper v${main::VERSION}\n";
 print_msg "Extracts copyright and license information from source code\n\n";
 
-Getopt::Long::Configure ("pass_through");
+Getopt::Long::Configure (qw(pass_through no_bundling));
 GetOptions(
     "headers!" => \$opt_show_headers, 
     "max-header-length=i" => \$max_header,

Modified: people/modax/copyright-helper/trunk/licenses/gnugpl.pm
===================================================================
--- people/modax/copyright-helper/trunk/licenses/gnugpl.pm	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/licenses/gnugpl.pm	2008-02-12 21:09:56 UTC (rev 9402)
@@ -26,15 +26,15 @@
         # it under the terms of the GNU General Public License as published by
         # the Free Software Foundation; either version 2 of the License, or
         # (at your option) any later version.
-        if ($p =~ m/redistribute.*modify.*GNU General Public License as published(.*)$/) {
+        if ($p =~ m/redistribute.*modify.*GNU General Public License as published(.*)/so) {
             my $version = $1;
             my $license = new CHLicenses::gnugpl;
 
             # Check for version and later clause (optional)
-            if ($version =~ m/version ([\d.]*[\d]+) of the License(.*)$/i) {
+            if ($version =~ m/version ([\d.]*[\d]+) of the License(.*)/soi) {
                 $license->{"version"} = $1;
                 $version = $2;
-                $license->{"later"} = 1 if ($version =~ m/any later version/);
+                $license->{"later"} = 1 if ($version =~ m/any\slater\sversion/);
             }
             $license->{"foundInText"} = $p;
             return $license;
@@ -52,9 +52,9 @@
     for my $p (@$text) {
         if (!$m_title) {
             # text is post-splitting to paragraphs
-            if ($m_title = ($p =~ m/^\s*GNU GENERAL PUBLIC LICENSE(.*)$/)) {
+            if ($m_title = ($p =~ m/^\s*GNU GENERAL PUBLIC LICENSE(.*)$/so)) {
                 my $verstr = $1;
-                if ($verstr =~ m/^\s*Version (\d), (.+?)\s*$/) {
+                if ($verstr =~ m/^\s*Version (\d), (.+?)\s*$/o) {
                     $m_version = $1;
                     my $date = $2;
                     if (($m_version eq '2' && $date eq "June 1991") ||

Modified: people/modax/copyright-helper/trunk/parsers/c_cpp.pm
===================================================================
--- people/modax/copyright-helper/trunk/parsers/c_cpp.pm	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/parsers/c_cpp.pm	2008-02-12 21:09:56 UTC (rev 9402)
@@ -22,6 +22,7 @@
 sub __push_comment(\@$) {
     my ($comments, $comment) = @_;
     if ($comment) {
+        chomp($comment);
         push @$comments, $comment;
         $_[1] = "";
     }
@@ -65,7 +66,7 @@
                     my $before = $1;
                     my $after = $2;
                     __push_comment(@comments, $comment) unless($before =~ /^\s*$/);
-                    $comment .= $after;
+                    $comment .= $after . "\n";
                     $more = 0;
                 } else {
                     __push_comment @comments, $comment;
@@ -92,7 +93,7 @@
         my $i = 0;
         for my $line (@lines) {
             my $p;
-            if ($line =~ m%^([\s*/])*(.*?)([\s*/])*$%) {
+            if ($line =~ m%^([\s*/])*(.*?)([\s*/])*\s*$%) {
                 $p = $2;
             } else {
                 $p = $line;
@@ -104,7 +105,7 @@
             if ($p eq "") {
                 # Treat like the end of the paragraph
                 if (scalar(@parag) > 0) {
-                    push @parags, join(" ", @parag);
+                    push @parags, join("\n", @parag);
                     @parag = ();
                 }
             } else {

Modified: people/modax/copyright-helper/trunk/parsers/po.pm
===================================================================
--- people/modax/copyright-helper/trunk/parsers/po.pm	2008-02-11 23:54:38 UTC (rev 9401)
+++ people/modax/copyright-helper/trunk/parsers/po.pm	2008-02-12 21:09:56 UTC (rev 9402)
@@ -23,7 +23,7 @@
 sub __push_comment(\@\@) {
     my ($comments, $comment) = @_;
     if (scalar(@$comment) > 0) {
-        push @$comments, join(" ", @$comment);
+        push @$comments, join("\n", @$comment);
         @{$_[1]} = ();
     }
 }
@@ -48,6 +48,24 @@
     return @comments;
 }
 
+sub toParagraphs($\@) {
+    my $self = shift;
+    my @text = @{shift()};
+    my @parags = ();
+
+    foreach $_ (@text) {
+        my @lines = split(/\n/);
+        map s/\s+/ /g, @lines;
+        push @parags, @lines;
+    }
+    return @parags;
+}
+
+sub getCopyrights($\@) {
+    # Fuzzy search is forced
+    return $_[0]->getStandardCopyrights($_[1], 1);
+}
+
 sub extensions { 
     return qw( po );
 }




More information about the pkg-kde-commits mailing list