rev 9251 - in people/modax/copyright-helper/trunk: . parsers

Sat Feb 2 14:14:49 UTC 2008

Author: modax-guest
Date: 2008-02-02 14:14:48 +0000 (Sat, 02 Feb 2008)
New Revision: 9251

Modified:
   people/modax/copyright-helper/trunk/CHCopyright.pm
   people/modax/copyright-helper/trunk/CHCore.pm
   people/modax/copyright-helper/trunk/copyright-helper.pl
   people/modax/copyright-helper/trunk/parsers/c_cpp.pm
Log:
* v0.2
* More fixes to parsers & stuff
* Locale fixes
* Eliminated all false positives for konversation source.


Modified: people/modax/copyright-helper/trunk/CHCopyright.pm
===================================================================

--- people/modax/copyright-helper/trunk/CHCopyright.pm	2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/CHCopyright.pm	2008-02-02 14:14:48 UTC (rev 9251)
@@ -18,6 +18,35 @@
 use encoding "utf8";
 use utf8;
 
+# Perl does not seem to match (regex) © without this trick in utf8 mode
+our $__copyright_symbol = '©';
+utf8::encode($__copyright_symbol);
+
+our %blacklisted_names = (
+    "jan" => [ "jan", "january" ],
+    "feb" => [ "feb", "february" ],
+    "mar" => [ "mar", "march" ],
+    "apr" => [ "apr", "april" ],
+    "may" => [ "may" ],
+    "jun" => [ "jun", "june" ],
+    "jul" => [ "jul", "july" ],
+    "aug" => [ "aug", "august" ],
+    "sep" => [ "sep", "september" ],
+    "oct" => [ "oct", "october" ],
+    "nov" => [ "nov", "november" ],
+    "dec" => [ "dec", "december" ],
+
+    "mon" => [ "mon", "monday" ],
+    "tue" => [ "tue", "tuesday" ],
+    "wed" => [ "wed", "wednesday" ],
+    "thu" => [ "thu", "thursday" ],
+    "fri" => [ "fri", "friday" ],
+    "sat" => [ "sat", "saturday" ],
+    "sun" => [ "sun", "sunday" ],
+
+    "inc" => [ "inc." ], # Rule out Free Software Foundation, **Inc.**
+);
+
 sub isUcFirst($) {
     my $str = shift;
     return $str eq ucfirst($str);
@@ -28,6 +57,23 @@
     return $str eq lcfirst($str);
 }
 
+sub isWordCharFirst($) {
+    my $word = shift;
+    return $word =~ m/^\w/;
+}
+
+sub isNameBlacklisted($) {
+    our %blacklisted_names;
+    my $name = lc(shift);
+    my $key = substr($name, 0, 3);
+
+    if (exists $blacklisted_names{$key}) {
+        for my $black (@{$blacklisted_names{$key}}) {
+            return 1 if $black eq $name;
+        }
+    }
+}
+
 sub CHCopyright::Author::new {
     my $self = { 
         "names" => [],
@@ -65,12 +111,17 @@
         return 0;
     }
 
+    # Skip blacklisted words
+    if (isNameBlacklisted($name)) {
+        return 0;
+    }
+
     if ($closeName) {
         $self->closeName();
     }
 
-    push @$names , $name;
-    return 1;
+    push @$names, $name;
+    return $self->isNameClosed() + 1;
 }
 
 sub CHCopyright::Author::isNameClosed($) {
@@ -99,12 +150,14 @@
 
 sub CHCopyright::Author::addYear($$) {
     my ($self, $year) = @_;
+    my @years = split(/,/, $year);
+
     if ($self->{autoYears}) {
         # Replace years
-        $self->{years} = [ $year ];
+        $self->{years} = \@years;
         $self->{autoYears} = 0;
     } else {
-        push @{$self->{years}}, $year;
+        push @{$self->{years}}, @years;
     }
 }
 
@@ -128,33 +181,45 @@
 sub CHCopyright::Author::isComplete($) {
     my ($self) = @_;
     return (@{$self->{names}} && $self->{email}) ||
-        (@{$self->{names}} && @{$self->{years}});
+        (scalar(@{$self->{names}}) > 1 && @{$self->{years}});
 }
 
-sub CHCopyright::Author::complete($) {
-    my ($self) = @_;
+sub CHCopyright::Author::calcQualityOfNames($) {
+    my $self = shift;
     my $names = $self->{names};
-    my $otherWords = $self->{otherWords};
 
-    # Count lowercase and uppercase names
+    my $ucount = 0;
+    my $uwcount = 0;
     my $lcount = 0;
-    my $ucount = 0;
     my $dcount = 0;
     for my $n (@$names) {
         if (isUcFirst($n)) {
             $ucount++; # non-word characters go here too
+            $uwcount++ if (isWordCharFirst($n));
             # Increate "dot count" if the name ends with a dot.
             $dcount++ if ($n =~ /\.$/);
         } else {
             $lcount++;
         }
     }
+    return ($ucount, $uwcount, $lcount, $dcount);
+}
 
-    if (!@$names && @$otherWords) {
+sub CHCopyright::Author::complete($$) {
+    my ($self, $strict) = @_;
+    my $names = $self->{names};
+    my $otherWords = $self->{otherWords};
+
+    # Calculate "names" quality
+    my ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
+
+#    print @{$self->{names}}, " -o ", @{$self->{otherWords}}, " -e ", $self->getEmail(), " -y ", $self->getAllYears(), "\n";
+
+    if (!$strict && !@$names && @$otherWords) {
         # Assume the author was a bit lazy and wrote his
         # firstname/lastname/nickname starting with a 
         # lowercase letter. Credit him anyway
-        $self->{names} = $self->{otherWords};
+        $self->{names} = $otherWords;
     } elsif ($lcount > 2 || $lcount >= $ucount) {
         # Then probably this name is not a real name
         $self->{names} = [];
@@ -163,9 +228,28 @@
         # from the end of the names array
         my $i = scalar(@$names) - 1;
         for (; $i >= 0 && isLcFirst($$names[$i]); $i--) {;}
-        my @newArray = splice(@$names, 0, $i+1);
-        $self->{names} = \@newArray;
+        if ($i < scalar(@$names) - 1) {
+            my @removed = splice(@$names, -(scalar(@$names)-$i-1));
+            push (@$otherWords, @removed);
+        }
     }
+    $names = $self->{names};
+    
+    if ($strict) {
+        # If mode is strict, do not accept anything with too many
+        # otherWords 
+        if (scalar(@$otherWords) > scalar(@$names)) {
+            $self->{names} = [];
+        } else {
+            # Recalc names quality
+            ($ucount, $uwcount, $lcount, $dcount) = $self->calcQualityOfNames();
+            # Ensure high quality standards
+            if (!($lcount <= 1 && $dcount <= 1 && $uwcount == 0)) {
+                $self->{names} = [];
+            }
+        }
+    }
+    $names = $self->{names};
 
     if ($ucount == $dcount && !$self->getEmail() &&
         (!@{$self->{years}} || $self->{autoYears})) {
@@ -216,19 +300,29 @@
 
 ########################### AuthorCollection ###################################
 sub CHCopyright::AuthorCollection::new {
+    my ($cls, $strict) = @_;
+    $strict = 0 unless(defined($strict));
+
     my $self = {
         "authors" => [ new CHCopyright::Author ],
-        "lastAddition" => "",
+        "additions" => [],
+        "strictMode" => $strict,
     };
-    return bless($self, $_[0]);
+    return bless($self, $cls);
 }
 
+sub CHCopyright::AuthorCollection::isModeStrict($) {
+    $_[0]->{strictMode};
+}
+
 sub CHCopyright::AuthorCollection::getAuthors($) {
     $_[0]->{authors};
 }
 
 sub CHCopyright::AuthorCollection::getLastAddition($) {
-    return $_[0]->{lastAddition};
+    my $array = $_[0]->{additions};
+    my $last = scalar(@$array) - 1;
+    return ($last > 0) ? $$array[$last] : "";
 }
 
 sub CHCopyright::AuthorCollection::setLastAddition($$) {
@@ -242,7 +336,7 @@
         $self->closeYears();
     }
 
-    $self->{lastAddition} = $newAddition;
+    push @{$self->{additions}}, $newAddition;
 }
 
 sub CHCopyright::AuthorCollection::getAuthorCount($) {
@@ -257,6 +351,17 @@
     return 0;
 }
 
+sub CHCopyright::AuthorCollection::getLastNameClosedOrNonEmpty($$) {
+    my ($self) = @_;
+    my $prev = 0;
+    for my $a (@{$self->{authors}}) {
+        last if ($prev && !$a->{nameClosed} && !@{$a->{names}});
+        $prev = $a;
+    }
+
+    return $prev;
+}
+
 sub CHCopyright::AuthorCollection::getLastAuthor($) {
     return $_[0]->{authors}->[ $_[0]->getAuthorCount()-1 ]; 
 }
@@ -276,14 +381,16 @@
     my $author = $self->getCurrentByField("nameClosed");
 
     $author = $self->addNewAuthor() unless ($author);
-    $author->addName($name);
+    my $ret = $author->addName($name);
 
     $self->setLastAddition("name");
+
+    return $ret;
 }
 
 sub CHCopyright::AuthorCollection::addOtherWord($$) {
     my ($self, $word) = @_;
-    my $author = $self->getCurrentByField("nameClosed");
+    my $author = $self->getLastNameClosedOrNonEmpty();
 
     $author = $self->addNewAuthor() unless ($author);
     $author->addOtherWord($word);
@@ -313,28 +420,65 @@
 sub CHCopyright::AuthorCollection::closeYears($) {
     my ($self) = @_;
     for my $a (@{$self->{authors}}) {
-        return $a->closeYears() if ($a->{years});
+        return $a->closeYears() if (!$a->isYearsClosed() && $a->{years});
     }
 }
 
-sub CHCopyright::AuthorCollection::complete($) { 
+sub CHCopyright::AuthorCollection::complete($) {
+    my $self = shift;
+    my $authors = $self->getAuthors();
+
+    for my $author (@$authors) {
+        $author->complete($self->isModeStrict());
+    }
+
+    # If the last author is without email, check the order
+    # how words were added. We might need to lift emails down
+    # if the email came before name
+    my $author = $self->getLastAuthor();
+    if ($self->getAuthorCount() > 1 && !$author->getEmail()) {
+        my $additions = $self->{additions};
+        my $email = 0;
+        my $name = 0;
+        for (my $i = scalar(@$additions)-1; $i >= 0; $i--) {
+            my $el = $$additions[$i];
+            if ($el eq "email") {
+                $email++;
+            } elsif ($el eq "name") {
+                $name++;
+                last;
+            }
+        }
+
+        if ($email == 1 && $name == 1) {
+            # Then lift email down by 1
+            for (my $i = $self->getAuthorCount()-1; $i >= 1; $i--) {
+                $$authors[$i]->setEmail($$authors[$i-1]->getEmail());
+            }
+            $$authors[0]->setEmail("");
+        }
+    }
 }
 
-sub __process_copyright_statement(\@$$) {
-    my ($results, $p, $uniqOnly) = @_;
+sub __process_copyright_statement(\@$$$) {
+    my ($results, $p, $uniqOnly, $strict) = @_;
 
     # Initialize author collection
-    my $authorCollection = new CHCopyright::AuthorCollection;
+    my $authorCollection = new CHCopyright::AuthorCollection($strict);
     my $is_name = 0;
 
     my @words = split(/[\s]+/, $p);
     for (my $i = 0; $i <= $#words; $i++) {
         my $w = $words[$i];
+        my $added = 0;
 
-        if ($w =~ m/^([0-9-]{2,})\W?$/) {
+#       print "----- $w -- $is_name", "\n";
+
+        if ($w =~ m/^(\d{4}[\d,-]*)\W?$/) {
             # Then probably it's a year
             $authorCollection->addYear($1);
             $is_name = 0;
+            $added = 1;
         } else {
             if ($w =~ m/@/) {
                 $w =~ m/^<?([^>]*)>?/;
@@ -343,20 +487,24 @@
                 # Then probably it's an e-mail address
                 $authorCollection->addEmail($w);
                 $is_name = 0;
-            } elsif (($w =~ m/\w/) && isUcFirst($w)) { # if has any word chars and start with an uppercase
-                $authorCollection->addName($w);
-                $is_name = 1;
+                $added = 1;
+            } elsif (!$is_name && isWordCharFirst($w) && isUcFirst($w)) {
+                # if it starts with an uppercase word char
+                $added = $is_name = $authorCollection->addName($w);
             } elsif ($is_name) {
-                $authorCollection->addName($w);
-            } else {
+                $added = $authorCollection->addName($w);
+                $is_name = 0 if ($added == 2);
+            }
+
+            if (!$added) {
                 $authorCollection->addOtherWord($w);
             }
         }
     }
-
+    
+    $authorCollection->complete();
 nextAuthor:
     for my $author (@{$authorCollection->getAuthors()}) {
-        $author->complete();
         if ($author->isComplete()) {
             if ($uniqOnly) {
                 for my $a (@$results) {
@@ -371,17 +519,19 @@
 
 sub getCopyright(\@$$) {
     my ($results, $p, $separator) = @_;
+    our $__copyright_symbol;
 
-    use bytes; # That's an ugly workaround for perl to recognize ©
-    $separator = qr/copyright|\(C\)\s*|©\s*/i if (!$separator);
+    $separator = qr/copyright|\(C\)\s*|$__copyright_symbol\s*/i if (!$separator);
 
     # Copyright statement usually starts with Copyright or (C)
     my @copyrights = split($separator, $p);
     if (scalar(@copyrights) > 1) {
+        my $i = -1;
         for my $c (@copyrights) {
+            $i++;
             next if ($c =~ m/^\s*$/);
 
-            __process_copyright_statement(@$results, $c, 0);
+            __process_copyright_statement(@$results, $c, 0, ($i == 0));
         }
         return 1;
     } else {
@@ -392,21 +542,23 @@
 sub getFuzzyCopyright(\@$) {
     my ($results, $p) = @_;
 
-    if (($p =~ /\W\d{4}\W/) && ($p =~ /@/)) {
-        my @sentenses = split(/\s*\.\s+/, $p);
+    my @sentenses = split(/\s*\.\s+/, $p);
+    if ($p =~ /(^|[\s.,-])\d{4}([\s.,-]|$)/) {
         for my $s (@sentenses) {
             $s =~ s/copyright\s*|\(C\)\s*//i;
 
-            # Count words starting with a upper case letter
-            my @words = split(/\s+/, $s);
-            my $count = 0;
-            for my $w (@words) {
-                $count++ if (($w =~ m/\w/) && isUcFirst($w));
+            if (($s =~ /@/)) {
+#               # Count words starting with a upper case letter (turned off)
+#               my @words = split(/\s+/, $s);
+#               my $count = 0;
+#               for my $w (@words) {
+#                   $count++ if (isWordCharFirst($w) && isUcFirst($w));
+#               }
+#               if ($count >= 2) {
+                    # OK. Treat it like copyright statement then
+                __process_copyright_statement(@$results, $s, 1, 0);
+#               }
             }
-            if ($count >= 2) {
-                # OK. Treat it like copyright statement then
-                __process_copyright_statement(@$results, $s, 1);
-            }
         }
     }
 }

Modified: people/modax/copyright-helper/trunk/CHCore.pm
===================================================================
--- people/modax/copyright-helper/trunk/CHCore.pm	2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/CHCore.pm	2008-02-02 14:14:48 UTC (rev 9251)
@@ -20,9 +20,14 @@
 
 @CHCore::Directory::ISA = qw( CHCore::File );
 
-#use CHParsers("fileparsers/*");
-#use CHLicenses("licenses/*");
-#use CHCopyright;
+sub __dump_array(\@$) {
+    my ($array, $name) = @_;
+    my $i = 0;
+    for my $a (@$array) {
+        $i++;
+        print "$name $i |", $a, "\n";
+    }
+}
 
 sub CHCore::File::new($$$) {
     my ($cls, $directory, $filename) = @_;
@@ -125,16 +130,13 @@
     # Extract comments from file and split text to paragraphs
     my @comments = $parser->filter($arg);
     my @parags = $parser->toParagraphs(\@comments);
-
     close FILE;
 
-    #my $i = 0;
-    #for my $c (@comments) {
-    #   $i++;
-    #    print "Parag $i |", $c, "\n";
-    #}
-    
-    # Get copyright holders
+    # Array dumps for debugging purposes
+    #__dump_array(@comments, "Comment");
+    #__dump_array(@parags, "Paragraph");
+
+    # Get copyright holders and license information
     my @copyrights = $parser->getCopyrights(\@parags);
     my $license = $parser->getLicense(\@parags);
 

Modified: people/modax/copyright-helper/trunk/copyright-helper.pl
===================================================================
--- people/modax/copyright-helper/trunk/copyright-helper.pl	2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/copyright-helper.pl	2008-02-02 14:14:48 UTC (rev 9251)
@@ -169,7 +169,7 @@
 }
 
 # Entry point
-${main::VERSION}='0.1';
+${main::VERSION}='0.2';
 print_msg "\n";
 print_msg "Copyright Helper v${main::VERSION}\n";
 print_msg "Extracts copyright and license information from source code\n\n";

Modified: people/modax/copyright-helper/trunk/parsers/c_cpp.pm
===================================================================
--- people/modax/copyright-helper/trunk/parsers/c_cpp.pm	2008-02-02 06:23:25 UTC (rev 9250)
+++ people/modax/copyright-helper/trunk/parsers/c_cpp.pm	2008-02-02 14:14:48 UTC (rev 9251)
@@ -19,9 +19,9 @@
 use utf8;
 our @ISA = qw( CHParsers::ParserBase );
 
-sub __push_comment(\@$$) {
-    my ($comments, $comment, $before) = @_;
-    if ($comment && !($before =~ /^\s*$/)) {
+sub __push_comment(\@$) {
+    my ($comments, $comment) = @_;
+    if ($comment) {
         push @$comments, $comment;
         $_[1] = "";
     }
@@ -42,6 +42,9 @@
                     $compbound = 0;
                     $_ = $2 . "\n";
                     $more = 1;
+                    # End of the compbound comment always means
+                    # the end of the comment
+                    __push_comment @comments, $comment;
                 } else {
                     $comment .= $_;
                     $more = 0;
@@ -49,22 +52,32 @@
             } else {
                 # Beginning of the comment
                 if (m%^(.*?)/\*(.*)$%) {
-                    __push_comment @comments, $comment, $1;
+                    # Start of the compbound comment always starts
+                    # a new comment. Push old one and continue to
+                    # the upper block
+                    __push_comment @comments, $comment;
                     $_ = $2 . "\n";
                     $compbound = 1;
                     $more = 1;
                 } elsif (m%^(.*?)//(.*)$%) {
-                    __push_comment @comments, $comment, $1;
-                    $comment .= $2;
+                    # Push _previous comment_ only if there is
+                    # something (non-commentish before it)
+                    my $before = $1;
+                    my $after = $2;
+                    __push_comment(@comments, $comment) unless($before =~ /^\s*$/);
+                    $comment .= $after;
                     $more = 0;
                 } else {
-                    __push_comment @comments, $comment, $_;
+                    __push_comment @comments, $comment;
                     $more = 0;
                 }
             }
         } while ($more);
     }
-    __push_comment @comments, $comment, "force";
+
+    # Push the rest if there anything to push
+    __push_comment @comments, $comment;
+
     return @comments;
 }
 
@@ -78,14 +91,14 @@
         my @parag;
         my $i = 0;
         for my $line (@lines) {
-            my $p = "";
-            if ($line =~ m%^(\s|[/*])*(.*?)(\s|[*/])*$%) {
+            my $p;
+            if ($line =~ m%^([\s*/])*(.*?)([\s*/])*$%) {
                 $p = $2;
             } else {
                 $p = $line;
             }
             if ($i == $#lines && $p ne "") {
-                push @parags, $p;
+                push @parag, $p;
                 $p = "";
             }
             if ($p eq "") {