[med-svn] [roary] 01/07: New upstream version 3.11.0+dfsg

Sascha Steinbiss satta at debian.org
Wed Oct 11 08:51:48 UTC 2017


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository roary.

commit 622661bfc8d75dd459351df08730a74ea63c24ed
Author: Sascha Steinbiss <satta at debian.org>
Date:   Wed Oct 11 10:17:28 2017 +0200

    New upstream version 3.11.0+dfsg
---
 .travis.yml                                        | 11 ++-
 dist.ini                                           |  4 +-
 install_dependencies.sh                            | 12 ++--
 lib/Bio/Roary/CommandLine/Roary.pm                 |  2 +-
 lib/Bio/Roary/External/CheckTools.pm               |  8 +--
 lib/Bio/Roary/MergeMultifastaAlignments.pm         |  2 +-
 lib/Bio/Roary/PrepareInputFiles.pm                 | 81 +++++++++++++++-------
 lib/Bio/Roary/ReformatInputGFFs.pm                 | 72 +++++++++++++++----
 t/Bio/Roary/ReformatInputGFFs.t                    | 12 ++--
 t/data/expected_core_gene_alignment_core0.66.aln   |  2 +-
 t/data/overall_gene_presence_absence.csv           | 42 +++++------
 .../reformat_input_gffs/expected_fixed_query_2.gff | 10 +--
 12 files changed, 168 insertions(+), 90 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8896bae..5789ffe 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,20 +8,19 @@ addons:
 cache:
   directories:
   - build/parallel-20160722
-  - build/parallel-20130922
+  - build/parallel-20170822
   - build/bedtools2
-  - build/cd-hit-v4.6.6-2016-0711
+  - build/cd-hit-v4.6.8-2017-0621
   - build/prank-msa-master
-  - build/ncbi-blast-2.4.0+
+  - build/ncbi-blast-2.6.0+
   - build/mcl-14-137
   - build/fasttree
 perl:
   - "5.14"
-  - "5.20"
-  - "5.24"
+  - "5.26"
 env:
   - PARALLEL_VERSION=20160722
-  - PARALLEL_VERSION=20130922
+  - PARALLEL_VERSION=20170822
 install:
   - "source ./install_dependencies.sh"
 script: "ROARY_FULL_TESTS=1 dzil test"
diff --git a/dist.ini b/dist.ini
index 4d8b414..e673cd7 100644
--- a/dist.ini
+++ b/dist.ini
@@ -1,9 +1,9 @@
 name    = Bio-Roary
-version = 3.9.1
 author  = Andrew J. Page <ap13 at sanger.ac.uk>
 license = GPL_3
 copyright_holder = Wellcome Trust Sanger Institute
 copyright_year   = 2013
+version = 3.11.0
 main_module = lib/Bio/Roary.pm
 
 [MetaResources]
@@ -12,7 +12,7 @@ repository.web  = http://sanger-pathogens.github.io/Roary/
 repository.url  = https://github.com/sanger-pathogens/Roary.git
 repository.type = git
 
-[@Basic]
+[@Starter]
 [PruneCruft]
 [ExtraTests]
 [AutoPrereqs]
diff --git a/install_dependencies.sh b/install_dependencies.sh
index 41aecc2..253ed7b 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -15,16 +15,16 @@ BEDTOOLS_VERSION="2.26.0"
 BEDTOOLS_DOWNLOAD_FILENAME="bedtools-${BEDTOOLS_VERSION}.tar.gz"
 BEDTOOLS_URL="https://github.com/arq5x/bedtools2/releases/download/v${BEDTOOLS_VERSION}/${BEDTOOLS_DOWNLOAD_FILENAME}"
 
-CDHIT_SHORT_VERSION="4.6.6"
-CDHIT_LONG_VERSION="4.6.6-2016-0711"
+CDHIT_SHORT_VERSION="4.6.8"
+CDHIT_LONG_VERSION="4.6.8-2017-0621"
 CDHIT_DOWNLOAD_FILENAME="cd-hit-${CDHIT_SHORT_VERSION}.tar.gz"
-CDHIT_URL="https://github.com/weizhongli/cdhit/releases/download/V${CDHIT_SHORT_VERSION}/cd-hit-v${CDHIT_LONG_VERSION}.tar.gz"
+CDHIT_URL="https://github.com/weizhongli/cdhit/releases/download/V${CDHIT_SHORT_VERSION}/cd-hit-v${CDHIT_LONG_VERSION}-source.tar.gz"
 
 PRANK_VERSION="0.140603"
 PRANK_DOWNLOAD_FILENAME="prank-msa-master.tar.gz"
 PRANK_URL="https://github.com/ariloytynoja/prank-msa/archive/master.tar.gz"
 
-BLAST_VERSION="2.4.0"
+BLAST_VERSION="2.6.0"
 BLAST_DOWNLOAD_FILENAME="ncbi-blast-${BLAST_VERSION}+-x64-linux.tar.gz"
 BLAST_URL="ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/${BLAST_VERSION}/${BLAST_DOWNLOAD_FILENAME}"
 
@@ -32,11 +32,11 @@ MCL_VERSION="14-137"
 MCL_DOWNLOAD_FILENAME="mcl-${MCL_VERSION}.tar.gz"
 MCL_URL="http://micans.org/mcl/src/mcl-${MCL_VERSION}.tar.gz"
 
-FASTTREE_VERSION="2.1.9"
+FASTTREE_VERSION="2.1.10"
 FASTTREE_DOWNLOAD_FILENAME="FastTree-${FASTTREE_VERSION}.c"
 FASTTREE_URL="http://microbesonline.org/fasttree/FastTree-${FASTTREE_VERSION}.c"
 
-MAFFT_VERSION="7.271"
+MAFFT_VERSION="7.310"
 MAFFT_DOWNLOAD_FILENAME="mafft-${MAFFT_VERSION}-without-extensions-src.tgz"
 MAFFT_URL="http://mafft.cbrc.jp/alignment/software/${MAFFT_DOWNLOAD_FILENAME}"
 
diff --git a/lib/Bio/Roary/CommandLine/Roary.pm b/lib/Bio/Roary/CommandLine/Roary.pm
index 3052b6c..e2335a0 100644
--- a/lib/Bio/Roary/CommandLine/Roary.pm
+++ b/lib/Bio/Roary/CommandLine/Roary.pm
@@ -154,7 +154,7 @@ sub BUILD {
         $self->perc_identity($perc_identity);
         if ( $perc_identity < 50 ) {
             $self->logger->error(
-"The percentage identity is too low. Either somethings wrong with your data, like contamination, or your doing something that the software isnt designed to support."
+"The percentage identity is too low. Either something is wrong with your data, like contamination, or your doing something that the software isnt designed to support."
             );
         }
     }
diff --git a/lib/Bio/Roary/External/CheckTools.pm b/lib/Bio/Roary/External/CheckTools.pm
index b05593a..dc01fd5 100644
--- a/lib/Bio/Roary/External/CheckTools.pm
+++ b/lib/Bio/Roary/External/CheckTools.pm
@@ -57,17 +57,17 @@ my %tools = (
     },
     'mafft' => {
         GETVER => "mafft --version < /dev/null 2>&1",
-        REGEXP => qr/v($BIDEC) /,
+        REGEXP => qr/(\d+\.\d+) /,
         NEEDED => 1,
     },
     'kraken' => {
         GETVER => "kraken --version | head -n 1",
-        REGEXP => qr/Kraken version kraken-(\d+\.\d+\.\d+.*)/,
+        REGEXP => qr/(\d+\.\d+\.\d+.*)/,
         NEEDED => 0,
     },
     'kraken-report' => {
         GETVER => "kraken-report --version | head -n 1",
-        REGEXP => qr/Kraken version kraken-(\d+\.\d+\.\d+.*)/,
+        REGEXP => qr/(\d+\.\d+\.\d+.*)/,
         NEEDED => 0,
     },
 	'Rscript'  => {
@@ -78,7 +78,7 @@ my %tools = (
     },
 
     # prank version also performs an update check so cant use it
-    'prank' => { NEEDED => 1 },
+    'prank' => { NEEDED => 0 },
 
     # now just the standard unix tools we need
     'grep' => { NEEDED => 1 },
diff --git a/lib/Bio/Roary/MergeMultifastaAlignments.pm b/lib/Bio/Roary/MergeMultifastaAlignments.pm
index f74d4ac..1355cf5 100644
--- a/lib/Bio/Roary/MergeMultifastaAlignments.pm
+++ b/lib/Bio/Roary/MergeMultifastaAlignments.pm
@@ -83,7 +83,7 @@ sub _sequence_for_sample_from_gene_file {
 sub _padded_string_for_gene_file {
     my ( $self, $gene_file ) = @_;
     return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
-    return 'N' x ( $self->_gene_lengths->{$gene_file} );
+    return '-' x ( $self->_gene_lengths->{$gene_file} );
 }
 
 sub _create_merged_sequence_for_sample {
diff --git a/lib/Bio/Roary/PrepareInputFiles.pm b/lib/Bio/Roary/PrepareInputFiles.pm
index bedb9bb..aaf2a19 100644
--- a/lib/Bio/Roary/PrepareInputFiles.pm
+++ b/lib/Bio/Roary/PrepareInputFiles.pm
@@ -18,32 +18,39 @@ use Moose;
 use Bio::Roary::Exceptions;
 use Bio::Roary::ExtractProteomeFromGFFs;
 use Bio::Roary::FilterUnknownsFromFasta;
-use Cwd qw(getcwd); 
+use Cwd qw(getcwd);
 use File::Temp;
-
-has 'input_files'      => ( is => 'ro', isa => 'ArrayRef',        required => 1 );
-has 'job_runner'       => ( is => 'ro', isa => 'Str',             default  => 'Local' );
-has 'cpus'             => ( is => 'ro', isa => 'Int',      default => 1 );
-has '_input_gff_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy     => 1, builder => '_build__input_gff_files' );
-has '_input_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_fasta_files' );
-has '_input_fasta_files_filtered' =>
-  ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_fasta_files_filtered' );
+use Log::Log4perl qw(:easy);
+
+has 'input_files'        => ( is => 'ro', isa => 'ArrayRef',        required => 1 );
+has 'job_runner'         => ( is => 'ro', isa => 'Str',             default  => 'Local' );
+has 'cpus'               => ( is => 'ro', isa => 'Int',             default  => 1 );
+has '_input_gff_files'   => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy     => 1, builder => '_build__input_gff_files' );
+has '_input_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy     => 1, builder => '_build__input_fasta_files' );
+has '_input_fasta_files_filtered' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_fasta_files_filtered' );
 has '_input_fasta_files_filtered_obj' =>
-    ( is => 'ro', isa => 'Bio::Roary::FilterUnknownsFromFasta', lazy => 1, builder => '_build__input_fasta_files_filtered_obj' );
+  ( is => 'ro', isa => 'Bio::Roary::FilterUnknownsFromFasta', lazy => 1, builder => '_build__input_fasta_files_filtered_obj' );
 
-has '_derived_fasta_files' =>
-  ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__derived_fasta_files' );
+has '_derived_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__derived_fasta_files' );
 has '_extract_proteome_obj' => (
     is      => 'ro',
     isa     => 'Bio::Roary::ExtractProteomeFromGFFs',
     lazy    => 1,
     builder => '_build__extract_proteome_obj'
 );
-has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
-has 'translation_table'     => ( is => 'rw', isa => 'Int',  default => 11 );
-has 'verbose'               => ( is => 'rw', isa => 'Bool', default => 0 );
-has '_fasta_filter_obj'     =>  ( is => 'ro', isa => 'Bio::Roary::FilterUnknowsFromFasta', lazy => 1, builder => '_fasta_filter_obj' );
-has 'working_directory'    => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
+has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool',                               default => 1 );
+has 'translation_table'     => ( is => 'rw', isa => 'Int',                                default => 11 );
+has 'verbose'               => ( is => 'rw', isa => 'Bool',                               default => 0 );
+has '_fasta_filter_obj'     => ( is => 'ro', isa => 'Bio::Roary::FilterUnknowsFromFasta', lazy    => 1, builder => '_fasta_filter_obj' );
+has 'working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
+has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
+
+sub _build_logger {
+    my ($self) = @_;
+    Log::Log4perl->easy_init($ERROR);
+    my $logger = get_logger();
+    return $logger;
+}
 
 sub _build__input_gff_files {
     my ($self) = @_;
@@ -54,6 +61,33 @@ sub _build__input_gff_files {
 sub _build__input_fasta_files {
     my ($self) = @_;
     my @fasta_files = grep( !/\.gff$/, @{ $self->input_files } );
+
+    my @validated_fasta_files;
+
+    for my $fasta_file (@fasta_files) {
+        eval {
+            my $inseq = Bio::SeqIO->new(
+                -file     => $fasta_file,
+                -format   => 'fasta',
+                -alphabet => 'protein'
+            );
+            while ( my $seq = $inseq->next_seq ) {
+
+                # do something to force the reading.
+                $seq->seq;
+            }
+        };
+        if ($@) {
+            $self->logger->warn(
+                "Input file doesnt have a .gff extension and isnt a protein FASTA file so excluding it from further analysis: $fasta_file"
+            );
+        }
+        else {
+            push( @validated_fasta_files, $fasta_file );
+        }
+
+    }
+
     return \@fasta_files;
 }
 
@@ -62,11 +96,10 @@ sub _build__input_fasta_files_filtered_obj {
     return Bio::Roary::FilterUnknownsFromFasta->new( fasta_files => $self->_input_fasta_files );
 }
 
-sub _build__input_fasta_files_filtered
-{
-  my ($self) = @_;
-  return undef if ( !defined( $self->_input_fasta_files ) );
-  return $self->_input_fasta_files_filtered_obj->filtered_fasta_files();
+sub _build__input_fasta_files_filtered {
+    my ($self) = @_;
+    return undef if ( !defined( $self->_input_fasta_files ) );
+    return $self->_input_fasta_files_filtered_obj->filtered_fasta_files();
 }
 
 sub _build__extract_proteome_obj {
@@ -77,8 +110,8 @@ sub _build__extract_proteome_obj {
         apply_unknowns_filter => $self->apply_unknowns_filter,
         translation_table     => $self->translation_table,
         cpus                  => $self->cpus,
-		verbose               => $self->verbose,
-        working_directory    => $self->working_directory,
+        verbose               => $self->verbose,
+        working_directory     => $self->working_directory,
     );
 }
 
diff --git a/lib/Bio/Roary/ReformatInputGFFs.pm b/lib/Bio/Roary/ReformatInputGFFs.pm
index 0ae52ce..4d1dd03 100644
--- a/lib/Bio/Roary/ReformatInputGFFs.pm
+++ b/lib/Bio/Roary/ReformatInputGFFs.pm
@@ -18,10 +18,12 @@ Take in gff files and add suffix where a gene id is seen twice
 use Moose;
 use Bio::Roary::Exceptions;
 use Cwd;
+use File::Copy;
 use Log::Log4perl qw(:easy);
 use Bio::Tools::GFF;
 use File::Path qw(make_path);
 use File::Basename;
+use Digest::MD5::File qw(file_md5_hex);
 
 has 'gff_files'        => ( is => 'ro', isa  => 'ArrayRef', required => 1 );
 has 'logger'           => ( is => 'ro', lazy => 1,          builder  => '_build_logger' );
@@ -42,47 +44,93 @@ sub fix_duplicate_gene_ids {
     my ($self) = @_;
 
     my %gene_ids_seen_before;
+	
+	my %file_md5s;
+	
     for my $file ( @{ $self->gff_files } ) {
-
+        my $digest = file_md5_hex($file);
+		
+		if(defined($file_md5s{$digest}))
+		{
+            $self->logger->warn(
+                "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file
+            );
+			next;
+		}
+		else
+		{
+			$file_md5s{$digest} = $file;
+		}
+		
         my $ids_seen      = 0;
         my $ids_from_file = $self->_get_ids_for_gff_file($file);
 
         if ( @{$ids_from_file} < 1 ) {
-            $self->logger->warn(
+            $self->logger->error(
                 "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
             );
         }
         else {
             for my $gene_id ( @{$ids_from_file} ) {
                 if ( $gene_ids_seen_before{$gene_id} ) {
-                    $self->logger->warn(
-  "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix.  New GFF in the fixed_input_files directory.  $file "
+                    $self->logger->error(
+  "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file "
                     );
-                    my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file);
+                    my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest);
                     push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
                     $ids_seen = 1;
                     last;
                 }
                 $gene_ids_seen_before{$gene_id}++;
             }
+			
+			# We know its a valid GFF file since we could open it and extract IDs. 
+			# We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but 
+			# if they didnt, then we have to double check and repair if necessary.			
             if ( $ids_seen == 0 ) {
-                push( @{ $self->fixed_gff_files }, $file );
+				
+				
+                push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) );
             }
         }
     }
     return 1;
 }
 
+sub _fix_gff_file_extension
+{
+	my ( $self, $input_file ) = @_;
+	
+	my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
+	return $input_file if($suffix eq '.gff');
+	
+	
+    make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
+    my $output_file = $self->output_directory . '/' . $filename . '.gff';
+	copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file");
+	return $output_file;
+}
+
+
 sub _add_suffix_to_gene_ids_and_return_new_file {
-    my ( $self, $input_file ) = @_;
+    my ( $self, $input_file, $digest ) = @_;
     my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
     make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
-    my $output_file = $self->output_directory . '/' . $filename . $suffix;
+    my $output_file = $self->output_directory . '/' . $filename . '.gff';
 
     open( my $input_gff_fh, $input_file );
     open( my $out_gff_fh, '>', $output_file );
+ 
+    # There is a chance that there can be a collision here, but its remote.
+	my $random_locus_tag = "".$digest;
+	
+    $self->logger->warn(
+        "Renamed GFF file from: $input_file -> $output_file" );
+    $self->logger->warn(
+        "Locus tag used is '$random_locus_tag' for file: $input_file" );
 
     my $found_fasta = 0;
+	my $gene_counter = 1;
     while (<$input_gff_fh>) {
         my $line = $_;
 
@@ -102,15 +150,15 @@ sub _add_suffix_to_gene_ids_and_return_new_file {
             if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
                 my $current_id = $2;
                 $current_id .= '___' . $self->suffix_counter;
-                $tags[$i] = $1 . $current_id . $3;
-                $self->suffix_counter( $self->suffix_counter + 1 );
+                $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3;
+				$gene_counter++;
                 $found_id++;
                 last;
             }
         }
         if ( $found_id == 0 ) {
-            unshift( @tags, 'ID=id___' . $self->suffix_counter );
-            $self->suffix_counter( $self->suffix_counter + 1 );
+            unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter );
+			$gene_counter++;
         }
         $cells[8] = join( ';', @tags );
         print {$out_gff_fh} join( "\t", @cells );
diff --git a/t/Bio/Roary/ReformatInputGFFs.t b/t/Bio/Roary/ReformatInputGFFs.t
index 16ad53b..dd68fc1 100755
--- a/t/Bio/Roary/ReformatInputGFFs.t
+++ b/t/Bio/Roary/ReformatInputGFFs.t
@@ -45,21 +45,19 @@ ok(( -e 'fixed_input_files/query_2.gff'), 'fixed file should exist');
 compare_ok('fixed_input_files/query_2.gff', 't/data/reformat_input_gffs/expected_fixed_query_2.gff',  'fixed file should have expected changes');
 remove_tree('fixed_input_files');
 
-
-ok($obj = Bio::Roary::ReformatInputGFFs->new(gff_files => ['t/data/reformat_input_gffs/query_1.gff', 't/data/reformat_input_gffs/query_2.gff', 't/data/reformat_input_gffs/query_3.gff']), 'initialise with 3 input gffs');
+ok($obj = Bio::Roary::ReformatInputGFFs->new(gff_files => ['t/data/reformat_input_gffs/query_1.gff', 't/data/reformat_input_gffs/query_2.gff', 't/data/reformat_input_gffs/query_3.gff']), 'initialise with 3 input gffs, 2 identical duplicates');
 ok(!( -d 'fixed_input_files'), 'Directory shouldnt exist before running');
 ok($obj->fix_duplicate_gene_ids, 'fix duplicates with 3 input gffs');
-ok(( -d 'fixed_input_files'), 'Directory should exist because there is 2 gffs thats fixed');
-is_deeply($obj->fixed_gff_files, ['t/data/reformat_input_gffs/query_1.gff','fixed_input_files/query_2.gff','fixed_input_files/query_3.gff' ] ,'list of gff files 2 in the fixed directory');
+ok(( -d 'fixed_input_files'), 'Directory should exist because there are 2 gffs thats fixed');
+is_deeply($obj->fixed_gff_files, ['t/data/reformat_input_gffs/query_1.gff','fixed_input_files/query_2.gff' ] ,'list of gff files 2 in the fixed directory');
 ok(( -e 'fixed_input_files/query_2.gff'), 'fixed file should exist');
-ok(( -e 'fixed_input_files/query_3.gff'), 'fixed file should exist');
+ok(!( -e 'fixed_input_files/query_3.gff'), 'fixed file should exist');
 compare_ok('fixed_input_files/query_2.gff','t/data/reformat_input_gffs/expected_fixed_query_2.gff',  'fixed file should have expected changes');
-compare_ok('fixed_input_files/query_3.gff', 't/data/reformat_input_gffs/expected_fixed_query_3.gff',  'fixed file should have expected changes');
 remove_tree('fixed_input_files');
 	
 
 ok($obj = Bio::Roary::ReformatInputGFFs->new(gff_files => ['t/data/reformat_input_gffs/real_1.gff']), 'initialise with 1 gff that has shown to have a bug');
-ok(my $fixed_file = $obj->_add_suffix_to_gene_ids_and_return_new_file('t/data/reformat_input_gffs/real_1.gff'), 'fix duplicates');
+ok(my $fixed_file = $obj->_add_suffix_to_gene_ids_and_return_new_file('t/data/reformat_input_gffs/real_1.gff', 'id__'), 'fix duplicates');
 ok(( -e 'fixed_input_files/real_1.gff'), 'fixed file should exist');
 compare_ok('fixed_input_files/real_1.gff', 't/data/reformat_input_gffs/expected_real_1.gff',  'fixed file should have expected changes');
 remove_tree('fixed_input_files');
diff --git a/t/data/expected_core_gene_alignment_core0.66.aln b/t/data/expected_core_gene_alignment_core0.66.aln
index e542299..9309965 100644
--- a/t/data/expected_core_gene_alignment_core0.66.aln
+++ b/t/data/expected_core_gene_alignment_core0.66.aln
@@ -3,4 +3,4 @@ TTTTT
 >query_2
 GGGGG
 >query_3
-NNNNN
+-----
diff --git a/t/data/overall_gene_presence_absence.csv b/t/data/overall_gene_presence_absence.csv
index d828265..fd16e85 100644
--- a/t/data/overall_gene_presence_absence.csv
+++ b/t/data/overall_gene_presence_absence.csv
@@ -1,22 +1,22 @@
 "Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","query_1","query_2","query_5"
-"hly","","Alpha-toxin","2","2","1","1","1","1","7","","959","959","959","1_1","2_1___1",""
-"group_10","","hypothetical protein","2","2","1","1","10","1","6","","227","227","227","abc_00010","abc_00010___10",""
-"group_11","","C4-dicarboxylate transporter/malic acid transport protein","2","2","1","1","11","1","5","","947","947","947","abc_00011","abc_00011___11",""
-"group_12","","hypothetical protein","2","2","1","1","12","1","4","","188","188","188","abc_00012","abc_00012___12",""
-"group_13","","Gonococcal growth inhibitor III","2","2","1","1","13","1","3","","134","134","134","abc_00014","abc_00014___14",""
-"group_14","","Gonococcal growth inhibitor III","2","2","1","1","14","1","2","","134","134","134","1_6","2_7___15",""
-"yfnB","","Putative HAD-hydrolase yfnB","2","2","1","1","15","1","1","","686","686","686","abc_00016","abc_00016___16",""
-"group_2","","hypothetical protein","2","2","1","1","2","1","8","","146","146","146","abc_00002","abc_00002___2",""
-"group_3","","hypothetical protein","2","2","1","1","3","1","9","","197","197","197","abc_00003","abc_00003___3",""
-"group_4","","superantigen-like protein","2","2","1","1","4","1","10","","716","716","716","abc_00004","abc_00004___4",""
-"speH","","hypothetical protein","2","2","1","1","5","1","11","","725","725","725","1_2","2_2___5",""
-"group_6","","superantigen-like protein","2","2","1","1","6","1","12","","725","725","725","abc_00006","abc_00006___6",""
-"argF","","Ornithine carbamoyltransferase","2","2","1","1","7","1","13","","1001","1001","1001","1_3","2_3___7",""
-"arcC1","","Carbamate kinase 1","2","2","1","1","8","1","14","","935","935","935","abc_00008","abc_00008___8",""
-"group_9","","16S ribosomal RNA","2","2","1","1","9","1","15","","1556","1556","1556","abc_01705","abc_01705___9",""
-"group_16","","hypothetical protein","1","1","1","2","1","2","6","","146","146","146","","","abc_50002"
-"group_17","argF","Ornithine carbamoyltransferase","1","1","1","2","6","2","5","","1001","1001","1001","","","3_3"
-"group_18","","hypothetical protein","1","1","1","2","5","2","4","","227","227","227","","","abc_50010"
-"group_19","","hypothetical protein","1","1","1","2","4","2","3","","188","188","188","","","abc_50012"
-"group_20","","Gonococcal growth inhibitor III","1","1","1","2","3","2","2","","134","134","134","","","abc_50014"
-"group_21","yfnB","Putative HAD-hydrolase yfnB","1","1","1","2","2","2","1","","686","686","686","","","3_5"
+"hly","","Alpha-toxin","2","2","1","1","14","1","1","","959","959","959","1_1","05a85fcc1cbac7027ac3689992006154_1",""
+"group_10","","hypothetical protein","2","2","1","1","6","1","11","","227","227","227","abc_00010","05a85fcc1cbac7027ac3689992006154_10",""
+"group_11","","C4-dicarboxylate transporter/malic acid transport protein","2","2","1","1","5","1","10","","947","947","947","abc_00011","05a85fcc1cbac7027ac3689992006154_11",""
+"group_12","","hypothetical protein","2","2","1","1","4","1","9","","188","188","188","abc_00012","05a85fcc1cbac7027ac3689992006154_12",""
+"group_13","","Gonococcal growth inhibitor III","2","2","1","1","3","1","8","","134","134","134","abc_00014","05a85fcc1cbac7027ac3689992006154_14",""
+"group_14","","Gonococcal growth inhibitor III","2","2","1","1","2","1","7","","134","134","134","1_6","05a85fcc1cbac7027ac3689992006154_15",""
+"yfnB","","Putative HAD-hydrolase yfnB","2","2","1","1","1","1","6","","686","686","686","abc_00016","05a85fcc1cbac7027ac3689992006154_16",""
+"group_2","","hypothetical protein","2","2","1","1","15","1","2","","146","146","146","abc_00002","05a85fcc1cbac7027ac3689992006154_2",""
+"group_3","","hypothetical protein","2","2","1","1","13","1","3","","197","197","197","abc_00003","05a85fcc1cbac7027ac3689992006154_3",""
+"group_4","","superantigen-like protein","2","2","1","1","12","1","4","","716","716","716","abc_00004","05a85fcc1cbac7027ac3689992006154_4",""
+"speH","","hypothetical protein","2","2","1","1","11","1","5","","725","725","725","1_2","05a85fcc1cbac7027ac3689992006154_5",""
+"group_6","","superantigen-like protein","2","2","1","1","10","1","15","","725","725","725","abc_00006","05a85fcc1cbac7027ac3689992006154_6",""
+"argF","","Ornithine carbamoyltransferase","2","2","1","1","9","1","14","","1001","1001","1001","1_3","05a85fcc1cbac7027ac3689992006154_7",""
+"arcC1","","Carbamate kinase 1","2","2","1","1","8","1","13","","935","935","935","abc_00008","05a85fcc1cbac7027ac3689992006154_8",""
+"group_9","","16S ribosomal RNA","2","2","1","1","7","1","12","","1556","1556","1556","abc_01705","05a85fcc1cbac7027ac3689992006154_9",""
+"group_16","","hypothetical protein","1","1","1","2","6","2","3","","146","146","146","","","abc_50002"
+"group_17","argF","Ornithine carbamoyltransferase","1","1","1","2","5","2","4","","1001","1001","1001","","","3_3"
+"group_18","","hypothetical protein","1","1","1","2","4","2","5","","227","227","227","","","abc_50010"
+"group_19","","hypothetical protein","1","1","1","2","3","2","6","","188","188","188","","","abc_50012"
+"group_20","","Gonococcal growth inhibitor III","1","1","1","2","2","2","2","","134","134","134","","","abc_50014"
+"group_21","yfnB","Putative HAD-hydrolase yfnB","1","1","1","2","1","2","1","","686","686","686","","","3_5"
diff --git a/t/data/reformat_input_gffs/expected_fixed_query_2.gff b/t/data/reformat_input_gffs/expected_fixed_query_2.gff
index 1cabef6..a425bb4 100644
--- a/t/data/reformat_input_gffs/expected_fixed_query_2.gff
+++ b/t/data/reformat_input_gffs/expected_fixed_query_2.gff
@@ -1,10 +1,10 @@
 ##gff-version 3
 ##sequence-region abc|SC|contig000001 1 15000
-abc|SC|contig000001	Prodigal:2.60	CDS	172	1131	.	-	0	ID=1_1___1;gene=different;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:Q2G1X0,protein motif:TIGRFAMs:TIGR01002,protein motif:Pfam:PF07968.6;locus_tag=abc_00001;product=Alpha-toxin,beta-channel forming cytolysin,Leukocidin/Hemolysin toxin family protein;protein_id=gnl|SC|abc_00001
-abc|SC|contig000001	Prodigal:2.60	CDS	1804	1950	.	+	0	ID=abc_00002___2;inference=ab initio prediction:Prodigal:2.60;locus_tag=abc_00002;product=hypothetical protein;protein_id=gnl|SC|abc_00002
-abc|SC|contig000001	Prodigal:2.60	CDS	1934	2131	.	+	0	inference=ab initio prediction:Prodigal:2.60;locus_tag=abc_00003;ID=abc_00003___3;product=hypothetical protein;protein_id=gnl|SC|abc_00003
-abc|SC|contig000001	Prodigal:2.60	CDS	2621	3337	.	-	0	ID=abc_00004___4;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK13350,protein motif:Pfam:PF02876.11;locus_tag=abc_00004;product=superantigen-like protein,Staphylococcal/Streptococcal toxin%2C beta-grasp domain;protein_id=gnl|SC|abc_00004
-abc|SC|contig000001	Prodigal:2.60	CDS	3445	4170	.	-	0	gene=speH;ID=1_2___5;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P0C0I6,protein motif:CLUSTERS:PRK13349,protein motif:Pfam:PF02876.11;locus_tag=abc_00005;note=SPE H;product=hypothetical protein,superantigen-like protein,Staphylococcal/Streptococcal toxin%2C beta-grasp domain;protein_id=gnl|SC|abc_00005
+abc|SC|contig000001	Prodigal:2.60	CDS	172	1131	.	-	0	ID=5d3897f59edf296200f1c7de895509e1_1;gene=different;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:Q2G1X0,protein motif:TIGRFAMs:TIGR01002,protein motif:Pfam:PF07968.6;locus_tag=abc_00001;product=Alpha-toxin,beta-channel forming cytolysin,Leukocidin/Hemolysin toxin family protein;protein_id=gnl|SC|abc_00001
+abc|SC|contig000001	Prodigal:2.60	CDS	1804	1950	.	+	0	ID=5d3897f59edf296200f1c7de895509e1_2;inference=ab initio prediction:Prodigal:2.60;locus_tag=abc_00002;product=hypothetical protein;protein_id=gnl|SC|abc_00002
+abc|SC|contig000001	Prodigal:2.60	CDS	1934	2131	.	+	0	inference=ab initio prediction:Prodigal:2.60;locus_tag=abc_00003;ID=5d3897f59edf296200f1c7de895509e1_3;product=hypothetical protein;protein_id=gnl|SC|abc_00003
+abc|SC|contig000001	Prodigal:2.60	CDS	2621	3337	.	-	0	ID=5d3897f59edf296200f1c7de895509e1_4;inference=ab initio prediction:Prodigal:2.60,protein motif:CLUSTERS:PRK13350,protein motif:Pfam:PF02876.11;locus_tag=abc_00004;product=superantigen-like protein,Staphylococcal/Streptococcal toxin%2C beta-grasp domain;protein_id=gnl|SC|abc_00004
+abc|SC|contig000001	Prodigal:2.60	CDS	3445	4170	.	-	0	gene=speH;ID=5d3897f59edf296200f1c7de895509e1_5;inference=ab initio prediction:Prodigal:2.60,similar to AA sequence:UniProtKB:P0C0I6,protein motif:CLUSTERS:PRK13349,protein motif:Pfam:PF02876.11;locus_tag=abc_00005;note=SPE H;product=hypothetical protein,superantigen-like protein,Staphylococcal/Streptococcal toxin%2C beta-grasp domain;protein_id=gnl|SC|abc_00005
 ##FASTA
 >abc|SC|contig000001
 ACTGGCCGCCTAATAATAAAAACTCTAAAAGTTGTAATTTAAAATAGTTCTTTAAATTAT

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/roary.git



More information about the debian-med-commit mailing list