[med-svn] [gubbins] 01/03: New upstream version 2.2.0

Thu Nov 3 14:29:02 UTC 2016

This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository gubbins.

commit 99e8d84dcd3105985748d95c1ae5b5f8abbd8dbd
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date:   Thu Nov 3 15:25:11 2016 +0100

    New upstream version 2.2.0
---
 CHANGELOG                                          |  4 ++++
 Dockerfile                                         | 19 +++++++++++++++++++
 README.md                                          | 10 +++++++++-
 VERSION                                            |  2 +-
 python/gubbins/PreProcessFasta.py                  |  9 +++++++--
 python/gubbins/common.py                           |  2 +-
 python/gubbins/tests/test_external_dependancies.py |  1 +
 python/gubbins/tests/test_pre_process_fasta.py     | 22 ++++++++++++++++++----
 python/scripts/run_gubbins.py                      |  1 +
 9 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 223b398..91401a5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+v2.2.0 - 31 Oct 2016
+------
+By default dont filter out sequences which are 100% identical.
+
 v2.1.0 - 22 July 2016
 ------
 Use GTRCAT model by default in RAxML instead of GTRGAMMA (massive speedup).
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5b322c7
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,19 @@
+#
+#  From this base-image / starting-point
+#
+FROM debian:testing
+
+#
+#  Authorship
+#
+MAINTAINER ap13 at sanger.ac.uk
+
+#
+# Pull in packages from testing
+#
+RUN apt-get update -qq
+
+#
+# Install Roary
+#
+RUN apt-get install gubbins
diff --git a/README.md b/README.md
index c9ade1c..77095e1 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,10 @@ Print debugging messages. Default is off.
     --no_cleanup, -n
     
 Do not remove files from intermediate iterations. This option will also keep other files created by RAxML and fasttree, which would otherwise be deleted. Default is to only keep files from the final iteration.
+
+    --raxml_model, -r
+
+Change the model used by RAxML. The default it GTRCAT (with -V). You can set it to GTRGAMMA.
     
 Output files    
 ==========
@@ -125,8 +129,12 @@ Data from the paper
 ===================
 * ftp://ftp.sanger.ac.uk/pub/project/pathogens/gubbins/PMEN1.aln.gz
 * ftp://ftp.sanger.ac.uk/pub/project/pathogens/gubbins/ST239.aln.gz
-* 
 
 Midpoint rerooting
 ==================
 From version 1.3.5 (25/6/15) to version 1.4.6 (29/2/16) trees were not midpoint rerooted by default. This doesnt have any effect on the recombination detection, but the output trees may not look as expected. Users are advised to upgrade to the latest version.
+
+Ancestoral sequence reconstruction
+==================
+From version 2.0.0 onwards, RAxML is used to reconstruction ancestral sequences instead of fastML. RAxML doesnt always produce results as you would expect so the results can be lower quaility than fastML. If you would like to stick with fastML for ancestral sequence reconstruction, please checkout and install v1.4.9.
+
diff --git a/VERSION b/VERSION
index 7ec1d6d..ccbccc3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.1.0
+2.2.0
diff --git a/python/gubbins/PreProcessFasta.py b/python/gubbins/PreProcessFasta.py
index 1dbbc85..5b5f98b 100644
--- a/python/gubbins/PreProcessFasta.py
+++ b/python/gubbins/PreProcessFasta.py
@@ -71,8 +71,13 @@ class PreProcessFasta(object):
      
      return taxa_to_remove
 
-  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename):
-      taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
+  def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0):
+	  
+      taxa_to_remove = []
+      if remove_identical_sequences < 1:	  
+          taxa_to_remove = self.taxa_missing_too_much_data()
+      else:
+          taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data()
       
       with open(self.input_filename) as input_handle:
           with open(output_filename, "w+") as output_handle:
diff --git a/python/gubbins/common.py b/python/gubbins/common.py
index b1f2e4e..c89fd7f 100644
--- a/python/gubbins/common.py
+++ b/python/gubbins/common.py
@@ -151,7 +151,7 @@ class GubbinsCommon():
     temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())
     
     pre_process_fasta = PreProcessFasta(self.args.alignment_filename,self.args.verbose,self.args.filter_percentage)
-    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename)
+    taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(temp_working_dir+"/"+starting_base_filename, self.args.remove_identical_sequences)
     
     self.args.alignment_filename = temp_working_dir+"/"+starting_base_filename
 
diff --git a/python/gubbins/tests/test_external_dependancies.py b/python/gubbins/tests/test_external_dependancies.py
old mode 100644
new mode 100755
index 6066171..6711d3b
--- a/python/gubbins/tests/test_external_dependancies.py
+++ b/python/gubbins/tests/test_external_dependancies.py
@@ -213,6 +213,7 @@ class TestExternalDependancies(unittest.TestCase):
       parser.add_argument('--converge_method',  '-z', help='Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]',  default = 'weighted_robinson_foulds')
       parser.add_argument('--version',                action='version', version=str(pkg_resources.get_distribution("gubbins").version))
       parser.add_argument('--raxml_model',      '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT',  default = 'GTRCAT')
+      parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0)
       return parser
       
   def default_arg_parse(self):
diff --git a/python/gubbins/tests/test_pre_process_fasta.py b/python/gubbins/tests/test_pre_process_fasta.py
index 10e5954..07d5b8a 100644
--- a/python/gubbins/tests/test_pre_process_fasta.py
+++ b/python/gubbins/tests/test_pre_process_fasta.py
@@ -28,7 +28,7 @@ class TestPreProcessFasta(unittest.TestCase):
         
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),[])
 
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/no_duplicates.aln'))
 
   def test_input_file_with_one_duplicate_sequences(self):   
@@ -40,7 +40,7 @@ class TestPreProcessFasta(unittest.TestCase):
         
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1'])
       
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_one_duplicate.aln'))
  
   def test_input_file_with_multiple_duplicate_sequences(self):   
@@ -51,8 +51,22 @@ class TestPreProcessFasta(unittest.TestCase):
         
       self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
       
-      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
       self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/expected_multiple_duplicates.aln'))
+      
+      
+  def test_dont_filter_input_file_with_multiple_duplicate_sequences(self):   
+      preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/multiple_duplicates.aln')
+      self.assertEqual(preprocessfasta.hash_sequences(), 
+       {b"\x840\x89L\xfe\xb5J6%\xf1\x8f\xe2O\xce'.": ['sample1', 'sample3'],
+        b'\x9c\xe6\x8b\xf7\xae\xe2\x1f\xf5j\xcfu\xf4\xfdO\x8b\xec': ['sample2', 'sample4']})
+        
+      self.assertEqual(preprocessfasta.taxa_of_duplicate_sequences(),['sample1','sample2'])
+      
+      preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',0)
+      self.assertTrue(filecmp.cmp('output.aln', 'gubbins/tests/data/preprocessfasta/multiple_duplicates.aln'))    
+      
+      
  
   def test_input_file_with_all_duplicate_sequences(self):   
       preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/all_same_sequence.aln')
@@ -68,7 +82,7 @@ class TestPreProcessFasta(unittest.TestCase):
                                                       
   def test_filter_out_alignments_with_too_much_missing_data(self):
     preprocessfasta = PreProcessFasta('gubbins/tests/data/preprocessfasta/missing_data.aln', False, 5)
-    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln')
+    preprocessfasta.remove_duplicate_sequences_and_sequences_missing_too_much_data('output.aln',1)
     self.assertTrue(filecmp.cmp('output.aln','gubbins/tests/data/preprocessfasta/expected_missing_data.aln'))        
       
   def tearDown(self):
diff --git a/python/scripts/run_gubbins.py b/python/scripts/run_gubbins.py
index 35fc07c..809f244 100755
--- a/python/scripts/run_gubbins.py
+++ b/python/scripts/run_gubbins.py
@@ -46,6 +46,7 @@ parser.add_argument('--version',                action='version', version=str(pk
 parser.add_argument('--min_window_size',  '-a', help='Minimum window size, default 100', type=int,  default = 100)
 parser.add_argument('--max_window_size',  '-b', help='Maximum window size, default 10000', type=int,  default = 10000)
 parser.add_argument('--raxml_model',      '-r', help='RAxML model [GTRGAMMA|GTRCAT], default GTRCAT',  default = 'GTRCAT')
+parser.add_argument('--remove_identical_sequences', '-d', action='count', help='Remove identical sequences', default = 0)
 
 gubbins_runner  = common.GubbinsCommon(parser.parse_args())
 gubbins_runner.parse_and_run()

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/gubbins.git