[med-svn] [spades] 01/06: Imported Upstream version 3.8.2+dfsg

Sascha Steinbiss satta at debian.org
Mon Jul 11 10:02:05 UTC 2016


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository spades.

commit cba6d9c7b86edd43a3d71a191f02cc3ee98c45b1
Author: Sascha Steinbiss <satta at debian.org>
Date:   Mon Jul 11 08:36:20 2016 +0000

    Imported Upstream version 3.8.2+dfsg
---
 VERSION                                            |   2 +-
 changelog.html                                     |   4 +
 configs/debruijn/path_extend/pe_params.info        |   8 +-
 ext/src/ssw/ssw.c                                  |   3 +-
 manual.html                                        |  40 ++++----
 metaspades.py                                      |   4 +-
 plasmidspades.py                                   |   4 +-
 spades.py                                          |   4 +-
 spades_init.py                                     |  29 ++++--
 src/cmake/pack.cmake                               |   4 +-
 .../algorithms/path_extend/extension_chooser.hpp   | 108 +++++++++++++--------
 .../algorithms/path_extend/loop_traverser.hpp      |  15 ++-
 .../algorithms/path_extend/overlap_analysis.hpp    |   8 +-
 .../algorithms/path_extend/path_extend_launch.hpp  |   2 +-
 src/modules/algorithms/path_extend/pe_io.hpp       |  28 +++++-
 .../simplification/complex_tip_clipper.hpp         |  11 ++-
 src/modules/paired_info/pair_info_improver.hpp     |  15 +--
 src/projects/hammer/hammer_tools.cpp               |   1 +
 src/projects/spades/distance_estimation.cpp        |   4 +-
 src/projects/spades/main.cpp                       |   3 +-
 src/spades_pipeline/spades_logic.py                |  24 +++--
 src/spades_pipeline/support.py                     |   3 +
 22 files changed, 207 insertions(+), 117 deletions(-)

diff --git a/VERSION b/VERSION
index 1693986..0418bab 100644
--- a/VERSION
+++ b/VERSION
@@ -1,2 +1,2 @@
-3.8.1
+3.8.2
 
diff --git a/changelog.html b/changelog.html
index a11ed47..e83ce50 100644
--- a/changelog.html
+++ b/changelog.html
@@ -3,6 +3,10 @@
 
 <h2>SPAdes Genome Assembler changelog</h2>
 
+<h3>SPAdes 3.8.2, 10 July 2016</h3>
+
+<p> FIX: Several minor bug-fixes for metaSPAdes and SPAdes pipelines.</p>
+
 <h3>SPAdes 3.8.1, 8 June 2016</h3>
 
 <p>FIX: plasmidSPAdes now works with PacBio/Nanopore reads.</p>
diff --git a/configs/debruijn/path_extend/pe_params.info b/configs/debruijn/path_extend/pe_params.info
index 13e04ad..86f1cd6 100644
--- a/configs/debruijn/path_extend/pe_params.info
+++ b/configs/debruijn/path_extend/pe_params.info
@@ -2,7 +2,7 @@ default_pe {
 
 ; output options
 
-debug_output    false
+debug_output    true
 
 output {
     write_overlaped_paths   true
@@ -20,7 +20,7 @@ output_broken_scaffolds     break_gaps
 params {
     multi_path_extend   false
     ; old | 2015 | combined | old_pe_2015
-    scaffolding_mode old
+    scaffolding_mode old_pe_2015
 
     remove_overlaps     true
     cut_all_overlaps  false
@@ -93,8 +93,8 @@ params {
     }
 
     scaffold_graph {
-        construct    false
-        output       false
+        construct    true
+        output       true
         always_add   40         ; connection with read count >= always_add are always added to the graph
         never_add     5         ; connection with read count < never_add are never added to the graph
         relative_threshold 0.25 ; connection with read count >= max_read_count * relative_threshod are added to the graph if satisfy condition above, max_read_count is calculated amond all alternatives
diff --git a/ext/src/ssw/ssw.c b/ext/src/ssw/ssw.c
index a77f39a..39682f2 100755
--- a/ext/src/ssw/ssw.c
+++ b/ext/src/ssw/ssw.c
@@ -470,7 +470,8 @@ static alignment_end* sw_sse2_word (const int8_t* ref,
 			for (j = 0; LIKELY(j < segLen); ++j) {
 				vH = _mm_load_si128(pvHStore + j);
 				vH = _mm_max_epi16(vH, vF);
-				_mm_store_si128(pvHStore + j, vH);
+        vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
+        _mm_store_si128(pvHStore + j, vH);
 				vH = _mm_subs_epu16(vH, vGapO);
 				vF = _mm_subs_epu16(vF, vGapE);
 				if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
diff --git a/manual.html b/manual.html
index 9e7e799..e6cbad1 100644
--- a/manual.html
+++ b/manual.html
@@ -1,6 +1,6 @@
 <html>
 <head>
-    <title>SPAdes 3.8.1 Manual</title>
+    <title>SPAdes 3.8.2 Manual</title>
     <style type="text/css">
         .code {
             background-color: lightgray;
@@ -8,7 +8,7 @@
     </style>
 </head>
 <body>
-<h1>SPAdes 3.8.1 Manual</h1>
+<h1>SPAdes 3.8.2 Manual</h1>
 
 1. <a href="#sec1">About SPAdes</a><br>
     1.1. <a href="#sec1.1">Supported data types</a><br>
@@ -35,18 +35,18 @@
 <h2>1. About SPAdes</h2>
 <p>
     SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. This manual will help you to install and run SPAdes. 
-SPAdes version 3.8.1 was released under GPLv2 on June 8, 2016 and can be downloaded from  <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
+SPAdes version 3.8.2 was released under GPLv2 on July 10, 2016 and can be downloaded from  <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
 
 <a name="sec1.1"></a>
 <h3>1.1 Supported data types</h3>
 <p>
     The current version of SPAdes works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. You can also provide additional contigs that will be used as long reads.
 <p>
-    Version 3.8.1 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
+    Version 3.8.2 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
 <p>
-    SPAdes 3.8.1 includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
+    SPAdes 3.8.2 includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
 <p>
-    Also, SPAdes 3.8.1 includes plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets. To learn more see <a href="#plasmid">options</a>.
+    Also, SPAdes 3.8.2 includes plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets. To learn more see <a href="#plasmid">options</a>.
 <p>
     Additionally, SPAdes has a separate modules for assembling highly polymorphic diploid genomes and for TruSeq barcode assembly. For more information see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a> and <a href="truspades_manual.html" target="_blank">truSPAdes manual</a> .
 
@@ -143,7 +143,7 @@ SPAdes comes in several separate modules:
         <li> Running SPAdes without preliminary read error correction (e.g. without BayesHammer or IonHammer) will likely require more time and memory. </li>
         <li> Each module removes its temporary files as soon as it finishes. </li>
         <li> SPAdes uses 512 Mb per thread for buffers, which results in higher memory consumption. If you set memory limit manually, SPAdes will use smaller buffers and thus less RAM. </li>
-        <li> Performance statistics is given for SPAdes version 3.8.1. </li>
+        <li> Performance statistics is given for SPAdes version 3.8.2. </li>
     </ul>
 
 
@@ -157,13 +157,13 @@ SPAdes comes in several separate modules:
 <h3>2.1 Downloading SPAdes Linux binaries</h3>
 
 <p>
-    To download <a href="http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
+    To download <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
 
 <pre  class="code">
 <code>
-    wget http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Linux.tar.gz
-    tar -xzf SPAdes-3.8.1-Linux.tar.gz
-    cd SPAdes-3.8.1-Linux/bin/
+    wget http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Linux.tar.gz
+    tar -xzf SPAdes-3.8.2-Linux.tar.gz
+    cd SPAdes-3.8.2-Linux/bin/
 </code>
 </pre>
 
@@ -189,13 +189,13 @@ SPAdes comes in several separate modules:
 <h3>2.2 Downloading SPAdes binaries for Mac</h3>
 
 <p>
-    To obtain <a href="http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
+    To obtain <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
 
 <pre  class="code">
 <code>
-    curl http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Darwin.tar.gz -o SPAdes-3.8.1-Darwin.tar.gz
-    tar -zxf SPAdes-3.8.1-Darwin.tar.gz
-    cd SPAdes-3.8.1-Darwin/bin/
+    curl http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Darwin.tar.gz -o SPAdes-3.8.2-Darwin.tar.gz
+    tar -zxf SPAdes-3.8.2-Darwin.tar.gz
+    cd SPAdes-3.8.2-Darwin/bin/
 </code>
 </pre>
 
@@ -230,13 +230,13 @@ SPAdes comes in several separate modules:
     </ul>
 
 <p>
-    If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1.tar.gz">SPAdes source code</a>: 
+    If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2.tar.gz">SPAdes source code</a>: 
 
 <pre class="code">
 <code>
-    wget http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1.tar.gz
-    tar -xzf SPAdes-3.8.1.tar.gz
-    cd SPAdes-3.8.1
+    wget http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2.tar.gz
+    tar -xzf SPAdes-3.8.2.tar.gz
+    cd SPAdes-3.8.2
 </code>
 </pre>
 
@@ -340,7 +340,7 @@ Thank you for using SPAdes!
     SPAdes takes as input paired-end reads, mate-pairs and single (unpaired) reads in FASTA and FASTQ. For IonTorrent data SPAdes also supports unpaired reads in unmapped BAM format (like the one produced by Torrent Server). However, in order to run read error correction, reads should be in FASTQ or BAM format. Sanger, Oxford Nanopore and PacBio CLR reads can be provided in both formats since SPAdes does not run error correction for these types of data.
 
 <p>
-    To run SPAdes 3.8.1 you need at least one library of the following types:
+    To run SPAdes 3.8.2 you need at least one library of the following types:
     <ul>
         <li>Illumina paired-end/high-quality mate-pairs/unpaired reads</li>
         <li>IonTorrent paired-end/high-quality mate-pairs/unpaired reads</li>
diff --git a/metaspades.py b/metaspades.py
index d06205f..9f5034b 100755
--- a/metaspades.py
+++ b/metaspades.py
@@ -756,7 +756,7 @@ def main(args):
                     dataset_file.close()
                 spades_cfg.__dict__["dataset"] = dataset_filename
 
-                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+                used_K = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
 
                 if os.path.isdir(misc_dir) and not options_storage.continue_mode:
                     shutil.rmtree(misc_dir)
@@ -768,7 +768,7 @@ def main(args):
                     if k_str.find(":") != -1:
                         k_str = k_str[:k_str.find(":")]
                     support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
-                log.info("\n===== %s finished. \n" % STAGE_NAME)
+                log.info("\n===== %s finished. Used k-mer sizes: %s \n" % (STAGE_NAME, ', '.join(map(str, used_K))))
             if not options_storage.run_completed:
                 if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
                     support.finish_here(log)
diff --git a/plasmidspades.py b/plasmidspades.py
index d06205f..9f5034b 100755
--- a/plasmidspades.py
+++ b/plasmidspades.py
@@ -756,7 +756,7 @@ def main(args):
                     dataset_file.close()
                 spades_cfg.__dict__["dataset"] = dataset_filename
 
-                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+                used_K = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
 
                 if os.path.isdir(misc_dir) and not options_storage.continue_mode:
                     shutil.rmtree(misc_dir)
@@ -768,7 +768,7 @@ def main(args):
                     if k_str.find(":") != -1:
                         k_str = k_str[:k_str.find(":")]
                     support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
-                log.info("\n===== %s finished. \n" % STAGE_NAME)
+                log.info("\n===== %s finished. Used k-mer sizes: %s \n" % (STAGE_NAME, ', '.join(map(str, used_K))))
             if not options_storage.run_completed:
                 if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
                     support.finish_here(log)
diff --git a/spades.py b/spades.py
index d06205f..9f5034b 100755
--- a/spades.py
+++ b/spades.py
@@ -756,7 +756,7 @@ def main(args):
                     dataset_file.close()
                 spades_cfg.__dict__["dataset"] = dataset_filename
 
-                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+                used_K = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
 
                 if os.path.isdir(misc_dir) and not options_storage.continue_mode:
                     shutil.rmtree(misc_dir)
@@ -768,7 +768,7 @@ def main(args):
                     if k_str.find(":") != -1:
                         k_str = k_str[:k_str.find(":")]
                     support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
-                log.info("\n===== %s finished. \n" % STAGE_NAME)
+                log.info("\n===== %s finished. Used k-mer sizes: %s \n" % (STAGE_NAME, ', '.join(map(str, used_K))))
             if not options_storage.run_completed:
                 if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
                     support.finish_here(log)
diff --git a/spades_init.py b/spades_init.py
old mode 100644
new mode 100755
index ce55e1a..4baebdd
--- a/spades_init.py
+++ b/spades_init.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 ############################################################################
 # Copyright (c) 2015 Saint Petersburg State University
 # Copyright (c) 2011-2014 Saint Petersburg Academic University
@@ -7,16 +9,18 @@
 
 import os
 import sys
+from os.path import abspath, dirname, realpath, join, isfile
 
 source_dirs = ["", "truspades", "common"]
 
 # developers configuration
-spades_home = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
-bin_home = os.path.join(spades_home, 'bin')
-python_modules_home = os.path.join(spades_home, 'src')
-ext_python_modules_home = os.path.join(spades_home, 'ext', 'src', 'python_libs')
+spades_home = abspath(dirname(realpath(__file__)))
+bin_home = join(spades_home, 'bin')
+python_modules_home = join(spades_home, 'src')
+ext_python_modules_home = join(spades_home, 'ext', 'src', 'python_libs')
 spades_version = ''
 
+
 def init():
     global spades_home
     global bin_home
@@ -25,14 +29,19 @@ def init():
     global ext_python_modules_home
 
     # users configuration (spades_init.py and spades binary are in the same directory)
-    if os.path.isfile(os.path.join(spades_home, 'spades')):
-        install_prefix = os.path.dirname(spades_home)
-        bin_home = os.path.join(install_prefix, 'bin')
-        spades_home = os.path.join(install_prefix, 'share', 'spades')
+    if isfile(os.path.join(spades_home, 'spades')):
+        install_prefix = dirname(spades_home)
+        bin_home = join(install_prefix, 'bin')
+        spades_home = join(install_prefix, 'share', 'spades')
         python_modules_home = spades_home
         ext_python_modules_home = spades_home
 
     for dir in source_dirs:
-        sys.path.append(os.path.join(python_modules_home, 'spades_pipeline', dir))
+        sys.path.append(join(python_modules_home, 'spades_pipeline', dir))
+
+    spades_version = open(join(spades_home, 'VERSION'), 'r').readline().strip()
+
 
-    spades_version = open(os.path.join(spades_home, 'VERSION'), 'r').readline().strip()
+if __name__ == '__main__':
+    spades_py_path = join(dirname(realpath(__file__)), 'spades.py')
+    sys.stderr.write('Please use ' + spades_py_path + ' for running SPAdes genome assembler\n')
\ No newline at end of file
diff --git a/src/cmake/pack.cmake b/src/cmake/pack.cmake
index 560c2cb..6b72b71 100644
--- a/src/cmake/pack.cmake
+++ b/src/cmake/pack.cmake
@@ -12,10 +12,10 @@ set(CPACK_PACKAGE_NAME "SPAdes")
 set(CPACK_PACKAGE_VENDOR "Saint Petersburg Academic University")
 set(CPACK_PACKAGE_DESCRIPTION_FILE "${SPADES_MAIN_SRC_DIR}/../README")
 set(CPACK_RESOURCE_FILE_LICENSE "${SPADES_MAIN_SRC_DIR}/../LICENSE")
-set(CPACK_PACKAGE_VERSION "3.8.1")
+set(CPACK_PACKAGE_VERSION "3.8.2")
 set(CPACK_PACKAGE_VERSION_MAJOR "3")
 set(CPACK_PACKAGE_VERSION_MINOR "8")
-set(CPACK_PACKAGE_VERSION_PATCH "1")
+set(CPACK_PACKAGE_VERSION_PATCH "2")
 set(CPACK_STRIP_FILES bin/spades bin/hammer bin/ionhammer bin/dipspades bin/spades-bwa bin/corrector bin/scaffold_correction)
 
 # Source stuff
diff --git a/src/modules/algorithms/path_extend/extension_chooser.hpp b/src/modules/algorithms/path_extend/extension_chooser.hpp
index 13f197c..b00f944 100644
--- a/src/modules/algorithms/path_extend/extension_chooser.hpp
+++ b/src/modules/algorithms/path_extend/extension_chooser.hpp
@@ -1401,8 +1401,8 @@ private:
             std::queue<VertexId>& can_be_processed, double path_coverage) const {
         DEBUG("Updating can be processed");
         for (EdgeId e : g_.OutgoingEdges(v)) {
-            VertexId neighbour_v = this->g_.EdgeEnd(e);
-            if (g_.length(e) < max_edge_length_in_repeat_ && GoodExtension(e, path_coverage)) {
+            VertexId neighbour_v = g_.EdgeEnd(e);
+            if (g_.length(e) <= max_edge_length_in_repeat_ && CompatibleEdge(e, path_coverage)) {
                 DEBUG("Adding vertex " << neighbour_v.int_id()
                                 << "through edge " << g_.str(e));
                 can_be_processed.push(neighbour_v);
@@ -1443,61 +1443,91 @@ private:
         return result;
     }
 
-    bool GoodExtension(EdgeId e, double path_coverage) const {
+    bool CompatibleEdge(EdgeId e, double path_coverage) const {
         return math::ge(g_.coverage(e), path_coverage * delta_);
     }
 
-    EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
-        set<EdgeId> good_extensions;
-        for(auto edge : edges) {
+    //returns lowest coverage among long compatible edges ahead of e
+    //if std::numeric_limits<double>::max() -- no such edges were detected
+    //if negative -- abort at once
+    double AnalyzeExtension(EdgeId ext, double path_coverage) const {
+        double answer = std::numeric_limits<double>::max();
 
-            if(!GoodExtension(edge.e_, path_coverage)) {
-                continue;
-            }
+        if (!CompatibleEdge(ext, path_coverage)) {
+            DEBUG("Extension coverage too low");
+            return answer;
+        }
 
-            if (g_.length(edge.e_) > max_edge_length_in_repeat_) {
-                if (GoodExtension(edge.e_, path_coverage)) {
-                    good_extensions.insert(edge.e_);
-                }
-                continue;
-            }
+        if (g_.length(ext) > max_edge_length_in_repeat_) {
+            DEBUG("Long extension");
+            return g_.coverage(ext);
+        } 
 
-            GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(edge.e_), path_coverage);
-            if (gc.v_size() == 0) {
-                return EdgeContainer();
-            }
+        DEBUG("Short extension, launching repeat component analysis");
+        GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(ext), path_coverage);
+        if (gc.v_size() == 0) {
+            DEBUG("Component search failed");
+            return -1.;
+        }
 
-            for (auto e : gc.edges()) {
-                if (g_.length(e) > max_edge_length_in_repeat_) {
-                    DEBUG("Repeat component contains long edges");
-                    return EdgeContainer();
-                }
+        for (auto e : gc.edges()) {
+            if (g_.length(e) > max_edge_length_in_repeat_) {
+                DEBUG("Repeat component contains long edges");
+                return -1.;
             }
+        }
 
-            for (auto v : gc.sinks()) {
-                for (auto e : g_.OutgoingEdges(v)) {
-                    if (GoodExtension(e, path_coverage)) {
-                        good_extensions.insert(edge.e_);
-                    }
+        DEBUG("Checking long sinks");
+        for (auto v : gc.sinks()) {
+            for (auto e : g_.OutgoingEdges(v)) {
+                if (g_.length(e) > max_edge_length_in_repeat_ && 
+                        CompatibleEdge(e, path_coverage) &&
+                        math::ls(g_.coverage(e), answer)) {
+                    DEBUG("Updating answer to coverage of edge " << g_.str(e));
+                    answer = g_.coverage(e);
                 }
             }
         }
 
-        DEBUG("Number of good extensions is " << good_extensions.size());
+        return answer;
+    }
 
-        if (good_extensions.size() != 1) {
-            DEBUG("Returning");
-            return EdgeContainer();
+    EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
+        static EdgeContainer EMPTY_CONTAINER;
+
+        map<EdgeId, double> good_extension_to_ahead_cov;
+
+        for (auto edge : edges) {
+            DEBUG("Processing candidate extension " << g_.str(edge.e_));
+            double analysis_res = AnalyzeExtension(edge.e_, path_coverage);
+
+            if (analysis_res == std::numeric_limits<double>::max()) {
+                DEBUG("Ignoring extension");
+            } else if (math::ls(analysis_res, 0.)) {
+                DEBUG("Troubles detected, abort mission");
+                return EMPTY_CONTAINER;
+            } else {
+                good_extension_to_ahead_cov[edge.e_] = analysis_res;
+                DEBUG("Extension mapped to ahead coverage of " << analysis_res);
+            }
         }
-        auto extension = *good_extensions.begin();
 
-        if(math::ls(path_coverage, g_.coverage(extension) * delta_)) {
-            DEBUG("Extension coverage too high");
-            return EdgeContainer();
+        DEBUG("Number of good extensions is " << good_extension_to_ahead_cov.size());
+
+        if (good_extension_to_ahead_cov.size() == 1) {
+            auto extension_info = *good_extension_to_ahead_cov.begin();
+            DEBUG("Single extension candidate " << g_.str(extension_info.first));
+            if (math::le(extension_info.second, path_coverage / delta_)) {
+                DEBUG("Extending");
+                return FinalFilter(edges, extension_info.first);
+            } else {
+                DEBUG("Predicted ahead coverage is too high");
+            }
+        } else {
+            DEBUG("Multiple extension candidates");
         }
 
-        DEBUG("Filtering... Extend with edge " << extension.int_id());
-        return FinalFilter(edges, extension);
+        return EMPTY_CONTAINER;
     }
     
     CoverageAwareIdealInfoProvider provider_;
diff --git a/src/modules/algorithms/path_extend/loop_traverser.hpp b/src/modules/algorithms/path_extend/loop_traverser.hpp
index 048615f..57eda57 100644
--- a/src/modules/algorithms/path_extend/loop_traverser.hpp
+++ b/src/modules/algorithms/path_extend/loop_traverser.hpp
@@ -26,6 +26,7 @@ class LoopTraverser {
     const Graph& g_;
     GraphCoverageMap& covMap_;
     shared_ptr<ContigsMaker> extender_;
+    static const size_t MAX_EDGE_LENGTH = 1000;
 private:
     EdgeId FindStart(const set<VertexId>& component_set) const{
         EdgeId result;
@@ -164,7 +165,6 @@ private:
                     for (size_t i = 0; i < shortest_path.size(); ++i) {
                         nLen += g_.length(shortest_path[i]);
                     }
-                    nLen += g_.k();
                 }
             }
         }
@@ -182,6 +182,15 @@ private:
         endPath->Clear();
     }
 
+    bool ContainsLongEdges(const GraphComponent<Graph>& component) const {
+        for(auto e : component.edges()) {
+            if(g_.length(e) > MAX_EDGE_LENGTH) {
+                return true;
+            }
+        }
+        return false;
+    }
+
 public:
     LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, shared_ptr<ContigsMaker> extender) :
             g_(g), covMap_(coverageMap), extender_(extender) {
@@ -189,11 +198,13 @@ public:
 
     void TraverseAllLoops() {
         DEBUG("TraverseAllLoops");
-        shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, 1000);
+        shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, MAX_EDGE_LENGTH);
         while (splitter->HasNext()) {
             GraphComponent<Graph> component = splitter->Next();
             if (component.v_size() > 10)
                 continue;
+            if(ContainsLongEdges(component))
+                continue;
             set<VertexId> component_set(component.v_begin(), component.v_end());
             EdgeId start = FindStart(component_set);
             EdgeId finish = FindFinish(component_set);
diff --git a/src/modules/algorithms/path_extend/overlap_analysis.hpp b/src/modules/algorithms/path_extend/overlap_analysis.hpp
index 279dc4a..d773b57 100644
--- a/src/modules/algorithms/path_extend/overlap_analysis.hpp
+++ b/src/modules/algorithms/path_extend/overlap_analysis.hpp
@@ -85,10 +85,10 @@ class SWOverlapAnalyzer {
 public:
     SWOverlapAnalyzer(size_t flank_length)
             : flank_length_(flank_length),
-              aligner_(/*match_score*/2,
-              /*mismatch_penalty*/6,
-                       /*gap_opening_penalty*/8,
-                       /*gap_extending_penalty*/8) {
+              aligner_(/*match_score*/1,
+                       /*mismatch_penalty*/3,
+                       /*gap_opening_penalty*/4,
+                       /*gap_extending_penalty*/3) {
     }
 
 
diff --git a/src/modules/algorithms/path_extend/path_extend_launch.hpp b/src/modules/algorithms/path_extend/path_extend_launch.hpp
index 7acdeff..5b11bc7 100644
--- a/src/modules/algorithms/path_extend/path_extend_launch.hpp
+++ b/src/modules/algorithms/path_extend/path_extend_launch.hpp
@@ -351,7 +351,7 @@ inline shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(const conj_gr
     VERIFY(cfg::get().mode == config::pipeline_type::meta);
     VERIFY(!lib->IsMp());
     shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g, lib, /*read_length*/cfg::get().ds.RL(),
-        /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 300);
+        /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 0);
     return make_shared<SimpleExtensionChooser>(gp.g, wc,
                                                pset.extension_options.weight_threshold,
                                                pset.extension_options.priority_coeff);
diff --git a/src/modules/algorithms/path_extend/pe_io.hpp b/src/modules/algorithms/path_extend/pe_io.hpp
index 60c22f1..4aa9ffd 100644
--- a/src/modules/algorithms/path_extend/pe_io.hpp
+++ b/src/modules/algorithms/path_extend/pe_io.hpp
@@ -43,20 +43,37 @@ protected:
             ss << constructor_.construct(path[0]).first.substr(0, k_);
         }
 
-        for (size_t i = 0; i < path.Size(); ++i) {
+
+        size_t i = 0;
+        while (i < path.Size()) {
             int gap = i == 0 ? 0 : path.GapAt(i);
             if (gap > (int) k_) {
                 for (size_t j = 0; j < gap - k_; ++j) {
                     ss << "N";
                 }
                 ss << constructor_.construct(path[i]).first;
-            } else {
+            }
+            else {
                 int overlapLen = (int) k_ - gap;
                 if (overlapLen >= (int) g_.length(path[i]) + (int) k_) {
-                    if(overlapLen > (int) g_.length(path[i]) + (int) k_) {
-                        WARN("Such scaffolding logic leads to local misassemblies");
+                    overlapLen -= (int) g_.length(path[i]) + (int) k_;
+                    ++i;
+                    //skipping overlapping edges
+                    while (i < path.Size() && overlapLen >= (int) g_.length(path[i]) + path.GapAt(i)) {
+                        overlapLen -= (int) g_.length(path[i]) + path.GapAt(i);
+                        ++i;
+                    }
+                    if (i == path.Size()) {
+                        break;
+                    }
+
+                    overlapLen = overlapLen + (int) k_ - path.GapAt(i);
+                    if(overlapLen < 0) {
+                        for (size_t j = 0; j < abs(overlapLen); ++j) {
+                            ss << "N";
+                        }
+                        overlapLen = 0;
                     }
-                    continue;
                 }
                 auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str();
                 if(i != path.Size() - 1) {
@@ -69,6 +86,7 @@ protected:
                 }
                 ss << temp_str;
             }
+            ++i;
         }
         return ss.str();
     }
diff --git a/src/modules/algorithms/simplification/complex_tip_clipper.hpp b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
index a5fd240..984cfd5 100644
--- a/src/modules/algorithms/simplification/complex_tip_clipper.hpp
+++ b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
@@ -107,7 +107,12 @@ public:
             DEBUG("Processing vertex " << g_.str(*it));
 
             DominatedSetFinder<Graph> dom_finder(g_, *it, max_path_length_ * 2);
-            dom_finder.FillDominated();
+
+            if(!dom_finder.FillDominated()) {
+                DEBUG("Tip contains too long paths");
+                continue;
+            }
+
             auto component = dom_finder.AsGraphComponent();
 
             if(!CheckEdgeLenghts(component)) {
@@ -142,8 +147,8 @@ public:
             RemoveComplexTip(component);
         }
         CompressAllVertices(g_);
-        INFO("Complex tip clipper finished");
-        INFO("Tips processed " << cnt);
+        DEBUG("Complex tip clipper finished");
+        DEBUG("Tips processed " << cnt);
         return something_done_flag;
     }
 private:
diff --git a/src/modules/paired_info/pair_info_improver.hpp b/src/modules/paired_info/pair_info_improver.hpp
index 89c8945..f1b392a 100644
--- a/src/modules/paired_info/pair_info_improver.hpp
+++ b/src/modules/paired_info/pair_info_improver.hpp
@@ -84,8 +84,8 @@ class PairInfoImprover {
   public:
     PairInfoImprover(const Graph& g,
                      Index& clustered_index,
-                     const io::SequencingLibrary<config::DataSetData> &lib)
-            : graph_(g), index_(clustered_index), lib_(lib) { }
+                     const io::SequencingLibrary<config::DataSetData> &lib, size_t max_repeat_length)
+            : graph_(g), index_(clustered_index), lib_(lib), max_repeat_length_(max_repeat_length) { }
 
     void ImprovePairedInfo(unsigned num_threads = 1) {
         CorrectPairedInfo(num_threads);
@@ -107,13 +107,13 @@ class PairInfoImprover {
       public:
         ContradictionalRemover(omnigraph::de::PairedInfoIndicesT<Graph> &to_remove,
                                const Graph &g,
-                               omnigraph::de::PairedInfoIndexT<Graph>& index)
-                : to_remove_(to_remove), graph_(g), index_(index) {}
+                               omnigraph::de::PairedInfoIndexT<Graph>& index, size_t max_repeat_length)
+                : to_remove_(to_remove), graph_(g), index_(index), max_repeat_length_(max_repeat_length) {}
 
         bool operator()(EdgeId e) {
             omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
 
-            if (graph_.length(e)>= cfg::get().max_repeat_length && index_.contains(e))
+            if (graph_.length(e)>= max_repeat_length_ && index_.contains(e))
                 FindInconsistent(e, to_remove);
 
             return false;
@@ -176,6 +176,7 @@ class PairInfoImprover {
         omnigraph::de::PairedInfoIndicesT<Graph> &to_remove_;
         const Graph &graph_;
         Index& index_;
+        size_t max_repeat_length_;
     };
 
     size_t RemoveContradictional(unsigned nthreads) {
@@ -184,7 +185,7 @@ class PairInfoImprover {
         omnigraph::de::PairedInfoIndicesT<Graph> to_remove(graph_, nthreads);
 
         // FIXME: Replace with lambda
-        ContradictionalRemover remover(to_remove, graph_, index_);
+        ContradictionalRemover remover(to_remove, graph_, index_, max_repeat_length_);
         ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(remover);
 
         DEBUG("ParallelRemoveContraditional: Threads finished");
@@ -272,7 +273,7 @@ class PairInfoImprover {
     const Graph& graph_;
     Index& index_;
     const io::SequencingLibrary<config::DataSetData>& lib_;
-
+    size_t max_repeat_length_;
     DECL_LOGGER("PairInfoImprover")
 };
 
diff --git a/src/projects/hammer/hammer_tools.cpp b/src/projects/hammer/hammer_tools.cpp
index 7298097..1fa2461 100644
--- a/src/projects/hammer/hammer_tools.cpp
+++ b/src/projects/hammer/hammer_tools.cpp
@@ -185,6 +185,7 @@ void CorrectPairedReadFiles(const KMerData &data,
     INFO("Written batch " << buffer_no);
     ++buffer_no;
   }
+  VERIFY_MSG(irsl.eof() && irsr.eof(), "Pair of read files " + fnamel + " and " + fnamer + " contain unequal amount of reads");
 }
 
 std::string getLargestPrefix(const std::string &str1, const std::string &str2) {
diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp
index 4444f90..481a457 100644
--- a/src/projects/spades/distance_estimation.cpp
+++ b/src/projects/spades/distance_estimation.cpp
@@ -176,7 +176,9 @@ void estimate_distance(conj_graph_pack& gp,
     INFO("The refining of clustered pair information has been finished ");    // if so, it resolves such conflicts.
 
     INFO("Improving paired information");
-    PairInfoImprover<Graph> improver(gp.g, clustered_index, lib);
+    PairInfoImprover<Graph> improver(gp.g, clustered_index, lib,
+                                     config.mode == debruijn_graph::config::pipeline_type::meta  ? std::numeric_limits<size_t>::max() : config.max_repeat_length);
+
     improver.ImprovePairedInfo((unsigned) config.max_threads);
 
     if (cfg::get().pe_params.param_set.scaffolder_options.cluster_info) {
diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp
index 0f28162..5b66301 100644
--- a/src/projects/spades/main.cpp
+++ b/src/projects/spades/main.cpp
@@ -72,7 +72,7 @@ int main(int argc, char **argv) {
 
         for (const auto& cfg_fn : cfg_fns)
             INFO("Loading config from " << cfg_fn);
-        
+
         VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
         VERIFY(cfg::get().K % 2 != 0);
 
@@ -85,6 +85,7 @@ int main(int argc, char **argv) {
                      SPADES_GIT_REFSPEC
                      ", git revision "
                      SPADES_GIT_SHA1);
+        INFO("Maximum k-mer length: " << runtime_k::MAX_K);
         INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
 
         spades::assemble_genome();
diff --git a/src/spades_pipeline/spades_logic.py b/src/spades_pipeline/spades_logic.py
index e9237cc..c04f5e9 100644
--- a/src/spades_pipeline/spades_logic.py
+++ b/src/spades_pipeline/spades_logic.py
@@ -88,19 +88,19 @@ def update_k_mers_in_special_cases(cur_k_mers, RL, log, silent=False):
     if options_storage.auto_K_allowed():
         if RL >= 250:
             if not silent:
-                support.warning("Default k-mer sizes were set to %s because estimated "
-                                "read length (%d) is equal to or greater than 250" % (str(options_storage.K_MERS_250), RL), log)
+                log.info("Default k-mer sizes were set to %s because estimated "
+                         "read length (%d) is equal to or greater than 250" % (str(options_storage.K_MERS_250), RL))
             return options_storage.K_MERS_250
         if RL >= 150:
             if not silent:
-                support.warning("Default k-mer sizes were set to %s because estimated "
-                                "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
+                log.info("Default k-mer sizes were set to %s because estimated "
+                         "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
             return options_storage.K_MERS_150
     if RL <= max(cur_k_mers):
         new_k_mers = [k for k in cur_k_mers if k < RL]
         if not silent:
-            support.warning("K-mer sizes were set to %s because estimated "
-                            "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)), log)
+            log.info("K-mer sizes were set to %s because estimated "
+                     "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)), log)
         return new_k_mers
     return cur_k_mers
 
@@ -201,6 +201,7 @@ def prepare_config_scaffold_correction(filename, cfg, log, saves_dir, K):
     #todo
     process_cfg.substitute_params(filename, subst_dict, log)
 
+
 def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K):
     data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K)
     saves_dir = os.path.join(data_dir, 'saves')
@@ -232,6 +233,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
     if not isinstance(cfg.iterative_K, list):
         cfg.iterative_K = [cfg.iterative_K]
     cfg.iterative_K = sorted(cfg.iterative_K)
+    used_K = []
 
     # checking and removing conflicting K-mer directories
     if options_storage.restart_from:
@@ -249,7 +251,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
             k_to_delete = []
             for id, k in enumerate(needed_K):
                 if len(processed_K) == id:
-                    if processed_K[-1] == original_K[-1]: # the last K in the original run was processed in "last_one" mode
+                    if processed_K[-1] == original_K[-1]:  # the last K in the original run was processed in "last_one" mode
                         k_to_delete = [original_K[-1]]
                     break
                 if processed_K[id] != k:
@@ -272,8 +274,10 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
     K = cfg.iterative_K[0]
     if len(cfg.iterative_K) == 1:
         run_iteration(configs_dir, execution_home, cfg, log, K, None, True)
+        used_K.append(K)
     else:
         run_iteration(configs_dir, execution_home, cfg, log, K, None, False)
+        used_K.append(K)
         if options_storage.stop_after == "k%d" % K:
             finished_on_stop_after = True
         else:
@@ -290,6 +294,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
                                         "Rerunning for the first value of K (%d) with Repeat Resolving" %
                                         (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log)
                     run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True)
+                    used_K.append(cfg.iterative_K[0])
                     K = cfg.iterative_K[0]
             else:
                 rest_of_iterative_K = cfg.iterative_K
@@ -299,6 +304,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
                     count += 1
                     last_one = count == len(cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL)
                     run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one)
+                    used_K.append(K)
                     prev_K = K
                     if last_one:
                         break
@@ -354,8 +360,6 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
             if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode:
                 shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths)
 
-
-
     if cfg.developer_mode:
         # saves
         saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves")
@@ -368,4 +372,4 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
     if os.path.isdir(cfg.tmp_dir):
         shutil.rmtree(cfg.tmp_dir)
 
-    return latest
+    return used_K
diff --git a/src/spades_pipeline/support.py b/src/spades_pipeline/support.py
index 77b8fa0..da7e65a 100644
--- a/src/spades_pipeline/support.py
+++ b/src/spades_pipeline/support.py
@@ -104,6 +104,7 @@ def recreate_dir(dirname):
         shutil.rmtree(dirname)
     os.makedirs(dirname)
 
+
 def check_files_duplication(filenames, log):
     for filename in filenames:
         if filenames.count(filename) != 1:
@@ -847,6 +848,8 @@ def read_fasta(filename, gzipped=False):
         file_handler = open(filename)
     for line in file_handler:
         line = process_readline(line, gzipped and sys.version.startswith('3.'))
+        if not line:
+            continue
         if line[0] == '>':
             res_name.append(line.strip())
             if not first:

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/spades.git



More information about the debian-med-commit mailing list