[med-svn] [Git][med-team/ssake][master] 11 commits: Adapt watch file to new tarball versioning

Sun Feb 18 15:50:04 UTC 2018

Andreas Tille pushed to branch master at Debian Med / ssake


Commits:
29c19132 by Andreas Tille at 2018-02-18T16:26:41+01:00
Adapt watch file to new tarball versioning

- - - - -
83012ff7 by Andreas Tille at 2018-02-18T16:27:02+01:00
New upstream version 4.0
- - - - -
ee1d2a66 by Andreas Tille at 2018-02-18T16:27:13+01:00
Update upstream source from tag 'upstream/4.0'

Update to upstream version '4.0'
with Debian dir cda641a463a96c45c14f209691008f4f971d9073
- - - - -
99574d39 by Andreas Tille at 2018-02-18T16:29:58+01:00
New upstream version

- - - - -
1a77fbdf by Andreas Tille at 2018-02-18T16:30:26+01:00
cme fix dpkg-control

- - - - -
18025e3e by Andreas Tille at 2018-02-18T16:30:42+01:00
debhelper 11

- - - - -
a9147550 by Andreas Tille at 2018-02-18T16:31:33+01:00
Update copyright

- - - - -
1f59bfb9 by Andreas Tille at 2018-02-18T16:37:34+01:00
README is now delivered in md format

- - - - -
a0abce15 by Andreas Tille at 2018-02-18T16:40:08+01:00
spelling

- - - - -
40e8f0a0 by Andreas Tille at 2018-02-18T16:45:19+01:00
Remove potentially privacy breaching logo from readme

- - - - -
7494ba05 by Andreas Tille at 2018-02-18T16:49:02+01:00
Upload to unstable

- - - - -


25 changed files:

- SSAKE
- − SSAKE-readme.pdf
- debian/changelog
- debian/compat
- debian/control
- debian/copyright
- debian/patches/do_not_send_mail_after_testing.patch
- + debian/patches/no_privacy_breach_logo.patch
- debian/patches/series
- debian/rules
- debian/ssake.docs
- debian/watch
- SSAKE-readme.txt → readme.md
- + test/CelegansLinkedReadsAssembly.sh
- test/Herpesvirus_3.60kb.reads.fa
- test/Herpesvirus_3.60kb.seed.fa
- test/HiSeqFusobacteriumAssembly.sh
- test/MiSeqCampylobacterAssembly.sh
- test/MiSeqCampylobacterAssemblyPIPELINE.sh
- test/MiSeqEbolaAssemblyPIPELINE.sh
- test/MiSeqEcoliAssembly250XPE300.sh
- test/README.txt
- + test/assembly_metrics_tests.txt
- + test/runSSAKE.sh
- test/runSSAKE.sh → tools/runSSAKE.sh


Changes:

=====================================
SSAKE
=====================================

--- a/SSAKE
+++ b/SSAKE
@@ -2,13 +2,14 @@
 
 
 #AUTHOR
-#   Rene Warren (c) 2006-2017
+#   Rene Warren (c) 2006-2018
 #   Short Sequence Assembly by K-mer search and 3' Extension (SSAKE)
 #   rwarren at bcgsc.ca
 
 #NAME
-#   SSAKE Rene Warren, April 2017
-#   v3.8.5+ Implements targeted de novo assembly. Bug fix (only affected targeted de novo assembly -s mode)
+#   SSAKE Rene Warren, Jan 2018
+#   v4.0+ Initial support for linked reads (reads co-located to genomic locus via barcode or other means)
+#   v3.8.5+ Bug fix (only affected targeted de novo assembly -s mode)
 #   v3.8.4+ Improvements to the TASR modules, recruiting pairs when reads have a k-mer match
 #   v3.8.3+ Included option to ignore reads making up the consensus base extension (-y)
 #   v3.8.3+ Included tie-breaker option (-q) when determining consensus from equal-coverage bases 
@@ -37,7 +38,7 @@
 #   If you use SSAKE, the SSAKE code or ideas, please cite our work
 
 #LICENSE
-#   SSAKE Copyright (c) 2006-2017 Canada's Michael Smith Genome Science Centre.  All rights reserved.
+#   SSAKE Copyright (c) 2006-2018 Canada's Michael Smith Genome Science Centre.  All rights reserved.
 
 #   This program is free software; you can redistribute it and/or
 #   modify it under the terms of the GNU General Public License
@@ -55,29 +56,32 @@
 use strict;
 use Getopt::Std;
 use Net::SMTP;
-use vars qw($opt_f $opt_m $opt_o $opt_v $opt_r $opt_p $opt_d $opt_k $opt_a $opt_z $opt_e $opt_g $opt_s $opt_t $opt_b $opt_c $opt_x $opt_n $opt_h $opt_i $opt_w $opt_j $opt_q $opt_y $opt_u);
-#l
-getopts('f:m:o:v:r:p:d:k:a:z:e:g:s:t:b:c:x:n:h:u:w:j:q:y:i:');
-my ($targetwordlen,$base_overlap,$min_overlap,$verbose,$MIN_READ_LENGTH,$SEQ_SLIDE,$min_base_ratio,$paired,$min_links,$max_link_ratio,$contig_size_cutoff,$insert_stdev,$unpaired_file,$seed_file,$max_trim,$base_name,$tracked,$forcetrack,$max_count_trim,$min_tig_overlap,$npad_gaps,$ignorehead,$space_restriction,$min_depth_of_coverage,$tie_breaker,$ignore_read,$independent)=(15,2,20,0,21,1,0.7,0,4,0.50,100,0.75,"no-g","no-targetSeed",0,"",0,0,10,20,0,0,0,0,0,0,1);
+use vars qw($opt_f $opt_m $opt_o $opt_v $opt_r $opt_p $opt_d $opt_l $opt_a $opt_z $opt_e $opt_g $opt_s $opt_t $opt_b $opt_c $opt_x $opt_n $opt_h $opt_i $opt_w $opt_j $opt_q $opt_y $opt_u);
+#k unused letters
+getopts('f:m:o:v:r:p:d:l:a:z:e:g:s:t:b:c:x:n:h:u:w:j:q:y:i:');
+my ($targetwordlen,$base_overlap,$min_overlap,$verbose,$MIN_READ_LENGTH,$SEQ_SLIDE,$min_base_ratio,$paired,$min_links,$max_link_ratio,$contig_size_cutoff,$insert_stdev,$unpaired_file,$seed_file,$max_trim,$base_name,$tracked,$forcetrack,$max_count_trim,$min_tig_overlap,$npad_gaps,$ignorehead,$space_restriction,$min_depth_of_coverage,$tie_breaker,$ignore_read,$independent)=(15,2,20,0,21,1,0.7,0,5,0.3,100,0.75,"no-g","no-targetSeed",0,"",0,0,10,20,0,0,0,0,0,0,1);
 
-my $version = "[v3.8.5]";
+my $version = "[v4.0]";
 my $dev = "rwarren\@bcgsc.ca";
 my $per;
 my $MAX = 0;
 my $MAX_TOP = 1500; # this is the very maximum anchoring edge sequence that will be searched (designed for use with -s to prevent long searches)
 my $TRACK_COUNT = 0;
+my $LR_MAXDIST_APART = 60000;
 my $illuminaLengthCutoff = 500; ### means all sequence reads greater than this are not illumina sequences
+my $LOW_COVERAGE_TIG_IN_A_ROW = 100; ### SEE THIS MANY CONTIGS IN A ROW HAVING < -w COVERAGE TO TERMINATE
 
 #-------------------------------------------------
 
 if(! $opt_f || ! $opt_w){
-   print "Usage: $0 $version\n";
+   print "\nUsage: $0 $version\n";
    print "-f  File containing all the [paired (-p 1)] reads (required)\n";
    print "\t  With -p 1:\n";
-   print "\t! Target insert size must be indicated at the end of the header line (e.g. :200 for a 200bp insert)\n";
+   print "\t! Target insert size must be indicated at the end of the header line (e.g. :400 for a 400bp fragment/insert size)\n";
    print "\t! Paired reads must be separated by \":\"\n";
-   print "\t  >template_name:200\n\t  ACGACACTATGCATAAGCAGACGAGCAGCGACGCAGCACG:GCGCACGACGCAGCACAGCAGCAGACGAC\n";
-   print "-w  Minimum depth of coverage allowed for contigs (e.g. -w 1 = process all reads [v3.7 behavior], required)\n";
+   print "\t  >header:400 (or >header_barcode:400)\n\t  ACGACACTATGCATAAGCAGACGAGCAGCGACGCAGCACG:GCGCACGACGCAGCACAGCAGCAGACGAC\n";
+   print "-g  Fasta file containing unpaired sequence reads (optional)\n";
+   print "-w  Minimum depth of coverage allowed for contigs (e.g. -w 1 = process all reads [v3.7 behavior], required, recommended -w 5)\n";
    print "    *The assembly will stop when 50+ contigs with coverage < -w have been seen.*\n";
    print "-s  Fasta file containing sequences to use as seeds exclusively (specify only if different from read set, optional)\n";
    print "\t-i Independent (de novo) assembly  i.e Targets used to recruit reads for de novo assembly, not guide/seed reference-based assemblies (-i 1 = yes (default), 0 = no, optional)\n";
@@ -95,13 +99,13 @@ if(! $opt_f || ! $opt_w){
    print "-q  Break tie when no consensus base at position, pick random base (-q 1 = yes, default = no, optional)\n"; 
    print "-p  Paired-end reads used? (-p 1 = yes, default = no, optional)\n";
    print "-v  Runs in verbose mode (-v 1 = yes, default = no, optional)\n";
-   print "============ Options below only considered with -p 1 ============\n";
+   print "============ scaffolding options below only considered with -p 1 ============\n";
    print "-e  Error (%) allowed on mean distance   e.g. -e 0.75  == distance +/- 75% (default -e $insert_stdev, optional)\n";
-   print "-k  Minimum number of links (read pairs) to compute scaffold (default -k $min_links, optional)\n";
+   print "-l  Minimum number of links (read pairs) to compute scaffold (default -k $min_links, optional)\n";
    print "-a  Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio, optional)\n";
-   print "-x  Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -x $min_tig_overlap, optional)\n";
-   print "-n  N-pad gaps (-n 1 = yes, default = no $npad_gaps, optional)\n";
-   die "-g  Fasta file containing unpaired sequence reads (optional)\n";
+   #print "-x  Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -x $min_tig_overlap, optional)\n";
+   #print "-n  N-pad gaps (-n 1 = yes, default = no $npad_gaps, optional)\n";
+   die "\nError: Missing mandatory options -f and -w.\n\n";
 }
 
 my $file = $opt_f;
@@ -111,7 +115,7 @@ $min_base_ratio = $opt_r if($opt_r);
 $max_trim = $opt_t if($opt_t);
 $verbose = $opt_v if($opt_v);
 $paired = $opt_p if($opt_p);
-$min_links = $opt_k if($opt_k);
+$min_links = $opt_l if($opt_l);
 $max_link_ratio = $opt_a if($opt_a);
 $contig_size_cutoff = $opt_z if($opt_z);
 $insert_stdev = $opt_e if($opt_e);
@@ -146,7 +150,7 @@ if($opt_s && ! -e $opt_s){### seed file specified, but does not exist
    $independent = $opt_i;
    if($independent >= 1 || $independent < 0 || $independent eq ""){
       $independent = 1;
-      $space_restriction = 1;###restrict the space to that of the target when doing targeted de novo assembly, -i mode
+      $space_restriction = 1;###restrict the space to that of the target when doing targeted de novo assembly -i mode
    }
 }
 
@@ -156,25 +160,26 @@ if ($base_name eq ""){
    $base_name = $file . ".ssake_m" . $min_overlap . "_o" . $base_overlap . "_r" . $min_base_ratio . "_t" . $max_trim . "_w" . $min_depth_of_coverage . "_q" . $tie_breaker . "_y" . $ignore_read;
 
    if($paired){
-      $base_name .= "_e" . $insert_stdev . "_k" . $min_links . "_a" . $max_link_ratio . "_z" . $contig_size_cutoff . "_x" . $min_tig_overlap  . "_g-" . $display_unpaired_file;
+      $base_name .= "_e" . $insert_stdev . "_l" . $min_links . "_a" . $max_link_ratio . "_z" . $contig_size_cutoff  . "_g-" . $display_unpaired_file;
    }
    if($opt_s){
-      $base_name .= "_s-" . $display_seed_file . "_i-" . $independent . "_j-" . $targetwordlen . "_u-" . $space_restriction;
+      $base_name .= "_s-" . $display_seed_file . "_i" . $independent . "_j-" . $targetwordlen . "_u-" . $space_restriction;
    }
 
    my $pid_num = getpgrp(0);
    $base_name .= "_pid" . $pid_num;
 }
 
-my $contig = $base_name .  ".contigs";
-my $singlet = $base_name . ".singlets";
-my $short = $base_name . ".short";
+my $contig = $base_name .  "_contigs.fa";
+my $singlet = $base_name . "_singlets.fa";
+my $short = $base_name . "_short.txt";
 my $log = $base_name . ".log";
 my $scaffold = $base_name . ".scaffolds" if ($paired);
-my $mergedtigs = $base_name . ".mergedcontigs" if ($paired);
-my $issues = $base_name . ".pairing_issues" if ($paired);
-my $distribution = $base_name . ".pairing_distribution.csv" if ($paired);
-my $covfile = $base_name . ".coverage.csv" if ($tracked);
+my $scaffold_fasta = $base_name . "_scaffolds.fa" if ($paired);
+my $mergedtigs = $base_name . "_mergedcontigs.fa" if ($paired);#deprecated Jan 2018
+my $issues = $base_name . "_pairing-issues.txt" if ($paired);
+my $distribution = $base_name . "_pairing-distribution.csv" if ($paired);
+my $covfile = $base_name . "_coverage.csv" if ($tracked);
 my $rdpositionfile = $base_name . ".readposition" if ($tracked);
 my $pileupfile = $base_name . ".pileup" if ($space_restriction);
 
@@ -210,8 +215,10 @@ if($min_base_ratio <= 0.5 || $min_base_ratio > 1){
 my $init_message = "\nRunning: $0 $version\n-f $file\n-s $seed_file\n\t-i $independent\n\t-j $targetwordlen\n\t-u $space_restriction\n-h $ignorehead\n-w $min_depth_of_coverage\n-m $min_overlap\n-o $base_overlap\n-r $min_base_ratio\n-t $max_trim\n-q $tie_breaker\n-y $ignore_read\n";
 if($tracked){$init_message .= "-c $tracked\nCoverage: $covfile\nRead position: $rdpositionfile\n";$init_message .= "Pileup: $pileupfile\n" if(! $independent && $space_restriction);}
 if($forcetrack){$init_message .= "-z $contig_size_cutoff\n";}
-if($paired){$init_message .= "-p $paired\n-e $insert_stdev\n-k $min_links\n-a $max_link_ratio\n-x $min_tig_overlap\nUnpaired reads (optional) -g $unpaired_file\nScaffolds: $scaffold\nMerged contigs: $mergedtigs\nPairing issues: $issues\nPairing distance distribution: $distribution\n";}
-$init_message .= "\nContigs: $contig\nSinglets: $singlet\n\nExcluded reads: $short\nLog: $log\n";
+if($paired){$init_message .= "-p $paired\n-e $insert_stdev\n-l $min_links\n-a $max_link_ratio\nUnpaired reads (optional) -g $unpaired_file\nScaffold layout: $scaffold\nPairing issues: $issues\nPairing distance distribution: $distribution\n";}
+$init_message .= "\nSinglets: $singlet\nContigs: $contig\n";
+if($paired){$init_message .= "Scaffolds: $scaffold_fasta\n";}
+$init_message .= "\nExcluded reads: $short\nLog: $log\n";
 
 print $init_message;
 print LOG $init_message;
@@ -238,23 +245,33 @@ if(-e $opt_s){
    print LOG $use_seed_message;
    print $use_seed_message if ($verbose);
    ($seed,$seedsplit) = &loadSeed($opt_s,$targetwordlen); 
+
+
+
 }
 
 my($set,$bin,$matepair);
-($set,$bin,$matepair) = &readFasta($matepair,$set,$bin,$file,$short,$paired,$encoded,$seedsplit,$space_restriction,$targetwordlen,$ignorehead);
-($set,$bin,$matepair) = &readFasta($matepair,$set,$bin,$unpaired_file,$short,0,$encoded,$seedsplit,$space_restriction,$targetwordlen,$ignorehead) if (-e $opt_g);
-
-if(! $opt_s){### not file provided as seed sequences
-   $seed = $set;     ### the sequencing read set
-   $TRACK_COUNT = 0; 
-}else{### file provided as seed
-   if($independent){### de novo targeted assembly
+my($sumall,$ctall)=(0,0);
+
+($set,$bin,$matepair,$sumall,$ctall) = &readFasta($matepair,$set,$bin,$file,$short,$paired,$encoded,$seedsplit,$space_restriction,$targetwordlen,$ignorehead,$sumall,$ctall);
+($set,$bin,$matepair,$sumall,$ctall) = &readFasta($matepair,$set,$bin,$unpaired_file,$short,0,$encoded,$seedsplit,$space_restriction,$targetwordlen,$ignorehead,$sumall,$ctall) if (-e $opt_g);
+
+my $fc = $sumall / 3000000000;
+my $statslog = "Total reads interrogated (total bases): $ctall ($sumall) -- human genome equivalent(s) = %.4f\n";
+printf $statslog, $fc;
+printf LOG $statslog, $fc;
+
+if(! $opt_s){
+   $seed = $set;
+   $TRACK_COUNT = 0;
+}else{
+   if($independent){
       my $ind_message = "-i has been set to 1, which means the target sequences are USED for recruiting reads for de novo assembly, NOT target-based reference guided assemblies\n";
       printf $ind_message;
       print LOG $ind_message;
       $seed = {};
-      $seed = $set;  ### the sequencing read set
-   }else{### seed-guided assembly extension
+      $seed = $set;
+   }else{
       my $ind_message = "-i has been set to 0, which means the target sequences are USED for recruiting reads for target-based reference guided assemblies (only de novo extension on the ends\n";
       printf $ind_message;
       print LOG $ind_message;
@@ -300,11 +317,13 @@ print "$status_bar\n.";
 my $keys_start = keys ( %$seed );
 
 my $low_total = 0;
+my $prev_cov = 0;
 
 #--------------------------------------------
 ASSEMBLY:
-foreach my $seq (sort {$seed->{$b}{'count'}<=>$seed->{$a}{'count'}} keys %$seed){#cycle through the input [normalized] reads
-
+foreach my $seq (keys %$seed){#cycle through the input [normalized] reads
+#foreach my $seq (sort {$seed->{$a}{'count'}<=>$seed->{$b}{'count'}} keys %$seed){#cycle through the input [normalized] reads
+#foreach my $seq (sort {$seed->{$b}{'count'}<=>$seed->{$a}{'count'}} keys %$seed){#cycle through the input [normalized] reads
    my $track;
 
    my ($pu_seed_name,$pu_seed_ori)=("","");
@@ -335,25 +354,27 @@ foreach my $seq (sort {$seed->{$b}{'count'}<=>$seed->{$a}{'count'}} keys %$seed)
       ($bin,$set,$seed) = deleteData($bin,$set,$seed,$seq,$encoded);   #remove k-mer from hash table and prefix tree
      
       print "\n\n>>> START SEED SEQUENCE :: $seq <<<\n\n" if ($verbose);
-
-      ($seq, $set, $bin, $reads_needed, $total_bases, $track) = doExtension("3 prime", $orig_mer, $seq, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read);
+      my $barcodeList;
+      ($seq, $set, $bin, $reads_needed, $total_bases, $track, $barcodeList) = doExtension("3 prime", $orig_mer, $seq, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read,$barcodeList);
       ####end of 3' extension, beginning of 5' extension  (via 3' RC)
       my $seqrc = reverseComplement($seq);
-      ($seqrc, $set, $bin, $reads_needed, $total_bases, $track) = doExtension("5 prime", $orig_mer, $seqrc, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read);
+      ($seqrc, $set, $bin, $reads_needed, $total_bases, $track, $barcodeList) = doExtension("5 prime", $orig_mer, $seqrc, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read,$barcodeList);
       ####end of 5' extension
       my $leng = length($seqrc);
       my $reversetig = reverseComplement($seqrc);                   ### return to sequence, as provided 
       my $trimmed_length = length($start_sequence) - 2*($max_trim);
 
       if($leng > $trimmed_length || ($reads_needed > 1 && $opt_s)){ ### second conditional is similar to TASR, output contig if targeted assembly mode and more reads have been recruited.
+         last ASSEMBLY if ($low_total >= $LOW_COVERAGE_TIG_IN_A_ROW); ### sudden termination based on expected depth ov coverage
 	 ### commented out: && $start_sequence ne $seqrc && $start_sequence ne $reversetig
-         $tig_length->{$tig_count} = $leng;
-         my $cov =  $total_bases / $leng;
-         $low_total++ if ($cov < $min_depth_of_coverage);
-         last ASSEMBLY if ($low_total >= 50);             ### sudden termination based on expected depth ov coverage
 
+         my $cov =  $total_bases / $leng;
          printf TIG ">contig%i|size%i|read%i|cov%.2f$seed_name\n%s\n", ($tig_count,$leng,$reads_needed,$cov,$reversetig);    #print contigs to file
 
+         $tig_length->{$tig_count} = $leng;
+         #if($prev_cov >= $min_depth_of_coverage){ $low_total=0; }###JAN2018 NEED $LOW_COVERAGE_TIG_IN_A_ROW to TERMINATE
+         $low_total++ if ($cov < $min_depth_of_coverage);
+
          if($forcetrack && $leng >= $contig_size_cutoff){
 
             if($tracked){ ### only execute & report if user specifies -c
@@ -477,7 +498,7 @@ foreach my $seq (sort {$seed->{$b}{'count'}<=>$seed->{$a}{'count'}} keys %$seed)
             }
             ($track_all,$alternate) = &trackReads($track,$track_all,$alternate,$tig_count);  ### all pairs from all contigs (track for scaffolding)
          }
-
+         $prev_cov = $cov;
          $tig_count++;
       }else{
          my $cov = $reads_needed;
@@ -551,20 +572,40 @@ if($paired){
    print LOG $sc_end_message;
    $assemblyruninfo.= $sc_end_message . "\n";
 
-   my $me_start_message = "\n=>Merging contigs initiated $date\n";
-   print $me_start_message;
-   print LOG $me_start_message;
-   $assemblyruninfo.= $me_start_message . "\n";
-
-
-   &forcefillGaps($scaffold, $contig, $mergedtigs, $verbose, $min_tig_overlap, $max_count_trim, $npad_gaps, $alternate, $matepair, $min_overlap, $base_overlap, $min_base_ratio, $forcetrack, $max_trim, $encoded,$seedsplit,$space_restriction,$targetwordlen,$insert_stdev);
-
    $date = `date`;
    chomp($date);
-   my $me_end_message = "\nMerging contigs ended $date\n";
-   print $me_end_message;
-   print LOG $me_end_message;
-   $assemblyruninfo.=  $me_end_message . "\n";
+
+   my $sc_fasta_message = "\n\n=>Making scaffold FASTA file: $date\n";
+   print $sc_fasta_message;
+   print LOG $sc_fasta_message;
+   $assemblyruninfo.= $sc_fasta_message . "\n";
+
+   ### reading contigs to make fasta file and get additional needed info
+   my ($tighash, $tignames, $tig_length) = &readContigsMemory($contig);
+   ### building scaffold fasta 
+   &buildScaffoldFasta($scaffold,$tighash,$scaffold_fasta);
+
+   my $sc_fasta_done_message = "Scaffolds FASTA in: $scaffold_fasta\n";
+   print $sc_fasta_done_message;
+   print LOG $sc_fasta_done_message;
+   $assemblyruninfo.= $sc_fasta_done_message . "\n";
+
+   ### DEPRECATED FEATURE JAN2018 (force-fill gap)
+   #$date = `date`;
+   #chomp($date);
+   #my $me_start_message = "\n=>Merging contigs initiated $date\n";
+   #print $me_start_message;
+   #print LOG $me_start_message;
+   #$assemblyruninfo.= $me_start_message . "\n";
+
+   #&forcefillGaps($scaffold, $contig, $mergedtigs, $verbose, $min_tig_overlap, $max_count_trim, $npad_gaps, $alternate, $matepair, $min_overlap, $base_overlap, $min_base_ratio, $forcetrack, $max_trim, $encoded,$seedsplit,$space_restriction,$targetwordlen,$insert_stdev);
+
+   #$date = `date`;
+   #chomp($date);
+   #my $me_end_message = "\nMerging contigs ended $date\n";
+   #print $me_end_message;
+   #print LOG $me_end_message;
+   #$assemblyruninfo.=  $me_end_message . "\n";
 
 }
 
@@ -1010,7 +1051,7 @@ sub pairContigs{
 # SSAKE contig extension
 sub doExtension{
 
-   my ($direction, $orig_mer, $seq, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $paired, $tig_count, $max_trim, $e, $matepair, $tie_breaker, $ignore_read) = @_;
+   my ($direction, $orig_mer, $seq, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $paired, $tig_count, $max_trim, $e, $matepair, $tie_breaker, $ignore_read, $barcodeList) = @_;
 
    my $extended = 1;
    my $trim_ct = 0;     #trim counter - keeps track of 3'-end trim
@@ -1023,7 +1064,43 @@ sub doExtension{
          my $growing_tig_length = length($seq);
          my ($pos,$span) = (0,"");
 
+         ### Added 15January18 =====================================
+         if($growing_tig_length >= $MAX){   # $seq is length of contig being extended -- if larger than largest read, make sure the largest read could align and all subsequent rds.
+            $span = $MAX - $TRACK_COUNT;
+         }else{
+            $span = $growing_tig_length - $TRACK_COUNT;
+         }
+
+         ### NEW ADD JAN 2018
+         while ($span >= $min_overlap){  # will slide the subseq, until the user-defined min overlap size
+            $pos = $growing_tig_length - $span;
+            print "MAX:$MAX, SPAN:$span, POS:$pos" if ($verbose);
+
+            my $subseq = substr($seq, $pos, $span);              #make a sub-sequence of length l-(1..i) for searching
+            my @s = $subseq =~ /\S{4}/g;
+            my $subset = $bin->{$e->{$s[0]}}{$e->{$s[1]}}{$e->{$s[2]}}{$e->{$s[3]}}; #Will grab everything even the reverse complement ones
+
+            print "####$direction SEARCH Position:$pos Span:$span - Subseq:$subseq Previous:$seq\n" if ($verbose);
+
+            ### SEARCH -- this cycles through limited k-mer space
+            foreach my $pass (sort {$subset->{$b} <=> $subset->{$a}} keys %$subset){
+               if($pass =~ /^$subseq([ACGT]+)/ || $subseq =~ /$pass/){#### OVERHANG || EMBEDDED
+                  #print "BC $subset->{$pass}\n";
+                  if(defined $barcodeList->{$subset->{$pass}}){### barcode seen before
+                     if((length($seq) - $barcodeList->{$subset->{$pass}}{'lastlength'}) > $LR_MAXDIST_APART){
+                        $barcodeList->{$subset->{$pass}}{'count'}=0; ### reset barcode to zero, too far apart, new molecule
+                     }                     
+                  }
+                  $barcodeList->{$subset->{$pass}}{'lastlength'}=length($seq);### new tracks all barcodes encountered
+                  $barcodeList->{$subset->{$pass}}{'count'}++;### new tracks all barcodes encountered
+               }
+            } 
+            $span--;
+         }#while overlap >= user-defined -m minimum
+
+         ### END NEW ADDITION==========================================
          ### Added 19March08
+         ($pos,$span) = (0,"");
          if($growing_tig_length >= $MAX){   # $seq is length of contig being extended -- if larger than largest read, make sure the largest read could align and all subsequent rds.
             $span = $MAX - $TRACK_COUNT;
          }else{
@@ -1052,7 +1129,6 @@ sub doExtension{
 
             ### SEARCH -- this cycles through limited k-mer space
             foreach my $pass (sort {$subset->{$b} <=> $subset->{$a}} keys %$subset){
-
                if($pass =~ /^$subseq([ACGT]+)/){#### OVERHANG 
                   #can we align perfectly that subseq to another rd start?
                   my $dangle = $1;
@@ -1074,8 +1150,7 @@ sub doExtension{
                   ###############################################
                   # CONSIDER CERTAIN READS FOR OVERLAP, PREFERABLY THOSE WITH LOGICAL MATES AND FWD READS IN TIG LARGE ENOUGH TO HAVE SUCH PAIRS
                   foreach my $newpass(keys %$psr){
-                     if($paired && defined $matepair->{$newpass} && ($psr->{$newpass}{'end'} < $psr->{$newpass}{'start'}) && ($psr->{$newpass}{'start'} >= $set->{$newpass}{'grace'})){#paired, pairingRds, <---, outside grace
-
+                     if((length($seq)<=2000 || ((length($seq)>2000 || length($seq)<=10000) && $barcodeList->{$subset->{$newpass}}{'count'}>4) || (length($seq)>10000 && $barcodeList->{$subset->{$newpass}}{'count'}>=9 ) )  && $paired && defined $matepair->{$newpass} && ($psr->{$newpass}{'end'} < $psr->{$newpass}{'start'}) && ($psr->{$newpass}{'start'} >= $set->{$newpass}{'grace'})){#paired, pairingRds, <---, outside grace
                         #print "$B_end <-- $B_start *** [upper limit is grace]***\n";
                         #     <--B
                         #========
@@ -1100,7 +1175,7 @@ sub doExtension{
                               }
                            }
                         }
-                     }else{### not paired or paired (B-->): collect all overlaps
+                     }elsif(length($seq)<=2000 || ((length($seq)>2000 || length($seq)<=10000) && $barcodeList->{$subset->{$newpass}}{'count'}>4) || (length($seq)>10000 && $barcodeList->{$subset->{$newpass}}{'count'}>=9 ) ){### not paired or paired (B-->): collect all overlaps
                         $overhang = collectOverhang($overhang,$newpass,$dangle,$set,$verbose);
                         $overlapping_reads->{$pass}++;
                         $long=1 if(length($newpass) > $illuminaLengthCutoff);
@@ -1108,7 +1183,6 @@ sub doExtension{
                   }###for $newpass
                   #----------------------------------
                }elsif($subseq =~ /$pass/){     #### EMBEDDED
-
                   my $complement_pass = reverseComplement($pass);
 
                   print "$pass found in $subseq ($set->{$pass}{'count'}) - deleting read: $pass and complement ($set->{$complement_pass}): $complement_pass\n\n" if ($verbose);
@@ -1133,7 +1207,7 @@ sub doExtension{
                      my $current_bases = length($complement_pass) * $current_reads;
                      $reads_needed += $current_reads;
                      $total_bases += $current_bases;
-                     if($paired){ 
+                     if($paired){
                         $track->{$complement_pass}{'end'} = $pos + 1;
                         $track->{$complement_pass}{'start'} = $pos + length($complement_pass);
                         $track->{$complement_pass}{'cov'} = $current_reads;
@@ -1316,7 +1390,7 @@ sub doExtension{
    
    print "\n*** NOTHING ELSE TO BE DONE IN $direction - PERHAPS YOU COULD DECREASE THE MINIMUM OVERLAP -m (currently set to -m $min_overlap) ***\n\n" if ($verbose);
 
-   return $seq, $set, $bin, $reads_needed, $total_bases, $track;
+   return $seq, $set, $bin, $reads_needed, $total_bases, $track, $barcodeList;
 }
 
 
@@ -1347,7 +1421,7 @@ sub reverseComplement{
 
 #-----------------
 sub readFasta{
-   my ($matepair,$set,$bin,$file,$short,$paired,$encoded,$seedsplit,$space_restriction,$targetwordlen,$ignorehead) = @_;
+   my ($matepair,$set,$bin,$file,$short,$paired,$encoded,$seedsplit,$space_restriction,$targetwordlen,$ignorehead,$sumall,$ctall) = @_;
 
    my $ctrd = 0;
    my $ctline = 0;
@@ -1372,6 +1446,10 @@ sub readFasta{
                my ($rd1,$rd2) = ($1,$2);
                my $head1 = $head . "1";
                my $head2 = $head . "2";
+               ####num
+               my $len = length($rd1) + length($rd2);
+               $sumall+=$len;
+               $ctall+=2;
 
                $head1="" if($ignorehead);
                $head2="" if($ignorehead);
@@ -1395,6 +1473,10 @@ sub readFasta{
          }else{
             $head="" if($ignorehead);
             ($set,$bin,$ctrd,$recruit_mate) = &loadSequence($set,$bin,$ctrd,$sdna,$encoded,$head,$up_iz,$seedsplit,$space_restriction,$targetwordlen,$recruit_mate) if ($sdna =~ /^([ACGT]*)$/i);
+            ####num
+            my $len = length($sdna);
+            $sumall+=$len;
+            $ctall++;
          }
       }elsif(/^\>(\S+)/){
          $head = $1;
@@ -1422,7 +1504,7 @@ sub readFasta{
    print LOG $read_number_message;
    $assemblyruninfo.=$read_number_message . "\n";
 
-   return $set,$bin,$matepair;
+   return $set,$bin,$matepair,$sumall,$ctall;
 }
 
 #-----------------
@@ -1522,8 +1604,12 @@ sub loadSequence{
          $set->{$orig}{'count'}++;
          $set->{$orig}{'names'}{$head}="";
          $set->{$orig}{'grace'} = $up_iz;
-         $bin->{$e->{$f[0]}}{$e->{$f[1]}}{$e->{$f[2]}}{$e->{$f[3]}}{$orig}++;
-         $bin->{$e->{$r[0]}}{$e->{$r[1]}}{$e->{$r[2]}}{$e->{$r[3]}}{$rc}++;
+         my $barcode = "";
+         $barcode = $1 if($head =~ /\_(\w+)/); ### Jan 2018 THERE SHOULD ONLY BE ONE _ FOLLOWED BY BARCODE/INDEX/NUMBER
+         $bin->{$e->{$f[0]}}{$e->{$f[1]}}{$e->{$f[2]}}{$e->{$f[3]}}{$orig} = $barcode;### Jan 2018
+         $bin->{$e->{$r[0]}}{$e->{$r[1]}}{$e->{$r[2]}}{$e->{$r[3]}}{$rc} = $barcode;### Jan 2018
+         #was $bin->{$e->{$f[0]}}{$e->{$f[1]}}{$e->{$f[2]}}{$e->{$f[3]}}{$orig}++;
+         #was $bin->{$e->{$r[0]}}{$e->{$r[1]}}{$e->{$r[2]}}{$e->{$r[3]}}{$rc}++;
  
          $recruit_mate++;
          $ctrd++;
@@ -1643,7 +1729,8 @@ sub forcefillGaps{
             $seq = reverseComplement($seq) if($orient eq "f");  ###turn tig around for extension with previous reads
             my $track;
             my ($reads_needed,$total_bases,$tig_count)=(0,0,0);
-            ($seq, $prev_miniset, $prev_minibin, $reads_needed, $total_bases, $track) = doExtension("GAP left", $MAX, $seq, $prev_miniset, $prev_minibin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read) if($ct);###no need to extend first ($ct=0) contig on the left
+            my $barcodeList;
+            ($seq, $prev_miniset, $prev_minibin, $reads_needed, $total_bases, $track, $barcodeList) = doExtension("GAP left", $MAX, $seq, $prev_miniset, $prev_minibin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read,$barcodeList) if($ct);###no need to extend first ($ct=0) contig on the left
             ($prev_miniset, $prev_minibin)=({},{});
             $seq = reverseComplement($seq);  ###re-change orientation for right extension (both fwd/rev tigs)
 
@@ -1758,7 +1845,7 @@ sub forcefillGaps{
                   my $up_iz = $collect_candidates->{$mateseq} - $min_allowed;
                   ($miniset,$minibin,$ctrd,$recruit_mate) = &loadSequence($miniset,$minibin,$ctrd,$mateseq,$encoded,$head,$up_iz,$seedsplit,$space_restriction,$targetwordlen,$recruit_mate) if(length($mateseq) <= $illuminaLengthCutoff);
                }
-               ($seq, $miniset, $minibin, $reads_needed, $total_bases, $track) = doExtension("GAP right", $MAX, $seq, $miniset, $minibin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read);### extend first contig on the right
+               ($seq, $miniset, $minibin, $reads_needed, $total_bases, $track, $barcodeList) = doExtension("GAP right", $MAX, $seq, $miniset, $minibin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose, $track, $forcetrack, $tig_count, $max_trim, $encoded, $matepair,$tie_breaker,$ignore_read,$barcodeList);### extend first contig on the right
                $prev_miniset = $miniset;###used reads have been deleted
                $prev_minibin = $minibin;
                ($miniset,$minibin)=({},{});### flush custom prefix tree minibin and miniset
@@ -1850,4 +1937,121 @@ sub forcefillGaps{
    close OUT;
 }
 
+#-------------------
+sub readContigsMemory{
+   my $file = shift;
+
+   my $tig_length;
+   my $tignames;
+   my $fh;
+   my $prevhead="";
+   my $seq="";
+   my $cttig=0;
+
+   ###Support for compressed files MAR2016
+   if($file=~/zip$/i){
+      open(IN,"unzip -p $file|") || die "Error reading $file -- fatal\n";
+   }elsif($file=~/gz$/i || $file=~/gzip$/i){
+      open(IN,"gunzip -c $file|") || die "Error reading $file -- fatal\n";
+   }else{
+      open(IN,$file) || die "Error reading $file -- fatal\n";
+   }
+
+   while(<IN>){
+      chomp;
+      if(/^\>(.*)/){
+         my $head=$1;
+
+         if ($head ne $prevhead && $seq ne '' && $prevhead ne ''){
+            $cttig++;
+            $tignames->{$cttig} = $prevhead;
+            $fh->{$cttig} = $seq;
+            $tig_length->{$cttig} = length($seq);
+         }
+         $seq = '';
+         $prevhead = $head;
+      }else{
+         $seq .= uc($_);
+      }
+   }
+   $cttig++;
+   $tignames->{$cttig} = $prevhead;
+   $fh->{$cttig} = $seq;
+   $tig_length->{$cttig} = length($seq);
+
+   return $fh,$tignames,$tig_length;
+}
+
+
+#-------------------
+sub buildScaffoldFasta{
+
+   my ($dotscaffold,$fh,$scaffold_fasta) = @_;
+
+   open(IN,$dotscaffold) || die "Cannot open $dotscaffold for reading -- fatal.\n";
+
+   open(OUT,">$scaffold_fasta") || die "can't write to $scaffold_fasta -- fatal\n";
+
+   my $tot=0;
+   my $ct=0;
+   my $sct=0;
+   while(<IN>){### each line
+      chomp;   
+      my $sc="";
+      my @a = split(/\,/);
+      my @tig;
+   
+      if($a[2]=~/\_/){
+         @tig = split(/\_/,$a[2]);
+      }else{
+         push @tig, $a[2];
+      }
+
+      $sct++;
+      my $tigsum=0;
+      print OUT ">$_\n";
+      foreach my $t (@tig){
+         $ct++;
+    
+         if($t=~/([fr])(\d+)z(\d+)(\S+)?/i){
+
+            my $orient = $1;
+            my $tnum=$2;
+            my $head = $orient . $tnum;
+            my $search = $tnum;
+            my $other = $4;
+            $tot+= $3;
+            $tigsum +=$3;
+
+            my $gap = "NA"; 
+            my $numlinks = "NA";
+            my $linksratio = "NA";
+
+            my $gapseq = "";
+
+            $numlinks = $1 if($other=~/k(\d+)/);
+            $linksratio = $1 if($other=~/a(\d+.*)m/);
+	    $gap = $1 if($other=~/m(\-?\d+)/);
+
+            my $seq = $fh->{$search};
+            $seq = reverseComplement($seq) if($orient eq "r");
+
+            $gapseq = "N" x $gap if($gap > 0);
+            $gapseq = "n" if($gap ne "NA" && $gap <= 0 );
+            $seq .= $gapseq;
+
+            print OUT "$seq";         
+                
+         }#tig regex
+   
+      }#each tig
+      print OUT "\n";
+   }
+
+   close CORROUT;
+   close IN;
+   close OUT;
+
+}
+
 ## We hope this code is useful to you -- Please send comments & suggestions to rwarren at bcgsc.ca


=====================================
SSAKE-readme.pdf deleted
=====================================
Binary files a/SSAKE-readme.pdf and /dev/null differ


=====================================
debian/changelog
=====================================
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,20 @@
-ssake (3.8.5-2) UNRELEASED; urgency=medium
+ssake (4.0-1) unstable; urgency=medium
 
+  * New upstream version
+
+  [ Steffen Moeller ]
   * debian/upstream/metadata
     - yamllint cleanliness
     - added references to registries
 
- -- Steffen Moeller <moeller at debian.org>  Sun, 10 Sep 2017 12:10:09 +0200
+  [ Andreas Tille ]
+  * Adapt watch file to new tarball versioning
+  * cme fix dpkg-control
+  * debhelper 11
+  * README is now delivered in md format
+  * Remove potentially privacy breaching logo from readme
+
+ -- Andreas Tille <tille at debian.org>  Sun, 18 Feb 2018 16:45:52 +0100
 
 ssake (3.8.5-1) unstable; urgency=medium
 


=====================================
debian/compat
=====================================
--- a/debian/compat
+++ b/debian/compat
@@ -1 +1 @@
-10
+11


=====================================
debian/control
=====================================
--- a/debian/control
+++ b/debian/control
@@ -4,8 +4,9 @@ Uploaders: Andreas Tille <tille at debian.org>,
            Charles Plessy <plessy at debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 10)
-Standards-Version: 4.0.0
+Build-Depends: debhelper (>= 11~),
+               python-markdown
+Standards-Version: 4.1.3
 Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/ssake.git
 Vcs-Git: https://anonscm.debian.org/git/debian-med/ssake.git
 Homepage: http://www.bcgsc.ca/platform/bioinfo/software/ssake
@@ -15,7 +16,7 @@ Architecture: all
 Depends: ${misc:Depends},
          ${perl:Depends},
          python,
-         libperl4-corelibs-perl | perl (<< 5.12.3-7)
+         libperl4-corelibs-perl
 Suggests: ssake-examples
 Description: genomics application for assembling millions of very short DNA sequences
  The Short Sequence Assembly by K-mer search and 3′ read Extension


=====================================
debian/copyright
=====================================
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,9 +1,9 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Source: http://www.bcgsc.ca/platform/bioinfo/software/ssake/releases/3.8/ssake_v3-8-tar.gz
 
 Files: *
-Copyright: © 2006-2011  Canada's Michael Smith Genome Science Centre
-           © 2006–2011 Rene Warren <rwarren at bcgsc.ca>
+Copyright: © 2006-2018  Canada's Michael Smith Genome Science Centre
+           © 2006–2018 Rene Warren <rwarren at bcgsc.ca>
 License: GPL-2+
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -21,7 +21,7 @@ Comment: On Debian systems, the complete text of the GNU General Public
  License version 2 can be found in ‘/usr/share/common-licenses/GPL-2’.
 
 Files: debian/*
-Copyright: © 2008 Andreas Tille <tille at debian.org>
+Copyright: © 2008-2018 Andreas Tille <tille at debian.org>
            © 2009 Charles Plessy <plessy at debian.org>
 License: Same-as-SSAKE-itelf
  (see above)


=====================================
debian/patches/do_not_send_mail_after_testing.patch
=====================================
--- a/debian/patches/do_not_send_mail_after_testing.patch
+++ b/debian/patches/do_not_send_mail_after_testing.patch
@@ -1,10 +1,10 @@
 Author: Andreas Tille <tille at debian.org>
 Last-Update: Sat, 19 Jul 2014 00:09:21 +0200
-Description: Prevent sending e-mail to upstream in case of successfull test
- 
+Description: Prevent sending e-mail to upstream in case of successful test
+
 --- a/SSAKE
 +++ b/SSAKE
-@@ -574,15 +574,15 @@ exit;
+@@ -615,15 +615,15 @@ exit;
  eval{
     my $wdir = `pwd`;
     chomp($wdir);


=====================================
debian/patches/no_privacy_breach_logo.patch
=====================================
--- /dev/null
+++ b/debian/patches/no_privacy_breach_logo.patch
@@ -0,0 +1,13 @@
+Description: Remove potentially privacy breaching logo from readme
+Author: Andreas Tille <tille at debian.org>
+Last-Update: Sun, 18 Feb 2018 16:26:04 +0100
+
+--- a/readme.md
++++ b/readme.md
+@@ -1,6 +1,3 @@
+-![Logo](https://github.com/warrenlr/ssake/blob/master/ssake-logo.png)
+-
+-
+ # SSAKE
+ ## Short Sequence Assembly by K-mer search and 3' read Extension
+ ## SSAKE v4.0 Rene L. Warren, 2006-2018


=====================================
debian/patches/series
=====================================
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,2 @@
 do_not_send_mail_after_testing.patch
+no_privacy_breach_logo.patch


=====================================
debian/rules
=====================================
--- a/debian/rules
+++ b/debian/rules
@@ -24,6 +24,10 @@ override_dh_installexamples:
 	    sed -i '1 i #!/bin/sh' $${sh} ; \
 	done
 
+override_dh_installdocs:
+	dh_installdocs
+	markdown_py -f $(CURDIR)/debian/$(DEB_SOURCE)/usr/share/doc/$(DEB_SOURCE)/README.html readme.md
+
 ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
 TESTDIR := $(shell mktemp -d)
 override_dh_auto_test:


=====================================
debian/ssake.docs
=====================================
--- a/debian/ssake.docs
+++ b/debian/ssake.docs
@@ -1,2 +1 @@
-SSAKE.readme*
 tools/TQS.readme


=====================================
debian/watch
=====================================
--- a/debian/watch
+++ b/debian/watch
@@ -1,6 +1,6 @@
-version=3
+version=4
 
-opts=downloadurlmangle=s{([\d.]+)$}{$1/ssake_v$1-tar.gz};s{_v(\d)\.(\d)\.}{_v$1-$2-}g,filenamemangle=s{.+/([\d.]+)$}{ssake_$1.tar.gz} \
+opts=downloadurlmangle=s{(\d)\.(\d+)$}{$1.$2/ssake_v$1-$2.tar.gz};s{_v(\d)\.(\d)\.}{_v$1-$2-}g,filenamemangle=s{.+/([\d.]+)$}{ssake_$1.tar.gz} \
      http://www.bcgsc.ca/platform/bioinfo/software/ssake/releases/([\d.]*)
 
 # http://www.bcgsc.ca/platform/bioinfo/software/ssake/releases/3.8.2/ssake_v3-8-2-tar.gz


=====================================
SSAKE-readme.txt → readme.md
=====================================
--- a/SSAKE-readme.txt
+++ b/readme.md
@@ -1,70 +1,96 @@
-Short Sequence Assembly by K-mer search and 3' read Extension (SSAKE) 
+![Logo](https://github.com/warrenlr/ssake/blob/master/ssake-logo.png)
 
-SSAKE v3.8.5 Rene L. Warren, 2006-2017
-email: rwarren at bcgsc.ca
 
+# SSAKE
+## Short Sequence Assembly by K-mer search and 3' read Extension
+## SSAKE v4.0 Rene L. Warren, 2006-2018
+## email: rwarren [at] bcgsc [dot] ca
+## Visit www.bcgsc.ca/bioinfo/software/ssake for additional information
 
-Description
+
+### Description
 -----------
 
 SSAKE is a genomics application for de novo assembly of millions of very short DNA sequences.
 It is an easy-to-use, robust, reliable and tractable assembly algorithm for short sequence reads, such as those generated by Illumina Ltd. 
 
 
-For best performance
+### For best performance
 --------------------
 
 Best assembly results are achieved with quality-trimmed reads.  When dealing with Solexa/Illumina sequences, remove low quality bases, whenever possible, with:
 
-TQS.py -f _seq.txt -q _prb.txt -t 5 -d 5 -l #CYCLES -c 20  OR OTHER SETTINGS 
-TQSfastq.py -f myfile.fq -t 20 -c 30 -e 64
-
-example:
-~/ssake_v3.8.5/tools/TQSfastq.py -f ../myIlluminaSeqLane_1.fq -c 30 -t 20  ##mate no.1
-~/ssake_v3.8.5/tools/TQSfastq.py -f ../myIlluminaSeqLane_2.fq -c 30 -t 20  ##mate no.2
-~/ssake_v3.8.5/tools/makePairedOutput2UNEQUALfiles.pl ../myIlluminaSeqLane_1.fq_T20C20E64.trim.fa ../myIlluminaSeqLane_2.fq_T20C20E64.trim.fa
+<pre>
+~/ssake_v4.0/tools/
+TQS.py 
+eg. TQS.py -f _seq.txt -q _prb.txt -t 5 -d 5 -l #CYCLES -c 20  OR OTHER SETTINGS 
+TQSfastq.py
+eg. TQSfastq.py -f myfile.fq -t 20 -c 30 -e 64
+TQSfastq.pl
+eg. TQSfastq.pl -f human.fof -q 20 -n 70 -e 33
+where human.fof contains fastq files to trim
+make sure the trimmed fastq ends with 1 and 2 for the first and second reads of a pair
+eg. makePairedOutput2UNEQUALfiles.pl NA24143_genome_phased_namesorted.bam1_1.fqc70q20e33.fa NA24143_genome_phased_namesorted.bam1_2.fqc70q20e33.fa 350
 will produced "paired.fa" and "unpaired.fa"
-~/ssake_v3.8.5/SSAKE -f paired.fa -g unpaired.fa -p 1 -m 17 -o 4 -c 1 -w 5
+~/ssake_v4.0/SSAKE -f paired.fa -g unpaired.fa -p 1 -m 20 -o 3 -c 1 -w 5
+
+A BASIC PIPELINE TO HELP YOU PREPARE YOUR INPUT PAIRED READS EXISTS (./tools/runSSAKE.sh)
+
+USAGE: ./runSSAKE.sh read1.fq read2.fq libraryFragmentLength basename
+
+</pre>
 
 The scripts are located in ./tools subdirectory included with this release.
 It is recommended that you run TQS.py/TQSfastq.py for every tile (batch job) and cat the outputted fasta file, especially if your data set is large (e.g. entire flowcell)
 
 For trimming paired-end sequences (using _seq.txt and _prb.txt from Illumina), please refer to TRIMMING_PAIRED_READS.README located in the ./tools subdirectory
 
-
-What's new in v3.8.5 ?
+### What's new in v4.0 ?
 ----------------------
-
-Implements targeted de novo assembly. Fixed a bug that manifested when running SSAKE in targeted (-s) de novo assembly mode. Contig sequences as long as the seed sequence were previously misclassified as singlets. (thanks Gilberto Vaughan for reporting)
-
-
-What's new in v3.8.4 ?
+<pre>
+-Initial support for linked reads, such as those produced by the 10x Genomics Chromium technology
+-Linked reads are supplied in the original SSAKE format, but the barcode information is indicated by sequence following a single underscore "_"
+-Added a C. elegans linked read data assembly script in the test folder
+-Scaffolding options consistent with that of the LINKS scaffolder
+-SSAKE no longer sorts by/prioritizes high multiplicity reads for assembly
+-Streamlined file names
+-Deprecated force-fill gap feature
+-Bug fixes
+</pre>
+
+### What's new in v3.8.5 ?
 ----------------------
+<pre>
+Implements targeted de novo assembly. Fixed a bug that manifested when running SSAKE in targeted (-s) de novo assembly mode. Contig sequences as long as the seed sequence were previously misclassified as singlets.
+</pre>
 
+### What's new in v3.8.4 ?
+----------------------
+<pre>
 Improvements to the targeted assembly functionality, recruiting whole read pairs for de novo assembly when at least one read has a k-mer match. This has the potential to extend the reconstructed contigs by 2X the library fragment size (upstream and downstream) of the target sequence when run in targeted de novo assembly mode -s and -i 1 (default).
+</pre>
 
-
-What's new in v3.8.3 ?
+### What's new in v3.8.3 ?
 ----------------------
+<pre>
+Included tie-breaker option (-q) when determining consensus from equal-coverage bases. When set (-q 1), the parameter r is overridden. Note that even though the base is "randomly" chosen in such situations, SSAKE will only extend contigs in cases where there is read support over the chosen base. Included option to ignore reads making up the consensus base extension (-y).
+</pre>
 
-Included tie-breaker option (-q) when determining consensus from equal-coverage bases. When set (-q 1), the parameter r is overridden. Note that even though a "random" base is chosen in such situations, SSAKE will only extend contigs in cases where there is read supportover the chosen base. Included option to ignore reads making up the consensus base extension (-y).
-
-
-What's new in v3.8.2 ?
+### What's new in v3.8.2 ?
 ----------------------
-
+<pre>
 Included target word length (-j option) - TASR behaviour (equivalent to -k option in TASR)
+</pre>
 
-
-What's new in v3.8.1 ?
+### What's new in v3.8.1 ?
 ----------------------
-
+<pre>
 fixed SSAKE for Perl >= 5.16.0, where deprecated getopts.pl has been removed. Thanks to Nicola Soranzo for sending the fix.
+</pre>
 
-
-What's new in v3.8+ ?
+### What's new in v3.8+ ?
 ---------------------
-
+<pre>
 This release is 30% faster than v3.7, made possible by 4-base encoding of the first 16 bases of each read while populating a 4-nodes (16/4) prefix tree.  Also, the newly implemented (and required) -w option, gives users more control over the assembly, focusing on higher-depth contigs and ignoring short, low-depth contigs comprised of NGS reads having errors, contaminating reads or any other (perhaps unwanted) sequences. 
 
 *The assembly of 25M high-quality (100bp x 2 x Q30) bacterial NGS reads (1 lane, Illumina HiSeq) with SSAKE v3.7 in single-end mode took 2h50.  SSAKE v3.8 ran for 2h on the same data.  A SSAKE v3.8 assembly restricted to high-depth contigs (-w 10) ran in 33 minutes, and eliminated thousands of short, low depth of coverage contigs that would otherwise arise from contaminating reads and reads with base errors.    
@@ -75,6 +101,7 @@ Campylobacter showae CC57C colorectal cancer tumor isolate (~2Mbp genome) Illumi
 ftp://ftp.bcgsc.ca/supplementary/SSAKE/CC57C_paired.fa and CC57C_unpaired.fa
  
 SSAKE ASSEMBLY PIPELINE:
+./test/runSSAKE.sh
 
 ./tools/TQSfastq.py -f Assemble_1_R1.fastq -t 30 -c 100 -e 33
 ./tools/TQSfastq.py -f Assemble_1_R2.fastq -t 30 -c 100 -e 33
@@ -82,125 +109,82 @@ cat Assemble_1_R2.fastq_T30C100E33.trim.fa |perl -ne 'if(/^(\>\@\S+)/){print "$1
 cat Assemble_1_R1.fastq_T30C100E33.trim.fa |perl -ne 'if(/^(\>\@\S+)/){print "$1a\n";}else{print;}' >Assemble_1_R1.fastq_T30C100E33.trimFIX.fa
 ./tools/makePairedOutput2UNEQUALfiles.pl Assemble_1_R1.fastq_T30C100E33.trimFIX.fa Assemble_1_R2.fastq_T30C100E33.trimFIX.fa 400
 ./Syrupy-1.4.0/scripts/syrupy.py ./SSAKE -f CC57C_paired.fa -p 1 -g CC57C_unpaired.fa -m 20 -w 5 -b run2014
-
-./ssake_v3.8.5/tools/getStats.pl run2014.contigs
+./ssake_v4.0/tools/getStats.pl run2014_contigs.fa
+./ssake_v4.0/tools/getStats.pl run2014_scaffolds.fa
 
 TRY IT OUT BY SIMPLY RUNNING:
 
 cd test;./MiSeqCampylobacterAssemblyPIPELINE.sh
 cd test;./MiSeqCampylobacterAssembly.sh
+</pre>
 
 
-CONTIG SEQUENCE STATS
---------------------------------------------------------------------------------
-Mean (nt),10476.89
-Max (nt),119107
-Min (nt),200
-n,215
-Stdev (nt),19021.54
-Variance (nt),361818996.83
-TrimmedMean (nt),2763.44
-Median (nt),412.00
-Sum (nt),2252531.00
-N20,64969
-N50,41436
-N80,17465
-Size Range,#bases,#sequences
->=100000,119107,1
-10000-100000,1951445,63
-200-1000,36330,119
-1000-10000,145649,32
-
-./ssake_v3.8.5/tools/makeFastaFileFromScaffolds.pl run2014.scaffolds
-./ssake_v3.8.5/tools/getStats.pl run2014.scaffolds.fa
-
-SCAFFOLD SEQUENCE STATS
---------------------------------------------------------------------------------
-Mean (nt),14931.05
-Max (nt),252560
-Min (nt),200
-n,151
-Stdev (nt),42086.22
-Variance (nt),1771250106.52
-TrimmedMean (nt),349.95
-Median (nt),288.00
-Sum (nt),2254589.00
-N20,227805
-N50,124387
-N80,59173
-Size Range,#bases,#sequences
->=100000,1350704,8
-10000-100000,805378,18
-200-1000,33684,113
-1000-10000,63291,12
-
-
-What's new in v3.7+ ?
+### What's new in v3.7+ ?
 ---------------------
-
+<pre>
 v3.7+ Improved support for seed-based -s assemblies, notably read-space restriction option -u (TASR behavior, without fastq support)
+</pre>
 
-
-What's new in v3.6+ ?
+### What's new in v3.6+ ?
 ---------------------
-
+<pre>
 v3.6+ supports various insert size sequence libraries.  To work with paired data, users must add ":insert_size" (e.g. >SLXA23-1-1-2-13:200 for a 200bp library) at the end of the fasta header (>) for each pairs. v3.6 also has preliminary support for Sanger, paired-end reads.
+</pre>
 
-
-What's new in v3.5+ ?
+### What's new in v3.5+ ?
 ---------------------
-
+<pre>
 In v3.5, the read pairing logic is used in the extension process.  More specifically, passed the upper bound insert size, only forward reads AND reverse reads having their assembled mate already assembled in the contig being built will be considered for extension.  This has for effect to mitigate contig misassemblies due to repeats.  It will also extend the end of adjacent contigs in a scaffold in an effort to fill gaps - the resulting contigs are placed in the .mergedcontigs file.
+</pre>
 
-
-What's new in v3.4+ ?
+### What's new in v3.4+ ?
 ---------------------
+<pre>
+Version 3.4 exploits paired-end reads to explore possible contig merges within scaffolds (Consecutive contigs >= -z bases must overlap by -m bases or more). Version v3.4.1 allows a user to merge all contigs of a scaffold by padding predicted gaps with Ns (-n 1) and predicted but undetected overlaps with a single (n).  Merged contigs are outputted in the _mergedcontigs.fa file.  The default behaviour (-n 0) is to NOT pad the gaps with Ns (v3.4 behaviour).  In the v3.4.1, the _readposition.txt file tracks read names instead of read sequences as the latter can be inferred from the start and end coordinates. THIS IS NOW (v4.0) DEPRECATED.
+</pre>
 
-Version 3.4 exploits paired-end reads to explore possible contig merges within scaffolds (Consecutive contigs >= -z bases must overlap by -m bases or more). Version v3.4.1 allows a user to merge all contigs of a scaffold by padding predicted gaps with Ns (-n 1) and predicted but undetected overlaps with a single (n).  Merged contigs are outputted in the .mergedcontigs file.  The default behaviour (-n 0) is to NOT pad the gaps with Ns (v3.4 behaviour).  In the v3.4.1, the .readposition file tracks read names instead of read sequences as the latter can be inferred from the start and end coordinates.
-
-
-What's new in v3.3+ ?
+### What's new in v3.3+ ?
 ---------------------
-
+<pre>
 Fixed a bug in PET routine.  User can now track read position and individual base coverage for reads *fully embedded* within contigs, using the -c option.
+</pre>
 
-
-What's new in v3.2.1+ ?
+### What's new in v3.2.1+ ?
 -----------------------
-
+<pre>
 This release runs ~30% faster and requires ~33% less RAM compared to 15-node prefix tree SSAKE v3.2.0.1 beta.
 Compared to the previous v3.2 release (11-node prefix tree), it will run at ca. double the speed, requiring ~20% more RAM.
+</pre>
 
-
-What's new in v3.2+ ?
+### What's new in v3.2+ ?
 ---------------------
-
+<pre>
 The -t option, first introduced in SSAKE 1.3, is back in this release.  The option allows you to trim your contigs in 3', 1 base at a time until a maximum base trim value (-t) is reached.  This option yields longer contigs, but increases assembly run time and, at high t values, might introduce contig misassemblies if your run parameters (i.e. -m, -o & -r) are not stringent enough.  At -m 16 -o 3 -r 0.7, best results were obtained with -t 1.  That's because it removes bases that cause premature breaks during the fragment assembly.  If set, end-trimming kicks in only when all possibilities have been exhausted for a contig extension.
 This release also fixes a major bug that prevented SSAKE from exploring the entire read space for contig extensions seeded by the shorter reads.
+</pre>
 
-
-What's new in v3.1+ ?
+### What's new in v3.1+ ?
 ---------------------
-
+<pre>
 SSAKE now allows users to input a fasta file with DNA sequences for use as seeds to nucleate contig extension.
 This feature can be used to extend existing/known DNA sequences using millions of short reads.
 There's a new input format for paired-end reads, which allows reads of variable length to be considered (such as quality-trimmed reads)
+</pre>
 
-
-What's new in v3.0+ ?
+### What's new in v3.0+ ?
 ---------------------
-
+<pre>
 SSAKE supports Illumina paired-end read data to build scaffolds. 
+</pre>
 
-
-What's new in v2.0+ ?
+### What's new in v2.0+ ?
 ---------------------
-
+<pre>
 SSAKE can now handle error-rich data sets by looking through the overlapping k-mer space for consensus bases overhanging a seed sequence or contig.
 SSAKE now runs on reads of various lengths.  That means quality base trimming of individual sequences can be achieved (using TQS.py/TQSfastq.py supplied in ./tools directory).
+</pre>
 
-
-Implementation and requirements
+### Implementation and requirements
 -------------------------------
 
 SSAKE is implemented in PERL and runs on any OS where PERL is installed.
@@ -222,18 +206,18 @@ PLEASE READ:
 SSAKE might not be suited to work with 454-type reads.  Simply because recurring base insertions/deletions errors, such as those commonly seen in homopolymeric regions, will not assemble well in the context of the SSAKE algorithm scheme. Sanger reads are ok, as long as reads are quality-trimmed.
 
 
-Install
+### Install
 -------
 
 Download the tar ball, gunzip and extract the files on your system using:
-
-gunzip ssake_v3-8-5.tar.gz
-tar -xvf ssake_v3-8-5.tar
-
+<pre>
+gunzip ssake_v4-0.tar.gz
+tar -xvf ssake_v4-0.tar
+</pre>
 Change the shebang line of SSAKE to point to the version of perl installed on your system and you're good to go.
 
 
-Documentation
+### Documentation
 -------------
 
 Refer to the SSAKE.readme file on how to run SSAKE and the SSAKE web site for information about the software and its performance 
@@ -242,33 +226,35 @@ www.bcgsc.ca/bioinfo/software/ssake
 Questions or comments?  We would love to hear from you!
 
 
-Citing SSAKE
+### Citing SSAKE
 ------------
 
 Thank you for using, developing and promoting this free software.
 If you use SSAKE for you research, please cite:
 
+<pre>
 Warren RL, Sutton GG, Jones SJM, Holt RA.  2007.  Assembling millions of short DNA sequences using SSAKE.  Bioinformatics. 23(4):500-501
+</pre>
 
-
-Running SSAKE
+### Running SSAKE
 -------------
+<pre>
+e.g. SSAKE -f paired.fa -m 20 -o 3 -r 0.7 -p 1 -c 1 -e 0.75 -l 5 -a 0.3 -z 50 -w 5 -g unpaired.fa 
 
-e.g. SSAKE -f paired.fa -m 17 -o 4 -r 0.7 -p 1 -c 1 -e 0.75 -k 2 -a 0.6 -z 50 -w 5 -g unpaired.fa 
-
-Usage: ./SSAKE [v3.8.5]
+Usage: ./SSAKE [v4.0]
 -f  File containing all the [paired (-p 1)] reads (required)
-	  With -p 1:
-	! Target insert size must be indicated at the end of the header line (e.g. :200 for a 200bp insert)
-	! Paired reads must be separated by ":"
-	  >template_name:200
-	  ACGACACTATGCATAAGCAGACGAGCAGCGACGCAGCACG:GCGCACGACGCAGCACAGCAGCAGACGAC
--w  Minimum depth of coverage allowed for contigs (e.g. -w 1 = process all reads [v3.7 behavior], required)
+          With -p 1:
+        ! Target insert size must be indicated at the end of the header line (e.g. :400 for a 400bp fragment/insert size)
+        ! Paired reads must be separated by ":"
+          >header:400 (or >header_barcode:400)
+          ACGACACTATGCATAAGCAGACGAGCAGCGACGCAGCACG:GCGCACGACGCAGCACAGCAGCAGACGAC
+-g  Fasta file containing unpaired sequence reads (optional)
+-w  Minimum depth of coverage allowed for contigs (e.g. -w 1 = process all reads [v3.7 behavior], required, recommended -w 5)
     *The assembly will stop when 50+ contigs with coverage < -w have been seen.*
 -s  Fasta file containing sequences to use as seeds exclusively (specify only if different from read set, optional)
-	-i Independent (de novo) assembly  i.e Targets used to recruit reads for de novo assembly, not guide/seed reference-based assemblies (-i 1 = yes (default), 0 = no, optional)
-	-j Target sequence word size to hash (default -j 15)
-	-u Apply read space restriction to seeds while -s option in use (-u 1 = yes, default = no, optional)
+        -i Independent (de novo) assembly  i.e Targets used to recruit reads for de novo assembly, not guide/seed reference-based assemblies (-i 1 = yes (default), 0 = no, optional)
+        -j Target sequence word size to hash (default -j 15)
+        -u Apply read space restriction to seeds while -s option in use (-u 1 = yes, default = no, optional)
 -m  Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 20)
 -o  Minimum number of reads needed to call a base during an extension (default -o 2)
 -r  Minimum base ratio used to accept a overhang consensus base (default -r 0.7)
@@ -281,16 +267,19 @@ Usage: ./SSAKE [v3.8.5]
 -q  Break tie when no consensus base at position, pick random base (-q 1 = yes, default = no, optional)
 -p  Paired-end reads used? (-p 1 = yes, default = no, optional)
 -v  Runs in verbose mode (-v 1 = yes, default = no, optional)
-============ Options below only considered with -p 1 ============
+============ scaffolding options below only considered with -p 1 ============
 -e  Error (%) allowed on mean distance   e.g. -e 0.75  == distance +/- 75% (default -e 0.75, optional)
--k  Minimum number of links (read pairs) to compute scaffold (default -k 4, optional)
--a  Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a 0.5, optional)
--x  Minimum overlap required between contigs to merge adjacent contigs in a scaffold (default -x 20, optional)
--n  N-pad gaps (-n 1 = yes, default = no 0, optional)
--g  Fasta file containing unpaired sequence reads (optional)
+-l  Minimum number of links (read pairs) to compute scaffold (default -k 5, optional)
+-a  Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a 0.3, optional)
 
+A BASIC PIPELINE TO HELP YOU PREPARE YOUR INPUT PAIRED READS EXISTS (./test/runSSAKE.sh)
 
-Test data
+USAGE: ./runSSAKE.sh read1.fq read2.fq libraryFragmentLength basename
+
+
+</pre>
+
+### Test data
 ---------
 
 Go to the test folder, (cd test)
@@ -305,7 +294,7 @@ The assembly above uses a single seed sequence located in (Herpesvirus_3.60kb.se
 compare your results with Herpesvirus_3.60kb.reference.fa to see how successful the assembly was
 
 TEST DATA / SSAKE ASSEMBLIES
-
+<pre>
 A) Testing the distribution with very short reads:
 ../SSAKE -f Herpesvirus_3.60kb.reads.fa -m 16 -o 2 -w 5 -c 1 
 
@@ -328,15 +317,15 @@ C) Testing SSAKE on real (experimental) Illumina sequence data
 or 
 
 ./MiSeqCampylobacterAssembly.sh
-(just the assembly)
+(read download, assembly)
 
 
 3) Escherichia coli / 2014 Illumina MiSeq data
 
 ./MiSeqEcoliAssembly250XPE300.sh
-(just the assembly)
+(read download, assembly)
 
-This is illumina MiSeq base space data (one tenth of 2500-fold coverage run
+This is illumina MiSeq base space data (one tenth of 2500-fold coverage run)
 sequence ~ 250X, 550bp fragments PE300
 
 *compare your assembly to:
@@ -346,7 +335,7 @@ coliMiSeq300m80.scaffolds.stats1
 
 4) Fusobacterium nucleatum - CRC tumor isolate / Illumina HiSeq 2000 data
 ./HiSeqFusobacteriumAssembly.sh
-(just the assembly)
+(read download, assembly)
 
 
 5) De Novo Targeted assembly of a TMPRSS2:ERG fusion using a prostate adenocarcinoma RNA-seq dataset
@@ -354,7 +343,43 @@ coliMiSeq300m80.scaffolds.stats1
 (read download,trimming,formatting,assembly)
 
 
-How it works
+D) Testing SSAKE on LRsim linked-read data
+
+1) C. elegans linked-read assembly
+./CelegansLinkedReadsAssembly.sh
+(read download, assembly)
+</pre>
+
+
+SSAKE v4.0 RUN TESTS (all tests provided in the test folder)
+January 2018
+
+Contigs:
+
+species|read type|n|n:500|L50|min|N80|N50|N20|E-size|max|sum|time (h:mm:ss)|RAM (GB)|file name
+---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
+Herpes virus|Illumina single-end|7|1|1|60000|60000|60000|60000|60000|60000|60000|0:00:22|0.2|HStestInstall_contigs.fa
+Ebola virus|Illumina paired-end|122|18|1|501|688|18917|18917|12139|18917|30101|0:01:22|0.4|ebola_contigs.fa
+F. nucleatum|HiSeq paired-end|697|560|131|501|2540|4641|8019|5631|19764|1937271|0:10:43|6.0|fusoCC53_contigs.fa
+C. showae|MiSeq paired-end|206|109|21|562|18061|32736|64004|40613|107028|2222124|0:06:58|3.5|CC57C_contigs.fa
+E. coli|MiSeq paired-end|101|97|12|687|80150|127399|210819|146845|318067|4606049|0:21:50|9.3|coliMiSeq300m80_contigs.fa
+C. elegans|LRsim linked-read (paired) |5352|4949|613|502|19304|44626|90857|58011|305563|95.57e6|4:05:34|79.1|celegansLR_contigs.fa
+
+Scaffolds:
+
+species|read type|n|n:500|L50|min|N80|N50|N20|E-size|max|sum|name
+---|---|---|---|---|---|---|---|---|---|---|---|---
+F. nucleatum|HiSeq paired-end|638|524|120|501|2756|5067|9273|6141|19764|1943677|fusoCC53_scaffolds.fa
+C. showae|MiSeq paired-end|135|42|6|585|47914|128053|233500|159643|382374|2223249|Cshowae_scaffolds.fa
+E. coli|MiSeq paired-end|86|82|5|687|91172|204889|1193524|437907|1193524|4606049|coliMiSeq300m80_scaffolds.fa
+C. elegans|LRsim linked-read (paired)|4637|4309|491|502|22274|53866|112496|74063|357928|95.59e6|celegansLR_scaffolds.fa
+
+stats generated with abyss-fac
+
+benchmark: Intel(R) Xeon(R) CPU E7-8867 v3 @ 2.50GHz 128CPU 2TB RAM CentOS7/ 1 thread per assembly
+
+
+### How it works
 ------------
 
 1. Sequence Overlap
@@ -366,9 +391,11 @@ Once the search complete, a consensus sequence is derived from the hash table c,
 The process of progressively cycling through longer to shorter 3'-most k-mer is repeated after every sequence extension until nothing else can be done on that side.  Since only left-most searches are possible with a prefix tree, when all possibilities have been exhausted for the 3' extension, the complementary strand of the contiguous sequence generated is used to extend the contig on the 5' end.  The DNA prefix tree is used to limit the search space by segregating sequence reads and their reverse-complemented counterparts by their first eleven 5' end bases.  
 
 There are three ways to control the stringency in SSAKE:
-1) Disallow read/contig extension if the coverage is too low (-o).  Higher -o values lead to shorter contigs, but minimizes sequence misassemblies.
-2) Adjust the minimum overlap -m allowed between the seed/contig and short sequence reads.  Higher m values lead to more accurate contigs at the cost of decreased contiguity.  
-3) Set the minimum base ratio -r to higher values
+<pre>
+i) Disallow read/contig extension if the coverage is too low (-o).  Higher -o values lead to shorter contigs, but minimizes sequence misassemblies.
+ii) Adjust the minimum overlap -m allowed between the seed/contig and short sequence reads.  Higher m values lead to more accurate contigs at the cost of decreased contiguity.  
+iii) Set the minimum base ratio -r to higher values
+</pre>
 
 
 2. Building scaffolds with SSAKE
@@ -384,6 +411,7 @@ For each read pairs, putative contig pairs (pre-scaffolding stage) are tallied b
 Please note that this stage accepts redundancy of contig pairs (i.e. a given contig may link to multiple contigs, and the number of links (spanning pairs) between any given contig pair is recorded, along with a mean putative gap or overlap(-)).
 Once pairing between contigs is complete, the scaffolds are built using contigs (-z or larger) as seeds.  Every contig is used in turn until all have been incorporated into a scaffold.
 
+<pre>
 Consider the following contig pairs (AB, AC and rAD):
 
     A         B
@@ -403,6 +431,7 @@ Consider the following contig pairs (AB, AC and rAD):
       ->   <-
      ->   <-
        ->   <-
+</pre>
 
 Two parameters control scaffolding (-k and -a).  The -k option specifies the minimum number of links (read pairs) a valid contig pair MUST have to be considered.  The -a option specifies the maximum ratio between the best two contig pairs for a given seed/contig being extended.  For example, contig A shares 4 links with B and 2 links with C, in this orientation.  contig rA (reverse) also shares 3 links with D.   When it's time to extend contig A (with the options -k and -a set to 2 and 0.7, respectively), both contig pairs AB and AC are considered.  Since C (second-best) has 2 links and B (best) has 4 (2/4) = 0.5 below the maximum ratio of 0.7, A will be linked with B in the scaffold and C will be kept for another extension. If AC had 3 links the resulting ratio (0.75), above the user-defined maximum 0.7 would have caused the extension to terminate at A, with both B and C considered for a different scaffold.  A maximum links ratio of 1 (not recommended) means that the best two candidate contig pairs have the same number of links -- SSAKE will accept the first one since both have a valid gap/overlap.  When a scaffold extension is terminated on one side, the scaffold is extended on the "left", by looking for contig pairs that involve the reverse of the seed (in this example, rAD).  With AB and AC having 4 and 2 links, respectively and rAD being the only pair on the left, the final scaffolds outputted by SSAKE would be:
 
@@ -417,7 +446,7 @@ Accurate scaffolding depends on many factors.  Number and nature of repeats in y
 
 If the -s option is set and points to a valid fasta file, the DNA sequences comprised in that file will populate the hash table and be used exclusively as seeds to nucleate contig extensions (they will not be utilized to build the prefix tree).  In that scheme, every unique seed will be used in turn to nucleate an extension, using short reads found in the tree (specified in -f).  This feature might be useful if you already have characterized sequences & want to increase their length using short reads.  That said, since the short reads are not used as seeds when -s is set, they will not co-assemble with one another WITHOUT a seed sequence file - unless you run SSAKE in targeted de novo assembly mode (see below).  Also, to speed up the assembly, no imbedded reads (i.e. those aligning to the seed in their entirety) are considered.  Only reads that contribute to extending a seed sequence are noted.
 
-When -s is set, the .contigs file lists all extended seeds, even if it's by a single base.  The .singlets will ONLY list seeds that could not be extended.  Unassembled microreads will NOT be outputted. 
+When -s is set, the _contigs.fa file lists all extended seeds, even if it's by a single base.  The .singlets will ONLY list seeds that could not be extended.  Unassembled microreads will NOT be outputted. 
 
 Support for sequence target-independent de novo assemblies:
 
@@ -434,29 +463,51 @@ If more than one seed is supplied in the -s file and you're providing paired-end
 
 
 
-Input sequences
+### Input sequences
 ---------------
 
 UNPAIRED:
 
-DNA sequences can be in lower caps as well
-
->PX1CG_29
+DNA sequences can be in lower caps as well (NO UNDERSCORE _ CHARACTER ALLOWED)
+<pre>
+>PX1CG29
 TTAACACTTTCGGATATTTCTGATG
->PX1CG_35
+>PX1CG35
 CTTTCGGATATTTCTGATGAGTCGA
->PX1CG_64
+>PX1CG64
 TTATCTTGATAAAGCAGGAATTACT
 ...
+</pre>
 
 PAIRED:
-
+<pre>
 >2-1-464-197:200
 TGGCTCACCCCTGTAATCCCAGCACT:CTCCCAGGTTCAAGCGATTCTCCTGC
 >2-1-783-425:300
 GTCTGAGGGTCCCAGGAACCAG:TGCCCCAGAGGTGGGAGCAGGGGA
 >2-1-662-655:1000
 TGAATCCCCACCAGGCGCCTTCGG:CACTTTATTATTAATGTACAAAAT
+...
+</pre>
+
+PAIRED, LINKED READ:
+<pre>
+>2-1-464-197_ACGATGCATGCAGTAG:200
+TGGCTCACCCCTGTAATCCCAGCACT:CTCCCAGGTTCAAGCGATTCTCCTGC
+>2-1-783-425_ACGATGCATGCAGTAG:300
+GTCTGAGGGTCCCAGGAACCAG:TGCCCCAGAGGTGGGAGCAGGGGA
+>2-1-662-655_ATGCATGCATGCTAGC:1000
+TGAATCCCCACCAGGCGCCTTCGG:CACTTTATTATTAATGTACAAAAT
+...
+</pre>
+
+-Note: For linked read, a barcode sequence of any length will do. In fact, any [A-Za-z0-9] characters after the underscore, before : or /\ will be used as barcode sequence to filter the reads. Note that when linked reads are not used, but regular reads are used instead, the original ssake behaviour will take precedence. Note that under NO circumstances should your read contain underscore characters (_) if your reads are NOT linked.
+
+Example input from the C. elegans test data available with the v4.0 distribution:
+<pre>
+>gi|453232919|ref|NCI003284.9|I11747444I11747218I1I0I0I0I0:0:0I0:0:0I98920/1_AAACCTGAGCTTTCAG/:350
+CACATACGAGGGCGTTATTTGAAAAATTTAAAAATCAACATGTTCAAGCGTGCGAAGTGTCAAAATAAAAAAGAAAAAAAAAACGAAAAAAAAAACAGAAAAGGCTGATAAGAGGACGCGTCAAGTTA:ACTGCTCATTTGTCAATCAGCAAGGTACATGAAAACACAGAGCAGGAACCAAAATGCACACAATAAAACTCCCCGTACCCATTGTGTGGTACGCAGTACAAAATGACTGACAATAAGAAAGGGAGAGAGGGATTGAGGCGCCGAATACTTG
+</pre>
 
 -Paired sequences must be concatenated together in one fasta-like entry, separated by ":".  For example, TGGCTCACCCCTGTAATCCCAGCACT:CTCCCAGGTTCAAGCGATTCTCCTGC consists of two paired reads.  Changes to the input was made to allow reads of variable length (e.g. quality-trimmed reads) to be considered by SSAKE.  As of v3-6, the header line [>] must have [:insert_size] at the very end (see above example)
 
@@ -472,31 +523,36 @@ General points:
 -Spaces in fasta file are NOT permitted and will either not be considered or result in execution failure
 
 
-Output files
+### Output files
 ------------
 
-.contigs   :: fasta file; All sequence contigs
-.log       :: text file; Logs execution time / errors / pairing stats (if -p is set to 1)
-.short     :: text file; Lists sequence reads shorter than a set, acceptable, minimum
-.singlets  :: fasta file; All unassembled sequence reads
+Output file (-p 0 and -p 1) | Description
+---|---
+_contigs.fa   | fasta file; All sequence contigs
+_scaffolds.fa |fasta file; All sequence scaffolds
+.log       | text file; Logs execution time / errors / pairing stats (if -p is set to 1)
+_short.txt     | text file; Lists sequence reads shorter than a set, acceptable, minimum
+_singlets.fa  | fasta file; All unassembled sequence reads
 
--p 1
-.pairing_distribution.csv :: comma-separated file; 1st column is the calculated distance for each pair (template) with reads that assembled logically within the same contig.  2nd column is the number of pairs at that distance
-.pairing_issues           :: text file; Lists all pairing issues encountered between contig pairs and illogical/out-of-bounds pairing
-.scaffolds                :: comma-separated file; see below
-.mergedcontigs            :: fasta file; All merged/unmerged contigs >= -z bases within scaffolds are listed.  The overlap sequence between contigs (>= -x bases) will be shown in lower case within the merged contig.  Note that *perfect* sequence overlap has to occur between 2 predicted adjacent contigs of a scaffold in order to merge.  It is possible that two contigs merge even though they are NOT predicted to do so (perhaps because insert size range supplied is off or mate pairs are misassembled).  When two consecutive contigs do not physically overlap and the -n option is set to 1, then gaps will be padded with Ns of length corresponding to the predicted gap size m (refer to Understanding the .scaffolds csv file below) and predicted but undetected overlaps with a single (n).
+Output file (-p 1) | Description
+---|---
+_pairing-distribution.csv | comma-separated file; 1st column is the calculated distance for each pair (template) with reads that assembled logically within the same contig.  2nd column is the number of pairs at that distance
+_pairing-issues.txt           | text file; Lists all pairing issues encountered between contig pairs and illogical/out-of-bounds pairing
+.scaffolds                | comma-separated file; see below
 
--c 1  (WARNING: ASSOCIATED FILES CAN BECOME VERY LARGE!)
-.readposition             :: this is a text file listing all whole (fully embedded) reads, start and end coordinate onto the contig (in this order).  For reads aligning on the minus strand, end coordinate is < start coordinate
-.coverage.csv             :: this is a comma-separated values file showing the base coverage at every position for any given contig   >  -z
+Output file (-c 1*) | Description
+---|---
+_readposition.txt             | this is a text file listing all whole (fully embedded) reads, start and end coordinate onto the contig (in this order).  For reads aligning on the minus strand, end coordinate is < start coordinate
+_coverage.csv             | this is a comma-separated values file showing the base coverage at every position for any given contig   >  -z
+*WARNING: ASSOCIATED FILES CAN BECOME VERY LARGE!
 
 
-Understanding the .contigs fasta header
+#### Understanding the _contigs.fa fasta header
 ---------------------------------------
-
+<pre>
 e.g.
 >contig27|size52|read193|cov92.79
-
+</pre>
 contig id# = 27
 size (G) = 52 nt
 number of reads (N) = 193
@@ -507,11 +563,12 @@ the coverage (C) is calculated using the total number (T) of consensus bases [su
 C = T / G
 
 
-Understanding the .scaffolds csv file
+#### Understanding the .scaffolds layout csv file
 -------------------------------------
-
+<pre>
+e.g.
 scaffold1,7484,f127Z7068k12a0.58m42_f3090z62k7a0.14m76_f1473z354
-
+</pre>
 column 1: a unique scaffold identifier
 column 2: the sum of all contig sizes that made it to the scaffold/supercontig
 column 3: a contig chain representing the layout:
@@ -525,19 +582,19 @@ Negative m values imply that there's a possible overlap between the contigs.  Bu
 Use makeFastaFileFromScaffolds.pl included in this distribution to make a scaffold fasta file (ordered and oriented contig sequences) using the layout recipe (contig chain) shown above.
 
 
-Understanding the .coverage.csv file
+#### Understanding the _coverage.csv file
 ------------------------------------
-
+<pre>
 e.g.
 >contig1|size60000|read74001|cov37.00
 12,12,13,13,13,14,14,15,16,16,20,21,22,23,25,26,27,28,27 ...
-
+</pre>
 Each number represents the number of reads covering that base at that position.
 
 
-Understanding the .readposition file
+#### Understanding the _readposition.txt file
 ------------------------------------
-
+<pre>
 e.g.
 >contig1|size60000|read74001|cov37.00
 READ_85952,3,32
@@ -552,23 +609,23 @@ READ_9721,15,44
 READ_49141,16,45
 READ_43328,18,1
 READ_94449,18,1
-
+</pre>
 In this order: read name [template th -p 1 :: name followed with 1 or 2, corresponds to the order in the sequence input (1:2)], start coordinate, end coordinate.  end < start indicates read is on minus strand
 
 
-SSAKE does not
+### SSAKE does not
 --------------
 
--Take into consideration base quality scores.  It is up to the user to process the sequence data before assembling with SSAKE.
-	Python scripts (TQS.py, TQSfastq.py, TQSexport.fq) are provided to help trim poor quality bases off Illumina sequences.  
-	Refer to TQS.readme and TRIMMING_PAIRED_READS.README included in this distribution (in the ./tools subdirectory) for information on how to run those programs
--Consider sequence read having any character other than A,C,G,T and will skip these reads entirely while reading the fasta file. 
+1. Take into consideration base quality scores.  It is up to the user to process the sequence data before assembling with SSAKE.
+2. Consider sequence read having any character other than A,C,G,T and will skip these reads entirely while reading the fasta file. 
+
+Note: Python scripts (TQS.py, TQSfastq.py, TQSexport.fq) are provided to help trim poor quality bases off Illumina sequences. Refer to TQS.readme and TRIMMING_PAIRED_READS.README included in this distribution (in the ./tools subdirectory) for information on how to run those programs
 
 
-License
+### License
 -------
 
-SSAKE Copyright (c) 2006-2016 Canada's Michael Smith Genome Science Centre.  All rights reserved.
+SSAKE Copyright (c) 2006-2018 Canada's Michael Smith Genome Science Centre.  All rights reserved.
 
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License


=====================================
test/CelegansLinkedReadsAssembly.sh
=====================================
--- /dev/null
+++ b/test/CelegansLinkedReadsAssembly.sh
@@ -0,0 +1,21 @@
+d=`date`
+echo ----------------------------------------------------------------------------------- 
+echo Running SSAKE assembly pipeline 
+echo ------------------------------------------------------------------------------------ 
+echo Downloading trimmed/formatted data on $d ...
+echo ------------------------------------------------------------------------------------
+rm -rf celegans_paired.fa
+wget ftp://ftp.bcgsc.ca/supplementary/SSAKE/celegans_paired.fa.gz
+gunzip celegans_paired.fa.gz
+echo ------------------------------------------------------------------------------------
+echo done. Initiating SSAKE assembly
+echo ------------------------------------------------------------------------------------
+time ../SSAKE -f celegans_paired.fa -p 1 -m 20 -w 5 -b celegansLR
+echo ------------------------------------------------------------------------------------
+echo done. Computing stats from contigs
+echo ------------------------------------------------------------------------------------
+../tools/getStats.pl celegansLR_contigs.fa 500 > celegansLR_contigs_stats.txt
+echo done. Computing stats from scaffolds 
+echo ------------------------------------------------------------------------------------
+../tools/getStats.pl celegansLR_scaffolds.fa 500 > celegansLR_scaffolds_stats.txt
+echo assembly pipeline complete. Results are under celegansLR.


=====================================
test/Herpesvirus_3.60kb.reads.fa
=====================================
The diff for this file was not included because it is too large.

=====================================
test/Herpesvirus_3.60kb.seed.fa
=====================================
--- a/test/Herpesvirus_3.60kb.seed.fa
+++ b/test/Herpesvirus_3.60kb.seed.fa
@@ -1,2 +1,2 @@
->seed_test
+>seedTest
 TGACCAATACAAGAACATGTTCGATATATACTGGAATGAGTACGCCCCGGATATAGGGTTTTGTACATTTCCGGAGGAAGATGGCTGGATGTTAATACACCCAACCACGCAAAGTATGTTGTTTCGAAAAATCCTAGCCGGTGACTTTGGATATACCGATGGACAAGGCATATATAGCGCTGTACGGTCTACGGAAACTGTAATTCGCCAAGTTCAGGCAACCGTTTTGATGAACGCGTTGGATGCAACTCGGTATGAGGACCTAGCAGCAGATTGGGAACACCACATCCAACAATGTAACCTGCATGCCGGGGCTCTAGCGGAACGTTATGGGCTATGTGGAGAATCAGAAGCCGTACGGCTTGCACATCAGGTTTTTGAAACCTGGCGTCAAACATTACAGTCATCGTTACTTGAGTTTCTGCGTGGAATAACCGGTTGTCTCTATACCAGTGGTTTAAATGGAAGGGTCGGTTTTGCCAAATACGTGGACTGGATAGCCTGTGTAGGTATTGTGCCCGTTGTAAGAAAGGTACGATCAGAACAGAATGGAACCCCTGCACCATTAAATACGTATATGGGTCAAGCGGCAGAACTGTCCCAGATGTTAAAAGTTGCCGATGCAACGTTGGCCAGAGGAGCGGCGGTTGTCACAAGCCTAGTTGAGTGTATGCAAAATGTTGCTATTATGGATTATGATAGGACGCGTCTTTATTATAATTATAACCGAAGATTAATTATGGCAAAGGATGATGTAACGGGCATGAAGGGAGAGTGTTTGGTCGTGTGGCCGCCCGTTGTATGTGGGGAGGGTGTAGTATTTGACTCACCCTTACAGCGGCTTTCTGGGGAGGTGTTGGCCTGTTATGCATTACGTGAACATGCTCGCGTCTGCCAAGTTTTAAATACAGCCCCTTTGCGCGTGTTAATAGGTCGCCGGAATGAAGATGATAGATCTCACAGCACACGTGCGGTTGATCGTATAATGGGCGAGAACGATACAACACGGGCTGGATCGGCCGCGTCTAGACTTGTAAAGCTAATAGTTAACTTAAAAAACATGAGACATGTTGGAGATATTACCGAAACCGTACGTTCCTATCTAGAAGAAACGGGCAATCACATTCTGGAAGGAAGTGGATCGGTGGACACATCACAACCGGGGTTTGGCAAGGCCAACCAATCCTTTAACGGGGGGGCAATGTCCGGAACAACAAACGTTCAAAGTGCGTTTAAAACTTCGGTGGTTAACAGTATCAACGGCATGCTCGAGGGTTATGTGAAT


=====================================
test/HiSeqFusobacteriumAssembly.sh
=====================================
--- a/test/HiSeqFusobacteriumAssembly.sh
+++ b/test/HiSeqFusobacteriumAssembly.sh
@@ -1,6 +1,6 @@
 d=`date`
 echo ----------------------------------------------------------------------------------- 
-echo Running SSAKE assembly pipeline on bacterial sequence data. It will need ca.4GB RAM
+echo Running SSAKE assembly pipeline on bacterial sequence data.
 echo ------------------------------------------------------------------------------------ 
 echo Downloading trimmed/formatted data for Fusobacterium nucleatum CC53 on $d ...
 echo ------------------------------------------------------------------------------------
@@ -9,17 +9,12 @@ wget ftp://ftp.bcgsc.ca/supplementary/SSAKE/CC53_2million.fa
 echo ------------------------------------------------------------------------------------
 echo done. Initiating SSAKE assembly ETA 10-20min depending on system...
 echo ------------------------------------------------------------------------------------
-time ../SSAKE -f CC53_2million.fa -p 1 -m 20 -w 5 -b fusoCC53-1
+time ../SSAKE -f CC53_2million.fa -p 1 -m 20 -w 5 -b fusoCC53
 echo ------------------------------------------------------------------------------------
-echo done. Converting scaffold .csv into fasta file...
+echo done. Computing stats from contigs
 echo ------------------------------------------------------------------------------------
-../tools/makeFastaFileFromScaffolds.pl fusoCC53-1.scaffolds
+../tools/getStats.pl fusoCC53_contigs.fa 500 > fusoCC53_contigs_stats.txt
+echo done. Computing stats from scaffolds 
 echo ------------------------------------------------------------------------------------
-echo done. Computing stats from fusoCC53-1.contigs 
-echo ------------------------------------------------------------------------------------
-../tools/getStats.pl fusoCC53-1.contigs 500 > fusoCC53-1.contigs.stats
-echo ------------------------------------------------------------------------------------
-echo done. Computing stats from fusoCC53-1.scaffolds.fa
-echo ------------------------------------------------------------------------------------
-../tools/getStats.pl fusoCC53-1.scaffolds.fa 500 > fusoCC53-1.scaffolds.stats
-echo assembly pipeline complete. Results are under fusoCC53-1.
+../tools/getStats.pl fusoCC53_scaffolds.fa 500 > fusoCC53_scaffolds_stats.txt
+echo assembly pipeline complete. Results are under fusoCC53.


=====================================
test/MiSeqCampylobacterAssembly.sh
=====================================
--- a/test/MiSeqCampylobacterAssembly.sh
+++ b/test/MiSeqCampylobacterAssembly.sh
@@ -1,6 +1,6 @@
 d=`date`
 echo ----------------------------------------------------------------------------------- 
-echo Running SSAKE assembly pipeline on bacterial sequence data. It will need ca.4GB RAM
+echo Running SSAKE assembly pipeline on bacterial sequence data.
 echo ------------------------------------------------------------------------------------ 
 echo Downloading trimmed/formatted data for Campylobacter showae CC57C on $d ...
 echo ------------------------------------------------------------------------------------
@@ -12,15 +12,11 @@ echo done. Initiating SSAKE assembly ETA 10-20min depending on system...
 echo ------------------------------------------------------------------------------------
 time ../SSAKE -f CC57C_paired.fa -p 1 -g CC57C_unpaired.fa -m 20 -w 5 -b CC57C
 echo ------------------------------------------------------------------------------------
-echo done. Converting scaffold .csv into fasta file...
+echo done. Computing stats from contigs 
 echo ------------------------------------------------------------------------------------
-../tools/makeFastaFileFromScaffolds.pl CC57C.scaffolds
+../tools/getStats.pl CC57C_contigs.fa 500 > CC57C_contigs_stats.txt
 echo ------------------------------------------------------------------------------------
-echo done. Computing stats from CC57C.contigs 
+echo done. Computing stats from scaffolds
 echo ------------------------------------------------------------------------------------
-../tools/getStats.pl CC57C.contigs 500 > CC57C.contigs.stats
-echo ------------------------------------------------------------------------------------
-echo done. Computing stats from CC57C.scaffolds.fa
-echo ------------------------------------------------------------------------------------
-../tools/getStats.pl CC57C.scaffolds.fa 500 > CC57C.scaffolds.stats
+../tools/getStats.pl CC57C_scaffolds.fa 500 > CC57C_scaffolds_stats.txt
 echo assembly pipeline complete. Results are under CC57C.


=====================================
test/MiSeqCampylobacterAssemblyPIPELINE.sh
=====================================
--- a/test/MiSeqCampylobacterAssemblyPIPELINE.sh
+++ b/test/MiSeqCampylobacterAssemblyPIPELINE.sh
@@ -1,6 +1,6 @@
 d=`date`
 echo ----------------------------------------------------------------------------------- 
-echo Running SSAKE assembly pipeline on bacterial sequence data. It will need ca.4GB RAM
+echo Running SSAKE assembly pipeline on bacterial sequence data.
 echo ----------------------------------------------------------------------------------- 
 echo Downloading MiSeq data for Campylobacter showae CC57C on $d ...
 echo -----------------------------------------------------------------------------------
@@ -22,15 +22,10 @@ echo done. Initiating SSAKE assembly ETA 10-20min depending on system...
 echo -----------------------------------------------------------------------------------
 time ../SSAKE -f paired.fa -p 1 -g unpaired.fa -m 20 -w 5 -b Cshowae
 echo -----------------------------------------------------------------------------------
-echo done. Converting scaffold .csv into fasta file...
+echo done. Computing stats from contigs 
 echo -----------------------------------------------------------------------------------
-../tools/makeFastaFileFromScaffolds.pl Cshowae.scaffolds
+../tools/getStats.pl Cshowae_contigs.fa 500 > Cshowae_contigs_stats.fa
+echo done. Computing stats from scaffolds
 echo -----------------------------------------------------------------------------------
-echo done. Computing stats from Cshowae.contigs 
-echo -----------------------------------------------------------------------------------
-../tools/getStats.pl Cshowae.contigs 500 > Cshowae.contigs.stats
-echo -----------------------------------------------------------------------------------
-echo done. Computing stats from Cshowae.scaffolds.fa
-echo -----------------------------------------------------------------------------------
-../tools/getStats.pl Cshowae.scaffolds.fa 500 > Cshowae.scaffolds.stats
+../tools/getStats.pl Cshowae_scaffolds.fa 500 > Cshowae_scaffolds_stats.txt
 echo assembly pipeline complete. Results are under Cshowae.


=====================================
test/MiSeqEbolaAssemblyPIPELINE.sh
=====================================
--- a/test/MiSeqEbolaAssemblyPIPELINE.sh
+++ b/test/MiSeqEbolaAssemblyPIPELINE.sh
@@ -19,19 +19,14 @@ echo done. Formatting fasta input for SSAKE...
 echo -----------------------------------------------------------------------------------
 ../tools/makePairedOutput2UNEQUALfiles.pl SRR2019530_1.fastqc70q20e33.trimFIX.fa SRR2019530_2.fastqc70q20e33.trimFIX.fa 600
 echo -----------------------------------------------------------------------------------
-echo done. Initiating SSAKE assembly ETA 10-20min depending on system...
+echo done. Initiating SSAKE assembly.. 
 echo -----------------------------------------------------------------------------------
 time ../SSAKE -f paired.fa -p 1 -g unpaired.fa -m 20 -w 5 -b ebola
 echo -----------------------------------------------------------------------------------
-echo done. Converting scaffold .csv into fasta file...
+echo done. Computing stats from ebola_contigs.fa 
 echo -----------------------------------------------------------------------------------
-../tools/makeFastaFileFromScaffolds.pl ebola.scaffolds
+../tools/getStats.pl ebola_contigs.fa 500 > ebola_contigs_stats.txt
+echo done. Computing stats from ebola_scaffolds.fa
 echo -----------------------------------------------------------------------------------
-echo done. Computing stats from ebola.contigs 
-echo -----------------------------------------------------------------------------------
-../tools/getStats.pl ebola.contigs 500 > ebola.contigs.stats
-echo -----------------------------------------------------------------------------------
-echo done. Computing stats from ebola.scaffolds.fa
-echo -----------------------------------------------------------------------------------
-../tools/getStats.pl ebola.scaffolds.fa 500 > ebola.scaffolds.stats
+../tools/getStats.pl ebola_scaffolds.fa 500 > ebola_scaffolds_stats.txt
 echo assembly pipeline complete. Results are under ebola.


=====================================
test/MiSeqEcoliAssembly250XPE300.sh
=====================================
--- a/test/MiSeqEcoliAssembly250XPE300.sh
+++ b/test/MiSeqEcoliAssembly250XPE300.sh
@@ -2,7 +2,6 @@ d=`date`
 echo ----------------------------------------------------------------------------------- 
 echo Running SSAKE assembly pipeline on bacterial sequence data 250X E.coli
 echo ----------------------------------------------------------------------------------- 
-echo -----------------------------------------------------------------------------------
 rm -rf ecoli_miseqTrimmed*fa
 wget ftp://ftp.bcgsc.ca/supplementary/SSAKE/ecoli_miseqTrimmed_paired.fa
 wget ftp://ftp.bcgsc.ca/supplementary/SSAKE/ecoli_miseqTrimmed_unpaired.fa
@@ -11,16 +10,12 @@ echo done. Initiating SSAKE assembly ETA 20-30min depending on system...
 echo -----------------------------------------------------------------------------------
 /usr/bin/time -v -o coliMiSeq300m80.time ../SSAKE -f ecoli_miseqTrimmed_paired.fa -g ecoli_miseqTrimmed_unpaired.fa -p 1 -m 80 -w 100 -b coliMiSeq300m80
 echo -----------------------------------------------------------------------------------
-echo done. Converting scaffold .csv into fasta file...
-echo -----------------------------------------------------------------------------------
-../tools/makeFastaFileFromScaffolds.pl coliMiSeq300m80.scaffolds
-echo -----------------------------------------------------------------------------------
-echo done. Computing stats from Ecoli.contigs 
+echo done. Computing stats from contigs 
 echo -----------------------------------------------------------------------------------
-../tools/getStats.pl coliMiSeq300m80.contigs > coliMiSeq300m80.contigs.stats
+../tools/getStats.pl coliMiSeq300m80_contigs.fa > coliMiSeq300m80_contigs_stats.txt
 echo -----------------------------------------------------------------------------------
-echo done. Computing stats from Ecoli.scaffolds.fa
+echo done. Computing stats from scaffolds
 echo -----------------------------------------------------------------------------------
-../tools/getStats.pl coliMiSeq300m80.scaffolds.fa 200 > coliMiSeq300m80.scaffolds.200stats
-../tools/getStats.pl coliMiSeq300m80.scaffolds.fa 500 > coliMiSeq300m80.scaffolds.500stats
+../tools/getStats.pl coliMiSeq300m80_scaffolds.fa 200 > coliMiSeq300m80_scaffolds_200bpstats.txt
+../tools/getStats.pl coliMiSeq300m80_scaffolds.fa 500 > coliMiSeq300m80_scaffolds_500bpstats.txt
 echo assembly pipeline complete. Results are under coliMiSeq300m80.


=====================================
test/README.txt
=====================================
--- a/test/README.txt
+++ b/test/README.txt
@@ -1,11 +1,11 @@
-January 2016
+January 2018
 ------------
 TEST DATA / SSAKE ASSEMBLIES
 
 
 A) Testing the distribution with very short reads:
 
-../SSAKE -f Herpesvirus_3.60kb.reads.fa -m 16 -o 2 -w 5 -c 1 
+../SSAKE -f Herpesvirus_3.60kb.reads.fa -m 16 -o 2 -w 5 -c 1 -b HStestInstall 
 
 
 B) Testing the targeted assembly using a seed/target sequence:
@@ -51,7 +51,7 @@ coliMiSeq300m80.scaffolds.stats1
 
 ./HiSeqFusobacteriumAssembly.sh
 
-(just the assembly)
+(read download, assembly)
 
 
 5) De Novo Targeted assembly of a TMPRSS2:ERG fusion using a prostate adenocarcinoma RNA-seq dataset
@@ -59,3 +59,10 @@ coliMiSeq300m80.scaffolds.stats1
 ./runSSAKEtargeted.sh
 
 (read download,trimming,formatting,assembly)
+
+
+6) C. elegans linked-read assembly
+
+./CelegansLinkedReadsAssembly.sh
+
+(read download, assembly)


=====================================
test/assembly_metrics_tests.txt
=====================================
--- /dev/null
+++ b/test/assembly_metrics_tests.txt
@@ -0,0 +1,24 @@
+SSAKE v4.0
+January 2018
+
+Contigs:
+n	n:500	L50	min	N80	N50	N20	E-size	max	sum	time (h:mm:ss)	RAM (GB)	name
+206	109	21	562	18061	32736	64004	40613	107028	2222124	0:06:58	3.5	CC57C_contigs.fa
+5352	4949	613	502	19304	44626	90857	58011	305563	95.57e6	4:05:34	79.1	celegansLR_contigs.fa
+101	97	12	687	80150	127399	210819	146845	318067	4606049	0:21:50	9.3	coliMiSeq300m80_contigs.fa
+206	109	21	562	18061	32736	64004	40613	107028	2222124	0:07:22	3.5	Cshowae_contigs.fa
+122	18	1	501	688	18917	18917	12139	18917	30101	0:01:22	0.4	ebola_contigs.fa
+697	560	131	501	2540	4641	8019	5631	19764	1937271	0:10:43	6.0	fusoCC53_contigs.fa
+7	1	1	60000	60000	60000	60000	60000	60000	60000	0:00:22	0.2	HStestInstall_contigs.fa
+
+Scaffolds:
+n	n:500	L50	min	N80	N50	N20	E-size	max	sum	name
+135	42	6	585	47914	128053	233500	159643	382374	2223249	CC57C_scaffolds.fa
+4637	4309	491	502	22274	53866	112496	74063	357928	95.59e6	celegansLR_scaffolds.fa
+86	82	5	687	91172	204889	1193524	437907	1193524	4606049	coliMiSeq300m80_scaffolds.fa
+135	42	6	585	47914	128053	233500	159643	382374	2223249	Cshowae_scaffolds.fa
+122	18	1	501	688	18917	18917	12139	18917	30101	ebola_scaffolds.fa
+638	524	120	501	2756	5067	9273	6141	19764	1943677	fusoCC53_scaffolds.fa
+
+stats generated with abyss-fac
+benchmark: Intel(R) Xeon(R) CPU E7-8867 v3 @ 2.50GHz 128CPU 2TB RAM CentOS7/ 1 thread per assembly


=====================================
test/runSSAKE.sh
=====================================
--- /dev/null
+++ b/test/runSSAKE.sh
@@ -0,0 +1 @@
+../tools/runSSAKE.sh
\ No newline at end of file


=====================================
test/runSSAKE.sh → tools/runSSAKE.sh
=====================================
--- a/test/runSSAKE.sh
+++ b/tools/runSSAKE.sh
@@ -33,19 +33,14 @@ echo ---------------------------------------------------------------------------
 /usr/bin/time -v -o $4.time ../SSAKE -f paired.fa -p 1 -g unpaired.fa -m 20 -w 5 -b $4
 d=`date`
 echo -----------------------------------------------------------------------------------
-echo $d : Converting scaffold .csv into fasta file...
-echo -----------------------------------------------------------------------------------
-../tools/makeFastaFileFromScaffolds.pl $4.scaffolds
-d=`date`
-echo -----------------------------------------------------------------------------------
 echo $d : Computing stats 
 echo -----------------------------------------------------------------------------------
-../tools/getStats.pl $4.contigs > $4.contigs.stats
+../tools/getStats.pl $4_contigs.fa > $4_contigs_stats.txt
 d=`date`
 echo -----------------------------------------------------------------------------------
-echo $d : Computing stats from $4.scaffolds.fa
+echo $d : Computing stats from $4_scaffolds.fa
 echo -----------------------------------------------------------------------------------
-../tools/getStats.pl $4.scaffolds.fa > $4.scaffolds.stats
+../tools/getStats.pl $4_scaffolds.fa > $4_scaffolds.stats.txt
 d=`date`
 echo -----------------------------------------------------------------------------------
 echo $d : assembly pipeline complete. Results are under $4.



View it on GitLab: https://salsa.debian.org/med-team/ssake/compare/11dc84ae26a86ac7188229d6d3f29d7cfba616f5...7494ba05d9bb3e02801deceebc4fa2b65adc138a

---
View it on GitLab: https://salsa.debian.org/med-team/ssake/compare/11dc84ae26a86ac7188229d6d3f29d7cfba616f5...7494ba05d9bb3e02801deceebc4fa2b65adc138a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alioth.debian.org/pipermail/debian-med-commit/attachments/20180218/1789bdf1/attachment-0001.html>