[med-svn] [abyss] 01/07: New upstream version 2.0.1

Andreas Tille tille at debian.org
Thu Oct 6 04:57:19 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository abyss.

commit 95e61ae091136a9f5c0e9f513aa330e54098308d
Author: Andreas Tille <tille at debian.org>
Date:   Thu Oct 6 06:40:38 2016 +0200

    New upstream version 2.0.1
---
 .gitignore                                         |   12 +-
 Assembly/Options.cc                                |    8 +-
 Bloom/Bloom.h                                      |   10 +-
 Bloom/BloomFilter.h                                |   21 +-
 Bloom/BloomFilterWindow.h                          |   21 +-
 Bloom/CascadingBloomFilter.h                       |    9 +-
 Bloom/CascadingBloomFilterWindow.h                 |   14 +-
 Bloom/ConcurrentBloomFilter.h                      |   10 +-
 Bloom/bloom.cc                                     |  470 ++++++-
 BloomDBG/HashAgnosticCascadingBloom.h              |  145 +++
 BloomDBG/LightweightKmer.h                         |   96 ++
 BloomDBG/Makefile.am                               |   23 +
 BloomDBG/MaskedKmer.h                              |  121 ++
 BloomDBG/RollingBloomDBG.h                         |  486 ++++++++
 BloomDBG/RollingHash.h                             |  289 +++++
 BloomDBG/RollingHashIterator.h                     |  234 ++++
 BloomDBG/SpacedSeed.h                              |   79 ++
 BloomDBG/bloom-dbg.cc                              |  345 ++++++
 BloomDBG/bloom-dbg.h                               | 1276 ++++++++++++++++++++
 COPYRIGHT                                          |   84 +-
 ChangeLog                                          |   57 +
 Common/Kmer.h                                      |    8 +-
 Common/Sequence.h                                  |    9 +-
 DataBase/Makefile.am                               |    3 +
 DataBase/db-csv.cc                                 |    3 +-
 DataLayer/fac.cc                                   |   27 +-
 Dockerfile                                         |   20 +
 Graph/BreadthFirstSearch.h                         |    3 +
 Graph/ExtendPath.h                                 |  482 ++++++--
 Graph/Path.h                                       |   13 +
 .../Konnector/integration-tests.mk                 |   24 +-
 Konnector/DBGBloomAlgorithms.h                     |   11 +-
 Konnector/README.md                                |  176 +++
 Konnector/konnector.cc                             |  690 +++++++----
 Konnector/konnector.h                              |  239 +++-
 LICENSE                                            |   22 +-
 LogKmerCount/CountingBloomFilter.h                 |    4 +-
 LogKmerCount/plc.h                                 |    6 +-
 Makefile.am                                        |   15 +-
 ParseAligns/abyss-fixmate.cc                       |    1 -
 README.css                                         |   39 -
 README.md                                          |  156 +--
 Scaffold/drawgraph.cc                              |    2 +-
 Scaffold/scaffold.cc                               |   21 +-
 Sealer/Makefile.am                                 |    2 +-
 Sealer/README.md                                   |   16 +-
 Sealer/sealer.cc                                   |   92 +-
 SimpleGraph/SimpleGraph.cpp                        |    2 +-
 Unittest/BloomDBG/BloomDBGTest.cpp                 |  155 +++
 .../BloomDBG/HashAgnosticCascadingBloomTest.cpp    |   46 +
 Unittest/BloomDBG/MaskedKmerTest.cpp               |   26 +
 Unittest/BloomDBG/RollingBloomDBGTest.cpp          |  275 +++++
 Unittest/BloomDBG/RollingHashIteratorTest.cpp      |  116 ++
 Unittest/BloomDBG/RollingHashTest.cpp              |  195 +++
 Unittest/BloomDBG/SpacedSeedTest.cpp               |   26 +
 Unittest/Graph/ExtendPathTest.cpp                  |   98 +-
 Unittest/Makefile.am                               |  191 +--
 bin/abyss-adjtodot.pl                              |    2 +-
 bin/abyss-cstont                                   |    2 +-
 bin/abyss-dida                                     |    2 +-
 bin/abyss-fac.pl                                   |    2 +-
 bin/abyss-fatoagp                                  |   23 +-
 bin/abyss-joindist                                 |    2 +-
 bin/abyss-pe                                       |  123 +-
 bin/abyss-samtoafg                                 |    2 +-
 configure.ac                                       |   29 +-
 doc/ABYSS.1                                        |    2 +-
 doc/abyss-pe.1                                     |   19 +-
 doc/abyss-tofastq.1                                |    2 +-
 doc/flowchart.graffle                              |    2 +-
 lib/bloomfilter/BloomFilter.hpp                    |  446 +++++++
 lib/bloomfilter/Makefile.am                        |    1 +
 lib/bloomfilter/README.md                          |    4 +
 lib/rolling-hash/Makefile.am                       |    1 +
 lib/rolling-hash/README.md                         |    2 +
 lib/rolling-hash/rolling.h                         |  316 +++++
 76 files changed, 7156 insertions(+), 850 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5253d26..dd0a13a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,14 +1,23 @@
-*/Makefile.in
+Makefile
 Makefile.in
+README.html
 aclocal.m4
 autom4te.cache
+config.guess
+config.h
 config.h.in
+config.log
+config.status
+config.sub
 configure
 depcomp
 install-sh
+lib*.a
 missing
+stamp-h1
 test-driver
 _*
+*.o
 *.swp
 *.swo
 *.swn
@@ -16,3 +25,4 @@ _*
 tags
 compile
 *.orig
+.deps
diff --git a/Assembly/Options.cc b/Assembly/Options.cc
index 379dd1f..8ae5d6f 100644
--- a/Assembly/Options.cc
+++ b/Assembly/Options.cc
@@ -51,16 +51,16 @@ static const char USAGE_MESSAGE[] =
 "  -k, --kmer=N          the length of a k-mer (when -K is not set)\n"
 "                        or the span of a k-mer pair (when -K is set)\n"
 "  -K, --single-kmer=N   the length of a single k-mer in a k-mer pair\n"
-"  -t, --trim-length=N   maximum length of dangling edges to trim\n"
+"  -t, --trim-length=N   maximum length of blunt contigs to trim [k]\n"
 "  -c, --coverage=FLOAT  remove contigs with mean k-mer coverage\n"
 "                        less than this threshold\n"
 "  -b, --bubbles=N       pop bubbles shorter than N bp [3*k]\n"
 "  -b0, --no-bubbles     do not pop bubbles\n"
-"  -e, --erode=N         erode bases at the ends of blunt contigs\n"
-"                        with coverage less than this threshold\n"
+"  -e, --erode=N         erode bases at the ends of blunt contigs with coverage\n"
+"                        less than this threshold [round(sqrt(median))]\n"
 "  -E, --erode-strand=N  erode bases at the ends of blunt contigs\n"
 "                        with coverage less than this threshold on\n"
-"                        either strand\n"
+"                        either strand [1 if sqrt(median) > 2 else 0]\n"
 "  --coverage-hist=FILE  write the k-mer coverage histogram to FILE\n"
 "  -m, --mask-cov        do not include kmers containing masked bases in\n"
 "                        coverage calculations [experimental]\n"
diff --git a/Bloom/Bloom.h b/Bloom/Bloom.h
index 3b1c6fe..cd90dfb 100644
--- a/Bloom/Bloom.h
+++ b/Bloom/Bloom.h
@@ -40,12 +40,13 @@ namespace Bloom {
 		size_t fullBloomSize;
 		size_t startBitPos;
 		size_t endBitPos;
+		size_t hashSeed;
 	};
 
 	/** Print a progress message after loading this many seqs */
 	static const unsigned LOAD_PROGRESS_STEP = 100000;
 	/** file format version number */
-	static const unsigned BLOOM_VERSION = 2;
+	static const unsigned BLOOM_VERSION = 4;
 
 	/** Return the hash value of this object. */
 	inline static size_t hash(const key_type& key)
@@ -55,7 +56,7 @@ namespace Bloom {
 
 		key_type copy(key);
 		copy.reverseComplement();
-		return hashmem(&copy, sizeof copy);
+		return hashmem(&copy, sizeof copy, 0);
 	}
 
 	/** Return the hash value of this object given seed. */
@@ -137,13 +138,12 @@ namespace Bloom {
 		(void)writeHeader;
 
 		out << BLOOM_VERSION << '\n';
-		assert(out);
 		out << Kmer::length() << '\n';
-		assert(out);
 		out << header.fullBloomSize
 			<< '\t' << header.startBitPos
 			<< '\t' << header.endBitPos
 			<< '\n';
+		out << header.hashSeed << '\n';
 		assert(out);
 	}
 
@@ -180,6 +180,8 @@ namespace Bloom {
 		   >> expect("\t") >> header.endBitPos
 		   >> expect("\n");
 
+		in >> header.hashSeed >> expect("\n");
+
 		assert(in);
 		assert(header.startBitPos < header.fullBloomSize);
 		assert(header.endBitPos < header.fullBloomSize);
diff --git a/Bloom/BloomFilter.h b/Bloom/BloomFilter.h
index 620f045..91b1aa4 100644
--- a/Bloom/BloomFilter.h
+++ b/Bloom/BloomFilter.h
@@ -20,10 +20,11 @@ class BloomFilter
   public:
 
 	/** Constructor. */
-	BloomFilter() : m_size(0), m_array(NULL) { }
+	BloomFilter() : m_size(0), m_hashSeed(0), m_array(NULL) { }
 
 	/** Constructor. */
-	BloomFilter(size_t n) : m_size(n)
+	BloomFilter(size_t n, size_t hashSeed=0) : m_size(n),
+		m_hashSeed(hashSeed)
 	{
 		m_array = new char[(n + 7)/8]();
 	}
@@ -70,7 +71,7 @@ class BloomFilter
 	/** Return whether the object is present in this set. */
 	bool operator[](const Bloom::key_type& key) const
 	{
-		return (*this)[Bloom::hash(key) % m_size];
+		return (*this)[Bloom::hash(key, m_hashSeed) % m_size];
 	}
 
 	/** Add the object with the specified index to this set. */
@@ -83,7 +84,7 @@ class BloomFilter
 	/** Add the object to this set. */
 	void insert(const Bloom::key_type& key)
 	{
-		insert(Bloom::hash(key) % m_size);
+		insert(Bloom::hash(key, m_hashSeed) % m_size);
 	}
 
 	/** Operator for reading a bloom filter from a stream. */
@@ -106,6 +107,16 @@ class BloomFilter
 		Bloom::FileHeader header = Bloom::readHeader(in);
 		assert(in);
 
+		if (m_hashSeed != header.hashSeed) {
+			if (readOp == BITWISE_OVERWRITE) {
+				m_hashSeed = header.hashSeed;
+			} else {
+				std::cerr << "error: can't union/intersect bloom filters with "
+					<< "different hash seeds\n";
+				exit(EXIT_FAILURE);
+			}
+		}
+
 		if (m_size != header.fullBloomSize) {
 			if (readOp == BITWISE_OVERWRITE) {
 				resize(header.fullBloomSize);
@@ -128,6 +139,7 @@ class BloomFilter
 		header.fullBloomSize = m_size;
 		header.startBitPos = 0;
 		header.endBitPos = m_size - 1;
+		header.hashSeed = m_hashSeed;
 
 		Bloom::writeHeader(out, header);
 		assert(out);
@@ -149,6 +161,7 @@ class BloomFilter
   protected:
 
 	size_t m_size;
+	size_t m_hashSeed;
 	char* m_array;
 };
 
diff --git a/Bloom/BloomFilterWindow.h b/Bloom/BloomFilterWindow.h
index c16f8dd..521d2a3 100644
--- a/Bloom/BloomFilterWindow.h
+++ b/Bloom/BloomFilterWindow.h
@@ -27,8 +27,9 @@ public:
 	 * @param startBitPos index of first bit in the window
 	 * @param endBitPos index of last bit in the window
 	 */
-	BloomFilterWindow(size_t fullBloomSize, size_t startBitPos, size_t endBitPos) :
-		BloomFilter(endBitPos - startBitPos + 1),
+	BloomFilterWindow(size_t fullBloomSize, size_t startBitPos,
+			size_t endBitPos, size_t hashSeed=0) :
+		BloomFilter(endBitPos - startBitPos + 1, hashSeed),
 		m_fullBloomSize(fullBloomSize),
 		m_startBitPos(startBitPos),
 		m_endBitPos(endBitPos)
@@ -88,7 +89,7 @@ public:
 	/** Return whether the object is present in this set. */
 	bool operator[](const Bloom::key_type& key) const
 	{
-		return (*this)[Bloom::hash(key) % m_fullBloomSize];
+		return (*this)[Bloom::hash(key, m_hashSeed) % m_fullBloomSize];
 	}
 
 	/** Add the object with the specified index to this set. */
@@ -101,7 +102,7 @@ public:
 	/** Add the object to this set. */
 	void insert(const Bloom::key_type& key)
 	{
-		insert(Bloom::hash(key) % m_fullBloomSize);
+		insert(Bloom::hash(key, m_hashSeed) % m_fullBloomSize);
 	}
 
 	/** Operator for reading a bloom filter from a stream. */
@@ -128,6 +129,16 @@ public:
 		m_startBitPos = header.startBitPos;
 		m_endBitPos = header.endBitPos;
 
+		if (m_hashSeed != header.hashSeed) {
+			if (readOp == BITWISE_OVERWRITE) {
+				m_hashSeed = header.hashSeed;
+			} else {
+				std::cerr << "error: can't union/intersect bloom filters with "
+					<< "different hash seed values\n";
+				exit(EXIT_FAILURE);
+			}
+		}
+
 		size_t bits = header.endBitPos - header.startBitPos + 1;
 
 		if (m_size != bits) {
@@ -152,6 +163,8 @@ public:
 		header.fullBloomSize = m_fullBloomSize;
 		header.startBitPos = m_startBitPos;
 		header.endBitPos = m_endBitPos;
+		header.hashSeed = m_hashSeed;
+
 		Bloom::writeHeader(out, header);
 		assert(out);
 
diff --git a/Bloom/CascadingBloomFilter.h b/Bloom/CascadingBloomFilter.h
index 1cf3f93..30b9844 100644
--- a/Bloom/CascadingBloomFilter.h
+++ b/Bloom/CascadingBloomFilter.h
@@ -18,11 +18,11 @@ class CascadingBloomFilter
 	CascadingBloomFilter() {}
 
 	/** Constructor */
-	CascadingBloomFilter(size_t n, size_t max_count)
+	CascadingBloomFilter(size_t n, size_t max_count, size_t hashSeed=0) : m_hashSeed(hashSeed)
 	{
 		m_data.reserve(max_count);
 		for (unsigned i = 0; i < max_count; i++)
-			m_data.push_back(new BloomFilter(n));
+			m_data.push_back(new BloomFilter(n, hashSeed));
 	}
 
 	/** Destructor */
@@ -68,7 +68,7 @@ class CascadingBloomFilter
 	bool operator[](const Bloom::key_type& key) const
 	{
 		assert(m_data.back() != NULL);
-		return (*m_data.back())[Bloom::hash(key) % m_data.back()->size()];
+		return (*m_data.back())[Bloom::hash(key, m_hashSeed) % m_data.back()->size()];
 	}
 
 	/** Add the object with the specified index to this multiset. */
@@ -87,7 +87,7 @@ class CascadingBloomFilter
 	void insert(const Bloom::key_type& key)
 	{
 		assert(m_data.back() != NULL);
-		insert(Bloom::hash(key) % m_data.back()->size());
+		insert(Bloom::hash(key, m_hashSeed) % m_data.back()->size());
 	}
 
 	/** Get the Bloom filter for a given level */
@@ -111,6 +111,7 @@ class CascadingBloomFilter
 	}
 
   private:
+	size_t m_hashSeed;
 	std::vector<BloomFilter*> m_data;
 
 };
diff --git a/Bloom/CascadingBloomFilterWindow.h b/Bloom/CascadingBloomFilterWindow.h
index 8aa4106..fa33910 100644
--- a/Bloom/CascadingBloomFilterWindow.h
+++ b/Bloom/CascadingBloomFilterWindow.h
@@ -17,13 +17,14 @@ class CascadingBloomFilterWindow : private CascadingBloomFilter
 	 * @param endBitPos index of last bit in the window
 	 * @param max_count the maximum count value of the Bloom filter
 	 */
-	CascadingBloomFilterWindow(size_t fullBloomSize, size_t startBitPos, size_t endBitPos,
-			unsigned max_count)
-		: m_fullBloomSize(fullBloomSize)
+	CascadingBloomFilterWindow(size_t fullBloomSize, size_t startBitPos,
+			size_t endBitPos, unsigned max_count, size_t hashSeed=0)
+		: m_fullBloomSize(fullBloomSize), m_hashSeed(hashSeed)
 	{
 		m_data.reserve(max_count);
-		for (unsigned i = 0; i < max_count; ++i)
-			m_data.push_back(new BloomFilterWindow(fullBloomSize, startBitPos, endBitPos));
+		for (unsigned i = 0; i < max_count; i++)
+			m_data.push_back(new BloomFilterWindow(fullBloomSize,
+						startBitPos, endBitPos, hashSeed));
 	}
 
 	/** Return the size of the bit array. */
@@ -62,7 +63,7 @@ class CascadingBloomFilterWindow : private CascadingBloomFilter
 	void insert(const Bloom::key_type& key)
 	{
 		assert(m_data.back() != NULL);
-		insert(Bloom::hash(key) % m_fullBloomSize);
+		insert(Bloom::hash(key, m_hashSeed) % m_fullBloomSize);
 	}
 
 	void write(std::ostream& out) const
@@ -87,6 +88,7 @@ class CascadingBloomFilterWindow : private CascadingBloomFilter
 
   private:
 	size_t m_fullBloomSize;
+	size_t m_hashSeed;
 	std::vector<BloomFilterWindow*> m_data;
 };
 
diff --git a/Bloom/ConcurrentBloomFilter.h b/Bloom/ConcurrentBloomFilter.h
index 3dcc68f..18e45a3 100644
--- a/Bloom/ConcurrentBloomFilter.h
+++ b/Bloom/ConcurrentBloomFilter.h
@@ -20,8 +20,9 @@ class ConcurrentBloomFilter
 public:
 
 	/** Constructor */
-	ConcurrentBloomFilter(BloomFilterType& bloom, size_t numLocks) :
-		m_bloom(bloom), m_locks(numLocks)
+	ConcurrentBloomFilter(BloomFilterType& bloom, size_t numLocks,
+		size_t hashSeed=0) : m_bloom(bloom), m_locks(numLocks),
+		m_hashSeed(hashSeed)
 	{
 		m_windowSize = bloom.size() / numLocks;
 		// round down to the nearest byte boundary,
@@ -54,7 +55,7 @@ public:
 	/** Return whether the object is present in this set. */
 	bool operator[](const Bloom::key_type& key) const
 	{
-		return *this[Bloom::hash(key) % m_bloom.size()];
+		return *this[Bloom::hash(key, m_hashSeed) % m_bloom.size()];
 	}
 
 	/** Add the object with the specified index to this set. */
@@ -69,7 +70,7 @@ public:
 	/** Add the object to this set. */
 	void insert(const Bloom::key_type& key)
 	{
-		insert(Bloom::hash(key) % m_bloom.size());
+		insert(Bloom::hash(key, m_hashSeed) % m_bloom.size());
 	}
 
 private:
@@ -90,6 +91,7 @@ private:
 
 	BloomFilterType& m_bloom;
 	std::vector<omp_lock_t> m_locks;
+	size_t m_hashSeed;
 	size_t m_windowSize;
 };
 
diff --git a/Bloom/bloom.cc b/Bloom/bloom.cc
index 31f7eaa..d8efc2d 100644
--- a/Bloom/bloom.cc
+++ b/Bloom/bloom.cc
@@ -6,7 +6,12 @@
 #include "Common/Options.h"
 #include "Common/Kmer.h"
 #include "Common/BitUtil.h"
+#include "Common/KmerIterator.h"
+#include "Graph/Path.h"
+#include "Graph/ExtendPath.h"
+#include "Konnector/DBGBloom.h"
 #include "DataLayer/Options.h"
+#include "DataLayer/FastaReader.h"
 #include "Common/StringUtil.h"
 #include "Bloom/Bloom.h"
 #include "Bloom/BloomFilter.h"
@@ -19,6 +24,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include <cmath>
 
 #if _OPENMP
 # include <omp.h>
@@ -41,6 +47,9 @@ static const char USAGE_MESSAGE[] =
 "Usage 2: " PROGRAM " union [GLOBAL_OPTS] [COMMAND_OPTS] <OUTPUT_BLOOM_FILE> <BLOOM_FILE_1> <BLOOM_FILE_2> [BLOOM_FILE_3]...\n"
 "Usage 3: " PROGRAM " intersect [GLOBAL_OPTS] [COMMAND_OPTS] <OUTPUT_BLOOM_FILE> <BLOOM_FILE_1> <BLOOM_FILE_2> [BLOOM_FILE_3]...\n"
 "Usage 4: " PROGRAM " info [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE>\n"
+"Usage 5: " PROGRAM " compare [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE_1> <BLOOM_FILE_2>\n"
+"Usage 6: " PROGRAM " kmers [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE>\n"
+"Usage 7: " PROGRAM " trim [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE> [READS_FILE_2]... > trimmed.fq\n"
 "Build and manipulate bloom filter files.\n"
 "\n"
 " Global options:\n"
@@ -55,6 +64,7 @@ static const char USAGE_MESSAGE[] =
 "  -b, --bloom-size=N         size of bloom filter [500M]\n"
 "  -B, --buffer-size=N        size of I/O buffer for each thread, in bytes [100000]\n"
 "  -j, --threads=N            use N parallel threads [1]\n"
+"  -h, --hash-seed=N          seed for hash function [0]\n"
 "  -l, --levels=N             build a cascading bloom filter with N levels\n"
 "                             and output the last level\n"
 "  -L, --init-level='N=FILE'  initialize level N of cascading bloom filter\n"
@@ -76,8 +86,23 @@ static const char USAGE_MESSAGE[] =
 " Options for `" PROGRAM " union': (none)\n"
 " Options for `" PROGRAM " intersect': (none)\n"
 " Options for `" PROGRAM " info': (none)\n"
+" Options for `" PROGRAM " compare':\n"
 "\n"
-"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
+"  -m, --method=`String'      choose distance calculation method \n"
+"                             [`jaccard'(default), `forbes', `czekanowski']\n"
+"\n"
+" Options for `" PROGRAM " kmers':\n"
+"\n"
+"  -r, --inverse              get k-mers that are *NOT* in the bloom filter\n"
+"  --bed                      output k-mers in BED format\n"
+"  --fasta                    output k-mers in FASTA format [default]\n"
+"  --raw                      output k-mers in raw format (one per line)\n"
+"\n"
+" Options for `" PROGRAM " trim': (none)\n"
+"\n"
+"Report bugs to <" PACKAGE_BUGREPORT ">.\n";;
+
+enum OutputFormat { BED, FASTA, RAW };
 
 namespace opt {
 
@@ -90,6 +115,9 @@ namespace opt {
 	/** The number of parallel threads. */
 	unsigned threads = 1;
 
+	/** Seed for Bloom filter hash function. */
+	size_t hashSeed = 0;
+
 	/** The size of a k-mer. */
 	unsigned k;
 
@@ -115,31 +143,49 @@ namespace opt {
 	/** Number of windows in complete bloom filter.
 	  ("N" for -w option) */
 	unsigned windows = 0;
+
+	/* Method for similarity or distance calculation.
+	 -m option
+	 */
+	string method("jaccard");
+
+	/* Inverse option to retrieve kmers which are not
+	 in the filter
+	 */
+	bool inverse = false;
+
+	OutputFormat format = FASTA;
 }
 
-static const char shortopts[] = "b:B:j:k:l:L:n:q:vw:";
+static const char shortopts[] = "b:B:h:j:k:l:L:m:n:q:rvw:";
 
-enum { OPT_HELP = 1, OPT_VERSION };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_BED, OPT_FASTA, OPT_RAW };
 
 static const struct option longopts[] = {
-	{ "bloom-size",       required_argument, NULL, 'b' },
-	{ "buffer-size",      required_argument, NULL, 'B' },
-	{ "threads",          required_argument, NULL, 'j' },
-	{ "kmer",             required_argument, NULL, 'k' },
-	{ "levels",           required_argument, NULL, 'l' },
-	{ "init-level",       required_argument, NULL, 'L' },
-	{ "chastity",         no_argument, &opt::chastityFilter, 1 },
-	{ "no-chastity",      no_argument, &opt::chastityFilter, 0 },
-	{ "trim-masked",      no_argument, &opt::trimMasked, 1 },
-	{ "no-trim-masked",   no_argument, &opt::trimMasked, 0 },
-	{ "num-locks",        required_argument, NULL, 'n' },
-	{ "trim-quality",     required_argument, NULL, 'q' },
+	{ "bloom-size", required_argument, NULL, 'b' },
+	{ "buffer-size", required_argument, NULL, 'B' },
+	{ "hash-seed",        required_argument, NULL, 'h' },
+	{ "threads", required_argument, NULL, 'j' },
+	{ "kmer", required_argument, NULL, 'k' },
+	{ "levels", required_argument, NULL, 'l' },
+	{ "init-level", required_argument, NULL, 'L' },
+	{ "chastity", no_argument, &opt::chastityFilter, 1 },
+	{ "no-chastity", no_argument, &opt::chastityFilter, 0 },
+	{ "trim-masked", no_argument, &opt::trimMasked, 1 },
+	{ "no-trim-masked", no_argument, &opt::trimMasked, 0 },
+	{ "num-locks", required_argument, NULL, 'n' },
+	{ "trim-quality", required_argument, NULL, 'q' },
 	{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
 	{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
-	{ "verbose",          no_argument, NULL, 'v' },
-	{ "help",             no_argument, NULL, OPT_HELP },
-	{ "version",          no_argument, NULL, OPT_VERSION },
-	{ "window",           required_argument, NULL, 'w' },
+	{ "verbose", no_argument, NULL, 'v' },
+	{ "help", no_argument, NULL, OPT_HELP },
+	{ "version", no_argument, NULL, OPT_VERSION },
+	{ "window", required_argument, NULL, 'w' },
+	{ "method", required_argument, NULL, 'm' },
+	{ "inverse", required_argument, NULL, 'r' },
+	{ "bed", no_argument, NULL, OPT_BED },
+	{ "fasta", no_argument, NULL, OPT_FASTA },
+	{ "raw", no_argument, NULL, OPT_RAW },
 	{ NULL, 0, NULL, 0 }
 };
 
@@ -318,6 +364,8 @@ int build(int argc, char** argv)
 			opt::bloomSize = SIToBytes(arg); break;
 		  case 'B':
 			arg >> opt::bufferSize; break;
+		  case 'h':
+			arg >> opt::hashSeed; break;
 		  case 'j':
 			arg >> opt::threads; break;
 		  case 'l':
@@ -404,10 +452,10 @@ int build(int argc, char** argv)
 	if (opt::windows == 0) {
 
 		if (opt::levels == 1) {
-			BloomFilter bloom(bits);
+			BloomFilter bloom(bits, opt::hashSeed);
 #ifdef _OPENMP
 			ConcurrentBloomFilter<BloomFilter>
-				cbf(bloom, opt::numLocks);
+				cbf(bloom, opt::numLocks, opt::hashSeed);
 			loadFilters(cbf, argc, argv);
 #else
 			loadFilters(bloom, argc, argv);
@@ -416,11 +464,11 @@ int build(int argc, char** argv)
 			writeBloom(bloom, outputPath);
 		}
 		else {
-			CascadingBloomFilter cascadingBloom(bits, opt::levels);
+			CascadingBloomFilter cascadingBloom(bits, opt::levels, opt::hashSeed);
 			initBloomFilterLevels(cascadingBloom);
 #ifdef _OPENMP
 			ConcurrentBloomFilter<CascadingBloomFilter>
-				cbf(cascadingBloom, opt::numLocks);
+				cbf(cascadingBloom, opt::numLocks, opt::hashSeed);
 			loadFilters(cbf, argc, argv);
 #else
 			loadFilters(cascadingBloom, argc, argv);
@@ -441,14 +489,16 @@ int build(int argc, char** argv)
 			endBitPos = bits - 1;
 
 		if (opt::levels == 1) {
-			BloomFilterWindow bloom(bits, startBitPos, endBitPos);
+			BloomFilterWindow bloom(bits, startBitPos,
+					endBitPos, opt::hashSeed);
 			loadFilters(bloom, argc, argv);
 			printBloomStats(cerr, bloom);
 			writeBloom(bloom, outputPath);
 		}
 		else {
 			CascadingBloomFilterWindow cascadingBloom(
-				bits, startBitPos, endBitPos, opt::levels);
+				bits, startBitPos, endBitPos, opt::levels,
+				opt::hashSeed);
 			initBloomFilterLevels(cascadingBloom);
 			loadFilters(cascadingBloom, argc, argv);
 			printCascadingBloomStats(cerr, cascadingBloom);
@@ -544,6 +594,365 @@ int info(int argc, char** argv)
 	return 0;
 }
 
+int compare(int argc, char ** argv){
+	parseGlobalOpts(argc, argv);
+	// Arg parser to get `m' option in case set
+	for (int c; (c = getopt_long(argc, argv,
+								 shortopts, longopts, NULL)) != -1;) {
+		istringstream arg(optarg != NULL ? optarg : "");
+		switch (c) {
+			case '?':
+				cerr << PROGRAM ": unrecognized option: `-" << optopt
+					<< "'" << endl;
+				dieWithUsageError();
+			case 'm':
+				arg >> opt::method; break;
+				break;
+		}
+		if (optarg != NULL && (!arg.eof() || arg.fail())) {
+			cerr << PROGRAM ": invalid option: `-"
+			<< (char)c << optarg << "'\n";
+			exit(EXIT_FAILURE);
+		}
+		if (opt::method != "jaccard" && opt::method != "czekanowski" && opt::method != "forbes")
+			std::cerr << "Invalid method: " << opt::method << std::endl;
+	}
+
+
+	// Set method strin
+	string method(opt::method);
+	if (opt::verbose)
+	std::cerr << "Computing distance for 2"
+			  << " samples...\n";
+	// Get both paths and open istreams
+	BloomFilter bloomA;
+	string pathA(argv[optind]);
+	BloomFilter bloomB;
+	string pathB(argv[optind+1]);
+	if (opt::verbose)
+	  std::cerr << "Loading bloom filters from "
+		<< pathA << " and " << pathB << "...\n";
+	istream* inA = openInputStream(pathA);
+	istream* inB = openInputStream(pathB);
+	// Assert state of streams
+	assert_good(*inA, pathA);
+	assert_good(*inB, pathB);
+	// Not sure this conversion is needed, check docs
+	std::istream & tA = *inA;
+	std::istream & tB = *inB;
+	// Need to read header for bit start and end info
+	Bloom::FileHeader headerA = Bloom::readHeader(tA);
+	Bloom::FileHeader headerB = Bloom::readHeader(tB);
+	// Need to assert after every read operation
+	assert(tA);
+	assert(tB);
+
+	const size_t IO_BUFFER_SIZE = 32 * 1024;
+	unsigned char mask = 1;
+	// The number of total bits in the vector
+	size_t bitsA = headerA.endBitPos - headerA.startBitPos + 1;
+	size_t bitsB = headerB.endBitPos - headerB.startBitPos + 1;
+	// They need to be the same size to be comparable
+	if(bitsA != bitsB ) {
+		std::cerr << "Bit sizes of arrays not equal" << std::endl;
+		exit(EXIT_FAILURE);
+	}
+	if (opt::verbose)
+	std::cerr << "Bits: " << bitsA << std::endl;
+	/* As in Choi et al. (2010),
+	 a - cases where both bits are set (1/1)
+	 b - cases where bits are set in the first but nor the second (1/0)
+	 c - cases where bits are set in the second but not the first (0/1)
+	 d - cases where bits are not set in either (0/0)
+	 */
+	unsigned long a = 0;
+	unsigned long b = 0;
+	unsigned long c = 0;
+	unsigned long d = 0;
+	// Iteratively compare bits
+	for(size_t i = 0; i < bitsA;){
+		char bufferA[IO_BUFFER_SIZE];
+		char bufferB[IO_BUFFER_SIZE];
+		// The number of bits in the buffer is its size * 8 except for the last iteration
+		size_t bitsRead = std::min(IO_BUFFER_SIZE * 8, bitsA - i);
+		size_t bytesRead = (bitsRead + 7)/8;
+		// Read bytes from the the istream and immediately assert
+		tA.read(bufferA, bytesRead);
+		tB.read(bufferB, bytesRead);
+		assert(tA);
+		assert(tB);
+		// For each byte in the buffer, compare bits
+		for(size_t j = 0; j < IO_BUFFER_SIZE; j++){
+			// Compare bit-wise
+			for(int bit = 0; bit < 8; bit++){
+				bool f = (bufferA[j] & (mask << bit)) != 0;
+				bool s = (bufferB[j] & (mask << bit)) != 0;
+				if( f == 1 && s == 1 ) {
+					a++;
+				} else if( f == 1 && s == 0) {
+					b++;
+				} else if( f == 0 && s == 1) {
+					c++;
+				} else d++;
+			}
+		}
+		i += bitsRead;
+	}
+	assert(tA);
+	assert(tB);
+	// Result output:
+	std::cout << "1/1: " << a << "\n1/0: " << b << "\n0/1: " << c << "\n0/0: " << d << std::endl;
+	if(method == "jaccard"){
+		float Dist = (float)a/(float)(a+b+c);
+		std::cout << "Jaccard similarity: " << Dist << std::endl;
+	}
+	if(method == "czekanowski"){
+		float Dist = (2*(float)a)/(float)((2*a)+b+c);
+		std::cout << "Czekanowski similarity: " << Dist << std::endl;
+	}
+	if(method == "forbes"){
+		float n = (float)(a + b + c + d);
+		float Dist = (n*a - ((a+b)*(a+c))) / (n*std::min(a+b,a+c) - ((a+b) * (a+c)));
+		std::cout << "Forbes similarity: " << Dist << std::endl;
+	}
+	// Check and clean up
+	assert_good(tA, pathA);
+	assert_good(tA, pathB);
+	closeInputStream(inA, pathA);
+	closeInputStream(inB, pathB);
+
+  return 1;
+}
+
+int memberOf(int argc, char ** argv){
+	// Initalise bloom and get globals
+	BloomFilter bloom;
+	parseGlobalOpts(argc, argv);
+	// Arg parser to get `m' option in case set
+	for (int c; (c = getopt_long(argc, argv,
+								 shortopts, longopts, NULL)) != -1;) {
+		istringstream arg(optarg != NULL ? optarg : "");
+		switch (c) {
+			case '?':
+				cerr << PROGRAM ": unrecognized option: `-" << optopt
+					<< "'" << endl;
+				dieWithUsageError();
+			case 'r':
+				opt::inverse = true; break;
+				break;
+			case OPT_BED:
+				opt::format = BED;
+				break;
+			case OPT_FASTA:
+				opt::format = FASTA;
+				break;
+			case OPT_RAW:
+				opt::format = RAW;
+				break;
+		}
+		if (optarg != NULL && (!arg.eof() || arg.fail())) {
+			cerr << PROGRAM ": invalid option: `-"
+			<< (char)c << optarg << "'\n";
+			exit(EXIT_FAILURE);
+		}
+	}
+	string path = argv[optind];
+	string fasta = argv[++optind];
+	unsigned k = opt::k;
+	if (opt::verbose)
+		std::cerr << "Loading bloom filter from `"
+		<< path << "'...\n";
+
+	istream* in = openInputStream(path);
+	assert_good(*in, path);
+	*in >> bloom;
+
+	assert(!fasta.empty());
+	if (opt::verbose)
+		std::cerr << "Reading `" << fasta << "'...\n";
+	FastaReader _in(fasta.c_str(), FastaReader::FOLD_CASE);
+
+	size_t seqCount=0;
+	for (FastaRecord rec; _in >> rec; ++seqCount) {
+		string& seq = rec.seq;
+		if (seq.size() < k)
+			continue;
+		for (size_t i = 0; i < seq.size() - k + 1; ++i) {
+			string kmer = seq.substr(i, k);
+			size_t pos = kmer.find_last_not_of("ACGTacgt");
+			if (pos != string::npos) {
+				i += pos;
+				continue;
+			}
+			if (bloom[Kmer(kmer)] || opt::inverse) {
+				if (opt::format == FASTA) {
+					cout << ">" << rec.id << ":seq:" << seqCount
+						<< ":kmer:" << i << "\n";
+				} else if (opt::format == BED) {
+					cout << rec.id
+						<< "\t" << i
+						<< "\t" << i + k - 1
+						<< "\t";
+				}
+				cout << kmer << "\n";
+			}
+		}
+		if (opt::verbose && seqCount % 1000 == 0)
+			cerr << "processed " << seqCount << " sequences" << endl;
+	}
+	assert(_in.eof());
+	if (opt::verbose)
+		cerr << "processed " << seqCount << " sequences" << endl;
+
+	return 0;
+}
+
+/**
+ * Calculate number of bases to trim from left end of sequence.
+ */
+int calcLeftTrim(const Sequence& seq, unsigned k, const BloomFilter& bloom,
+	size_t minBranchLen)
+{
+	// Boost graph interface for Bloom filter
+	DBGBloom<BloomFilter> g(bloom);
+
+	// if this is the first k-mer we have found in
+	// Bloom filter, starting from the left end
+	// of the sequence
+	bool firstKmerMatch = true;
+
+	KmerIterator it(seq, k);
+	for (; it != KmerIterator::end(); ++it) {
+
+		const Kmer& kmer = *it;
+
+		// assume k-mers not present in Bloom filter are
+		// due to sequencing errors and should be trimmed
+		if (!bloom[kmer])
+			continue;
+
+		// in degree, disregarding false branches
+		unsigned inDegree = trueBranches(kmer, REVERSE, g,
+				minBranchLen).size();
+		// out degree, disregarding false branches
+		unsigned outDegree = trueBranches(kmer, FORWARD, g,
+				minBranchLen).size();
+
+		if (firstKmerMatch) {
+			bool leftTip = (inDegree == 0 && outDegree == 1);
+			bool rightTip = (inDegree == 1 && outDegree == 0);
+			if (!leftTip && !rightTip)
+				break;
+		} else if (inDegree != 1 || outDegree != 1) {
+			// end of linear path
+			break;
+		}
+
+		firstKmerMatch = false;
+
+	} // for each k-mer (left to right)
+
+	if (it.pos() == 0)
+		return 0;
+
+	return k + it.pos() - 1;
+}
+
+/**
+ * Trim reads that corresponds to tips in the Bloom filter
+ * de Bruijn graph.
+ */
+int trim(int argc, char** argv)
+{
+	// parse command line opts
+	parseGlobalOpts(argc, argv);
+	unsigned k = opt::k;
+
+	// arg 1: Bloom filter
+	// args 2-n: FASTA/FASTQ files
+	if (argc - optind < 2) {
+		cerr << PROGRAM ": missing arguments\n";
+		dieWithUsageError();
+	}
+
+	// load Bloom filter de Bruijn graph
+	string bloomPath(argv[optind++]);
+	if (opt::verbose)
+		cerr << "Loading bloom filter from `"
+			<< bloomPath << "'...\n";
+
+	BloomFilter bloom;
+	istream *in = openInputStream(bloomPath);
+	assert_good(*in, bloomPath);
+	bloom.read(*in);
+	assert_good(*in, bloomPath);
+
+	if (opt::verbose)
+		printBloomStats(cerr, bloom);
+
+	// Calculate min length threshold for a "true branch"
+	// (not due to Bloom filter false positives)
+	const double falseBranchProbability = 0.0001;
+	const size_t minBranchLen =
+		(size_t)ceil(log(falseBranchProbability)/log(bloom.FPR()));
+
+	if (opt::verbose >= 2)
+		cerr << "min length threshold for true branches (k-mers): "
+			<< minBranchLen << endl;
+
+	size_t readCount = 0;
+
+	// trim reads and print to STDOUT
+	for (int i = optind; i < argc; ++i) {
+
+		if (opt::verbose)
+			cerr << "Reading `" << argv[i] << "'..." << endl;
+
+		FastaReader in(argv[i], FastaReader::FOLD_CASE);
+		for (FastqRecord rec; in >> rec; ++readCount) {
+
+			Sequence& seq = rec.seq;
+			string& qual = rec.qual;
+
+			// can't trim if read length < k; just echo
+			// back to STDOUT
+			if (seq.size() < k) {
+				cout << rec;
+				continue;
+			}
+
+			// start pos for trimmed read
+			unsigned startPos = calcLeftTrim(seq, k, bloom, minBranchLen);
+			// end pos for trimmed read
+			unsigned endPos = seq.length() - 1 -
+				calcLeftTrim(reverseComplement(seq), k, bloom, minBranchLen);
+
+			// if whole read was trimmed away
+			if (endPos < startPos)
+				continue;
+
+			// output trimmed read
+			unsigned trimmedLen = endPos - startPos + 1;
+			seq = seq.substr(startPos, trimmedLen);
+			qual = qual.substr(startPos, trimmedLen);
+			cout << rec;
+
+			if (opt::verbose && (readCount+1) % 100000 == 0)
+				cerr << "Processed " << (readCount+1) << " reads"
+					<< endl;
+
+		} // for each read
+		assert(in.eof());
+
+	} // for each input FASTA/FASTQ file
+
+	if (opt::verbose)
+		cerr << "Processed " << readCount << " reads" << endl;
+
+	// success
+	return 0;
+}
+
 int main(int argc, char** argv)
 {
 	if (argc < 2)
@@ -572,6 +981,17 @@ int main(int argc, char** argv)
 	else if (command == "info") {
 		return info(argc, argv);
 	}
+	else if (command == "compare") {
+		return compare(argc, argv);
+	}
+	else if (command == "kmers" || command == "getKmers") {
+		return memberOf(argc, argv);
+	}
+	else if (command == "trim") {
+		return trim(argc, argv);
+	}
 
+	cerr << PROGRAM ": unrecognized command: `" << command
+		<< "'" << endl;
 	dieWithUsageError();
 }
diff --git a/BloomDBG/HashAgnosticCascadingBloom.h b/BloomDBG/HashAgnosticCascadingBloom.h
new file mode 100644
index 0000000..559a4c6
--- /dev/null
+++ b/BloomDBG/HashAgnosticCascadingBloom.h
@@ -0,0 +1,145 @@
+/**
+ * A cascading Bloom filter
+ * Copyright 2015 Shaun Jackman, Ben Vandervalk.
+ */
+#ifndef HASH_AGNOSTIC_CASCADING_BLOOM_H
+#define HASH_AGNOSTIC_CASCADING_BLOOM_H 1
+
+#include "lib/bloomfilter/BloomFilter.hpp"
+#include <vector>
+
+/**
+ * An implementation of a Cascading Bloom filter.
+ * A Cascading Bloom filter implements a crude
+ * counting mechanism using an array of _l_ Bloom
+ * filters; we say that such a Bloom filter has
+ * l _levels_. Each time an element is inserted, we
+ * check for its presence in each level, and then
+ * insert the element into the first Bloom filter
+ * where the element is not already present.
+ *
+ * We use the Cascading Bloom filter to filter
+ * out error k-mers from the de Bruijn graph, since
+ * these k-mers typically only occur once in the
+ * the data.
+ */
+class HashAgnosticCascadingBloom
+{
+  public:
+
+	/** Default constructor */
+	HashAgnosticCascadingBloom() : m_k(0), m_hashes(0) {}
+
+	/**
+	 * Constructor.
+	 * @param size size of the Bloom filters (in bits)
+	 * @param hashes number of hash functions
+	 * @param levels number of levels in Cascading Bloom filter
+	 * @param k k-mer size
+	 */
+	HashAgnosticCascadingBloom(size_t size, unsigned hashes,
+		size_t levels, unsigned k) : m_k(k), m_hashes(hashes)
+	{
+		m_data.reserve(levels);
+		for (unsigned i = 0; i < levels; i++)
+			m_data.push_back(new BloomFilter(size, hashes, k));
+	}
+
+	/** Destructor */
+	~HashAgnosticCascadingBloom()
+	{
+		typedef std::vector<BloomFilter*>::iterator Iterator;
+		for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
+			assert(*i != NULL);
+			delete *i;
+		}
+	}
+
+	/** Return k-mer size used by Bloom filter. */
+	unsigned getKmerSize() const { return m_k; }
+
+	/** Return number of hash functions used by Bloom filter */
+	unsigned getHashNum() const { return m_hashes; }
+
+	/** Return the size of the bit array. */
+	size_t size() const
+	{
+		assert(m_data.back() != NULL);
+		return m_data.back()->getFilterSize();
+	}
+
+	/** Return the number of elements with count >= levels. */
+	size_t popcount() const
+	{
+		assert(m_data.back() != NULL);
+		return m_data.back()->getPop();
+	}
+
+	/** Return the estimated false positive rate */
+	double FPR() const
+	{
+		return pow((double)popcount()/size(), m_hashes);
+	}
+
+	/**
+	 * Return true if the element with the given hash values
+	 * has count >= levels.
+	 */
+	bool contains(const std::vector<size_t>& hashes) const
+	{
+		assert(m_data.back() != NULL);
+		return m_data.back()->contains(hashes);
+	}
+
+	/**
+	 * Return true if the element with the given hash values
+	 * has count >= levels.
+	 */
+	bool contains(const size_t hashes[]) const
+	{
+		assert(m_data.back() != NULL);
+		return m_data.back()->contains(hashes);
+	}
+
+	/** Add the object with the specified index to this multiset. */
+	void insert(const std::vector<size_t>& hashes)
+	{
+		for (unsigned i = 0; i < m_data.size(); ++i) {
+			assert(m_data.at(i) != NULL);
+			if (!(*m_data[i]).contains(hashes)) {
+				m_data[i]->insert(hashes);
+				break;
+			}
+		}
+	}
+
+	/** Add the object with the specified index to this multiset. */
+	void insert(const size_t hashes[])
+	{
+		for (unsigned i = 0; i < m_data.size(); ++i) {
+			assert(m_data.at(i) != NULL);
+			if (!(*m_data[i]).contains(hashes)) {
+				m_data[i]->insert(hashes);
+				break;
+			}
+		}
+	}
+
+	/** Get the Bloom filter for a given level */
+	BloomFilter& getBloomFilter(unsigned level)
+	{
+		assert(m_data.at(level) != NULL);
+		return *m_data.at(level);
+	}
+
+  private:
+
+	/** k-mer length */
+	unsigned m_k;
+	/** number of hash functions */
+	unsigned m_hashes;
+	/** the array of Bloom filters */
+	std::vector<BloomFilter*> m_data;
+};
+
+#endif
diff --git a/BloomDBG/LightweightKmer.h b/BloomDBG/LightweightKmer.h
new file mode 100644
index 0000000..72c5723
--- /dev/null
+++ b/BloomDBG/LightweightKmer.h
@@ -0,0 +1,96 @@
+#ifndef LIGHTWEIGHT_KMER_H
+#define LIGHTWEIGHT_KMER_H 1
+
+#include <algorithm>
+#include <cstring>
+#include <boost/shared_array.hpp>
+
+/**
+ * Class that stores a shared pointer to a k-mer (char array).
+ *
+ * I implemented this class because I observed that storing and
+ * copying the full char array between data structures was hurting
+ * performance and using a lot of memory.
+ *
+ * Having a lightweight k-mer representation is particularly
+ * important when using it as the `vertex_descriptor` in a Boost graph.
+ */
+class LightweightKmer
+{
+private:
+
+	/** Shared pointer to k-mer data */
+	boost::shared_array<char> m_kmer;
+
+public:
+
+	/** Default constructor */
+	LightweightKmer() {}
+
+	/** Constructor */
+	LightweightKmer(const char* kmer) : m_kmer(new char[Kmer::length()])
+	{
+		const unsigned k = Kmer::length();
+		std::copy(kmer, kmer + k, m_kmer.get());
+	}
+
+	/** Get pointer to raw char array for k-mer */
+	char* c_str() { return (char*)m_kmer.get(); }
+
+	/** Get pointer to raw char array for k-mer (read-only) */
+	const char* c_str() const { return (const char*)m_kmer.get(); }
+
+	/** Shift the k-mer and set the value of the new incoming base. */
+	void shift(extDirection dir, char charIn = 'A')
+	{
+		const unsigned k = Kmer::length();
+		assert(k >= 2);
+		if (dir == SENSE) {
+			memmove(m_kmer.get(), m_kmer.get() + 1, k - 1);
+		} else {
+			memmove(m_kmer.get() + 1, m_kmer.get(), k - 1);
+		}
+		setLastBase(dir, charIn);
+	}
+
+	/** Change the first/last base of the k-mer */
+	void setLastBase(extDirection dir, char base)
+	{
+		const unsigned k = Kmer::length();
+		unsigned pos = (dir == SENSE) ? k - 1 : 0;
+		setBase(pos, base);
+	}
+
+	/** Change a base within the k-mer */
+	void setBase(unsigned pos, char base)
+	{
+		assert(pos < Kmer::length());
+		*(m_kmer.get() + pos) = base;
+	}
+
+	/** Get the base (ACGT) at a given position */
+	char getBase(unsigned pos) const
+	{
+		return *(m_kmer.get() + pos);
+	}
+
+	/** Equality operator */
+	bool operator==(const LightweightKmer& o) const
+	{
+		unsigned k = Kmer::length();
+		const std::string& spacedSeed = MaskedKmer::mask();
+
+		if (spacedSeed.empty()) {
+			return !memcmp(m_kmer.get(), o.m_kmer.get(), k);
+		} else {
+			assert(spacedSeed.length() == k);
+			for (unsigned i = 0; i < k; ++i) {
+				if (spacedSeed.at(i) != '0' && getBase(i) != o.getBase(i))
+					return false;
+			}
+			return true;
+		}
+	}
+};
+
+#endif
diff --git a/BloomDBG/Makefile.am b/BloomDBG/Makefile.am
new file mode 100644
index 0000000..652c48e
--- /dev/null
+++ b/BloomDBG/Makefile.am
@@ -0,0 +1,23 @@
+bin_PROGRAMS = abyss-bloom-dbg
+
+abyss_bloom_dbg_CPPFLAGS = -I$(top_srcdir) \
+	-I$(top_srcdir)/Common \
+	-I$(top_srcdir)/DataLayer
+
+abyss_bloom_dbg_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
+
+abyss_bloom_dbg_LDADD = \
+	$(top_builddir)/DataLayer/libdatalayer.a \
+	$(top_builddir)/Common/libcommon.a
+
+abyss_bloom_dbg_SOURCES = bloom-dbg.cc \
+	bloom-dbg.h \
+	MaskedKmer.h \
+	SpacedSeed.h \
+	HashAgnosticCascadingBloom.h \
+	LightweightKmer.h \
+	RollingBloomDBG.h \
+	RollingHash.h \
+	RollingHashIterator.h \
+	$(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \
+	$(top_srcdir)/lib/rolling-hash/rolling.h
diff --git a/BloomDBG/MaskedKmer.h b/BloomDBG/MaskedKmer.h
new file mode 100644
index 0000000..4336c04
--- /dev/null
+++ b/BloomDBG/MaskedKmer.h
@@ -0,0 +1,121 @@
+#ifndef MASKED_KMER_H
+#define MASKED_KMER_H 1
+
+#include "Common/Kmer.h"
+#include "Common/Hash.h"
+#include "Common/Sequence.h"
+#include <iostream>
+#include <string>
+#include <cstdlib>
+
+class MaskedKmer : public Kmer
+{
+public:
+
+	/** Default constructor */
+	MaskedKmer() : Kmer() {}
+
+	/**
+	 * Constructor.
+	 * @param seq k-mer sequence
+	 */
+	explicit MaskedKmer(const Sequence& seq) : Kmer(seq) {}
+
+	/** Set global k-mer mask (a.k.a. spaced seed) */
+	static void setMask(const std::string& kmerMask)
+	{
+		/* setLength() must be called before setMask() */
+		assert(length() > 0);
+
+		/* set global bitmask */
+		mask() = kmerMask;
+
+		/* empty mask is equivalent to string of '1's */
+		if (kmerMask.empty())
+			return;
+
+		/* check for valid spaced seed pattern */
+		if (mask().length() != length()) {
+			std::cerr << "error: spaced seed must be exactly k bits long\n";
+			exit(EXIT_FAILURE);
+		} else if (mask().find_first_not_of("01") != std::string::npos) {
+			std::cerr << "error: spaced seed must contain only '0's or '1's\n";
+			exit(EXIT_FAILURE);
+		} else if (*mask().begin() != '1' || *mask().rbegin() != '1') {
+			std::cerr << "error: spaced seed must begin and end with '1's\n";
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	/** Get global k-mer mask */
+	static std::string& mask()
+	{
+		static std::string s_kmerMask;
+		return s_kmerMask;
+	}
+
+	/** Compare this k-mer to another */
+	int compare(const Kmer& other) const
+	{
+		if (!mask().empty()) {
+			Kmer kmer1(*this), kmer2(other);
+			maskKmer(kmer1);
+			maskKmer(kmer2);
+			return kmer1.compare(kmer2);
+		}
+		return Kmer::compare(other);
+	}
+
+	/** Equality operator */
+	bool operator==(const MaskedKmer& other) const
+	{
+		return compare(other) == 0;
+	}
+
+	/** Inequality operator */
+	bool operator!=(const MaskedKmer& other) const
+	{
+		return compare(other) != 0;
+	}
+
+	/** Less-than operator */
+	bool operator<(const MaskedKmer& other) const
+	{
+		return compare(other) < 0;
+	}
+
+	/** Mask out don't care positions by changing them to 'A' */
+	static void maskKmer(Kmer& kmer)
+	{
+		if (mask().empty())
+			return;
+
+		assert(mask().length() == length());
+		for(size_t i = 0; i < mask().length(); ++i) {
+			if (mask().at(i) == '0')
+				kmer.set(i, baseToCode('A'));
+		}
+	}
+};
+
+/** Return the reverse complement of the specified k-mer. */
+static inline MaskedKmer reverseComplement(const MaskedKmer& seq)
+{
+	MaskedKmer rc(seq);
+	rc.reverseComplement();
+	return rc;
+}
+
+/** Define default hash function for use with STL containers */
+NAMESPACE_STD_HASH_BEGIN
+template <> struct hash<MaskedKmer> {
+	size_t operator()(const MaskedKmer& kmer) const
+	{
+		MaskedKmer kmerCopy(kmer);
+		MaskedKmer::maskKmer(kmerCopy);
+		return kmerCopy.getHashCode();
+	}
+};
+NAMESPACE_STD_HASH_END
+
+#endif
diff --git a/BloomDBG/RollingBloomDBG.h b/BloomDBG/RollingBloomDBG.h
new file mode 100644
index 0000000..b671522
--- /dev/null
+++ b/BloomDBG/RollingBloomDBG.h
@@ -0,0 +1,486 @@
+/**
+ * de Bruijn Graph data structure using a Bloom filter
+ * Copyright 2015 Shaun Jackman, Ben Vandervalk
+ */
+
+#ifndef ROLLING_BLOOM_DBG_H
+#define ROLLING_BLOOM_DBG_H 1
+
+#include "Assembly/SeqExt.h" // for NUM_BASES
+#include "Common/Hash.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "Graph/Properties.h"
+#include "BloomDBG/RollingHash.h"
+#include "BloomDBG/LightweightKmer.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib> // for abort
+#include <fstream>
+#include <string>
+#include <utility> // for std::pair
+#include <vector>
+#include <iostream>
+
+#define BASE_CHARS "ACGT"
+
+using boost::graph_traits;
+
+/**
+ * Represents a vertex in the de Bruijn graph.
+ */
+struct RollingBloomDBGVertex
+{
+private:
+
+	LightweightKmer m_kmer;
+	RollingHash m_rollingHash;
+
+public:
+
+	RollingBloomDBGVertex() {}
+
+	RollingBloomDBGVertex(const char* kmer, const RollingHash rollingHash)
+		: m_kmer(kmer), m_rollingHash(rollingHash) {}
+
+	const LightweightKmer& kmer() const { return m_kmer; };
+	const RollingHash& rollingHash() const { return m_rollingHash; }
+
+	RollingBloomDBGVertex clone() const {
+		return RollingBloomDBGVertex(m_kmer.c_str(), m_rollingHash);
+	}
+
+	void shift(extDirection dir, char charIn = 'A')
+	{
+		if (dir == SENSE) {
+			m_rollingHash.rollRight(m_kmer.c_str(), charIn);
+		} else {
+			m_rollingHash.rollLeft(charIn, m_kmer.c_str());
+		}
+		m_kmer.shift(dir, charIn);
+	}
+
+	void setLastBase(extDirection dir, char base)
+	{
+		const unsigned k = Kmer::length();
+		if (dir == SENSE) {
+			m_rollingHash.setBase(m_kmer.c_str(), k-1, base);
+		} else {
+			m_rollingHash.setBase(m_kmer.c_str(), 0, base);
+		}
+	}
+
+	/**
+	 * Comparison operator that takes spaced seed bitmask into account.
+	 */
+	bool operator==(const RollingBloomDBGVertex& o) const
+	{
+		/* do fast comparison first */
+		if (m_rollingHash != o.m_rollingHash)
+			return false;
+
+		return m_kmer == o.m_kmer;
+	}
+
+	/**
+	 * Inequality operator that takes spaced seed bitmask into account.
+	 */
+	bool operator!=(const RollingBloomDBGVertex& o) const
+	{
+		return !(*this == o);
+	}
+};
+
+NAMESPACE_STD_HASH_BEGIN
+template <> struct hash<RollingBloomDBGVertex> {
+	/**
+	 * Hash function for graph vertex type (vertex_descriptor)
+	 */
+	size_t operator()(const RollingBloomDBGVertex& vertex) const
+	{
+		return vertex.rollingHash().getHashSeed();
+	}
+};
+NAMESPACE_STD_HASH_END
+
+template <typename BF>
+class RollingBloomDBG: public BF {
+  public:
+	/** The bundled vertex properties. */
+	typedef no_property vertex_bundled;
+	typedef no_property vertex_property_type;
+
+	/** The bundled edge properties. */
+	typedef no_property edge_bundled;
+	typedef no_property edge_property_type;
+
+	/** The bloom filter */
+	const BF& m_bloom;
+
+	RollingBloomDBG(const BF& bloom) : m_bloom(bloom) {}
+
+  private:
+	/** Copy constructor. */
+	RollingBloomDBG(const RollingBloomDBG<BF>&);
+
+}; // class RollingBloomDBG
+
+// Graph
+
+namespace boost {
+
+/** Graph traits */
+template <typename BF>
+struct graph_traits< RollingBloomDBG<BF> > {
+	// Graph
+
+	/**
+	 * Identifier for accessing a vertex in the graph.
+	 * The second member of the pair (std::vector<size_t>) is
+	 * a set of hash values associated with the k-mer.
+	 */
+	typedef RollingBloomDBGVertex vertex_descriptor;
+	typedef boost::directed_tag directed_category;
+	struct traversal_category
+		: boost::adjacency_graph_tag,
+		boost::bidirectional_graph_tag,
+		boost::vertex_list_graph_tag
+		{ };
+	typedef boost::disallow_parallel_edge_tag edge_parallel_category;
+
+	// IncidenceGraph
+	typedef std::pair<vertex_descriptor, vertex_descriptor>
+		edge_descriptor;
+	typedef unsigned degree_size_type;
+
+	// VertexListGraph
+	typedef size_t vertices_size_type;
+	typedef void vertex_iterator;
+
+	// EdgeListGraph
+	typedef size_t edges_size_type;
+	typedef void edge_iterator;
+
+// AdjacencyGraph
+
+/** Iterate through the adjacent vertices of a vertex. */
+struct adjacency_iterator
+	: public std::iterator<std::input_iterator_tag, vertex_descriptor>
+{
+	/** Skip to the next edge that is present. */
+	void next()
+	{
+		for (; m_i < NUM_BASES; ++m_i) {
+			m_v.setLastBase(SENSE, BASE_CHARS[m_i]);
+			if (vertex_exists(m_v, *m_g))
+				break;
+		}
+	}
+
+  public:
+
+	adjacency_iterator() { }
+
+	adjacency_iterator(const RollingBloomDBG<BF>& g) : m_g(&g), m_i(NUM_BASES) { }
+
+	adjacency_iterator(const RollingBloomDBG<BF>& g, const vertex_descriptor& u)
+		: m_g(&g), m_u(u), m_v(u.clone()), m_i(0)
+	{
+		m_v.shift(SENSE);
+		next();
+	}
+
+	const vertex_descriptor& operator*() const
+	{
+		assert(m_i < NUM_BASES);
+		return m_v;
+	}
+
+	bool operator==(const adjacency_iterator& it) const
+	{
+		return m_i == it.m_i;
+	}
+
+	bool operator!=(const adjacency_iterator& it) const
+	{
+		return !(*this == it);
+	}
+
+	adjacency_iterator& operator++()
+	{
+		assert(m_i < NUM_BASES);
+		++m_i;
+		next();
+		return *this;
+	}
+
+	adjacency_iterator operator++(int)
+	{
+		adjacency_iterator it = *this;
+		++*this;
+		return it;
+	}
+
+  private:
+	const RollingBloomDBG<BF>* m_g;
+	vertex_descriptor m_u;
+	vertex_descriptor m_v;
+	short unsigned m_i;
+}; // adjacency_iterator
+
+/** IncidenceGraph */
+struct out_edge_iterator
+	: public std::iterator<std::input_iterator_tag, edge_descriptor>
+{
+	/** Skip to the next edge that is present. */
+	void next()
+	{
+		for (; m_i < NUM_BASES; ++m_i) {
+			m_v.setLastBase(SENSE, BASE_CHARS[m_i]);
+			if (vertex_exists(m_v, *m_g))
+				break;
+		}
+	}
+
+  public:
+	out_edge_iterator() { }
+
+	out_edge_iterator(const RollingBloomDBG<BF>& g) : m_g(&g), m_i(NUM_BASES) { }
+
+	out_edge_iterator(const RollingBloomDBG<BF>& g, const vertex_descriptor& u)
+		: m_g(&g), m_u(u), m_v(u.clone()), m_i(0)
+	{
+		m_v.shift(SENSE);
+		next();
+	}
+
+	edge_descriptor operator*() const
+	{
+		assert(m_i < NUM_BASES);
+		return edge_descriptor(m_u, m_v.clone());
+	}
+
+	bool operator==(const out_edge_iterator& it) const
+	{
+		return m_i == it.m_i;
+	}
+
+	bool operator!=(const out_edge_iterator& it) const
+	{
+		return !(*this == it);
+	}
+
+	out_edge_iterator& operator++()
+	{
+		assert(m_i < NUM_BASES);
+		++m_i;
+		next();
+		return *this;
+	}
+
+	out_edge_iterator operator++(int)
+	{
+		out_edge_iterator it = *this;
+		++*this;
+		return it;
+	}
+
+  private:
+	const RollingBloomDBG<BF>* m_g;
+	vertex_descriptor m_u;
+	vertex_descriptor m_v;
+	unsigned m_i;
+}; // out_edge_iterator
+
+/** BidirectionalGraph */
+struct in_edge_iterator
+	: public std::iterator<std::input_iterator_tag, edge_descriptor>
+{
+	/** Skip to the next edge that is present. */
+	void next()
+	{
+		for (; m_i < NUM_BASES; ++m_i) {
+			m_v.setLastBase(ANTISENSE, BASE_CHARS[m_i]);
+			if (vertex_exists(m_v, *m_g))
+				break;
+		}
+	}
+
+  public:
+	in_edge_iterator() { }
+
+	in_edge_iterator(const RollingBloomDBG<BF>& g) : m_g(&g), m_i(NUM_BASES) { }
+
+	in_edge_iterator(const RollingBloomDBG<BF>& g, const vertex_descriptor& u)
+		: m_g(&g), m_u(u), m_v(u.clone()), m_i(0)
+	{
+		m_v.shift(ANTISENSE);
+		next();
+	}
+
+	edge_descriptor operator*() const
+	{
+		assert(m_i < NUM_BASES);
+		return edge_descriptor(m_v.clone(), m_u);
+	}
+
+	bool operator==(const in_edge_iterator& it) const
+	{
+		return m_i == it.m_i;
+	}
+
+	bool operator!=(const in_edge_iterator& it) const
+	{
+		return !(*this == it);
+	}
+
+	in_edge_iterator& operator++()
+	{
+		assert(m_i < NUM_BASES);
+		++m_i;
+		next();
+		return *this;
+	}
+
+	in_edge_iterator operator++(int)
+	{
+		in_edge_iterator it = *this;
+		++*this;
+		return it;
+	}
+
+  private:
+	const RollingBloomDBG<BF>* m_g;
+	vertex_descriptor m_u;
+	vertex_descriptor m_v;
+	unsigned m_i;
+}; // in_edge_iterator
+
+}; // graph_traits<RollingBloomDBG>
+
+} // namespace boost
+
+// Subgraph
+
+/** Return whether this vertex exists in the subgraph. */
+template <typename Graph>
+static inline bool
+vertex_exists(const typename graph_traits<Graph>::vertex_descriptor& u, const Graph& g)
+{
+	size_t hashes[MAX_HASHES];
+	u.rollingHash().getHashes(hashes);
+	return g.m_bloom.contains(hashes);
+}
+
+template <typename Graph>
+static inline
+std::pair<typename graph_traits<Graph>::adjacency_iterator,
+		typename graph_traits<Graph>::adjacency_iterator>
+adjacent_vertices(
+		const typename graph_traits<Graph>::vertex_descriptor& u, const Graph& g)
+{
+	typedef typename graph_traits<Graph>::adjacency_iterator adjacency_iterator;
+	return std::make_pair(adjacency_iterator(g, u), adjacency_iterator(g));
+}
+
+// IncidenceGraph
+template <typename Graph>
+static inline
+typename graph_traits<Graph>::degree_size_type
+out_degree(
+		const typename graph_traits<Graph>::vertex_descriptor& u,
+		const Graph& g)
+{
+	typedef typename graph_traits<Graph>::adjacency_iterator Ait;
+	std::pair<Ait, Ait> adj = adjacent_vertices(u, g);
+	return std::distance(adj.first, adj.second);
+}
+
+template <typename Graph>
+static inline typename
+std::pair<typename graph_traits<Graph>::out_edge_iterator,
+	typename graph_traits<Graph>::out_edge_iterator>
+out_edges(
+		const typename graph_traits<Graph>::vertex_descriptor& u,
+		const Graph& g)
+{
+	typedef typename graph_traits<Graph>::out_edge_iterator Oit;
+	return std::make_pair(Oit(g, u), Oit(g));
+}
+
+// BidirectionalGraph
+template <typename Graph>
+static inline
+std::pair<typename graph_traits<Graph>::in_edge_iterator,
+	typename graph_traits<Graph>::in_edge_iterator>
+in_edges(
+		const typename graph_traits<Graph>::vertex_descriptor& u,
+		const Graph& g)
+{
+	typedef typename graph_traits<Graph>::in_edge_iterator Iit;
+	return std::make_pair(Iit(g, u), Iit(g));
+}
+
+template <typename Graph>
+static inline
+typename graph_traits<Graph>::degree_size_type
+in_degree(const typename graph_traits<Graph>::vertex_descriptor& u,
+		  const Graph& g)
+{
+	//return out_degree(reverseComplement(u), g);
+	typedef typename graph_traits<Graph>::in_edge_iterator Iit;
+	std::pair<Iit, Iit> it = in_edges(u, g);
+	return std::distance(it.first, it.second);
+}
+
+// PropertyGraph
+
+/** Return the reverse complement of the specified k-mer. */
+template <typename Graph>
+static inline
+typename graph_traits<Graph>::vertex_descriptor
+get(vertex_complement_t, const Graph&,
+		typename graph_traits<Graph>::vertex_descriptor u)
+{
+	typedef typename graph_traits<Graph>::vertex_descriptor V;
+	return V(reverseComplement(u.first), u.second);
+}
+
+/** Return the name of the specified vertex. */
+template <typename Graph>
+static inline
+MaskedKmer get(vertex_name_t, const Graph&,
+		typename graph_traits<Graph>::vertex_descriptor u)
+{
+	return u.first;
+}
+
+template <typename Graph>
+static inline
+bool
+get(vertex_removed_t, const Graph&,
+		typename graph_traits<Graph>::vertex_descriptor)
+{
+	return false;
+}
+
+template <typename Graph>
+static inline
+no_property
+get(vertex_bundle_t, const Graph&,
+		typename graph_traits<Graph>::edge_descriptor)
+{
+	return no_property();
+}
+
+template <typename Graph>
+static inline
+no_property
+get(edge_bundle_t, const Graph&,
+		typename graph_traits<Graph>::edge_descriptor)
+{
+	return no_property();
+}
+
+#endif
diff --git a/BloomDBG/RollingHash.h b/BloomDBG/RollingHash.h
new file mode 100644
index 0000000..f7f4215
--- /dev/null
+++ b/BloomDBG/RollingHash.h
@@ -0,0 +1,289 @@
+#ifndef ABYSS_ROLLING_HASH_H
+#define ABYSS_ROLLING_HASH_H 1
+
+#include "config.h"
+#include "lib/rolling-hash/rolling.h"
+#include "BloomDBG/MaskedKmer.h"
+#include <string>
+#include <vector>
+#include <cassert>
+#include <boost/dynamic_bitset.hpp>
+#include <cstring>
+
+class RollingHash
+{
+private:
+
+	/**
+	 * Determine the canonical hash value, given hash values for
+	 * forward and reverse-complement of the same k-mer.
+	 */
+	uint64_t canonicalHash(uint64_t hash, uint64_t rcHash) const
+	{
+		return (rcHash < hash) ? rcHash : hash;
+	}
+
+public:
+
+	/**
+	 * Default constructor.
+	 */
+	RollingHash() : m_numHashes(0), m_k(0), m_hash1(0), m_rcHash1(0) {}
+
+	/**
+	 * Constructor. Construct RollingHash object when initial k-mer
+	 * is unknown.
+	 * @param numHashes number of pseudo-independent hash values to compute
+	 * for each k-mer
+	 * @param k k-mer length
+	 */
+	RollingHash(unsigned numHashes, unsigned k) : m_numHashes(numHashes),
+		m_k(k), m_hash1(0), m_rcHash1(0) {}
+
+	/**
+	 * Constructor. Construct RollingHash object while specifying
+	 * initial k-mer to be hashed.
+	 * @param kmer initial k-mer for initializing hash value(s)
+	 * @param numHashes number of pseudo-independent hash values to compute
+	 * for each k-mer
+	 * @param k k-mer length
+	 */
+	RollingHash(const std::string& kmer, unsigned numHashes, unsigned k)
+		: m_numHashes(numHashes), m_k(k), m_hash1(0), m_rcHash1(0)
+	{
+		/* init rolling hash state */
+		reset(kmer);
+	}
+
+	/**
+	 * Initialize hash state from sequence.
+	 * @param kmer k-mer used to initialize hash state
+	 */
+	void reset(const std::string& kmer)
+	{
+		if (!MaskedKmer::mask().empty())
+			resetMasked(kmer.c_str());
+		else
+			resetUnmasked(kmer);
+	}
+
+	/**
+	 * Initialize hash values from current k-mer. When computing the hash
+	 * value, mask out "don't care" positions as per the active
+	 * k-mer mask.
+	 */
+	void resetMasked(const char* kmer)
+	{
+		const std::string& spacedSeed = MaskedKmer::mask();
+		assert(spacedSeed.length() == m_k);
+
+		/* compute first hash function for k-mer */
+		uint64_t hash1 = getFhval(m_hash1, spacedSeed.c_str(), kmer, m_k);
+
+		/* compute first hash function for reverse complement of k-mer */
+		uint64_t rcHash1 = getRhval(m_rcHash1, spacedSeed.c_str(), kmer, m_k);
+
+		m_hash = canonicalHash(hash1, rcHash1);
+	}
+
+	/**
+	 * Initialize hash values from sequence.
+	 * @param kmer k-mer used to initialize hash state
+	 */
+	void resetUnmasked(const std::string& kmer)
+	{
+		/* compute first hash function for k-mer */
+		m_hash1 = getFhval(kmer.c_str(), m_k);
+
+		/* compute first hash function for reverse complement
+		 * of k-mer */
+		m_rcHash1 = getRhval(kmer.c_str(), m_k);
+
+		m_hash = canonicalHash(m_hash1, m_rcHash1);
+	}
+
+	/**
+	 * Compute hash values for next k-mer to the right and
+	 * update internal state.
+	 * @param kmer current k-mer
+	 * @param nextKmer k-mer we are rolling into
+	 */
+	void rollRight(const char* kmer, char charIn)
+	{
+		if (!MaskedKmer::mask().empty())
+			rollRightMasked(kmer, charIn);
+		else
+			rollRightUnmasked(kmer, charIn);
+	}
+
+	/**
+	 * Compute hash values for next k-mer to the right and
+	 * update internal state.  When computing the new hash, mask
+	 * out "don't care" positions according to the active
+	 * k-mer mask.
+	 * @param kmer current k-mer
+	 * @param nextKmer k-mer we are rolling into
+	 */
+	void rollRightMasked(const char* kmer, char charIn)
+	{
+		const std::string& spacedSeed = MaskedKmer::mask();
+		m_hash = rollHashesRight(m_hash1, m_rcHash1, spacedSeed.c_str(),
+			kmer, charIn, m_k);
+	}
+
+	/**
+	 * Compute hash values for next k-mer to the right and
+	 * update internal state.
+	 * @param kmer current k-mer
+	 * @param nextKmer k-mer we are rolling into
+	 */
+	void rollRightUnmasked(const char* kmer, char charIn)
+	{
+		/* update first hash function */
+		rollHashesRight(m_hash1, m_rcHash1, kmer[0], charIn, m_k);
+		m_hash = canonicalHash(m_hash1, m_rcHash1);
+	}
+
+	/**
+	 * Compute hash values for next k-mer to the left and
+	 * update internal state.
+	 * @param prevKmer k-mer we are rolling into
+	 * @param kmer current k-mer
+	 */
+	void rollLeft(char charIn, const char* kmer)
+	{
+		if (!MaskedKmer::mask().empty())
+			rollLeftMasked(charIn, kmer);
+		else
+			rollLeftUnmasked(charIn, kmer);
+	}
+
+	/**
+	 * Compute hash values for next k-mer to the left and
+	 * update internal state.  When computing the new hash, mask
+	 * out "don't care" positions according to the active
+	 * k-mer mask.
+	 * @param prevKmer k-mer we are rolling into
+	 * @param kmer current k-mer
+	 */
+	void rollLeftMasked(char charIn, const char* kmer)
+	{
+		const std::string& spacedSeed = MaskedKmer::mask();
+		m_hash = rollHashesLeft(m_hash1, m_rcHash1, spacedSeed.c_str(),
+			kmer, charIn, m_k);
+	}
+
+	/**
+	 * Compute hash values for next k-mer to the left and
+	 * update internal state.
+	 * @param prevKmer k-mer we are rolling into
+	 * @param kmer current k-mer
+	 */
+	void rollLeftUnmasked(char charIn, const char* kmer)
+	{
+		/* update first hash function */
+		rollHashesLeft(m_hash1, m_rcHash1, charIn, kmer[m_k-1], m_k);
+		m_hash = canonicalHash(m_hash1, m_rcHash1);
+	}
+
+	/**
+	 * Get the seed hash value for the current k-mer. The seed hash
+	 * value is used to calculate multiple pseudo-independant
+	 * hash functions.
+	 */
+	size_t getHashSeed() const
+	{
+		return (size_t)m_hash;
+	}
+
+	/**
+	 * Get hash values for current k-mer.
+	 *
+	 * @param hashes array for returned hash values
+	 */
+	void getHashes(size_t hashes[]) const
+	{
+		uint64_t tmpHashes[MAX_HASHES];
+		multiHash(tmpHashes, m_hash, m_numHashes, m_k);
+		for (unsigned i = 0; i < m_numHashes; ++i) {
+			hashes[i] = (size_t)tmpHashes[i];
+		}
+	}
+
+	/** Equality operator */
+	bool operator==(const RollingHash& o) const
+	{
+		/**
+		 * Note: If hash seeds are equal, then the values
+		 * for all hash functions will also be equal, since
+		 * the hash values are calculated from the
+		 * seed in a deterministic manner. In practice seed
+		 * collision is very unlikely, though!
+		 */
+		return m_k == o.m_k && getHashSeed() == o.getHashSeed();
+	}
+
+	/** Inequality operator */
+	bool operator!=(const RollingHash& o) const
+	{
+		return !(*this == o);
+	}
+
+	/**
+	 * Set the base at a given position in the k-mer and update the hash
+	 * value accordingly.
+	 * @param kmer point to the k-mer char array
+	 * @param pos position of the base to be changed
+	 * @param base new value for the base
+	 */
+	void setBase(char* kmer, unsigned pos, char base)
+	{
+		if (!MaskedKmer::mask().empty())
+			setBaseMasked(kmer, pos, base);
+		else
+			setBaseUnmasked(kmer, pos, base);
+	}
+
+	/**
+	 * Set the base at a given position in the k-mer and update the hash
+	 * value accordingly.
+	 * @param kmer point to the k-mer char array
+	 * @param pos position of the base to be changed
+	 * @param base new value for the base
+	 */
+	void setBaseMasked(char* kmer, unsigned pos, char base)
+	{
+		const std::string& spacedSeed = MaskedKmer::mask();
+		assert(spacedSeed.length() == m_k);
+		m_hash = ::setBase(m_hash1, m_rcHash1, spacedSeed.c_str(), kmer,
+			pos, base, m_k);
+	}
+
+	/**
+	 * Set the base at a given position in the k-mer and update the hash
+	 * value accordingly.
+	 * @param kmer point to the k-mer char array
+	 * @param pos position of the base to be changed
+	 * @param base new value for the base
+	 */
+	void setBaseUnmasked(char* kmer, unsigned pos, char base)
+	{
+		m_hash = ::setBase(m_hash1, m_rcHash1, kmer, pos, base, m_k);
+	}
+
+private:
+
+	/** number of hash functions */
+	unsigned m_numHashes;
+	/** k-mer length */
+	unsigned m_k;
+	/** value of first hash function for current k-mer */
+	uint64_t m_hash1;
+	/** value of first hash function for current k-mer, after
+	 * reverse-complementing */
+	uint64_t m_rcHash1;
+	/** current canonical hash value */
+	uint64_t m_hash;
+};
+
+#endif
diff --git a/BloomDBG/RollingHashIterator.h b/BloomDBG/RollingHashIterator.h
new file mode 100644
index 0000000..9a03163
--- /dev/null
+++ b/BloomDBG/RollingHashIterator.h
@@ -0,0 +1,234 @@
+#ifndef ROLLING_HASH_ITERATOR_H
+#define ROLLING_HASH_ITERATOR_H 1
+
+#include <cstring>
+#include <vector>
+#include <cassert>
+#include <limits>
+#include <string>
+#include <algorithm>
+#include <cctype>
+#include <deque>
+#include "BloomDBG/RollingHash.h"
+
+/**
+ * Permitted characters in k-mers. All k-mers containing
+ * other characters will be skipped.
+ */
+#define ACGT_CHARS "ACGT"
+
+/**
+ * Iterate over hash values for k-mers in a
+ * given DNA sequence.
+ *
+ * This implementation uses a rolling hash
+ * function to efficiently calculate
+ * hash values for successive k-mers.
+ */
+class RollingHashIterator
+{
+private:
+
+	/**
+	 * Advance iterator right to the next valid k-mer.
+	 */
+	void next()
+	{
+		if (m_seq.length() < m_k) {
+			m_pos = std::numeric_limits<std::size_t>::max();
+			return;
+		}
+
+		const std::string& spacedSeed = MaskedKmer::mask();
+
+		while(m_pos < m_seq.length() - m_k + 1) {
+
+			/* skip k-mers with non-ACGT chars in unmasked positions */
+
+			while (!m_badCharPos.empty() && m_badCharPos.front() < m_pos)
+				m_badCharPos.pop_front();
+
+			if (!m_badCharPos.empty() && m_badCharPos.front() < m_pos + m_k) {
+				/* empty spaced seed is equivalent to a string of '1's */
+				if (spacedSeed.empty()) {
+					m_rollNextHash = false;
+					m_pos = m_badCharPos.front() + 1;
+					continue;
+				}
+				bool goodKmer = true;
+				assert(spacedSeed.length() == m_k);
+				for (size_t i = 0; i < m_badCharPos.size() &&
+					m_badCharPos.at(i) < m_pos + m_k; ++i) {
+					size_t kmerPos = m_badCharPos.at(i) - m_pos;
+					if (spacedSeed.at(kmerPos) == '1') {
+						goodKmer = false;
+						break;
+					}
+				}
+				if (!goodKmer) {
+					m_rollNextHash = false;
+					++m_pos;
+					continue;
+				}
+			}
+
+			/* we are positioned at the next valid k-mer */
+
+			if (!m_rollNextHash) {
+				/* we don't have hash values for the
+				 * preceding k-mer, so we must compute
+				 * the hash values from scratch */
+				m_rollingHash.reset(m_seq.substr(m_pos, m_k));
+				m_rollNextHash = true;
+			} else {
+				/* compute new hash values based on
+				 * hash values of preceding k-mer */
+				assert(m_pos > 0);
+				m_rollingHash.rollRight(m_seq.c_str() + m_pos - 1,
+					m_seq[m_pos + m_k - 1]);
+			}
+			m_rollingHash.getHashes(m_hashes);
+			return;
+
+		}
+
+		/* there are no more valid k-mers */
+		m_pos = std::numeric_limits<std::size_t>::max();
+	}
+
+public:
+
+	/**
+	 * Default constructor. Creates an iterator pointing to
+	 * the end of the iterator range.
+	 */
+	RollingHashIterator() : m_numHashes(0), m_k(0),
+		m_rollingHash(m_numHashes, m_k),
+		m_pos(std::numeric_limits<std::size_t>::max()) {}
+
+	/**
+	 * Constructor.
+	 * @param seq DNA sequence to be hashed
+	 * @param k k-mer size
+	 * for each k-mer
+	 */
+	RollingHashIterator(const std::string& seq, unsigned numHashes, unsigned k)
+		: m_seq(seq), m_numHashes(numHashes), m_k(k),
+		m_rollingHash(m_numHashes, m_k), m_rollNextHash(false), m_pos(0)
+	{
+		init();
+	}
+
+	/**
+	 * Initialize internal state of iterator.
+	 */
+	void init()
+	{
+		/* note: empty spaced seed indicates no masking (string of '1's) */
+		assert(MaskedKmer::mask().empty() || MaskedKmer::mask().length() == m_k);
+
+		/* convert sequence to upper case */
+		std::transform(m_seq.begin(), m_seq.end(), m_seq.begin(), ::toupper);
+
+		/* record positions of non-ACGT chars */
+		size_t i = m_seq.find_first_not_of(ACGT_CHARS);
+		while (i != std::string::npos) {
+			m_badCharPos.push_back(i);
+			i = m_seq.find_first_not_of(ACGT_CHARS, i + 1);
+		}
+
+		/* find first "good" k-mer in sequence */
+		next();
+	}
+
+	/** get reference to hash values for current k-mer */
+	const size_t* operator*() const
+	{
+		assert(m_pos + m_k <= m_seq.length());
+		return m_hashes;
+	}
+
+	/** test equality with another iterator */
+	bool operator==(const RollingHashIterator& it) const
+	{
+		return m_pos == it.m_pos;
+	}
+
+	/** test inequality with another iterator */
+	bool operator!=(const RollingHashIterator& it) const
+	{
+		return !(*this == it);
+	}
+
+	/** pre-increment operator */
+	RollingHashIterator& operator++()
+	{
+		++m_pos;
+		next();
+		return *this;
+	}
+
+	/** post-increment operator */
+	RollingHashIterator operator++(int)
+	{
+		RollingHashIterator it = *this;
+		++*this;
+		return it;
+	}
+
+    /** iterator pointing to one past last element */
+	static const RollingHashIterator end()
+	{
+		return RollingHashIterator();
+	}
+
+	/** return position of current k-mer */
+	unsigned pos() const
+	{
+		return m_pos;
+	}
+
+	/** return k-mer at current position */
+	std::string kmer(bool mask=false) const
+	{
+		std::string kmer(m_seq, m_pos, m_k);
+		const std::string& spacedSeed = MaskedKmer::mask();
+		if (mask && !spacedSeed.empty()) {
+			assert(spacedSeed.length() == m_k);
+			for(size_t i = 0; i < spacedSeed.length(); ++i) {
+				if (spacedSeed.at(i) == '0')
+					kmer.at(i) = 'N';
+			}
+		}
+		return kmer;
+	}
+
+	/** return RollingHash object for current state */
+	RollingHash rollingHash()
+	{
+		return m_rollingHash;
+	}
+
+private:
+
+	/** DNA sequence being hashed */
+	std::string m_seq;
+	/** number of hash functions */
+	unsigned m_numHashes;
+	/** hash values */
+	size_t m_hashes[MAX_HASHES];
+	/** k-mer size */
+	unsigned m_k;
+	/** internal state for rolling hash */
+	RollingHash m_rollingHash;
+	/** true whenever we can "roll" the hash values for
+	 * the current k-mer to compute the hash values for the
+	 * next k-mer */
+	bool m_rollNextHash;
+	/** position of current k-mer */
+	size_t m_pos;
+	/** positions of non-ACGT chars in sequence */
+	std::deque<size_t> m_badCharPos;
+};
+
+#endif
diff --git a/BloomDBG/SpacedSeed.h b/BloomDBG/SpacedSeed.h
new file mode 100644
index 0000000..80848e6
--- /dev/null
+++ b/BloomDBG/SpacedSeed.h
@@ -0,0 +1,79 @@
+#ifndef SPACED_SEED_H
+#define SPACED_SEED_H
+
+#include <string>
+#include <cassert>
+#include <algorithm>
+
+namespace SpacedSeed {
+
+	/**
+	 * Generate a spaced seed pattern (bitmask) for two equal-size
+	 * k-mers separated by a gap.
+	 *
+	 * @param k width of spaced seed pattern
+	 * @param K size of the individual k-mers. K must be <= k/2.
+	 * @return spaced seed pattern for gapped k-mer pair
+	 */
+	static inline std::string kmerPair(unsigned k, unsigned K)
+	{
+		assert(K <= k/2);
+		std::string seed(k, '0');
+		std::fill(seed.begin(), seed.begin() + K, '1');
+		std::fill(seed.rbegin(), seed.rbegin() + K, '1');
+		return seed;
+	}
+
+	/**
+	 * Generate a Quadratic Residue (QR) seed. The background theory
+	 * for QR seeds is described in:
+	 *
+	 * Egidi, Lavinia, and Giovanni Manzini. "Multiple seeds
+	 * sensitivity using a single seed with threshold." Journal of
+	 * bioinformatics and computational biology 13.04 (2015): 1550011.
+	 *
+	 * @param len desired length of QR seed. `len` must
+	 * be prime and >= 11.
+	 * @return a QR seed represented as a std::string
+	 * of 0's and 1's
+	 */
+	static inline std::string qrSeed(unsigned len)
+	{
+		assert(len >= 11);
+		std::string seed(len, '1');
+		for (size_t i = 0; i < len; ++i) {
+			for (size_t j = 1; j < len; ++j) {
+				if (j*j % len == i) {
+					seed.at(i) = '0';
+					break;
+				}
+			}
+		}
+		return seed;
+	}
+
+	/**
+	 * Generate a spaced seed pattern (bitmask) for two equal-length
+	 * Quadratic Residue (QR) seeds separated by a gap.  The first
+	 * QR seed is in the usual orientation and the second QR is reversed,
+	 * so that the overall pattern is symmetric.
+	 *
+	 * @param k width of the spaced seed pattern
+	 * @param qrSeedLen width of the individual QR seeds.
+	 * qrSeedLen must be a prime number >= 11 and must also be <= k/2.
+	 * @return spaced seed pattern for gapped QR seed pair
+	 */
+	static inline std::string qrSeedPair(unsigned k, unsigned qrSeedLen)
+	{
+		assert(qrSeedLen <= k/2);
+		std::string seed(k, '0');
+		std::string qrSeed = SpacedSeed::qrSeed(qrSeedLen);
+		std::copy(qrSeed.begin(), qrSeed.end(), seed.begin());
+		std::reverse(qrSeed.begin(), qrSeed.end());
+		std::copy(qrSeed.rbegin(), qrSeed.rend(), seed.rbegin());
+		return seed;
+	}
+
+}
+
+#endif
diff --git a/BloomDBG/bloom-dbg.cc b/BloomDBG/bloom-dbg.cc
new file mode 100644
index 0000000..2c6a05b
--- /dev/null
+++ b/BloomDBG/bloom-dbg.cc
@@ -0,0 +1,345 @@
+#include "config.h"
+
+#include "BloomDBG/bloom-dbg.h"
+#include "BloomDBG/HashAgnosticCascadingBloom.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "BloomDBG/SpacedSeed.h"
+#include "Common/StringUtil.h"
+#include "Common/Options.h"
+#include "DataLayer/Options.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <getopt.h>
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+#include <iomanip>
+#include <cstring>
+#include <limits>
+#include <string>
+
+#if _OPENMP
+# include <omp.h>
+#endif
+
+using namespace std;
+
+#define PROGRAM "abyss-bloom-dbg"
+
+static const char VERSION_MESSAGE[] =
+	PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
+	"Written by Ben Vandervalk, Shaun Jackman, Hamid Mohamadi,\n"
+	"Justin Chu, and Anthony Raymond.\n"
+	"\n"
+	"Copyright 2015 Canada's Michael Smith Genome Science Centre\n";
+
+static const char USAGE_MESSAGE[] =
+"Usage: " PROGRAM " -b <bloom_size> -H <bloom_hashes> -k <kmer_size> \\\n"
+"    -G <genome_size> [options] <FASTQ> [FASTQ]... > assembly.fasta\n"
+"\n"
+"Perform a de Bruijn graph assembly of the given FASTQ files.\n"
+"\n"
+"Basic Options:\n"
+"\n"
+"  -b  --bloom-size=N         Bloom filter memory size with unit suffix\n"
+"                             'k', 'M', or 'G' [required]\n"
+"      --chastity             discard unchaste reads [default]\n"
+"      --no-chastity          do not discard unchaste reads\n"
+"  -g  --graph=FILE           write de Bruijn graph to FILE (GraphViz)\n"
+"      --help                 display this help and exit\n"
+"  -H  --num-hashes=N         number of Bloom filter hash functions [1]\n"
+"  -j, --threads=N            use N parallel threads [1]\n"
+"      --trim-masked          trim masked bases from the ends of reads\n"
+"      --no-trim-masked       do not trim masked bases from the ends\n"
+"                             of reads [default]\n"
+"  -k, --kmer=N               the size of a k-mer [required]\n"
+"      --kc=N                 use a cascading Bloom filter with N levels,\n"
+"                             instead of a counting Bloom filter [2]\n"
+"  -o, --out=FILE             write the contigs to FILE [STDOUT]\n"
+"  -q, --trim-quality=N       trim bases from the ends of reads whose\n"
+"                             quality is less than the threshold\n"
+"  -Q, --mask-quality=N       mask all low quality bases as `N'\n"
+"      --standard-quality     zero quality is `!' (33), typically\n"
+"                             for FASTQ and SAM files [default]\n"
+"      --illumina-quality     zero quality is `@' (64), typically\n"
+"                             for qseq and export files\n"
+"  -t, --trim-length          max branch length to trim, in k-mers [k]\n"
+"  -v, --verbose              display verbose output\n"
+"      --version              output version information and exit\n"
+"\n"
+"Spaced Seed Options:\n"
+"\n"
+"  -K, --single-kmer=N        use a spaced seed that consists of two k-mers\n"
+"                             separated by a gap. K must be chosen such that\n"
+"                             K <= k/2\n"
+"      --qr-seed=N            use a spaced seed than consists of two mirrored\n"
+"                             QR seeds separated by a gap.  The following must\n"
+"                             hold: (a) N must be prime, (b) N >= 11,\n"
+"                             (c) N <= k/2\n"
+"  -s, --spaced-seed=STR      bitmask indicating k-mer positions to be\n"
+"                             ignored during hashing. The pattern must be\n"
+"                             symmetric\n"
+"\n"
+"Debugging Options:\n"
+"\n"
+"  -C, --cov-track=FILE       WIG track with 0/1 indicating k-mers with\n"
+"                             coverage above the -c threshold. A reference\n"
+"                             must also be specified with -R.\n"
+"  -T, --trace-file=FILE      write debugging info about extension of\n"
+"                             each read to FILE\n"
+"  -R, --ref=FILE             specify a reference genome. FILE may be\n"
+"                             FASTA, FASTQ, SAM, or BAM and may be gzipped."
+"\n"
+"Example:\n"
+"\n"
+"  Assemble a genome using a k-mer size of 50bp. Allocate a 1GB\n"
+"  Bloom filter with 2 hash functions and require that a k-mer\n"
+"  occurs 3 times or more to be included in the assembly. (The k-mer\n"
+"  count threshold filters out k-mers containing sequencing errors.)\n"
+"\n"
+"  $ " PROGRAM " -k50 -b1G -H2 --kc=3 reads1.fq.gz reads2.fq.gz > assembly.fa\n"
+"\n"
+"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
+
+/** Assembly params (stores command-line options) */
+BloomDBG::AssemblyParams params;
+
+static const char shortopts[] = "b:C:g:H:j:k:K:o:q:Q:R:s:t:T:v";
+
+enum { OPT_HELP = 1, OPT_VERSION, QR_SEED, MIN_KMER_COV };
+
+static const struct option longopts[] = {
+	{ "bloom-size",       required_argument, NULL, 'b' },
+	{ "min-coverage",     required_argument, NULL, 'c' },
+	{ "cov-track",        required_argument, NULL, 'C' },
+	{ "chastity",         no_argument, &opt::chastityFilter, 1 },
+	{ "no-chastity",      no_argument, &opt::chastityFilter, 0 },
+	{ "graph",            required_argument, NULL, 'g' },
+	{ "num-hashes",       required_argument, NULL, 'H' },
+	{ "help",             no_argument, NULL, OPT_HELP },
+	{ "threads",          required_argument, NULL, 'j' },
+	{ "trim-masked",      no_argument, &opt::trimMasked, 1 },
+	{ "no-trim-masked",   no_argument, &opt::trimMasked, 0 },
+	{ "kmer",             required_argument, NULL, 'k' },
+	{ "kc",               required_argument, NULL, MIN_KMER_COV },
+	{ "single-kmer",      required_argument, NULL, 'K' },
+	{ "out",              required_argument, NULL, 'o' },
+	{ "trim-quality",     required_argument, NULL, 'q' },
+	{ "mask-quality",     required_argument, NULL, 'Q' },
+	{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
+	{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
+	{ "qr-seed",          required_argument, NULL, QR_SEED },
+	{ "ref",              required_argument, NULL, 'R' },
+	{ "spaced-seed",      no_argument, NULL, 's' },
+	{ "trim-length",      no_argument, NULL, 't' },
+	{ "trace-file",       no_argument, NULL, 'T'},
+	{ "verbose",          no_argument, NULL, 'v' },
+	{ "version",          no_argument, NULL, OPT_VERSION },
+	{ NULL, 0, NULL, 0 }
+};
+
+/**
+ * Create a de novo genome assembly using a Bloom filter de
+ * Bruijn graph.
+ */
+int main(int argc, char** argv)
+{
+	bool die = false;
+
+	for (int c; (c = getopt_long(argc, argv,
+					shortopts, longopts, NULL)) != -1;) {
+		istringstream arg(optarg != NULL ? optarg : "");
+		switch (c) {
+		  case '?':
+			die = true; break;
+		  case 'b':
+			params.bloomSize = SIToBytes(arg); break;
+		  case 'C':
+			arg >> params.covTrackPath; break;
+		  case 'g':
+			arg >> params.graphPath; break;
+		  case 'H':
+			arg >> params.numHashes; break;
+		  case 'j':
+			arg >> params.threads; break;
+		  case 'k':
+			arg >> params.k; break;
+		  case 'K':
+			params.resetSpacedSeedParams();
+			arg >> params.K;
+			break;
+		  case 'o':
+			arg >> params.outputPath; break;
+		  case 'q':
+			arg >> opt::qualityThreshold; break;
+		  case 'R':
+			arg >> params.refPath; break;
+		  case 's':
+			params.resetSpacedSeedParams();
+			arg >> params.spacedSeed;
+			break;
+		  case 't':
+			arg >> params.trim; break;
+		  case 'T':
+			arg >> params.tracePath; break;
+		  case 'Q':
+			arg >> opt::internalQThreshold; break;
+		  case 'v':
+			++params.verbose; break;
+		  case OPT_HELP:
+			cout << USAGE_MESSAGE;
+			exit(EXIT_SUCCESS);
+		  case MIN_KMER_COV:
+			arg >> params.minCov; break;
+		  case OPT_VERSION:
+			cout << VERSION_MESSAGE;
+			exit(EXIT_SUCCESS);
+		  case QR_SEED:
+			params.resetSpacedSeedParams();
+			arg >> params.qrSeedLen;
+			break;
+		}
+		if (optarg != NULL && (!arg.eof() || arg.fail())) {
+			cerr << PROGRAM ": invalid option: `-"
+				<< (char)c << optarg << "'\n";
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	if (params.bloomSize == 0) {
+		cerr << PROGRAM ": missing mandatory option `-b'\n";
+		die = true;
+	}
+
+	if (params.k == 0) {
+		cerr << PROGRAM ": missing mandatory option `-k'\n";
+		die = true;
+	}
+
+	if (params.k > 0 && params.K > 0 && params.K > params.k/2) {
+		cerr << PROGRAM ": value of `-K' must be <= k/2\n";
+		die = true;
+	}
+
+	if (params.numHashes > MAX_HASHES) {
+		cerr << PROGRAM ": number of hash functions (`-H`) must "
+			"be <= " << MAX_HASHES << " (set by `configure` option "
+			"--enable-max-hashes=N)\n";
+		die = true;
+	}
+
+	if (params.k > 0 && params.qrSeedLen > 0 &&
+		(params.qrSeedLen < 11 || params.qrSeedLen > params.k/2)) {
+		cerr << PROGRAM ": value of `--qr-seed' must be >= 11 and <= k/2\n";
+		die = true;
+	}
+
+	if (!params.covTrackPath.empty() && params.refPath.empty()) {
+		cerr << PROGRAM ": you must specify a reference with `-r' "
+			"when using `-C'\n";
+		die = true;
+	}
+
+	if (params.trim == std::numeric_limits<unsigned>::max()) {
+		params.trim = params.k;
+	}
+
+	if (argc - optind < 1) {
+		cerr << PROGRAM ": missing input file arguments\n";
+		die = true;
+	}
+
+	if (die) {
+		cerr << "Try `" << PROGRAM
+			<< " --help' for more information.\n";
+		exit(EXIT_FAILURE);
+	}
+
+	assert(params.initialized());
+
+#if _OPENMP
+	if (params.threads > 0)
+		omp_set_num_threads(params.threads);
+#endif
+
+	/* set global variable for k-mer length */
+	MaskedKmer::setLength(params.k);
+
+	/* set global variable for spaced seed */
+	if (params.K > 0)
+		MaskedKmer::setMask(SpacedSeed::kmerPair(params.k, params.K));
+	else if (params.qrSeedLen > 0)
+		MaskedKmer::setMask(SpacedSeed::qrSeedPair(params.k, params.qrSeedLen));
+	else
+		MaskedKmer::setMask(params.spacedSeed);
+
+	if (params.verbose && !MaskedKmer::mask().empty())
+		cerr << "Using spaced seed " << MaskedKmer::mask() << endl;
+
+	/* print contigs to STDOUT unless -o option was set */
+	ofstream outputFile;
+	if (!params.outputPath.empty()) {
+		outputFile.open(params.outputPath.c_str());
+		assert_good(outputFile, params.outputPath);
+	}
+	ostream& out = params.outputPath.empty() ? cout : outputFile;
+
+	/* BloomFilter class requires size to be a multiple of 64 */
+	const size_t bitsPerByte = 8;
+	/*
+	 * Note: it is (params.minCov + 1) here because we use an additional
+	 * Bloom filter in BloomDBG::assemble() to track the set of
+	 * assembled k-mers.
+	 */
+	size_t bloomLevelSize = BloomDBG::roundUpToMultiple(
+		params.bloomSize * bitsPerByte / (params.minCov + 1), (size_t)64);
+
+	/* use cascading Bloom filter to remove error k-mers */
+	HashAgnosticCascadingBloom cascadingBloom(
+		bloomLevelSize, params.numHashes, params.minCov, params.k);
+
+	/* load reads into Bloom filter */
+	for (int i = optind; i < argc; ++i) {
+		/*
+		 * Debugging feature: If there is a ':'
+		 * separating the list of input read files into
+		 * two parts, use the first set of files
+		 * to load the Bloom filter and the second
+		 * set of files for the assembly (read extension).
+		 */
+		if (strcmp(argv[i],":") == 0) {
+			optind = i + 1;
+			break;
+		}
+		BloomDBG::loadFile(cascadingBloom, argv[i], params.verbose);
+	}
+	if (params.verbose)
+		cerr << "Bloom filter FPR: " << setprecision(3)
+			<< cascadingBloom.FPR() * 100 << "%" << endl;
+
+	if (!params.covTrackPath.empty()) {
+		assert(!params.refPath.empty());
+		BloomDBG::writeCovTrack(cascadingBloom, params);
+	}
+
+	/* second pass through FASTA files for assembling */
+	BloomDBG::assemble(argc - optind, argv + optind,
+		cascadingBloom, params, out);
+
+	/* generate de Bruijn graph in GraphViz format (optional) */
+	if (!params.graphPath.empty()) {
+		ofstream graphOut(params.graphPath.c_str());
+		assert_good(graphOut, params.graphPath);
+		BloomDBG::outputGraph(argc - optind, argv + optind,
+			cascadingBloom, params, graphOut);
+		assert_good(graphOut, params.graphPath);
+		graphOut.close();
+		assert_good(graphOut, params.graphPath);
+	}
+
+	/* cleanup */
+	if (!params.outputPath.empty())
+		outputFile.close();
+
+	return EXIT_SUCCESS;
+}
diff --git a/BloomDBG/bloom-dbg.h b/BloomDBG/bloom-dbg.h
new file mode 100644
index 0000000..819dd0a
--- /dev/null
+++ b/BloomDBG/bloom-dbg.h
@@ -0,0 +1,1276 @@
+#ifndef BLOOM_DBG_H
+#define BLOOM_DBG_H 1
+
+#include "BloomDBG/RollingHashIterator.h"
+#include "Common/Uncompress.h"
+#include "Common/IOUtil.h"
+#include "DataLayer/FastaReader.h"
+#include "Graph/Path.h"
+#include "Graph/ExtendPath.h"
+#include "Graph/BreadthFirstSearch.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "BloomDBG/RollingHash.h"
+#include "BloomDBG/RollingBloomDBG.h"
+#include "Common/UnorderedSet.h"
+#include "DataLayer/FastaConcat.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+#include <limits>
+#include <string>
+
+#if _OPENMP
+# include <omp.h>
+#endif
+
+namespace BloomDBG {
+
+	/**
+	 * Type for a vertex in the de Bruijn graph.
+	 */
+	typedef RollingBloomDBGVertex Vertex;
+
+	/**
+	 * Parameters controlling assembly.
+	 */
+	struct AssemblyParams
+	{
+		/** Bloom filter size (in bits) */
+		size_t bloomSize;
+
+		/** minimum k-mer coverage threshold */
+		unsigned minCov;
+
+		/** WIG track containing 0/1 for sufficient k-mer cov */
+		std::string covTrackPath;
+
+		/** path for output GraphViz file */
+		string graphPath;
+
+		/** num Bloom filter hash functions */
+		unsigned numHashes;
+
+		/** the number of parallel threads. */
+		unsigned threads;
+
+		/** the size of a k-mer. */
+		unsigned k;
+
+		/** the size of a single k-mer in a k-mer pair */
+		unsigned K;
+
+		/** reference genome */
+		std::string refPath;
+
+		/** Quadratic Residue (QR) seed length */
+		unsigned qrSeedLen;
+
+		/** spaced seed */
+		string spacedSeed;
+
+		/** maximum length of branches to trim */
+		unsigned trim;
+
+		/** verbose level for progress messages */
+		int verbose;
+
+		/** output contigs path (empty string indicates STDOUT) */
+		std::string outputPath;
+
+		/** output path for trace file (-T) option */
+		std::string tracePath;
+
+		/** Default constructor */
+		AssemblyParams() : bloomSize(0), minCov(2), graphPath(),
+			numHashes(1), threads(1), k(0), K(0), qrSeedLen(0),
+			spacedSeed(), trim(std::numeric_limits<unsigned>::max()),
+			verbose(0), outputPath(), tracePath() {}
+
+		/** Return true if all required members are initialized */
+		bool initialized() const {
+			return bloomSize > 0 && k > 0 &&
+				trim != std::numeric_limits<unsigned>::max();
+		}
+
+		/** Reset all spaced seed params to their default values */
+		void resetSpacedSeedParams() {
+			spacedSeed.clear();
+			K = 0;
+			qrSeedLen = 0;
+		}
+	};
+
+	/**
+	 * Round up `num` to the nearest multiple of `base`.
+	 */
+	template <typename T>
+	inline static T roundUpToMultiple(T num, T base)
+	{
+		if (base == 0)
+			return num;
+		T remainder = num % base;
+		if (remainder == 0)
+			return num;
+		return num + base - remainder;
+	}
+
+	/**
+	 * Load DNA sequence into Bloom filter using rolling hash.
+	 *
+	 * @param bloom target Bloom filter
+	 * @param seq DNA sequence
+	 */
+	template <typename BF>
+	inline static void loadSeq(BF& bloom, const std::string& seq)
+	{
+		const unsigned k = bloom.getKmerSize();
+		const unsigned numHashes = bloom.getHashNum();
+		for (RollingHashIterator it(seq, numHashes, k);
+			it != RollingHashIterator::end(); ++it) {
+			bloom.insert(*it);
+		}
+	}
+
+	/**
+	 * Load sequences contents of FASTA file into Bloom filter using
+	 * rolling hash.
+	 * @param bloom target Bloom filter
+	 * @param path path to FASTA file
+	 * @param verbose if true, print progress messages to STDERR
+	 */
+	template <typename BF>
+	inline static void loadFile(BF& bloom, const std::string& path,
+		bool verbose = false)
+	{
+		const size_t BUFFER_SIZE = 1000000;
+		const size_t LOAD_PROGRESS_STEP = 10000;
+
+		assert(!path.empty());
+		if (verbose)
+			std::cerr << "Reading `" << path << "'..." << std::endl;
+
+		FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
+		uint64_t readCount = 0;
+#pragma omp parallel
+		for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
+			buffer.clear();
+			size_t bufferSize = 0;
+			bool good = true;
+#pragma omp critical(in)
+			for (; good && bufferSize < BUFFER_SIZE;) {
+				std::string seq;
+				good = in >> seq;
+				if (good) {
+					buffer.push_back(seq);
+					bufferSize += seq.length();
+				}
+			}
+			if (buffer.size() == 0)
+				break;
+			for (size_t j = 0; j < buffer.size(); j++) {
+				loadSeq(bloom, buffer.at(j));
+				if (verbose)
+#pragma omp critical(cerr)
+				{
+					readCount++;
+					if (readCount % LOAD_PROGRESS_STEP == 0)
+						std::cerr << "Loaded " << readCount
+							<< " reads into Bloom filter\n";
+				}
+			}
+		}
+		assert(in.eof());
+		if (verbose) {
+			std::cerr << "Loaded " << readCount << " reads from `"
+					  << path << "` into Bloom filter\n";
+		}
+	}
+
+	/**
+	 * Return true if all of the k-mers in `seq` are contained in `bloom`
+	 * and false otherwise.
+	 */
+	template <typename BloomT>
+	inline static bool allKmersInBloom(const Sequence& seq, const BloomT& bloom)
+	{
+		const unsigned k = bloom.getKmerSize();
+		const unsigned numHashes = bloom.getHashNum();
+		assert(seq.length() >= k);
+		unsigned validKmers = 0;
+		for (RollingHashIterator it(seq, numHashes, k);
+			 it != RollingHashIterator::end(); ++it, ++validKmers) {
+			if (!bloom.contains(*it))
+				return false;
+		}
+		/* if we skipped over k-mers containing non-ACGT chars */
+		if (validKmers < seq.length() - k + 1)
+			return false;
+		return true;
+	}
+
+	/**
+	 * Add all k-mers of a DNA sequence to a Bloom filter.
+	 */
+	template <typename BloomT>
+	inline static void addKmersToBloom(const Sequence& seq, BloomT& bloom)
+	{
+		const unsigned k = bloom.getKmerSize();
+		const unsigned numHashes = bloom.getHashNum();
+		for (RollingHashIterator it(seq, numHashes, k);
+			 it != RollingHashIterator::end(); ++it) {
+			bloom.insert(*it);
+		}
+	}
+
+	/**
+	 * Translate a DNA sequence to an equivalent path in the
+	 * de Bruijn graph.
+	 */
+	inline static Path<Vertex>
+	seqToPath(const Sequence& seq, unsigned k, unsigned numHashes)
+	{
+		Path<Vertex> path;
+		assert(seq.length() >= k);
+		for (RollingHashIterator it(seq, numHashes, k);
+			 it != RollingHashIterator::end(); ++it) {
+			path.push_back(Vertex(it.kmer().c_str(), it.rollingHash()));
+		}
+		return path;
+	}
+
+	/**
+	 * Translate a path in the de Bruijn graph to an equivalent
+	 * DNA sequence.
+	 */
+	inline static Sequence pathToSeq(const Path<Vertex>& path, unsigned k)
+	{
+		assert(path.size() > 0);
+		assert(k > 0);
+
+		const std::string& spacedSeed = MaskedKmer::mask();
+		assert(spacedSeed.empty() || spacedSeed.length() == k);
+		Sequence seq;
+		seq.resize(path.size() + k - 1, 'N');
+
+		for (size_t i = 0; i < path.size(); ++i) {
+			std::string kmer(path.at(i).kmer().c_str());
+			for (size_t j = 0; j < k; ++j) {
+				if (spacedSeed.empty() || spacedSeed.at(j) == '1') {
+					if (seq.at(i + j) != 'N' && seq.at(i + j) != kmer.at(j)) {
+						std::cerr
+							<< "warning: inconsistent DBG path detected "
+							"at position " << i + j << ": "
+							<< seq.substr(0, i + j)
+							<< " (orig base: '" << seq.at(i + j) << "'"
+							<< ", new base: '" << kmer.at(j) << "')"
+							<< std::endl;
+					}
+					seq.at(i + j) = kmer.at(j);
+				}
+			}
+		}
+
+		return seq;
+	}
+
+	/**
+	 * Results for the extension of a read segment.
+	 * Each instance represents a row in the trace file generated
+	 * by the '-T' option for abyss-bloom-dbg.
+	 */
+	struct SeqExtensionResult
+	{
+		/** FASTA ID for origin read */
+		std::string readId;
+		/**
+		 * Index of this segment within the read. (Prior to extension,
+		 * each read is split into segments at branching k-mers.)
+		 */
+		unsigned readSegmentId;
+		/** Total number of segments after splitting the read */
+		unsigned numReadSegments;
+		/** True if leftwards sequence extension was attempted */
+		bool extendedLeft;
+		/** True if rightwards sequence extension was attempted */
+		bool extendedRight;
+		/** Result code for attempted left sequence extension (e.g. DEAD END) */
+		PathExtensionResult leftExtensionResult;
+		/** Result code for attempted left sequence extension (e.g. DEAD END) */
+		PathExtensionResult rightExtensionResult;
+		/** Original length of the read segment prior to extension */
+		unsigned origLength;
+		/** length of left extension (bp) */
+		unsigned leftExtensionLength;
+		/** length of right extension (bp) */
+		unsigned rightExtensionLength;
+		/** total length of extended sequence (bp) */
+		unsigned extendedLength;
+		/**
+		 * True if the extended sequence was excluded from the output contigs
+		 * because it was redundant. (An identical sequence was generated
+		 * when extending a previous read.)
+		 */
+		bool redundantContig;
+		/** Contig ID assigned to extended segment */
+		size_t contigID;
+
+		SeqExtensionResult() :
+			readId(),
+			readSegmentId(std::numeric_limits<unsigned>::max()),
+			numReadSegments(std::numeric_limits<unsigned>::max()),
+			extendedLeft(false),
+			extendedRight(false),
+			leftExtensionResult(DEAD_END),
+			rightExtensionResult(DEAD_END),
+			origLength(std::numeric_limits<unsigned>::max()),
+			leftExtensionLength(std::numeric_limits<unsigned>::max()),
+			rightExtensionLength(std::numeric_limits<unsigned>::max()),
+			extendedLength(std::numeric_limits<unsigned>::max()),
+			redundantContig(false),
+			contigID(std::numeric_limits<size_t>::max()) {}
+
+		bool initialized() const
+		{
+			return !readId.empty() &&
+				readSegmentId != std::numeric_limits<unsigned>::max() &&
+				numReadSegments != std::numeric_limits<unsigned>::max() &&
+				origLength != std::numeric_limits<unsigned>::max() &&
+				leftExtensionLength != std::numeric_limits<unsigned>::max() &&
+				rightExtensionLength != std::numeric_limits<unsigned>::max() &&
+				extendedLength != std::numeric_limits<unsigned>::max();
+		}
+
+		static std::ostream& printHeaders(std::ostream& out)
+		{
+			out << "read_id\t"
+				<< "read_segment_id\t"
+				<< "num_read_segments\t"
+				<< "left_extension_result\t"
+				<< "right_extension_result\t"
+				<< "orig_length\t"
+				<< "left_extension_len\t"
+				<< "right_extension_len\t"
+				<< "extended_length\t"
+				<< "redundant_contig\t"
+				<< "contig_id\n";
+			return out;
+		}
+
+		friend std::ostream& operator <<(std::ostream& out,
+			const SeqExtensionResult& o)
+		{
+			if (o.redundantContig) {
+				out << o.readId << '\t'
+					<< o.readSegmentId << '\t'
+					<< o.numReadSegments << '\t'
+					<< "-\t"
+					<< "-\t"
+					<< o.origLength << '\t'
+					<< "-\t"
+					<< "-\t"
+					<< "-\t"
+					<< "true" << '\t'
+					<< "-\n";
+			} else {
+				out << o.readId << '\t'
+					<< o.readSegmentId << '\t'
+					<< o.numReadSegments << '\t';
+				if (o.extendedLeft)
+					out << pathExtensionResultStr(o.leftExtensionResult) << '\t';
+				else
+					out << "-\t";
+				if (o.extendedRight)
+					out << pathExtensionResultStr(o.rightExtensionResult) << '\t';
+				else
+					out << "-\t";
+				out << o.origLength << '\t';
+				if (o.extendedLeft)
+					out << o.leftExtensionLength << '\t';
+				else
+					out << "-\t";
+				if (o.extendedRight)
+					out << o.rightExtensionLength << '\t';
+				else
+					out << "-\t";
+				out << o.extendedLength << '\t'
+					<< "false" << '\t'
+					<< o.contigID << '\n';
+			}
+			return out;
+		}
+	};
+
+	/**
+	 * Extend a sequence left (REVERSE) or right (FORWARD) within the de Bruijn
+	 * graph until either a branching point or a dead-end is encountered.
+	 */
+	template <typename GraphT>
+	inline static PathExtensionResult extendSeq(Sequence& seq, Direction dir,
+		unsigned k, unsigned numHashes, unsigned minBranchLen, const GraphT& graph)
+	{
+		assert(seq.length() >= k);
+
+		/* Convert sequence to path in DBG */
+		Path<Vertex> path = seqToPath(seq, k, numHashes);
+
+		/* Extend path */
+		ExtendPathParams params;
+		params.trimLen = minBranchLen - 1;
+		params.maxLen = NO_LIMIT;
+		PathExtensionResult result =
+			extendPath(path, dir, graph, params);
+
+		/* Convert extended path back to sequence */
+		Sequence extendedSeq = pathToSeq(path, k);
+
+		/*
+		 * If a spaced seed is in effect, short paths may result in
+		 * sequences containing 'N's.  However, since we only extend
+		 * "perfect reads", we can replace the 'N's with the correct
+		 * bases by overlaying the seed sequence.
+		 */
+		if (dir == FORWARD) {
+			overlaySeq(seq, extendedSeq, 0);
+		} else {
+			assert(dir == REVERSE);
+			overlaySeq(seq, extendedSeq, extendedSeq.length() - seq.length());
+		}
+
+		/*
+		 * Replace orig seq with extended version.
+		 */
+		seq = extendedSeq;
+
+		/* Return true if sequence was successfully extended */
+		return result;
+	}
+
+
+	/**
+	 * Counters for tracking assembly statistics and producing
+	 * progress messages.
+	 */
+	struct AssemblyCounters
+	{
+		size_t readsExtended;
+		size_t readsProcessed;
+		size_t basesAssembled;
+		size_t contigID;
+
+		AssemblyCounters() : readsExtended(0), readsProcessed(0),
+			basesAssembled(0), contigID(0) {}
+	};
+
+	/** Print an intermediate progress message during assembly */
+	void printProgressMessage(AssemblyCounters counters)
+	{
+#pragma omp critical(cerr)
+		std::cerr
+			<< "Extended " << counters.readsExtended
+			<< " of " << counters.readsProcessed
+			<< " reads (" << std::setprecision(3) << (float)100
+			* counters.readsExtended / counters.readsProcessed
+			<< "%), assembled " << counters.basesAssembled
+			<< " bp so far" << std::endl;
+	}
+
+	/**
+	 * Split a path at branching k-mers (degree > 2).
+	 */
+	template <typename GraphT>
+	inline static std::vector<
+		Path<typename boost::graph_traits<GraphT>::vertex_descriptor> >
+	splitPath(const Path<typename boost::graph_traits<GraphT>::vertex_descriptor>& path,
+		const GraphT& dbg, unsigned minBranchLen)
+	{
+		assert(path.size() > 0);
+
+		typedef typename boost::graph_traits<GraphT>::vertex_descriptor V;
+		typedef typename Path<V>::const_iterator PathIt;
+
+		std::vector< Path<V> > splitPaths;
+		Path<V> currentPath;
+		for (PathIt it = path.begin(); it != path.end(); ++it) {
+			currentPath.push_back(*it);
+			unsigned inDegree =
+				trueBranches(*it, REVERSE, dbg, minBranchLen).size();
+			unsigned outDegree =
+				trueBranches(*it, FORWARD, dbg, minBranchLen).size();
+			if (inDegree > 1 || outDegree > 1) {
+				/* we've hit a branching point -- end the current
+				 * path and start a new one */
+				splitPaths.push_back(currentPath);
+				currentPath.clear();
+				currentPath.push_back(*it);
+			}
+		}
+		if (currentPath.size() > 1 || splitPaths.empty())
+			splitPaths.push_back(currentPath);
+
+		assert(splitPaths.size() >= 1);
+		return splitPaths;
+	}
+
+	/**
+	 * Split a sequence at branching k-mers (degree > 2).
+	 * Branching k-mers are shared between the resulting sequence
+	 * segments.
+	 */
+	template <typename GraphT>
+	inline static std::vector<Sequence>
+	splitSeq(const Sequence& seq, unsigned k, unsigned numHashes,
+		const GraphT& dbg, unsigned minBranchLen)
+	{
+		assert(seq.length() >= k);
+
+		typedef typename boost::graph_traits<GraphT>::vertex_descriptor V;
+		typedef typename Path<V>::const_iterator PathIt;
+
+		std::vector<Sequence> segments;
+		Path<V> path = seqToPath(seq, k, numHashes);
+		PathIt start = path.begin();
+		PathIt end = path.begin();
+
+		for (; end != path.end(); ++end) {
+			std::vector<V> inBranches = trueBranches(*end, REVERSE, dbg,
+				minBranchLen-1);
+			unsigned inDegree = inBranches.size();
+			/*
+			 * Tricky: Include the read itself in the list of valid
+			 * incoming branches, even if it is shorter than trimLen.
+			 */
+			if (end > path.begin() && std::find(inBranches.begin(),
+				inBranches.end(), *(end - 1)) == inBranches.end()) {
+				inDegree++;
+			}
+			std::vector<V> outBranches = trueBranches(*end, FORWARD, dbg,
+				minBranchLen-1);
+			unsigned outDegree = outBranches.size();
+			/*
+			 * Tricky: Include the read itself in the list of valid
+			 * outgoing branches, even if it is shorter than trimLen.
+			 */
+			if (end < path.end() - 1 && std::find(outBranches.begin(),
+				outBranches.end(), *(end + 1)) == outBranches.end()) {
+				outDegree++;
+			}
+			if (inDegree > 1 || outDegree > 1) {
+				/* we've hit a branching point -- end the current
+				 * segment and start a new one */
+				Sequence segment = seq.substr(start - path.begin(),
+					end - start + k);
+				segments.push_back(segment);
+				start = end;
+			}
+		}
+		if (segments.empty() || segments.back().length() > k) {
+			Sequence segment = seq.substr(start - path.begin(),
+				end - start + k);
+			segments.push_back(segment);
+		}
+
+		assert(segments.size() >= 1);
+		return segments;
+	}
+
+	/**
+	 * Trim a sequence down to the longest contiguous subsequence
+	 * of "good" k-mers.  If the sequence has length < k or contains
+	 * no good k-mers, the trimmed sequence will be the empty string.
+	 *
+	 * @param seq the DNA sequence to be trimmed
+	 * @param goodKmerSet Bloom filter containing "good" k-mers
+	 */
+	template <typename BloomT>
+	static inline void trimSeq(Sequence& seq, const BloomT& goodKmerSet)
+	{
+		const unsigned k = goodKmerSet.getKmerSize();
+		const unsigned numHashes = goodKmerSet.getHashNum();
+
+		if (seq.length() < k) {
+			seq.clear();
+			return;
+		}
+
+		const unsigned UNSET = UINT_MAX;
+		unsigned prevPos = UNSET;
+		unsigned matchStart = UNSET;
+		unsigned matchLen = 0;
+		unsigned maxMatchStart = UNSET;
+		unsigned maxMatchLen = 0;
+
+		/* note: RollingHashIterator skips over k-mer
+		 * positions with non-ACGT chars */
+		for (RollingHashIterator it(seq, numHashes, k);
+			it != RollingHashIterator::end(); prevPos=it.pos(),++it) {
+			if (!goodKmerSet.contains(*it) ||
+				(prevPos != UNSET && it.pos() - prevPos > 1)) {
+				/* end any previous match */
+				if (matchStart != UNSET && matchLen > maxMatchLen) {
+					maxMatchLen = matchLen;
+					maxMatchStart = matchStart;
+				}
+				matchStart = UNSET;
+				matchLen = 0;
+			}
+			if (goodKmerSet.contains(*it)) {
+				/* initiate or extend match */
+				if (matchStart == UNSET)
+					matchStart = it.pos();
+				matchLen++;
+			}
+		}
+		/* handles case last match extends to end of seq */
+		if (matchStart != UNSET && matchLen > maxMatchLen) {
+			maxMatchLen = matchLen;
+			maxMatchStart = matchStart;
+		}
+		/* if there were no matching k-mers */
+		if (maxMatchLen == 0) {
+			seq.clear();
+			return;
+		}
+		/* trim read down to longest matching subseq */
+		seq = seq.substr(maxMatchStart, maxMatchLen + k - 1);
+	}
+
+	/**
+	 * Ensure that branching k-mers are not repeated in the output
+	 * contigs by selectively trimming contig ends.
+	 *
+	 * The idea is to keep a branch k-mer if the edge leading to it
+	 * is unambigous. For example, in the diagram below the contig
+	 * generated from the right side would include the branching k-mer
+	 * k5, whereas the two contigs entering on the left would discard it:
+	 *
+	 * ...-k1-k2
+	 *          \
+	 *           k5-k6-...
+	 *          /
+	 * ...-k3-k4
+	 *
+	 * @param seq the contig to trim
+	 * @param k k-mer size
+	 * @param numHashes number of Bloom filter hash functions
+	 * @param minBranchLen minimum length of a "true" branch (shorter
+	 * branches are assumed to be caused by sequencing errors or
+	 * Bloom filter false positives).
+	 */
+	template <typename GraphT>
+		inline static void trimBranchKmers(Sequence& seq,
+			unsigned k, unsigned numHashes, unsigned minBranchLen,
+			const GraphT& dbg)
+	{
+		assert(seq.length() >= k);
+
+		if (seq.length() == k)
+			return;
+
+		Sequence firstKmer = seq.substr(0, k);
+		Vertex vFirst(firstKmer.c_str(), RollingHash(firstKmer, numHashes, k));
+		unsigned outDegree = trueDegree(vFirst, FORWARD, dbg, minBranchLen - 1);
+		if (outDegree > 1)
+			seq.erase(0, 1);
+
+		if (seq.length() == k)
+			return;
+
+		Sequence lastKmer = seq.substr(seq.length()-k);
+		Vertex vLast(lastKmer.c_str(), RollingHash(lastKmer, numHashes, k));
+		unsigned inDegree = trueDegree(vLast, REVERSE, dbg, minBranchLen - 1);
+		if (inDegree > 1)
+			seq.erase(seq.length()-1, 1);
+	}
+
+	/**
+	 * Append a contig to the output FASTA stream.
+	 */
+    inline static void printContig(const Sequence& seq,
+		size_t contigID, const std::string& readID, unsigned k,
+		std::ostream& out)
+	{
+		assert(seq.length() >= k);
+
+		FastaRecord contig;
+
+		/* set FASTA id */
+		std::ostringstream id;
+		id << contigID;
+
+		/* add FASTA comment indicating extended read id */
+		std::ostringstream comment;
+		comment << "read:" << readID;
+		assert(id.good());
+		contig.id = id.str();
+		contig.comment = comment.str();
+
+		/* set seq (in canonical orientation) */
+		Sequence rcSeq = reverseComplement(seq);
+		contig.seq = (seq < rcSeq) ? seq : rcSeq;
+
+		/* output FASTQ record */
+		out << contig;
+		assert(out);
+	}
+
+	/**
+	 * Trim contiguous stretches of previously-assembled k-mers from
+	 * both ends of a contig.
+	 *
+	 * @param seq contig to be trimmed
+	 * @param assembledKmerSet Bloom filter of k-mers from previously
+	 * assembled contigs
+	 */
+	template <typename BloomT>
+	inline static void trimContigOverlaps(Sequence &seq,
+		const BloomT& assembledKmerSet)
+	{
+		const unsigned k = assembledKmerSet.getKmerSize();
+		const unsigned numHashes = assembledKmerSet.getHashNum();
+
+		/* trim previously assembled k-mers from start of sequence */
+		RollingHashIterator fwd(seq, numHashes, k);
+		for (; fwd != RollingHashIterator::end(); ++fwd) {
+			if (!assembledKmerSet.contains(*fwd))
+				break;
+		}
+		if (fwd.pos() > 0)
+			seq.erase(0, fwd.pos());
+
+		/* trim previously assembled k-mers from end of sequence */
+		Sequence rcSeq = reverseComplement(seq);
+		RollingHashIterator rev(rcSeq, numHashes, k);
+		for (; rev != RollingHashIterator::end(); ++rev) {
+			if (!assembledKmerSet.contains(*rev))
+				break;
+		}
+		if (rev.pos() > 0)
+			rcSeq.erase(0, rev.pos());
+
+		/* flip seq back to original orientation */
+		seq = reverseComplement(rcSeq);
+
+		assert(seq.length() >= k);
+	}
+
+	/**
+	 * Split a read at branching points in the de Bruijn graph and
+	 * then extend each segment left and right, up to the next
+	 * branching point or dead end.
+	 *
+	 * @param read read to be assembled
+	 * @param dbg Boost graph interface to de Bruijn graph
+	 * @param assembledKmerSet Bloom filter containing k-mers of
+	 * previously assembled contigs
+	 * @param params command line options for the assembly
+	 * (e.g. k-mer coverage threshold)
+	 * @param counters counter variables used for generating assembly
+	 * progress messages.
+	 * @param out output stream for contigs
+	 * @param traceOut output stream for trace file (-T option)
+	 */
+	template <typename GraphT, typename BloomT>
+	inline static void extendRead(const FastaRecord& read,
+		const GraphT& dbg, BloomT& assembledKmerSet,
+		const AssemblyParams& params, AssemblyCounters& counters,
+		std::ostream& out, std::ostream& traceOut)
+	{
+		const unsigned k = params.k;
+		const unsigned numHashes = params.numHashes;
+		const unsigned minBranchLen = params.trim + 1;
+
+		if (params.verbose >= 2) {
+#pragma omp critical(cerr)
+			std::cerr << "Extending read: " << read.id << std::endl;
+		}
+
+		/* split read at branching points (prevents over-assembly) */
+		std::vector<Sequence> segments = splitSeq(read.seq, k,
+			numHashes, dbg, minBranchLen);
+
+		for (std::vector<Sequence>::iterator it = segments.begin();
+			 it != segments.end(); ++it) {
+
+			Sequence& seq = *it;
+
+			/*
+			 * track results of sequence extension attempt for
+			 * trace file ('-T' option).
+			 */
+			SeqExtensionResult traceResult;
+			traceResult.readId = read.id;
+			traceResult.readSegmentId = it - segments.begin() + 1;
+			traceResult.numReadSegments = segments.size();
+			traceResult.origLength = seq.length();
+			traceResult.leftExtensionLength = 0;
+			traceResult.rightExtensionLength = 0;
+			traceResult.redundantContig = true;
+
+			/*
+			 * extend first and last segments only, since
+			 * internal segments are bounded by branching
+			 * points.
+			 */
+			if (it == segments.begin()) {
+				traceResult.extendedLeft = true;
+				traceResult.leftExtensionResult = extendSeq(seq,
+					REVERSE, k, numHashes, minBranchLen, dbg);
+				traceResult.leftExtensionLength =
+					seq.length() - traceResult.origLength;
+			}
+			if (it == segments.end() - 1) {
+				unsigned origLength = seq.length();
+				traceResult.extendedRight = true;
+				traceResult.rightExtensionResult = extendSeq(seq,
+					FORWARD, k, numHashes, minBranchLen, dbg);
+				traceResult.rightExtensionLength =
+					seq.length() - origLength;
+			}
+			traceResult.extendedLength = seq.length();
+
+			/* ensure branching k-mers are included only once in output */
+			trimBranchKmers(seq, k, numHashes, minBranchLen, dbg);
+
+			/*
+			 * check assembledKmerSet again to prevent race
+			 * condition. (Otherwise, the same contig may be
+			 * generated multiple times.)
+			 */
+#pragma omp critical(assembledKmerSet)
+			if (!allKmersInBloom(seq, assembledKmerSet)) {
+
+				/* trim previously assembled k-mers from both ends */
+				trimContigOverlaps(seq, assembledKmerSet);
+
+				/* mark remaining k-mers as assembled */
+				addKmersToBloom(seq, assembledKmerSet);
+
+				/* add contig to output FASTA */
+				printContig(seq, counters.contigID, read.id, k, out);
+
+				/* update counters / trace results */
+				traceResult.redundantContig = false;
+				traceResult.contigID = counters.contigID;
+				counters.basesAssembled += seq.length();
+				counters.contigID++;
+			}
+
+			/* trace file output ('-T' option) */
+#pragma omp critical(traceOut)
+			if (!params.tracePath.empty()) {
+				assert(traceResult.initialized());
+				traceOut << traceResult;
+				assert_good(traceOut, params.tracePath);
+			}
+
+		}  /* for each read segment */
+	}
+
+	/**
+	 * Perform a Bloom-filter-based de Bruijn graph assembly.
+	 * Contigs are generated by extending reads left/right within
+	 * the de Bruijn graph, up to the next branching point or dead end.
+	 * Short branches due to Bloom filter false positives are
+	 * ignored.
+	 *
+	 * @param argc number of input FASTA files
+	 * @param argv array of input FASTA filenames
+	 * @param genomeSize approx genome size
+	 * @param goodKmerSet Bloom filter containing k-mers that
+	 * occur more than once in the input data
+	 * @param out output stream for contigs (FASTA)
+	 * @param verbose set to true to print progress messages to
+	 * STDERR
+	 */
+	template <typename BloomT>
+	inline static void assemble(int argc, char** argv, const BloomT& goodKmerSet,
+		const AssemblyParams& params, std::ostream& out)
+	{
+		assert(params.initialized());
+
+		/* per-thread I/O buffer (size is in bases) */
+		const size_t SEQ_BUFFER_SIZE = 1000000;
+
+		/* print progress message after processing this many reads */
+		const unsigned progressStep = 1000;
+		const unsigned k = goodKmerSet.getKmerSize();
+
+		/* trace file output ('-T' option) */
+		std::ofstream traceOut;
+		if (!params.tracePath.empty()) {
+			traceOut.open(params.tracePath.c_str());
+			assert_good(traceOut, params.tracePath);
+			SeqExtensionResult::printHeaders(traceOut);
+			assert_good(traceOut, params.tracePath);
+		}
+
+		/* k-mers in previously assembled contigs */
+		BloomFilter assembledKmerSet(goodKmerSet.size(),
+			goodKmerSet.getHashNum(), goodKmerSet.getKmerSize());
+		/* counters for progress messages */
+		AssemblyCounters counters;
+
+		/* Boost graph API over Bloom filter */
+		RollingBloomDBG<BloomT> graph(goodKmerSet);
+
+		if (params.verbose)
+			std::cerr << "Trimming branches " << params.trim
+				<< " k-mers or shorter" << std::endl;
+
+		FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
+#pragma omp parallel
+		for (std::vector<FastaRecord> buffer;;) {
+
+			/* read sequences in batches to reduce I/O contention */
+			buffer.clear();
+			size_t bufferSize;
+			bool good = true;
+#pragma omp critical(in)
+			for (bufferSize = 0; bufferSize < SEQ_BUFFER_SIZE;) {
+				FastaRecord rec;
+				good = in >> rec;
+				if (!good)
+					break;
+				buffer.push_back(rec);
+				bufferSize += rec.seq.length();
+			}
+			if (buffer.size() == 0)
+				break;
+
+			for (std::vector<FastaRecord>::iterator it = buffer.begin();
+				 it != buffer.end(); ++it) {
+
+				const FastaRecord& rec = *it;
+				bool skip = false;
+
+				/* we can't extend reads shorter than k */
+				if (rec.seq.length() < k)
+					skip = true;
+
+				/* only extend error-free reads */
+				if (!skip && !allKmersInBloom(rec.seq, goodKmerSet))
+					skip = true;
+
+				/* skip reads in previously assembled regions */
+				if (!skip && allKmersInBloom(rec.seq, assembledKmerSet))
+					skip = true;
+
+				/* extend the read left and right within the DBG */
+				if (!skip) {
+					extendRead(rec, graph, assembledKmerSet, params,
+						counters, out, traceOut);
+#pragma omp atomic
+					counters.readsExtended++;
+				}
+
+#pragma omp atomic
+				counters.readsProcessed++;
+				if (params.verbose && counters.readsProcessed % progressStep == 0)
+					printProgressMessage(counters);
+
+			} /* for each read */
+
+		} /* for each batch of reads (parallel) */
+
+		assert(in.eof());
+		if (!params.tracePath.empty()) {
+			traceOut.close();
+			assert_good(traceOut, params.tracePath);
+		}
+
+		if (params.verbose) {
+			printProgressMessage(counters);
+			std::cerr << "Assembly complete" << std::endl;
+		}
+	}
+
+	/**
+	 * Visitor class that outputs visited nodes/edges in GraphViz format during
+	 * a breadth first traversal. An instance of this class may be passed
+	 * as an argument to the `breadthFirstSearch` function.
+	 */
+	template <typename GraphT>
+	class GraphvizBFSVisitor
+	{
+		typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
+		typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
+
+	public:
+
+		/** Constructor */
+		GraphvizBFSVisitor(std::ostream& out) :
+			m_out(out), m_nodesVisited(0), m_edgesVisited(0)
+		{
+			/* start directed graph (GraphViz) */
+			m_out << "digraph g {\n";
+		}
+
+		/** Destructor */
+		~GraphvizBFSVisitor()
+		{
+			/* end directed graph (GraphViz) */
+			m_out << "}\n";
+		}
+
+		/** Invoked when a vertex is initialized */
+		void initialize_vertex(const VertexT&, const GraphT&) {}
+
+		/** Invoked when a vertex is visited for the first time */
+		void discover_vertex(const VertexT& v, const GraphT&)
+		{
+			++m_nodesVisited;
+			/* declare vertex (GraphViz) */
+			m_out << '\t' << v.kmer().c_str() << ";\n";
+		}
+
+		/** Invoked each time a vertex is visited */
+		void examine_vertex(const VertexT&, const GraphT&) {}
+
+		/**
+		 * Invoked when all of a vertex's outgoing edges have been
+		 * traversed.
+		 */
+		void finish_vertex(const VertexT&, const GraphT&) {}
+
+		/**
+		 * Invoked when an edge is traversed. (Each edge
+		 * in the graph is traversed exactly once.)
+		 */
+		void examine_edge(const EdgeT& e, const GraphT& g)
+		{
+			++m_edgesVisited;
+			const VertexT& u = source(e, g);
+			const VertexT& v = target(e, g);
+
+			/* declare edge (GraphViz) */
+			m_out << '\t' << u.kmer().c_str() << " -> "
+				<< v.kmer().c_str() << ";\n";
+		}
+
+		/**
+		 * Invoked when an edge is traversed to a "gray" vertex.
+		 * A vertex is gray when some but not all of its outgoing edges
+		 * have been traversed.
+		 */
+		void gray_target(const EdgeT&, const GraphT&) {}
+
+		/**
+		 * Invoked when an edge is traversed to a "black" vertex.
+		 * A vertex is black when all of its outgoing edges have
+		 * been traversed.
+		 */
+		void black_target(const EdgeT&, const GraphT&) {}
+
+		/**
+		 * Invoked when an edge is traversed to a "gray" or
+		 * "black" vertex.
+		 */
+		void non_tree_edge(const EdgeT&, const GraphT&) {}
+
+		/**
+		 * Invoked when an edge is traversed to a "white" vertex.
+		 * A vertex is a white if it is previously unvisited.
+		 */
+		void tree_edge(const EdgeT&, const GraphT&) {}
+
+		/** Return number of distinct nodes visited */
+		size_t getNumNodesVisited() const
+		{
+			return m_nodesVisited;
+		}
+
+		/** Get number of distinct edges visited */
+		size_t getNumEdgesVisited() const
+		{
+			return m_edgesVisited;
+		}
+
+	protected:
+
+		/** output stream for GraphViz serialization */
+		std::ostream& m_out;
+		/** number of nodes visited so far */
+		size_t m_nodesVisited;
+		/** number of edges visited so far */
+		size_t m_edgesVisited;
+	};
+
+	/**
+	 * Output a GraphViz serialization of the de Bruijn graph
+	 * using FASTA files and a Bloom filter as input.
+	 *
+	 * @param argc number of input FASTA files
+	 * @param argv array of input FASTA filenames
+	 * @param kmerSet Bloom filter containing valid k-mers
+	 * @param out output stream for GraphViz serialization
+	 * @param verbose prints progress messages to STDERR if true
+	 */
+	template <typename BloomT>
+	static inline void outputGraph(int argc, char** argv,
+		const BloomT& kmerSet, const AssemblyParams& params,
+		std::ostream& out)
+	{
+		assert(params.initialized());
+
+		typedef RollingBloomDBG<BloomT> GraphT;
+
+		/* interval for progress messages */
+		const unsigned progressStep = 1000;
+		const unsigned k = kmerSet.getKmerSize();
+		const unsigned numHashes = kmerSet.getHashNum();
+
+		/* counter for progress messages */
+		size_t readsProcessed = 0;
+
+		/* Boost graph API over rolling hash Bloom filter */
+		GraphT dbg(kmerSet);
+
+		/* Marks visited nodes in breadth-first traversal */
+		DefaultColorMap<GraphT> colorMap;
+
+		/* BFS Visitor -- generates GraphViz output as nodes
+		 * and edges are traversed. */
+		GraphvizBFSVisitor<GraphT> visitor(out);
+
+		if (params.verbose)
+			std::cerr << "Generating GraphViz output..." << std::endl;
+
+		FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
+		for (FastaRecord rec;;) {
+			bool good;
+			good = in >> rec;
+			if (!good)
+				break;
+			Sequence& seq = rec.seq;
+
+			/* Trim down to longest subsequence of "good" k-mers */
+			trimSeq(seq, kmerSet);
+			if (seq.length() > 0) {
+
+				/* BFS traversal in forward dir */
+				std::string startKmer = seq.substr(0, k);
+				Vertex start(startKmer.c_str(),
+					RollingHash(startKmer, numHashes, k));
+				breadthFirstSearch(dbg, start, visitor, colorMap);
+
+				/* BFS traversal in reverse dir */
+				Sequence rcSeq = reverseComplement(seq);
+				std::string rcStartKmer = rcSeq.substr(0, k);
+				Vertex rcStart(rcStartKmer.c_str(),
+					RollingHash(rcStartKmer, numHashes, k));
+				breadthFirstSearch(dbg, rcStart, visitor, colorMap);
+
+			}
+
+			if (++readsProcessed % progressStep == 0 && params.verbose) {
+				std::cerr << "processed " << readsProcessed
+					<< " (k-mers visited: " << visitor.getNumNodesVisited()
+					<< ", edges visited: " << visitor.getNumEdgesVisited()
+					<< ")" << std::endl;
+			}
+		}
+		assert(in.eof());
+		if (params.verbose) {
+			std::cerr << "processed " << readsProcessed
+				<< " reads (k-mers visited: " << visitor.getNumNodesVisited()
+				<< ", edges visited: " << visitor.getNumEdgesVisited()
+				<< ")" << std::endl;
+			std::cerr <<  "GraphViz generation complete" << std::endl;
+		}
+	}
+
+	/**
+	 * Write a single block of a 'variableStep' WIG file.
+	 *
+	 * @param chr chromosome name
+	 * @param start start coordinate of block
+	 * @param length length of block
+	 * @param val value of block
+	 * @param out output stream for WIG file
+	 * @param outPath path for output WIG file
+	 */
+	static inline void outputWigBlock(const std::string& chr, size_t start,
+		size_t length, unsigned val, ostream& out, const std::string& outPath)
+	{
+		assert(length > 0);
+		out << "variableStep chrom=" << chr
+			<< " span=" << length << "\n";
+		out << start << ' ' << val << '\n';
+		assert_good(out, outPath);
+	}
+
+	/**
+	 * Write a WIG file for a reference genome, using the values 0 and 1
+	 * to indicate whether or not a given k-mer had sufficient coverage
+	 * in the reads to exceed the minimum coverage threshold.
+	 *
+	 * @param goodKmerSet Bloom filter of k-mers that exceed the
+	 * minimum coverage threshold
+	 * @param params encapsulates all command line options for the
+	 * assembly, including the reference genome and the output path
+	 * for the WIG file.
+	 */
+    template <class BloomT>
+	static inline void writeCovTrack(const BloomT& goodKmerSet,
+		const AssemblyParams& params)
+	{
+		assert(!params.covTrackPath.empty());
+		assert(!params.refPath.empty());
+
+		const unsigned k = goodKmerSet.getKmerSize();
+		const unsigned numHashes = goodKmerSet.getHashNum();
+
+		std::ofstream covTrack(params.covTrackPath.c_str());
+		assert_good(covTrack, params.covTrackPath);
+
+		if (params.verbose)
+			std::cerr << "Writing 0/1 k-mer coverage track for `"
+				<< params.refPath << "` to `"
+				<< params.covTrackPath << "`" << std::endl;
+
+		FastaReader ref(params.refPath.c_str(), FastaReader::FOLD_CASE);
+		for (FastaRecord rec; ref >> rec;) {
+			std::string chr = rec.id;
+			bool firstVal = true;
+			size_t blockStart = 1;
+			size_t blockLength = 0;
+			uint8_t blockVal = 0;
+			for (RollingHashIterator it(rec.seq, numHashes, k);
+				 it != RollingHashIterator::end(); ++it) {
+				uint8_t val = goodKmerSet.contains(*it) ? 1 : 0;
+				if (firstVal) {
+					firstVal = false;
+					/* WIG standard uses 1-based coords */
+					blockStart = it.pos() + 1;
+					blockLength = 1;
+					blockVal = val;
+				} else if (val != blockVal) {
+					assert(firstVal == false);
+					outputWigBlock(chr, blockStart, blockLength, blockVal,
+						covTrack, params.covTrackPath);
+					/* WIG standard uses 1-based coords */
+					blockStart = it.pos() + 1;
+					blockLength = 1;
+					blockVal = val;
+				} else {
+					blockLength++;
+				}
+			}
+			/* output last block */
+			if (blockLength > 0) {
+				outputWigBlock(chr, blockStart, blockLength, blockVal,
+					covTrack, params.covTrackPath);
+			}
+		}
+		assert(ref.eof());
+
+		assert_good(covTrack, params.covTrackPath);
+		covTrack.close();
+	}
+
+} /* BloomDBG namespace */
+
+#endif
diff --git a/COPYRIGHT b/COPYRIGHT
index 0c4d7a2..8dcf6fd 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -3,14 +3,10 @@ Upstream-Name: ABySS
 Upstream-Contact: Shaun Jackman <sjackman at gmail.com>
 Source: https://github.com/bcgsc/abyss
 
-License: GPL-NC-3+
- You may use, redistribute and modify this software for non-commercial
- purposes under the terms of the GNU General Public License as
- published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
- .
- To license ABySS for commercial purposes, please contact
- Patrick Rebstein <prebstein at bccancer.bc.ca>
+License: GPL-3
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, version 3.
  .
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -18,17 +14,26 @@ License: GPL-NC-3+
  GNU General Public License for more details.
  .
  You should have received a copy of the GNU General Public License
- along with this software. If not, see <http://www.gnu.org/licenses/>.
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
  .
- Debian may redistribute this software package.
+ For commercial licensing options, please contact
+ Patrick Rebstein <prebstein at bccancer.bc.ca>
 
 Files: *
-Copyright: Copyright 2014 Canada's Michael Smith Genome Sciences Centre
-License: GPL-NC-3+
+Copyright: Copyright 2016 British Columbia Cancer Agency Branch
+License: GPL-3
 
-Files: Common/* DataLayer/* DistanceEst/* FMIndex/* Map/* ParseAligns/*
-Copyright: Copyright 2014 Canada's Michael Smith Genome Sciences Centre
-License: GPL-3+
+Files: Layout/*
+Copyright: Copyright 2012 Shaun Jackman
+License: GPL-3
+
+Files: lib/bloomfilter/*
+Copyright: Copyright 2016 Justin Chu
+License: GPL-3
+
+Files: lib/rolling-hash/*
+Copyright: Copyright 2016 Hamid Mohamadi
+License: GPL-3
 
 Files: Common/cholesky.hpp
 Copyright: Copyright 2005 Gunter Winkler, Konstantin Kutzkow
@@ -38,10 +43,6 @@ Files: Common/city.cc Common/city.h
 Copyright: Copyright 2011 Google, Inc.
 License: Expat
 
-Files: Layout/*
-Copyright: Copyright 2012 Shaun Jackman
-License: GPL-3+
-
 Files: dialign/*
 Copyright: Copyright 2008 Amarendran R. Subramanian
 License: LGPL-2.1+
@@ -127,22 +128,31 @@ License: Expat
  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  OTHER DEALINGS IN THE SOFTWARE.
 
-License: GPL-3+
- This program is free software; you can redistribute it
- and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later
- version.
- .
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- PURPOSE. See the GNU General Public License for more
- details.
- .
- You should have received a copy of the GNU General Public License
- along with this software. If not, see <http://www.gnu.org/licenses/>.
+Files: lib/gtest-*/*
+Copyright: Copyright 2008 Google Inc.
+License: BSD-3-clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
  .
- On Debian systems, the full text of the GNU General Public
- License version 3 can be found in the file
- `/usr/share/common-licenses/GPL-3'.
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ChangeLog b/ChangeLog
index 8316e1c..368e132 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,60 @@
+2016-09-14  Ben Vandervalk <benv at bcgsc.ca>
+
+	* Release version 2.0.1
+	* Resolve licensing issues by switching to standard GPL-3 license
+
+2016-08-30  Ben Vandervalk <benv at bcgsc.ca>
+
+	* Release version 2.0.0
+	* New Bloom filter mode for assembly => assemble large genomes
+	with minimal memory (e.g. 34G for H. sapiens)
+	* Update param defaults for modern Illumina data
+	* Make sqlite3 an optional dependency
+
+	abyss-bloom:
+	* New 'compare' command for bitwise comparison of Bloom filters
+	(thanks to @bschiffthaler!)
+	* New 'kmers' command for printing k-mers that match a Bloom filter
+	(thanks to @bschiffthaler!)
+
+	abyss-bloom-dbg:
+	* New preunitig assembler that uses Bloom filter
+	* Add 'B' param (Bloom filter size) to 'abyss-pe' command to enable
+	Bloom filter mode
+	* See README.md and '--help' for further instructions
+
+	abyss-fatoagp:
+	* Mask scaftigs shorter than 50bp with 'N's (short scaftigs
+	were causing problems with NCBI submission)
+
+	abyss-pe:
+	* Update default parameter values for modern Illumina data
+	* Change 'l=k' => 'l=40'
+	* Change 's=200' => 's=1000'
+	* Change 'S=s' => 'S=1000-10000' (do a param sweep of 'S')
+	* Use 'DistanceEst --mean' for scaffolding stage, instead of
+	the default '--mle'
+
+	abyss-sealer:
+	* New '--max-gap-length' ('-G') option to replace unintuitive
+	'--max-frag'; use of '--max-frag' is now deprecated
+	* Require user to explicitly specify Bloom filter size (e.g.
+	'-b40G')
+	* Report false positive rate (FPR) when building/loading Bloom
+	filters
+	* Don't require input FASTQ files when using pre-built Bloom
+	filter files
+
+	konnector:
+	* Fix bug causing output read 2 file to be empty
+	* New percent sequence identity options ('-x' and '-X')
+	* New '--alt-paths-mode' option to output alternate connecting
+	paths between read pairs
+
+	README.md:
+	* Fixes to documentation of ABYSS and abyss-pe parameters
+	(thanks to @nsoranzo!)
+
 2015-05-28  Ben Vandervalk <benv at bcgsc.ca>
 
 	* Release version 1.9.0
diff --git a/Common/Kmer.h b/Common/Kmer.h
index f1a58a3..017f73c 100644
--- a/Common/Kmer.h
+++ b/Common/Kmer.h
@@ -109,13 +109,13 @@ class Kmer
 		return out << o.str();
 	}
 
-  private:
-	uint8_t shiftAppend(uint8_t base);
-	uint8_t shiftPrepend(uint8_t base);
-
 	uint8_t at(unsigned i) const;
 	void set(unsigned i, uint8_t base);
 
+  protected:
+	uint8_t shiftAppend(uint8_t base);
+	uint8_t shiftPrepend(uint8_t base);
+
 	static uint8_t leftShiftByte(char* pSeq,
 			unsigned byteNum, unsigned index, uint8_t base);
 	static uint8_t rightShiftByte(char* pSeq,
diff --git a/Common/Sequence.h b/Common/Sequence.h
index b9283b9..1c3a152 100644
--- a/Common/Sequence.h
+++ b/Common/Sequence.h
@@ -104,10 +104,10 @@ static inline bool ambiguityIsSubset(char a, char b)
  * @param maskNew output bases that have been changed or added
  * to target in lowercase.
  */
-static inline void overlaySeq(Sequence& overlay, Sequence& target,
+static inline void overlaySeq(const Sequence& overlay, Sequence& target,
 	int shift, bool maskNew = false)
 {
-	Sequence::iterator src = overlay.begin();
+	Sequence::const_iterator src = overlay.begin();
 	Sequence::iterator dest;
 
 	if (shift < 0) {
@@ -125,8 +125,9 @@ static inline void overlaySeq(Sequence& overlay, Sequence& target,
 	for (; src != overlay.end(); ++src, ++dest) {
 		assert(dest != target.end());
 		if (maskNew && *src != *dest)
-			*src = tolower(*src);
-		*dest = *src;
+			*dest = tolower(*src);
+		else
+			*dest = *src;
 	}
 }
 
diff --git a/DataBase/Makefile.am b/DataBase/Makefile.am
index e4c7cca..6a055ce 100644
--- a/DataBase/Makefile.am
+++ b/DataBase/Makefile.am
@@ -3,7 +3,10 @@ libdb_a_SOURCES = DB.cc DB.h Options.h
 libdb_a_CPPFLAGS = -I$(top_srcdir)
 libdb_a_LIBADD = $(top_builddir)/Common/libcommon.a
 
+if HAVE_SQLITE3
 bin_PROGRAMS = abyss-db-csv
+endif
+
 abyss_db_csv_SOURCES = DB.cc DB.h db-csv.cc
 abyss_db_csv_CPPFLAGS = -I$(top_srcdir)
 abyss_db_csv_LDADD = -lsqlite3
diff --git a/DataBase/db-csv.cc b/DataBase/db-csv.cc
index 05cac19..fc43c2f 100644
--- a/DataBase/db-csv.cc
+++ b/DataBase/db-csv.cc
@@ -19,8 +19,7 @@ typedef vector<string> vs;
 
 static bool existFile(const char* f)
 {
-	ifstream file(f);
-	return file;
+	return (bool)ifstream(f);
 }
 
 template <typename D>
diff --git a/DataLayer/fac.cc b/DataLayer/fac.cc
index f03af8f..fdf3232 100644
--- a/DataLayer/fac.cc
+++ b/DataLayer/fac.cc
@@ -29,8 +29,8 @@ static const char USAGE_MESSAGE[] =
 "\n"
 " Options:\n"
 "\n"
-"  -e, --exp-size=N        expected genome size. Will calculate NG50\n"
-"                          and associated stats\n"
+"  -G, -e, --genome-size=N expected genome size. Used to calculate NG50\n"
+"                          and associated stats [disabled]\n"
 "  -s, -t, --min-length=N  ignore sequences shorter than N bp [500]\n"
 "  -d, --delimiter=S       use S for the field delimiter [\\t]\n"
 "  -j, --jira              output JIRA format\n"
@@ -50,7 +50,7 @@ static const char USAGE_MESSAGE[] =
 
 namespace opt {
 	static unsigned minLength = 500;
-	static long long unsigned expSize = 0;
+	static long long unsigned genomeSize;
 	static string delimiter = "\t";
 	static int format;
 	static int verbose;
@@ -58,12 +58,12 @@ namespace opt {
 }
 enum { TAB, JIRA, MMD };
 
-static const char shortopts[] = "d:jms:t:e:v";
+static const char shortopts[] = "d:e:G:jms:t:v";
 
 enum { OPT_HELP = 1, OPT_VERSION };
 
 static const struct option longopts[] = {
-	{ "exp-size", no_argument, NULL, 'e' },
+	{ "genome-size", required_argument, NULL, 'G' },
 	{ "min-length", no_argument, NULL, 's' },
 	{ "delimiter", required_argument, NULL, 'd' },
 	{ "jira", no_argument, NULL, 'j' },
@@ -109,7 +109,7 @@ static void printContiguityStatistics(const char* path)
 			<< "n" << sep
 			<< "n:" << opt::minLength << sep
 			<< "L50" << sep;
-		if (opt::expSize > 0)
+		if (opt::genomeSize > 0)
 			cout << "n:NG50" << sep
 				<< "NG50" << sep;
 		cout << "min" << sep
@@ -126,7 +126,7 @@ static void printContiguityStatistics(const char* path)
 		cout << "n" << sep
 			<< "n:" << opt::minLength << sep
 			<< "L50" << sep;
-		if (opt::expSize > 0)
+		if (opt::genomeSize > 0)
 			cout << "n:NG50" << sep
 				<< "NG50" << sep;
 		cout << "min" << sep
@@ -137,7 +137,7 @@ static void printContiguityStatistics(const char* path)
 			<< "max" << sep
 			<< "sum" << sep
 			<< "name" << '\n';
-		if (opt::expSize > 0)
+		if (opt::genomeSize > 0)
 			cout << "------" << sep
 				<< "------" << sep;
 		cout << "------" << sep
@@ -157,7 +157,7 @@ static void printContiguityStatistics(const char* path)
 	if (opt::format == JIRA)
 		cout << '|';
 	printContiguityStats(cout, h, opt::minLength,
-			printHeader, opt::delimiter, opt::expSize)
+			printHeader, opt::delimiter, opt::genomeSize)
 		<< opt::delimiter << path;
 	if (opt::format == JIRA)
 		cout << opt::delimiter;
@@ -189,9 +189,14 @@ int main(int argc, char** argv)
 			opt::delimiter = "\t|";
 			opt::format = MMD;
 			break;
+		  case 'G':
 		  case 'e':
-			arg >> opt::expSize;
-			break;
+			{
+				double x;
+				arg >> x;
+				opt::genomeSize = x;
+				break;
+			}
 		  case 's': case 't':
 			arg >> opt::minLength;
 			break;
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..dbcf383
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM ubuntu:latest
+MAINTAINER Shaun Jackman <sjackman at gmail.com>
+
+RUN apt-get update \
+	&& apt-get install -y --no-install-recommends \
+		make openmpi-bin ssh
+ADD . /tmp/abyss
+RUN apt-get install -y --no-install-recommends \
+		automake g++ libboost-dev libopenmpi-dev libsparsehash-dev \
+	&& cd /tmp/abyss \
+	&& ./autogen.sh \
+	&& mkdir build && cd build \
+	&& ../configure --with-mpi=/usr/lib/openmpi \
+	&& make install-strip \
+	&& rm -rf /tmp/abyss \
+	&& apt-get autoremove -y binutils \
+		automake g++ libboost-dev libopenmpi-dev libsparsehash-dev
+ENV SHELL=/bin/bash
+ENTRYPOINT ["abyss-pe"]
+CMD ["help"]
diff --git a/Graph/BreadthFirstSearch.h b/Graph/BreadthFirstSearch.h
index a36a567..40a04f0 100644
--- a/Graph/BreadthFirstSearch.h
+++ b/Graph/BreadthFirstSearch.h
@@ -29,6 +29,9 @@ template <class IncidenceGraph, class Buffer, class BFSVisitor,
     typedef color_traits<ColorValue> Color;
     typename GTraits::out_edge_iterator ei, ei_end;
 
+	if (get(color, s) == Color::black())
+		return;
+
     put(color, s, Color::gray());             vis.discover_vertex(s, g);
     Q.push(s);
     while (! Q.empty()) {
diff --git a/Graph/ExtendPath.h b/Graph/ExtendPath.h
index acef178..50d471a 100644
--- a/Graph/ExtendPath.h
+++ b/Graph/ExtendPath.h
@@ -1,3 +1,4 @@
+
 #ifndef _EXTENDPATH_H_
 #define _EXTENDPATH_H_
 
@@ -10,22 +11,95 @@
 #include <cassert>
 #include <cstdio>
 #include <iostream>
+#include <algorithm>
+
+/**
+ * Parameters for path extension.
+ */
+struct ExtendPathParams
+{
+	/* ignore branches shorter than or equal to this length */
+	unsigned trimLen;
+	/* maximum length after extension */
+	unsigned maxLen;
+	/*
+	 * if true, multiple incoming branches > trimLen
+	 * will cause a path extension to halt
+	 */
+	bool lookBehind;
+
+	/* constructor */
+	ExtendPathParams() : trimLen(0), maxLen(NO_LIMIT), lookBehind(true) {}
+};
 
 /**
  * The result of attempting to extend a path.
  */
 enum PathExtensionResult {
+	/** path could not be extended because of a dead end */
 	DEAD_END,
+	/** path could not be extended because of a branching point */
 	BRANCHING_POINT,
+	/** path could not be extended because of a cycle */
 	CYCLE,
+	/** path could not be extended because of caller-specified length limit */
 	LENGTH_LIMIT,
+	/** path was extended up to a dead end */
 	EXTENDED_TO_DEAD_END,
+	/** path was extended up to a branching point */
 	EXTENDED_TO_BRANCHING_POINT,
+	/** path was extended up to a cycle */
 	EXTENDED_TO_CYCLE,
+	/** path was extended up to caller-specified length limit */
 	EXTENDED_TO_LENGTH_LIMIT
 };
 
 /**
+ * Translate path extension result code to a string.
+ */
+static inline const char* pathExtensionResultStr(PathExtensionResult result)
+{
+	switch(result) {
+	case DEAD_END:
+		return "DEAD_END";
+	case BRANCHING_POINT:
+		return "BRANCHING_POINT";
+	case CYCLE:
+		return "CYCLE";
+	case LENGTH_LIMIT:
+		return "LENGTH_LIMIT";
+	case EXTENDED_TO_DEAD_END:
+		return "EXTENDED_TO_DEAD_END";
+	case EXTENDED_TO_BRANCHING_POINT:
+		return "EXTENDED_TO_BRANCHING_POINT";
+	case EXTENDED_TO_CYCLE:
+		return "EXTENDED_TO_CYCLE";
+	case EXTENDED_TO_LENGTH_LIMIT:
+		return "EXTENDED_TO_LENGTH_LIMIT";
+	default:
+		assert(false);
+	}
+}
+
+/**
+ * Return true if the path extension result code indicates
+ * that the path was successfully extended by one or more nodes.
+ */
+static inline bool pathExtended(PathExtensionResult result)
+{
+	switch(result) {
+	case DEAD_END:
+	case BRANCHING_POINT:
+	case CYCLE:
+	case LENGTH_LIMIT:
+		return false;
+	default:
+		return true;
+	}
+	assert(false);
+}
+
+/**
  * The result of attempting to extend a path
  * by a single neighbouring vertex.
  */
@@ -36,9 +110,65 @@ enum SingleExtensionResult {
 };
 
 /**
+ * Return true if there is a path of at least depthLimit vertices
+ * that extends from given vertex u, otherwise return false.
+ * Implemented using a bounded depth first search.
+ *
+ * @param start starting vertex for traversal
+ * @param dir direction for traversal (FORWARD or REVERSE)
+ * @param depth depth of current vertex u
+ * @param depthLimit maximum depth to probe
+ * @param g graph to use for traversal
+ * @param visited vertices that have already been visited by the DFS
+ * @return true if at least one path with length >= len
+ * extends from v in direction dir, false otherwise
+ */
+template <class Graph>
+static inline bool lookAhead(
+	const typename boost::graph_traits<Graph>::vertex_descriptor& u,
+	Direction dir, unsigned depth, unsigned depthLimit,
+	unordered_set< typename boost::graph_traits<Graph>::vertex_descriptor,
+	hash<typename boost::graph_traits<Graph>::vertex_descriptor> >& visited, const Graph& g)
+{
+    typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+    typedef typename boost::graph_traits<Graph>::out_edge_iterator OutEdgeIter;
+    typedef typename boost::graph_traits<Graph>::in_edge_iterator InEdgeIter;
+
+	OutEdgeIter oei, oei_end;
+	InEdgeIter iei, iei_end;
+
+	visited.insert(u);
+	if (depth == depthLimit)
+		return true;
+
+	if (dir == FORWARD) {
+		for (boost::tie(oei, oei_end) = out_edges(u, g);
+			oei != oei_end; ++oei) {
+			const V& v = target(*oei, g);
+			if (visited.find(v) == visited.end()) {
+				if(lookAhead(v, dir, depth+1, depthLimit, visited, g))
+					return true;
+			}
+		}
+	} else {
+		assert(dir == REVERSE);
+		for (boost::tie(iei, iei_end) = in_edges(u, g);
+			 iei != iei_end; ++iei) {
+			const V& v = source(*iei, g);
+			if (visited.find(v) == visited.end()) {
+				if(lookAhead(v, dir, depth+1, depthLimit, visited, g))
+					return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+/**
  * Return true if there is a path of at least 'depth' vertices
  * that extends from given vertex v, otherwise return false.
- * Implemented using a bounded breadth first search.
+ * Implemented using a bounded depth first search.
  *
  * @param start starting vertex for traversal
  * @param dir direction for traversal (FORWARD or REVERSE)
@@ -47,39 +177,141 @@ enum SingleExtensionResult {
  * @return true if at least one path with length >= len
  * extends from v in direction dir, false otherwise
  */
-template <class BidirectionalGraph>
+template <class Graph>
 static inline bool lookAhead(
-	typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor start,
-	Direction dir, unsigned depth, const BidirectionalGraph& g)
+	const typename boost::graph_traits<Graph>::vertex_descriptor& start,
+	Direction dir, unsigned depth, const Graph& g)
+{
+	typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+	unordered_set< V, hash<V> > visited;
+	return lookAhead(start, dir, 0, depth, visited, g);
+}
+
+/**
+ * Return neighbour vertices that begin branches that are longer than trimLen.
+ *
+ * @param u root vertex
+ * @param dir direction for neighbours (FORWARD or REVERSE)
+ * @param g graph
+ * @param trimLen ignore all branches less than or equal to this length
+ * @return std::vector of neighbour vertices that start branches that are
+ * greater than trimLen vertices in length
+ */
+template <class BidirectionalGraph>
+static inline std::vector<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>
+trueBranches(const typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor& u,
+	Direction dir, const BidirectionalGraph& g, unsigned trimLen=0)
+{
+	typedef BidirectionalGraph G;
+	typedef boost::graph_traits<G> graph_traits;
+	typedef typename graph_traits::vertex_descriptor V;
+
+	typename graph_traits::out_edge_iterator oei, oei_end;
+	typename graph_traits::in_edge_iterator iei, iei_end;
+
+	std::vector<V> branchRoots;
+
+	if (dir == FORWARD) {
+		for (boost::tie(oei, oei_end) = out_edges(u, g);
+			oei != oei_end; ++oei) {
+			const V& v = target(*oei, g);
+			if (lookAhead(v, dir, trimLen, g))
+				branchRoots.push_back(v);
+		}
+	} else {
+		assert(dir == REVERSE);
+		for (boost::tie(iei, iei_end) = in_edges(u, g);
+			iei != iei_end; ++iei) {
+			const V& v = source(*iei, g);
+			if (lookAhead(v, dir, trimLen, g)) {
+				branchRoots.push_back(v);
+			}
+		}
+	}
+
+	return branchRoots;
+}
+
+/**
+ * Return the in/out degree of a vertex, disregarding branches
+ * <= trimLen.
+ *
+ * @param u the vertex of interest
+ * @param dir FORWARD for out degree, REVERSE for in degree
+ * @param g the graph
+ * @param trimLen branches less then or equal to this length
+ * are ignored (unless they are the only option)
+ * @return the in/out degree of u, ignoring branches <= trimLen
+ */
+template <typename Graph>
+static inline unsigned trueDegree(
+	const typename boost::graph_traits<Graph>::vertex_descriptor& u,
+	Direction dir, const Graph& g, unsigned trimLen=0)
+{
+	typedef boost::graph_traits<Graph> graph_traits;
+	typedef typename graph_traits::vertex_descriptor V;
+
+	unsigned degree = (dir == FORWARD) ? out_degree(u, g) : in_degree(u, g);
+	if (degree <= 1)
+		return degree;
+
+	std::vector<V> branches = trueBranches(u, dir, g, trimLen);
+	/*
+	 * Note: If branches.size() == 0, we know from above that
+	 * we must have 2 or more short branches. This situation typically occurs
+	 * near coverage gaps, where one of the branches is the correct choice.
+	 * (During path extension, our heuristic is to choose the longest branch
+	 * and to continue extending.)
+	 */
+	if (branches.size() == 0)
+		return 1;
+
+	return branches.size();
+}
+
+/**
+ * Return the depth of the graph from the given source vertex,
+ * i.e. the distance of the furthest node.  The depth is measured
+ * by means of an exhaustive breadth first search.
+ *
+ * @param root starting vertex for traversal
+ * @param dir direction for traversal (FORWARD or REVERSE)
+ * @param g graph to use for traversal
+ * @return the distance of the furthest vertex from root
+ */
+template <typename Graph>
+static inline size_t depth(
+	typename boost::graph_traits<Graph>::vertex_descriptor root,
+	Direction dir, const Graph& g)
 {
-    typedef typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor V;
-    typedef typename boost::graph_traits<BidirectionalGraph>::out_edge_iterator OutEdgeIter;
-    typedef typename boost::graph_traits<BidirectionalGraph>::in_edge_iterator InEdgeIter;
+    typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+    typedef typename boost::graph_traits<Graph>::out_edge_iterator OutEdgeIter;
+    typedef typename boost::graph_traits<Graph>::in_edge_iterator InEdgeIter;
 
 	OutEdgeIter oei, oei_end;
 	InEdgeIter iei, iei_end;
 
 	unordered_set<V, hash<V> > visited;
-	typedef unordered_map<V, unsigned> DepthMap;
+	typedef unordered_map<V, size_t> DepthMap;
 	DepthMap depthMap;
 	std::deque<V> q;
 
-	q.push_back(start);
+	q.push_back(root);
 
-	visited.insert(start);
+	visited.insert(root);
 	std::pair<typename DepthMap::iterator, bool> inserted =
-		depthMap.insert(std::make_pair(start, 0));
+		depthMap.insert(std::make_pair(root, 0));
 	assert(inserted.second);
 
+	size_t maxDepth = 0;
 	while (!q.empty()) {
-		V u = q.front();
-		q.pop_front();
+		V& u = q.front();
 		visited.insert(u);
 		typename DepthMap::const_iterator it = depthMap.find(u);
 		assert(it != depthMap.end());
-		unsigned uDepth = it->second;
-		if (uDepth == depth)
-			return true;
+		size_t depth = it->second;
+		if (depth > maxDepth)
+			maxDepth = depth;
 		if (dir == FORWARD) {
 			for (boost::tie(oei, oei_end) = out_edges(u, g);
 				oei != oei_end; ++oei) {
@@ -87,7 +319,7 @@ static inline bool lookAhead(
 				if (visited.find(v) == visited.end()) {
 					visited.insert(v);
 					std::pair<typename DepthMap::iterator, bool> inserted =
-						depthMap.insert(std::make_pair(v, uDepth+1));
+						depthMap.insert(std::make_pair(v, depth+1));
 					assert(inserted.second);
 					q.push_back(v);
 				}
@@ -100,50 +332,69 @@ static inline bool lookAhead(
 				if (visited.find(v) == visited.end()) {
 					visited.insert(v);
 					std::pair<typename DepthMap::iterator, bool> inserted =
-						depthMap.insert(std::make_pair(v, uDepth+1));
+						depthMap.insert(std::make_pair(v, depth+1));
 					assert(inserted.second);
 					q.push_back(v);
 				}
 			}
 		}
+		q.pop_front();
 	}
 
-	return false;
+	return maxDepth;
 }
 
-template <class BidirectionalGraph>
-static inline std::vector<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>
-trueBranches(typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor& u,
-	Direction dir, const BidirectionalGraph& g, unsigned trimLen=0)
+/**
+ * Return the neighbor vertex corresponding to the longest branch.  If there
+ * are no neighbour vertices, an assertion will be thrown. If there
+ * is a tie between branch lengths, the "winning" branch is chosen arbitrarily.
+ *
+ * @param u root vertex
+ * @param dir direction of branches to consider (FORWARD or REVERSE)
+ * @param g the graph
+ * @return the vertex at the head of the longest branch
+ */
+template <typename Graph>
+inline static typename boost::graph_traits<Graph>::vertex_descriptor
+longestBranch(const typename boost::graph_traits<Graph>::vertex_descriptor& u,
+	Direction dir, const Graph& g)
 {
-	typedef BidirectionalGraph G;
-	typedef boost::graph_traits<G> graph_traits;
-	typedef typename graph_traits::vertex_descriptor V;
-
-	typename graph_traits::out_edge_iterator oei, oei_end;
-	typename graph_traits::in_edge_iterator iei, iei_end;
-
-	std::vector<V> branchRoots;
+	typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+    typedef typename boost::graph_traits<Graph>::out_edge_iterator OutEdgeIter;
+    typedef typename boost::graph_traits<Graph>::in_edge_iterator InEdgeIter;
 
+	OutEdgeIter oei, oei_end;
+	InEdgeIter iei, iei_end;
+	size_t maxDepth = 0;
+	unsigned degree = 0;
+	/* note: had to initialize to prevent compiler warnings */
+	V longestBranch = u;
 	if (dir == FORWARD) {
 		for (boost::tie(oei, oei_end) = out_edges(u, g);
-				oei != oei_end; ++oei) {
+			 oei != oei_end; ++oei) {
+			degree++;
 			const V& v = target(*oei, g);
-			if (lookAhead(v, dir, trimLen, g))
-				branchRoots.push_back(v);
+			size_t d = depth(v, dir, g);
+			if (d >= maxDepth) {
+				maxDepth = d;
+				longestBranch = v;
+			}
 		}
 	} else {
 		assert(dir == REVERSE);
 		for (boost::tie(iei, iei_end) = in_edges(u, g);
-			iei != iei_end; ++iei) {
+			 iei != iei_end; ++iei) {
+			degree++;
 			const V& v = source(*iei, g);
-			if (lookAhead(v, dir, trimLen, g)) {
-				branchRoots.push_back(v);
+			size_t d = depth(v, dir, g);
+			if (d >= maxDepth) {
+				maxDepth = d;
+				longestBranch = v;
 			}
 		}
 	}
-
-	return branchRoots;
+	assert(degree > 0);
+	return longestBranch;
 }
 
 /**
@@ -153,14 +404,13 @@ trueBranches(typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor
  * @param path the path to extend (a list of vertices)
  * @param dir direction of extension (FORWARD or REVERSE)
  * @param g the graph to use for traversal
- * @param trimLen ignore neighbour vertices with branches
- * shorter than this length [0]
+ * @param params parameters controlling extension (e.g. trimLen)
  * @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT, or EXTENDED
  */
 template <class BidirectionalGraph>
 static inline SingleExtensionResult extendPathBySingleVertex(
 	Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
-	Direction dir, const BidirectionalGraph& g, unsigned trimLen = 0)
+	Direction dir, const BidirectionalGraph& g, const ExtendPathParams& params)
 {
 	typedef BidirectionalGraph G;
 	typedef boost::graph_traits<G> graph_traits;
@@ -172,38 +422,76 @@ static inline SingleExtensionResult extendPathBySingleVertex(
 	assert(dir == FORWARD || dir == REVERSE);
 
 	V& u = (dir == FORWARD) ? path.back() : path.front();
-	unsigned degree = (dir == FORWARD) ? out_degree(u, g) : in_degree(u, g);
 
-	if (degree == 0) {
+	unsigned outDegree = (dir == FORWARD) ? out_degree(u, g) : in_degree(u, g);
+	if (outDegree == 0) {
 		return SE_DEAD_END;
-	} else if (degree == 1) {
-		const V& v = (dir == FORWARD) ?
-			target(*(out_edges(u, g).first), g) :
-			source(*(in_edges(u, g).first), g);
+	}
+
+	unsigned inDegree = 0;
+	if (params.lookBehind)
+		inDegree = (dir == FORWARD) ? in_degree(u, g) : out_degree(u, g);
+
+	if ((!params.lookBehind || inDegree <= 1) && outDegree == 1) {
 		if (dir == FORWARD) {
+			const V& v = target(*(out_edges(u, g).first), g);
 			path.push_back(v);
 		} else {
 			assert(dir == REVERSE);
+			const V& v = source(*(in_edges(u, g).first), g);
 			path.push_front(v);
 		}
 		return SE_EXTENDED;
-	} else {
-		std::vector<V> neighbours = trueBranches(u, dir, g, trimLen);
-		if (neighbours.empty()) {
-			return SE_DEAD_END;
-		} else if (neighbours.size() == 1) {
-			if (dir == FORWARD) {
-				path.push_back(neighbours.front());
-			} else {
-				assert(dir == REVERSE);
-				path.push_front(neighbours.front());
+	}
+
+	Direction otherDir = (dir == FORWARD) ? REVERSE : FORWARD;
+	std::vector<V> longBranchesOut = trueBranches(u, dir, g, params.trimLen);
+	std::vector<V> longBranchesIn;
+
+	if (params.lookBehind) {
+		longBranchesIn = trueBranches(u, otherDir, g, params.trimLen);
+		/*
+		 * Tricky: Make sure the path we are extending
+		 * is treated as a valid incoming branch, even if it is less
+		 * than trimLen. This can happen if we seeded the path on
+		 * an error branch or a branch that has a coverage gap.
+		 */
+		if (path.size() > 1) {
+			const V& predecessor = (dir == FORWARD) ?
+				*(path.rbegin() + 1) : *(path.begin() + 1);
+			if (std::find(longBranchesIn.begin(), longBranchesIn.end(),
+				predecessor) == longBranchesIn.end()) {
+				longBranchesIn.push_back(predecessor);
 			}
-			return SE_EXTENDED;
-		} else {
-			assert(neighbours.size() > 1);
-			return SE_BRANCHING_POINT;
 		}
 	}
+
+	if ((params.lookBehind && longBranchesIn.size() > 1) ||
+		longBranchesOut.size() > 1)
+		return SE_BRANCHING_POINT;
+
+	if (longBranchesOut.size() == 0) {
+		/*
+		 * If we have multiple branches that are shorter
+		 * than the trim length then choose the longest one.
+		 * (This type of situation usually occurs near
+		 * coverage gaps.)
+		 */
+		V v = longestBranch(u, dir, g);
+		if (dir == FORWARD)
+			path.push_back(v);
+		else
+			path.push_front(v);
+
+		return SE_EXTENDED;
+	}
+
+	if (dir == FORWARD)
+		path.push_back(longBranchesOut.front());
+	else
+		path.push_front(longBranchesOut.front());
+
+	return SE_EXTENDED;
 }
 
 /**
@@ -212,16 +500,18 @@ static inline SingleExtensionResult extendPathBySingleVertex(
  * @param path path to extend (modified by this function)
  * @param dir direction to extend path (FORWARD or REVERSE)
  * @param g graph in which to perform the extension
- * @param trimLen ignore branches less than this length when
- * detecting branch points [0]
+ * @param visited set of previously visited vertices (used
+ * to detect cycles in the de Bruijn graph)
+ * @param params parameters controlling extension (e.g. trimLen)
  * @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT,
  * or EXTENDED.
  */
 template <class BidirectionalGraph>
-PathExtensionResult extendPath(
+static inline PathExtensionResult extendPath(
 	Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
-	Direction dir, const BidirectionalGraph& g, unsigned trimLen = 0,
-	unsigned maxLen = NO_LIMIT)
+	Direction dir, const BidirectionalGraph& g,
+	unordered_set<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& visited,
+	const ExtendPathParams& params)
 {
 	typedef BidirectionalGraph G;
 	typedef boost::graph_traits<G> graph_traits;
@@ -229,20 +519,19 @@ PathExtensionResult extendPath(
 	typename graph_traits::out_edge_iterator oei, oei_end;
 	typename graph_traits::in_edge_iterator iei, iei_end;
 
-	assert(path.size() > 0 && path.size() <= maxLen);
+	assert(path.size() > 0);
 	size_t origPathLen = path.size();
 
-	/* track visited nodes to avoid infinite traversal of cycles */
-	unordered_set<V> visited;
-	visited.insert(path.begin(), path.end());
+	if (path.size() != NO_LIMIT && path.size() >= params.maxLen)
+		return LENGTH_LIMIT;
 
 	SingleExtensionResult result = SE_EXTENDED;
 	bool detectedCycle = false;
 
 	while (result == SE_EXTENDED && !detectedCycle &&
-		path.size() < maxLen)
+		path.size() < params.maxLen)
 	{
-		result = extendPathBySingleVertex(path, dir, g, trimLen);
+		result = extendPathBySingleVertex(path, dir, g, params);
 		if (result == SE_EXTENDED) {
 			std::pair<typename unordered_set<V>::iterator,bool> inserted;
 			if (dir == FORWARD) {
@@ -275,7 +564,7 @@ PathExtensionResult extendPath(
 			return EXTENDED_TO_BRANCHING_POINT;
 		} else {
 			assert(result == SE_EXTENDED &&
-				path.size() == maxLen);
+				path.size() == params.maxLen);
 			return EXTENDED_TO_LENGTH_LIMIT;
 		}
 	} else {
@@ -287,10 +576,53 @@ PathExtensionResult extendPath(
 		} else if (result == SE_BRANCHING_POINT) {
 			return BRANCHING_POINT;
 		} else {
-			assert(origPathLen >= maxLen);
+			assert(origPathLen >= params.maxLen);
 			return LENGTH_LIMIT;
 		}
 	}
 }
 
+/**
+ * Extend a path up to the next branching point in the graph.
+ *
+ * @param path path to extend (modified by this function)
+ * @param dir direction to extend path (FORWARD or REVERSE)
+ * @param g graph in which to perform the extension
+ * @param params parameters controlling extension (e.g. trimLen)
+ * @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT,
+ * or EXTENDED.
+ */
+template <class BidirectionalGraph>
+PathExtensionResult extendPath(
+	Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
+	Direction dir, const BidirectionalGraph& g, const ExtendPathParams& params)
+{
+	typedef typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor V;
+
+	/* track visited nodes to avoid infinite traversal of cycles */
+	unordered_set<V> visited;
+	visited.insert(path.begin(), path.end());
+
+	return extendPath(path, dir, g, visited, params);
+}
+
+/**
+ * Extend a path up to the next branching point in the graph.
+ *
+ * @param path path to extend (modified by this function)
+ * @param dir direction to extend path (FORWARD or REVERSE)
+ * @param g graph in which to perform the extension
+ * @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT,
+ * or EXTENDED.
+ */
+template <class BidirectionalGraph>
+PathExtensionResult extendPath(
+	Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
+	Direction dir, const BidirectionalGraph& g)
+{
+	/* default extension params */
+	ExtendPathParams params;
+	return extendPath(path, dir, g, params);
+}
+
 #endif
diff --git a/Graph/Path.h b/Graph/Path.h
index 33c893d..2a0abaf 100644
--- a/Graph/Path.h
+++ b/Graph/Path.h
@@ -5,6 +5,7 @@
 #include <sstream>
 #include <climits>
 #include <deque>
+#include <cassert>
 
 enum PathSearchResult {
 	FOUND_PATH = 0,
@@ -26,6 +27,18 @@ const char* PathSearchResultLabel[] = {
 
 enum Direction { FORWARD = 0, REVERSE };
 
+inline static const char* directionStr(Direction dir)
+{
+	switch(dir) {
+	case FORWARD:
+		return "FORWARD";
+	case REVERSE:
+		return "REVERSE";
+	default:
+		assert(false);
+	}
+}
+
 const unsigned NO_LIMIT = UINT_MAX;
 
 template <class Vertex> class Path : public std::deque<Vertex>
diff --git a/Unittest/Konnector/integration-tests.mk b/IntegrationTest/Konnector/integration-tests.mk
similarity index 94%
rename from Unittest/Konnector/integration-tests.mk
rename to IntegrationTest/Konnector/integration-tests.mk
index 18e9c07..758f445 100755
--- a/Unittest/Konnector/integration-tests.mk
+++ b/IntegrationTest/Konnector/integration-tests.mk
@@ -88,7 +88,7 @@ $(tmpdir)/test_reference.fa: | $(tmpdir)
 $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq: $(tmpdir)/test_reference.fa
 	wgsim -S 0 -e $* -N $N -r 0 -R 0 $< $(tmpdir)/e$*_1.fq $(tmpdir)/e$*_2.fq
 
-$(tmpdir)/e%_merged.fa $(tmpdir)/e%_reads_1.fq $(tmpdir)/e%_reads_2.fq: $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq
+$(tmpdir)/e%_pseudoreads.fa $(tmpdir)/e%_reads_1.fq $(tmpdir)/e%_reads_2.fq: $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq
 	/usr/bin/time -v $(konnector) $(k_opts) -b$b -o $(tmpdir)/e$* $(K_OPTS) $^
 
 $(tmpdir)/e%_l2.bloom: $(tmpdir) $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq
@@ -113,7 +113,7 @@ $(tmpdir)/e%_reads_1of3.fq \
 # run_test
 #------------------------------------------------------------
 
-run_test: $(tmpdir) $(tmpdir)/e$e_merged.fa
+run_test: $(tmpdir) $(tmpdir)/e$e_pseudoreads.fa
 	@echo '------------------'
 	@echo '$@: PASSED'
 	@echo '------------------'
@@ -123,12 +123,12 @@ run_test: $(tmpdir) $(tmpdir)/e$e_merged.fa
 #------------------------------------------------------------
 
 save_and_load_test: $(tmpdir)/e$e_l2.bloom \
-	$(tmpdir)/e$e_merged.fa \
+	$(tmpdir)/e$e_pseudoreads.fa \
 	$(tmpdir)/e$e_reads_1.fq \
 	$(tmpdir)/e$e_reads_2.fq
 	/usr/bin/time -v $(konnector) $(k_opts) -o $(tmpdir)/e$e_loaded \
 		-i $(tmpdir)/e$e_l2.bloom $(K_OPTS) $(tmpdir)/e$e_1.fq $(tmpdir)/e$e_2.fq
-	diff $(tmpdir)/e$e_merged.fa $(tmpdir)/e$e_loaded_merged.fa
+	diff $(tmpdir)/e$e_pseudoreads.fa $(tmpdir)/e$e_loaded_pseudoreads.fa
 	diff $(tmpdir)/e$e_reads_1.fq $(tmpdir)/e$e_loaded_reads_1.fq
 	diff $(tmpdir)/e$e_reads_2.fq $(tmpdir)/e$e_loaded_reads_2.fq
 	@echo '------------------'
@@ -142,7 +142,7 @@ save_and_load_test: $(tmpdir)/e$e_l2.bloom \
 HALF_FASTQ_LINES:=$(shell echo '$N * 2 * 4 / 2' | bc)
 
 interleaved_files_test: $(tmpdir)/e$e_l2.bloom \
-		$(tmpdir)/e$e_merged.fa \
+		$(tmpdir)/e$e_pseudoreads.fa \
 		$(tmpdir)/e$e_interleaved_a.fq \
 		$(tmpdir)/e$e_interleaved_b.fq
 	/usr/bin/time -v $(konnector) $(k_opts) -I -b$b \
@@ -150,7 +150,7 @@ interleaved_files_test: $(tmpdir)/e$e_l2.bloom \
 		$(K_OPTS) \
 		$(tmpdir)/e$e_interleaved_a.fq \
 		$(tmpdir)/e$e_interleaved_b.fq
-	diff $(tmpdir)/e$e_merged.fa $(tmpdir)/e$e_interleaved_merged.fa
+	diff $(tmpdir)/e$e_pseudoreads.fa $(tmpdir)/e$e_interleaved_pseudoreads.fa
 	diff $(tmpdir)/e$e_reads_1.fq $(tmpdir)/e$e_interleaved_reads_1.fq
 	diff $(tmpdir)/e$e_reads_2.fq $(tmpdir)/e$e_interleaved_reads_2.fq
 	@echo '------------------'
@@ -355,16 +355,16 @@ abyss_bloom_multithreaded_test: $(tmpdir) $(tmpdir)/e$e_1.fq $(tmpdir)/e$e_2.fq
 konnector_multithreaded_test: $(tmpdir)/e$e_1.fq $(tmpdir)/e$e_2.fq
 	/usr/bin/time -v $(konnector) $(k_opts) -o $(tmpdir)/e$e_singlethreaded \
 		$(K_OPTS) -j1 $^
-	cat $(tmpdir)/e$e_singlethreaded_merged.fa | \
+	cat $(tmpdir)/e$e_singlethreaded_pseudoreads.fa | \
 		paste - - | sort | tr '\t' '\n' \
-		> $(tmpdir)/e$e_singlethreaded_merged.sorted.fa
+		> $(tmpdir)/e$e_singlethreaded_pseudoreads.sorted.fa
 	/usr/bin/time -v $(konnector) $(k_opts) -o $(tmpdir)/e$e_multithreaded \
 		$(K_OPTS) -j10 $^
-	cat $(tmpdir)/e$e_multithreaded_merged.fa | \
+	cat $(tmpdir)/e$e_multithreaded_pseudoreads.fa | \
 		paste - - | sort | tr '\t' '\n' \
-		> $(tmpdir)/e$e_multithreaded_merged.sorted.fa
-	diff $(tmpdir)/e$e_singlethreaded_merged.sorted.fa \
-		$(tmpdir)/e$e_multithreaded_merged.sorted.fa
+		> $(tmpdir)/e$e_multithreaded_pseudoreads.sorted.fa
+	diff $(tmpdir)/e$e_singlethreaded_pseudoreads.sorted.fa \
+		$(tmpdir)/e$e_multithreaded_pseudoreads.sorted.fa
 	@echo '------------------'
 	@echo '$@: PASSED'
 	@echo '------------------'
diff --git a/Konnector/DBGBloomAlgorithms.h b/Konnector/DBGBloomAlgorithms.h
index c1e7aa6..57f0270 100644
--- a/Konnector/DBGBloomAlgorithms.h
+++ b/Konnector/DBGBloomAlgorithms.h
@@ -36,22 +36,21 @@ static inline Sequence pathToSeq(Path<Kmer> path)
  * is no sequence of matches of length numMatchesThreshold,
  * use the longest sequence of matching kmers instead.
  *
- * The default behaviour of this method is to choose
- * the last kmer in the sequence that is present in the
- * Bloom filter de Bruijn graph.
- *
  * @param seq sequence in which to find start kmer
  * @param k kmer size
  * @param g de Bruijn graph
  * @param numMatchesThreshold if we encounter a sequence
  * of numMatchesThreshold consecutive kmers in the Bloom filter,
  * choose the kmer at the beginning of that sequence
+ * @param anchorToEnd if true, all k-mers from end of sequence
+ * up to the chosen k-mer must be matches. (This option is used when
+ * we wish to preserve the original sequences of the reads.)
  * @return position of chosen start kmer
  */
 template<typename Graph>
 static inline unsigned getStartKmerPos(const Sequence& seq,
 	unsigned k, Direction dir, const Graph& g,
-	unsigned numMatchesThreshold=1)
+	unsigned numMatchesThreshold=1, bool anchorToEnd=false)
 {
 	assert(numMatchesThreshold > 0);
 
@@ -86,6 +85,8 @@ static inline unsigned getStartKmerPos(const Sequence& seq,
 				maxMatchPos = i - inc;
 				maxMatchLen = matchCount;
 			}
+			if (anchorToEnd)
+				break;
 			matchCount = 0;
 		} else {
 			matchCount++;
diff --git a/Konnector/README.md b/Konnector/README.md
new file mode 100644
index 0000000..e27ad72
--- /dev/null
+++ b/Konnector/README.md
@@ -0,0 +1,176 @@
+---
+title: konnector
+author: Ben Vandervalk, Shaun Jackman, Tony Raymond, Hamid Mohamadi, Justin Chu
+date: 2015-06-30
+header: ABySS
+footer: ABySS
+section: 1
+---
+
+NAME
+====
+
+konnector - merged paired-end sequences by finding connecting paths in the de Bruijn graph
+
+SYNOPSIS
+========
+
+`konnector -k <kmer_size> -o <output_prefix> [options]... <FASTQ> [FASTQ]...`
+
+DESCRIPTION
+===========
+
+Konnector generates long pseudo-reads by finding connecting paths between paired-end reads within the de Bruijn graph. This can be thought of as a targeted de novo assembly in the neighbourhood of the paired-end reads. An additional feature of Konnector is the ability to extend the pseudo-reads and unmerged reads outwards, until it encounters a branching point or dead end in the de Bruijn graph.
+
+Konnector uses a Bloom filter representation of the de Bruijn graph to minimize memory requirements, as described in: Chikhi, Rayan, and Guillaume Rizk. "Space-efficient and exact de Bruijn graph representation based on a Bloom filter." Algorithms for Molecular Biology 8.22 (2013):1.
+
+OPTIONS
+=======
+
+Required Options
+----------------
+```
+-k, --kmer=N               the size of a k-mer [required]
+-o, --output-prefix=FILE   prefix of output FASTA files [required]
+```
+
+Bloom Filter Options
+--------------------
+```
+-b, --bloom-size=N         size of bloom filter [500M]
+-c, --min-coverage=N       kmer coverage threshold for error correction [2].
+                           This option specifies the number of levels in the
+                           cascading Bloom filter; it has no effect if the Bloom
+                           filter is loaded from an external file.
+-i, --input-bloom=FILE     load bloom filter from FILE; Bloom filter files can
+                           be created separately with the 'abyss-bloom' program
+```
+
+Graph Search Limits
+-------------------
+```
+-B, --max-branches=N       max branches in de Bruijn graph traversal;
+                           use 'nolimit' for no limit [350]
+-f, --min-frag=N           min fragment size in base pairs [0]
+-F, --max-frag=N           max fragment size in base pairs [1000]
+-P, --max-paths=N          merge at most N alternate paths; use 'nolimit'
+                           for no limit [2]
+```
+
+Sequence Identity Limits
+------------------------
+```
+-m, --read-mismatches=N    max mismatches between paths and reads; use
+                           'nolimit' for no limit [nolimit]
+-M, --max-mismatches=N     max mismatches between all alternate paths;
+                           use 'nolimit' for no limit [2]
+-x, --read-identity=N      min percent seq identity between consensus seq
+                           and reads [0]
+-X, --path-identity=N      min percent seq identity across alternate
+                           connecting paths [0]
+```
+
+Input Options
+-------------
+```
+-q, --trim-quality=N       trim bases from the ends of reads whose
+                           quality is less than the threshold
+    --standard-quality     zero quality is `!' (33), typically
+                           for FASTQ and SAM files [default]
+    --illumina-quality     zero quality is `@' (64), typically
+                           for qseq and export files
+    --chastity             discard unchaste reads [default]
+    --no-chastity          do not discard unchaste reads
+    --trim-masked          trim masked bases from the ends of reads
+    --no-trim-masked       do not trim masked bases from the ends
+                           of reads [default]
+-I, --interleaved          input reads files are interleaved
+```
+
+Output Options
+--------------
+```
+    --fastq                output merged reads in FASTQ format
+                           (default is FASTA); bases that are corrected
+						   or inserted by konnector are assigned a
+						   fixed quality score determined by -Q
+-Q, --corrected-qual       quality score for bases corrected or inserted
+                           by konnector; only relevant when --fastq is
+                           in effect [40]
+    --mask                 mask new and changed bases as lower case
+    --no-mask              do not mask bases [default]
+-p, --alt-paths-mode       output a separate pseudoread for each alternate
+                           path connecting a read pair (default is to create
+                           a consensus sequence of all connecting paths).
+						   The limit on the number of alternate paths is
+						   specified by the '--max-paths' option.
+                           The sequence IDs for alternate paths are named:
+						   ${orig_read_id}_1, ${orig_read_id}_2, ...
+--preserve-reads           don't correct any bases within the reads [disabled]
+-v, --verbose              display verbose output
+```
+
+Debugging Options
+-----------------
+```
+-d, --dot-file=FILE        write graph traversals to a DOT file
+-r, --read-name=STR        only process reads with names that contain STR
+-t, --trace-file=FILE      write graph search stats to FILE
+```
+
+Sequence Extension Options
+--------------------------
+```
+-D, --dup-bloom-size=N     use an additional Bloom filter to avoid
+                           assembling the same region of the genome
+                           multiple times. This option is highly
+                           recommended whenever -E (--extend) is used
+                           and has no effect otherwise. As a rule of
+                           thumb, the Bloom filter size should be
+                           about twice the target genome size [disabled]
+-E, --extend               in addition to connecting read pairs,
+                           extend the merged reads outwards to the next
+                           dead end or branching point in the de Brujin
+                           graph. For read pairs that were not successfully
+                           connected, trim the single-end reads at both ends
+						   and extend them independently.
+```
+
+Other Options
+-------------
+```
+-e, --fix-errors           find and fix single-base errors when reads
+                           have no kmers in bloom filter [disabled]
+-j, --threads=N            use N parallel threads [1]
+-n  --no-limits            disable all limits; equivalent to
+                           '-B nolimit -m nolimit -M nolimit -P nolimit'
+    --help                 display this help and exit
+    --version              output version information and exit
+```
+
+OUTPUT FILES
+============
+
+`$PREFIX` in the filenames below is determined by the `-o` option.
+
+Without `--extend`:
+
+  * `$PREFIX_pseudoreads.fa`: Pseudo-reads created by connecting paired-end reads.
+  * `$PREFIX_reads_1.fq`: Read 1 from read pairs that could not be connected.
+  * `$PREFIX_reads_2.fq`: Read 2 from read pairs that could not be connected.
+
+With `--extend`:
+
+  * `$prefix_pseudoreads.fa`: Pseudo-reads created by connecting paired-end reads, which may or may not be extended. Also contains single-end reads from read pairs that could not be connected, but which could be trimmed and/or extended.
+  * `$PREFIX_reads_1.fq`: Read 1 from read pairs that could not be connected and which could not be trimmed (because they contain no "good" k-mers).
+  * `$PREFIX_reads_2.fq`: Read 2 from read pairs that could not be connected and which could not be trimmed (because they contain no "good" k-mers).
+
+AUTHORS
+=======
+
+Ben Vandervalk, Shaun Jackman, Tony Raymond, Hamid Mohamadi, Justin Chu.
+
+REPORTING BUGS
+==============
+
+Report bugs to <abyss-users at bcgsc.ca>.
diff --git a/Konnector/konnector.cc b/Konnector/konnector.cc
index 1c8f80d..e63cff1 100644
--- a/Konnector/konnector.cc
+++ b/Konnector/konnector.cc
@@ -25,6 +25,7 @@
 #include <getopt.h>
 #include <iostream>
 #include <cstring>
+#include <algorithm>
 
 #if _OPENMP
 # include <omp.h>
@@ -73,6 +74,8 @@ static const char USAGE_MESSAGE[] =
 "                             dead end or branching point in the de Brujin\n"
 "                             graph. If the reads were not successfully\n"
 "                             connected, extend them inwards as well.\n"
+"  --fastq                    output merged reads in FASTQ format\n"
+"                             (default is FASTA)\n"
 "  -f, --min-frag=N           min fragment size in base pairs [0]\n"
 "  -F, --max-frag=N           max fragment size in base pairs [1000]\n"
 "  -i, --input-bloom=FILE     load bloom filter from FILE\n"
@@ -91,37 +94,48 @@ static const char USAGE_MESSAGE[] =
 "  -n  --no-limits            disable all limits; equivalent to\n"
 "                             '-B nolimit -m nolimit -M nolimit -P nolimit'\n"
 "  -o, --output-prefix=FILE   prefix of output FASTA files [required]\n"
+"  --preserve-reads           don't correct any bases within the reads [disabled]\n"
+"  -p, --alt-paths-mode       output a separate pseudoread for each alternate\n"
+"                             path connecting a read pair (default is to create\n"
+"                             a consensus sequence of all connecting paths)\n"
 "  -P, --max-paths=N          merge at most N alternate paths; use 'nolimit'\n"
 "                             for no limit [2]\n"
 "  -q, --trim-quality=N       trim bases from the ends of reads whose\n"
 "                             quality is less than the threshold\n"
-"      --standard-quality     zero quality is `!' (33)\n"
-"                             default for FASTQ and SAM files\n"
-"      --illumina-quality     zero quality is `@' (64)\n"
-"                             default for qseq and export files\n"
+"      --standard-quality     zero quality is `!' (33), typically\n"
+"                             for FASTQ and SAM files [default]\n"
+"      --illumina-quality     zero quality is `@' (64), typically\n"
+"                             for qseq and export files\n"
+"  -Q, --corrected-qual       quality score for bases corrected or inserted\n"
+"                             by konnector; only relevant when --fastq is\n"
+"                             in effect [40]\n"
 "  -r, --read-name=STR        only process reads with names that contain STR\n"
 "  -s, --search-mem=N         mem limit for graph searches; multiply by the\n"
 "                             number of threads (-j) to get the total mem used\n"
 "                             for graph traversal [500M]\n"
 "  -t, --trace-file=FILE      write graph search stats to FILE\n"
 "  -v, --verbose              display verbose output\n"
+"  -x, --read-identity=N      min percent seq identity between consensus seq\n"
+"                             and reads [0]\n"
+"  -X, --path-identity=N      min percent seq identity across alternate\n"
+"                             connecting paths [0]\n"
 "      --help                 display this help and exit\n"
 "      --version              output version information and exit\n"
 "\n"
 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
 
 const unsigned g_progressStep = 1000;
-/*
+/**
  * ignore branches less than this length
  *(false positive branches)
  */
 const unsigned g_trimLen = 3;
-
 /*
- * Bloom filter use to keep track of portions
+ * Bloom filter to keep track of portions
  * of genome that have already been assembled.
- * This Bloom filter is only used when the
- * -E (--extend) option is in effect.
+ * This Bloom filter is only used when both
+ * the --extend and --dup-bloom-size options
+ * are in effect.
  */
 BloomFilter g_dupBloom;
 
@@ -165,6 +179,11 @@ namespace opt {
 	 */
 	bool extend = false;
 
+	/**
+	 * Output pseudo-reads in FASTQ format.
+	 */
+	bool fastq = false;
+
 	/** The size of a k-mer. */
 	unsigned k;
 
@@ -177,9 +196,26 @@ namespace opt {
 	/** Bloom filter input file */
 	static string inputBloomPath;
 
+	/**
+	 * Do not correct bases in input read sequences.
+	 */
+	static bool preserveReads = false;
+
+	/**
+	 * Output separate sequence for each alternate path
+	 * between read pairs
+	 */
+	static bool altPathsMode = false;
+
 	/** Max paths between read 1 and read 2 */
 	unsigned maxPaths = 2;
 
+	/**
+	 * Quality score for bases that are corrected
+	 * or inserted by konnector.
+	 */
+	uint8_t correctedQual = 40;
+
 	/** Prefix for output files */
 	static string outputPrefix;
 
@@ -201,6 +237,17 @@ namespace opt {
 	/** Max mismatches between consensus and original reads */
 	static unsigned maxReadMismatches = NO_LIMIT;
 
+	/**
+	 * Min percent seq identity between consensus seq
+	 * and input reads
+	 */
+	static float minReadIdentity = 0.0f;
+
+	/**
+	 * Min percent seq identity between all alternate
+	 * paths
+	 */
+	static float minPathIdentity = 0.0f;
 }
 
 /** Counters */
@@ -220,14 +267,13 @@ static struct {
 	size_t readPairsMerged;
 	size_t skipped;
 	/* counts below are used only when -E is enabled */
-	size_t mergedAndExtended;
 	size_t mergedAndSkipped;
-	size_t singleEndCorrected;
+	size_t singleEndExtended;
 } g_count;
 
-static const char shortopts[] = "b:B:c:d:D:eEf:F:i:Ij:k:lm:M:no:P:q:r:s:t:v";
+static const char shortopts[] = "b:B:c:d:D:eEf:F:i:Ij:k:lm:M:no:p:P:q:Q:r:s:t:vx:X:";
 
-enum { OPT_HELP = 1, OPT_VERSION };
+enum { OPT_FASTQ = 1, OPT_HELP, OPT_PRESERVE_READS, OPT_VERSION };
 
 static const struct option longopts[] = {
 	{ "bloom-size",       required_argument, NULL, 'b' },
@@ -253,37 +299,35 @@ static const struct option longopts[] = {
 	{ "output-prefix",    required_argument, NULL, 'o' },
 	{ "read-mismatches",  required_argument, NULL, 'm' },
 	{ "max-mismatches",   required_argument, NULL, 'M' },
+	{ "alt-paths-mode",   no_argument, NULL, 'p' },
 	{ "max-paths",        required_argument, NULL, 'P' },
 	{ "trim-quality",     required_argument, NULL, 'q' },
+	{ "corrected-qual",   required_argument, NULL, 'Q' },
 	{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
 	{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
 	{ "read-name",        required_argument, NULL, 'r' },
 	{ "search-mem",       required_argument, NULL, 's' },
 	{ "trace-file",       required_argument, NULL, 't' },
 	{ "verbose",          no_argument, NULL, 'v' },
+	{ "read-identity",    required_argument, NULL, 'x' },
+	{ "path-identity",    required_argument, NULL, 'X' },
+	{ "fastq",            no_argument, NULL, OPT_FASTQ },
 	{ "help",             no_argument, NULL, OPT_HELP },
+	{ "preserve-reads",   no_argument, NULL, OPT_PRESERVE_READS },
 	{ "version",          no_argument, NULL, OPT_VERSION },
 	{ NULL, 0, NULL, 0 }
 };
 
 /**
  * Return true if the Bloom filter contains all of the
- * kmers in the given sequence.
+ * "good" kmers in the given sequence.
  */
-static bool bloomContainsSeq(const BloomFilter& bloom, const Sequence& seq)
+static inline bool isSeqRedundant(const BloomFilter& assembledKmers,
+	const BloomFilter& goodKmers, Sequence seq)
 {
-	if (containsAmbiguityCodes(seq)) {
-		Sequence seqCopy = seq;
-		flattenAmbiguityCodes(seqCopy, false);
-		for (KmerIterator it(seqCopy, opt::k); it != KmerIterator::end();
-			++it) {
-			if (!bloom[*it])
-				return false;
-		}
-		return true;
-	}
+	flattenAmbiguityCodes(seq, false);
 	for (KmerIterator it(seq, opt::k); it != KmerIterator::end(); ++it) {
-		if (!bloom[*it])
+		if (goodKmers[*it] && !assembledKmers[*it])
 			return false;
 	}
 	return true;
@@ -292,18 +336,133 @@ static bool bloomContainsSeq(const BloomFilter& bloom, const Sequence& seq)
 /**
  * Load the kmers of a given sequence into a Bloom filter.
  */
-static inline void loadSeq(BloomFilter& bloom, unsigned k, const Sequence& seq)
+static inline void addKmers(BloomFilter& bloom,
+	const BloomFilter& goodKmers, unsigned k,
+	const Sequence& seq)
 {
 	if (containsAmbiguityCodes(seq)) {
-		Sequence seqCopy = seq;
-		Sequence rc = reverseComplement(seqCopy);
-		flattenAmbiguityCodes(seqCopy, false);
-		flattenAmbiguityCodes(rc, false);
-		Bloom::loadSeq(bloom, k, seqCopy);
-		Bloom::loadSeq(bloom, k, rc);
+		Sequence flattened = seq;
+		Sequence rcFlattened = reverseComplement(seq);
+		flattenAmbiguityCodes(flattened, false);
+		flattenAmbiguityCodes(rcFlattened, false);
+		for (KmerIterator it(flattened, k);
+			it != KmerIterator::end();++it) {
+			if (goodKmers[*it])
+				bloom.insert(*it);
+		}
+		for (KmerIterator it(rcFlattened, k);
+			it != KmerIterator::end(); ++it) {
+			if (goodKmers[*it])
+				bloom.insert(*it);
+		}
+		return;
 	} else {
-		Bloom::loadSeq(bloom, k, seq);
+		for (KmerIterator it(seq, k);
+			it != KmerIterator::end(); ++it) {
+			if (goodKmers[*it])
+				bloom.insert(*it);
+		}
+	}
+}
+
+enum ExtendResult { ER_NOT_EXTENDED, ER_REDUNDANT, ER_EXTENDED };
+
+/**
+ * Calculate quality string for a pseudo-read.  A base will
+ * have a score of CORRECTED_BASE_QUAL if it was corrected
+ * by konnector or added by konnector (in the gap between
+ * paired-end reads).  For bases that are unchanged from the
+ * input reads, the original quality score is used.  In the
+ * case that the two input read(s) overlap and both provide
+ * a correct base call, the maximum of the two quality scores
+ * is used.
+ */
+static inline std::string calcQual(const FastqRecord& read1,
+	const FastqRecord& read2, Sequence& merged)
+{
+	unsigned char correctedQual = opt::qualityOffset + opt::correctedQual;
+	std::string qual(merged.length(), correctedQual);
+
+	/*
+	 * In the case that the input files are FASTA,
+	 * the quality strings for read1 / read2 will be
+	 * empty, so just return a uniform quality string.
+	 */
+	if (read1.qual.empty() || read2.qual.empty())
+		return qual;
+
+	Sequence r1 = read1.seq, r2 = reverseComplement(read2.seq);
+	std::string r1qual = read1.qual, r2qual = read2.qual;
+	std::reverse(r2qual.begin(), r2qual.end());
+	assert(r1.length() <= merged.length());
+	assert(r2.length() <= merged.length());
+
+	/* region covered only by read 1 */
+	unsigned r2offset = merged.length() - r2.length();
+	for (unsigned r1pos = 0; r1pos < r1.length() && r1pos < r2offset;
+		++r1pos) {
+		if (r1.at(r1pos) == merged.at(r1pos)) {
+			qual.at(r1pos) = r1qual.at(r1pos);
+		} else {
+			//r1Corrected.at(i) = true;
+			qual.at(r1pos) = correctedQual;
+		}
+	}
+
+	/* region where read 1 and read 2 overlap */
+	for (unsigned r1pos = r2offset; r1pos < r1.length(); ++r1pos) {
+		unsigned r2pos = r1pos - r2offset;
+		if (r1.at(r1pos) != merged.at(r1pos) ||
+			r2.at(r2pos) != merged.at(r1pos)) {
+			qual.at(r1pos) = correctedQual;
+		} else {
+			assert(r1.at(r1pos) == r2.at(r2pos));
+			qual.at(r1pos) = max(r1qual.at(r1pos), r2qual.at(r2pos));
+		}
+	}
+
+	/* region covered only by read 2 */
+	for (unsigned r1pos = max(r2offset, (unsigned)r1.length());
+		r1pos < merged.length(); ++r1pos) {
+		unsigned r2pos = r1pos - r2offset;
+		if (r2.at(r2pos) == merged.at(r1pos)) {
+			qual.at(r1pos) = r2qual.at(r2pos);
+		} else {
+			qual.at(r1pos) = correctedQual;
+		}
+	}
+
+	return qual;
+}
+
+static inline string calcQual(const FastqRecord& orig,
+	const Sequence& extended, unsigned extendedLeft,
+	unsigned extendedRight)
+{
+	assert(extended.length() == orig.seq.length() +
+		extendedLeft + extendedRight);
+
+	unsigned char correctedQual = opt::qualityOffset + opt::correctedQual;
+	string qual(extended.length(), correctedQual);
+
+	/*
+	 * In the case that the input files are FASTA,
+	 * the quality strings for read1 / read2 will be
+	 * empty, so just return a uniform quality string.
+	 */
+	if (orig.qual.empty())
+		return qual;
+
+	unsigned offset = extendedLeft;
+	for (unsigned i = 0; i < orig.seq.length(); ++i) {
+		assert(offset + i < extended.length());
+		assert(i < orig.seq.length());
+		assert(i < orig.qual.length());
+		if (orig.seq.at(i) == extended.at(offset + i))
+			qual.at(offset + i) = orig.qual.at(i);
 	}
+
+	return qual;
 }
 
 /**
@@ -318,45 +477,46 @@ static inline void loadSeq(BloomFilter& bloom, unsigned k, const Sequence& seq)
  * (or both) directions, false otherwise
  */
 template <typename Graph>
-static bool extendRead(Sequence& seq, unsigned k, const Graph& g)
+static bool extendRead(FastqRecord& rec, unsigned k, const Graph& g)
 {
-	ExtendSeqResult result;
-	bool extended = false;
+	unsigned extendedLeft = 0, extendedRight = 0;
+	Sequence extendedSeq = rec.seq;
 
 	/*
 	 * offset start pos to reduce chance of hitting
 	 * a dead end on a false positive kmer
 	 */
 	const unsigned runLengthHint = 3;
-	unsigned startPos = getStartKmerPos(seq, k, FORWARD, g,
+	unsigned startPos = getStartKmerPos(extendedSeq, k, FORWARD, g,
 		runLengthHint);
 	if (startPos != NO_MATCH) {
-		assert(startPos <= seq.length() - k);
-		result = extendSeq(seq, FORWARD, startPos, k, g,
-				NO_LIMIT, g_trimLen, opt::mask);
-		if (result == ES_EXTENDED_TO_DEAD_END ||
-				result == ES_EXTENDED_TO_BRANCHING_POINT ||
-				result == ES_EXTENDED_TO_CYCLE) {
-			extended = true;
-		}
+		assert(startPos <= extendedSeq.length() - k);
+		unsigned lengthBefore = extendedSeq.length();
+		extendSeq(extendedSeq, FORWARD, startPos, k, g,
+			NO_LIMIT, g_trimLen, opt::mask,
+			!opt::altPathsMode, opt::preserveReads);
+		extendedRight = extendedSeq.length() - lengthBefore;
 	}
 
-	startPos = getStartKmerPos(seq, k, REVERSE, g, runLengthHint);
+	startPos = getStartKmerPos(extendedSeq, k, REVERSE, g, runLengthHint);
 	if (startPos != NO_MATCH) {
-		assert(startPos <= seq.length() - k);
-		result = extendSeq(seq, REVERSE, startPos, k, g,
-				NO_LIMIT, g_trimLen, opt::mask);
-		if (result == ES_EXTENDED_TO_DEAD_END ||
-				result == ES_EXTENDED_TO_BRANCHING_POINT ||
-				result == ES_EXTENDED_TO_CYCLE) {
-			extended = true;
-		}
+		assert(startPos <= extendedSeq.length() - k);
+		unsigned lengthBefore = extendedSeq.length();
+		extendSeq(extendedSeq, REVERSE, startPos, k, g,
+			NO_LIMIT, g_trimLen, opt::mask,
+			!opt::altPathsMode, opt::preserveReads);
+		extendedLeft = extendedSeq.length() - lengthBefore;
 	}
 
-	return extended;
-}
+	if (extendedLeft > 0 || extendedRight > 0) {
+		rec.qual = calcQual(rec, extendedSeq,
+			extendedLeft, extendedRight);
+		rec.seq = extendedSeq;
+		return true;
+	}
 
-enum ExtendResult { ER_NOT_EXTENDED, ER_REDUNDANT, ER_EXTENDED };
+	return false;
+}
 
 /**
  * Attempt to extend a merged read (a.k.a. pseudoread)
@@ -369,11 +529,14 @@ enum ExtendResult { ER_NOT_EXTENDED, ER_REDUNDANT, ER_EXTENDED };
  * @return ExtendResult (ER_NOT_EXTENDED, ER_EXTENDED,
  * ER_REDUNDANT)
  */
-template <typename Graph>
+template <typename Graph, typename BloomT1, typename BloomT2>
 static inline ExtendResult
-extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
+extendReadIfNonRedundant(FastqRecord& seq, BloomT1& assembledKmers,
+	const BloomT2& goodKmers, unsigned k, const Graph& g)
 {
+	bool extended = false;
 	bool redundant = false;
+
 	if (opt::dupBloomSize > 0) {
 		/*
 		 * Check to see if the current pseudoread
@@ -381,12 +544,12 @@ extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
 		 * that has already been assembled.
 		 */
 #pragma omp critical(dupBloom)
-		redundant = bloomContainsSeq(g_dupBloom, seq);
+		redundant = isSeqRedundant(assembledKmers, goodKmers, seq);
 		if (redundant)
 			return ER_REDUNDANT;
 	}
-	Sequence origSeq = seq;
-	bool extended = extendRead(seq, k, g);
+	Sequence origSeq = seq.seq;
+	extended = extendRead(seq, k, g);
 	if (opt::dupBloomSize > 0) {
 		/*
 		 * mark the extended read as an assembled
@@ -395,8 +558,8 @@ extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
 #pragma omp critical(dupBloom)
 		{
 			/* must check again to avoid race conditions */
-			if (!bloomContainsSeq(g_dupBloom, origSeq))
-				loadSeq(g_dupBloom, opt::k, seq);
+			if (!isSeqRedundant(assembledKmers, goodKmers, origSeq))
+				addKmers(assembledKmers, goodKmers, k, seq.seq);
 			else
 				redundant = true;
 		}
@@ -410,6 +573,22 @@ extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
 		return ER_NOT_EXTENDED;
 }
 
+static inline FastqRecord connectingSeq(const FastqRecord& mergedSeq,
+	unsigned startKmerPos, unsigned goalKmerPos)
+{
+	FastqRecord rec;
+
+	unsigned start = startKmerPos;
+	unsigned end = mergedSeq.seq.length() - 1 - goalKmerPos;
+	assert(start <= end);
+
+	rec.id = mergedSeq.id;
+	rec.seq = mergedSeq.seq.substr(start, end - start + 1);
+	rec.qual = mergedSeq.qual.substr(start, end - start + 1);
+
+	return rec;
+}
+
 /**
  * Print progress stats about reads merged/extended so far.
  */
@@ -419,7 +598,7 @@ static inline void printProgressMessage()
 		<< g_count.readPairsProcessed << " read pairs";
 
 	if (opt::extend) {
-		cerr << ", corrected/extended " << g_count.singleEndCorrected << " of "
+		cerr << ", corrected/extended " << g_count.singleEndExtended << " of "
 			<< (g_count.readPairsProcessed - g_count.uniquePath -
 				g_count.multiplePaths) * 2
 		<< " unmerged reads";
@@ -436,34 +615,100 @@ static inline void printProgressMessage()
 		<< ")\n";
 }
 
-
-/**
- * For a successfully merged read pair, get the sequence
- * representing the connecting path between the two reads.
- */
-template <typename Bloom>
-static inline string getConnectingSeq(ConnectPairsResult& result,
-	unsigned k, const Bloom& bloom)
+static inline void updateCounters(const ConnectPairsParams& params,
+	const ConnectPairsResult& result)
 {
-	assert(result.pathResult == FOUND_PATH);
-	(void)bloom;
+	switch (result.pathResult) {
+		case NO_PATH:
+			assert(result.mergedSeqs.empty());
+			if (result.foundStartKmer && result.foundGoalKmer)
+#pragma omp atomic
+				++g_count.noPath;
+			else
+#pragma omp atomic
+				++g_count.noStartOrGoalKmer;
+			break;
+
+		case FOUND_PATH:
+			assert(!result.mergedSeqs.empty());
+			if (result.pathMismatches > params.maxPathMismatches ||
+				result.pathIdentity < params.minPathIdentity) {
+#pragma omp atomic
+					++g_count.tooManyMismatches;
+			} else if (result.readMismatches > params.maxReadMismatches ||
+				result.readIdentity < params.minReadIdentity) {
+#pragma omp atomic
+					++g_count.tooManyReadMismatches;
+			} else {
+				if (result.mergedSeqs.size() == 1)
+#pragma omp atomic
+					++g_count.uniquePath;
+				else
+#pragma omp atomic
+					++g_count.multiplePaths;
+			}
+			break;
+
+		case TOO_MANY_PATHS:
+#pragma omp atomic
+			++g_count.tooManyPaths;
+			break;
 
-	vector<FastaRecord>& paths = result.mergedSeqs;
-	assert(paths.size() > 0);
+		case TOO_MANY_BRANCHES:
+#pragma omp atomic
+			++g_count.tooManyBranches;
+			break;
 
-	Sequence& seq = (paths.size() == 1) ?
-		paths.front().seq : result.consensusSeq.seq;
+		case PATH_CONTAINS_CYCLE:
+#pragma omp atomic
+			++g_count.containsCycle;
+			break;
 
-	/*
-	 * initialize sequence to the chars between the
-	 * start and goal kmers of the path search.
-	 */
-	int startPos = result.startKmerPos;
-	int endPos = seq.length() - result.goalKmerPos - k;
-	assert(startPos >= 0 && startPos <=
-		(int)(seq.length() - k + 1));
+		case EXCEEDED_MEM_LIMIT:
+#pragma omp atomic
+			++g_count.exceededMemLimit;
+			break;
+	}
+}
 
-	return seq.substr(startPos, endPos - startPos + k);
+static inline void outputRead(const FastqRecord& read, ostream& out,
+	bool fastq = true)
+{
+	if (fastq)
+		out << read;
+	else
+		out << (FastaRecord)read;
+}
+
+static inline bool exceedsMismatchThresholds(const ConnectPairsParams& params,
+	const ConnectPairsResult& result)
+{
+	return (result.pathMismatches > params.maxPathMismatches ||
+		result.pathIdentity < params.minPathIdentity ||
+		result.readMismatches > params.maxReadMismatches ||
+		result.readIdentity < params.minReadIdentity);
+}
+
+/**
+ * Correct and extend an unmerged single-end read.
+ * @return true if the read was modified, false otherwise
+ */
+template <typename Graph, typename BloomT1, typename BloomT2>
+static inline bool correctAndExtend(FastqRecord& read,
+	BloomT1& assembledKmers, const BloomT2& goodKmers,
+	unsigned k, const Graph& g, bool preserveRead=false)
+{
+	bool corrected = false;
+	if (!preserveRead)
+		corrected = trimRead(read, k, g);
+	if (preserveRead || corrected) {
+		ExtendResult extendResult =
+			extendReadIfNonRedundant(read, assembledKmers,
+				goodKmers, k, g);
+		if (extendResult == ER_EXTENDED)
+			return true;
+	}
+	return corrected;
 }
 
 /** Connect a read pair. */
@@ -489,15 +734,31 @@ static void connectPair(const Graph& g,
 		return;
 	}
 
+	/* Search for connecting paths between read pair */
+
 	ConnectPairsResult result =
 		connectPairs(opt::k, read1, read2, g, params);
 
-	vector<FastaRecord>& paths = result.mergedSeqs;
-	bool mergedSeqRedundant = false;
-	bool read1Corrected = false;
-	bool read1Redundant = false;
-	bool read2Corrected = false;
-	bool read2Redundant = false;
+	/* Calculate quality strings for merged reads */
+
+	vector<FastqRecord> paths;
+	FastqRecord consensus;
+	if (result.pathResult == FOUND_PATH) {
+		for (unsigned i = 0; i < result.mergedSeqs.size(); ++i) {
+			FastqRecord fastq;
+			fastq.id = result.mergedSeqs.at(i).id;
+			fastq.seq = result.mergedSeqs.at(i).seq;
+			fastq.qual = calcQual(read1, read2, result.mergedSeqs.at(i).seq);
+			paths.push_back(fastq);
+		}
+		consensus.id = result.consensusSeq.id;
+		consensus.seq = result.consensusSeq.seq;
+		consensus.qual = calcQual(read1, read2, result.consensusSeq.seq);
+	}
+
+	bool outputRead1 = false;
+	bool outputRead2 = false;
+	std::vector<bool> pathRedundant;
 
 	/*
 	 * extend reads inwards or outwards up to the
@@ -506,58 +767,66 @@ static void connectPair(const Graph& g,
 	 */
 	if (opt::extend) {
 		ExtendResult extendResult;
-		if (result.pathResult == FOUND_PATH
-			&& result.pathMismatches <= params.maxPathMismatches
-			&& result.readMismatches <= params.maxReadMismatches) {
+		if (result.pathResult == FOUND_PATH &&
+			!exceedsMismatchThresholds(params, result)) {
+			/* we found at least one connecting path */
 			assert(paths.size() > 0);
-			Sequence& seq = (paths.size() == 1) ?
-				paths.front().seq : result.consensusSeq.seq;
-			seq = getConnectingSeq(result, opt::k, bloom);
-			extendResult = extendReadIfNonRedundant(
-				seq, opt::k, g);
-			if (extendResult == ER_REDUNDANT) {
+			if (opt::altPathsMode) {
+				/* extend each alternate path independently */
+				for (unsigned i = 0; i < paths.size(); ++i) {
+					if (!opt::preserveReads)
+						paths.at(i) = connectingSeq(paths.at(i),
+							result.startKmerPos, result.goalKmerPos);
+					extendResult = extendReadIfNonRedundant(
+						paths.at(i), g_dupBloom, bloom, opt::k, g);
+					pathRedundant.push_back(extendResult == ER_REDUNDANT);
+				}
+			} else  {
+				/* extend consensus sequence for all paths */
+				if (!opt::preserveReads)
+					consensus = connectingSeq(consensus,
+						result.startKmerPos, result.goalKmerPos);
+				extendResult = extendReadIfNonRedundant(
+					consensus, g_dupBloom, bloom, opt::k, g);
+				pathRedundant.push_back(extendResult == ER_REDUNDANT);
+			}
+			if (std::find(pathRedundant.begin(), pathRedundant.end(),
+				false) == pathRedundant.end()) {
 #pragma omp atomic
 				g_count.mergedAndSkipped++;
-				mergedSeqRedundant = true;
-			} else if (extendResult == ER_EXTENDED) {
-#pragma omp atomic
-				g_count.mergedAndExtended++;
 			}
 		} else {
 
 			/*
 			 * read pair could not be merged, so try
-			 * to extend each read individually (in
+			 * to correct and extend each read individually (in
 			 * both directions).
 			 */
 
-//std::cerr << "correcting " << read1.id << " (read 1)" << std::endl;
-			read1Corrected = correctAndExtendSeq(read1.seq,
-				opt::k, g, read1.seq.length(), g_trimLen,
-				opt::mask);
-
-			if (read1Corrected) {
+			if (correctAndExtend(read1, g_dupBloom, bloom,
+				opt::k, g, opt::preserveReads)) {
+					/* avoid duplicate read IDs */
+					if (!endsWith(read1.id, "/1")) {
+						read1.id.append("/1");
+						read1.comment.clear();
+					}
+					outputRead1 = true;
 #pragma omp atomic
-				g_count.singleEndCorrected++;
-				extendResult = extendReadIfNonRedundant(read1.seq,
-					opt::k, g);
-				if (extendResult == ER_REDUNDANT)
-					read1Redundant = true;
+					g_count.singleEndExtended++;
 			}
 
-//std::cerr << "correcting " << read2.id << " (read 2)" << std::endl;
-			read2Corrected = correctAndExtendSeq(read2.seq,
-				opt::k, g, read2.seq.length(), g_trimLen,
-				opt::mask);
-
-			if (read2Corrected) {
+			if (correctAndExtend(read2, g_dupBloom, bloom,
+				opt::k, g, opt::preserveReads)) {
+					/* avoid duplicate read IDs */
+					if (!endsWith(read2.id, "/2")) {
+						read2.id.append("/2");
+						read2.comment.clear();
+					}
+					outputRead2 = true;
 #pragma omp atomic
-				g_count.singleEndCorrected++;
-				extendResult = extendReadIfNonRedundant(read2.seq,
-					opt::k, g);
-				if (extendResult == ER_REDUNDANT)
-					read2Redundant = true;
+					g_count.singleEndExtended++;
 			}
+
 		}
 	}
 
@@ -568,106 +837,42 @@ static void connectPair(const Graph& g,
 		assert_good(traceStream, opt::tracefilePath);
 	}
 
-	switch (result.pathResult) {
+	/* update stats regarding merge successes / failures */
 
-		case NO_PATH:
-			assert(paths.empty());
-			if (result.foundStartKmer && result.foundGoalKmer)
-#pragma omp atomic
-				++g_count.noPath;
-			else {
-#pragma omp atomic
-				++g_count.noStartOrGoalKmer;
-			}
-			break;
+	updateCounters(params, result);
 
-		case FOUND_PATH:
-			assert(!paths.empty());
-			if (result.pathMismatches > params.maxPathMismatches ||
-					result.readMismatches > params.maxReadMismatches) {
-				if (result.pathMismatches > params.maxPathMismatches)
-#pragma omp atomic
-					++g_count.tooManyMismatches;
-				else
-					++g_count.tooManyReadMismatches;
-				if (opt::extend) {
-					if (read1Corrected || read2Corrected)
-#pragma omp critical(mergedStream)
-					{
-						if (read1Corrected && !read1Redundant)
-							mergedStream << (FastaRecord)read1;
-						if (read2Corrected && !read2Redundant)
-							mergedStream << (FastaRecord)read2;
-					}
-					if (!read1Corrected || !read2Corrected)
-#pragma omp critical(readStream)
-					{
-						if (!read1Corrected)
-							read1Stream << (FastaRecord)read1;
-						if (!read2Corrected)
-							read1Stream << (FastaRecord)read2;
-					}
-				} else
-#pragma omp critical(readStream)
-				{
-					read1Stream << read1;
-					read2Stream << read2;
-				}
-			}
-			else if (paths.size() > 1) {
-#pragma omp atomic
-				++g_count.multiplePaths;
-				if (!mergedSeqRedundant)
+	/* ouput merged / unmerged reads */
+
+	if (result.pathResult == FOUND_PATH &&
+		!exceedsMismatchThresholds(params, result)) {
+		assert(!paths.empty());
+		if (opt::altPathsMode) {
 #pragma omp critical(mergedStream)
-					mergedStream << result.consensusSeq;
+			for (unsigned i = 0; i < paths.size(); ++i) {
+				if (opt::dupBloomSize == 0 || !pathRedundant.at(i))
+					outputRead(paths.at(i), mergedStream, opt::fastq);
 			}
-			else {
-#pragma omp atomic
-				++g_count.uniquePath;
-				if (!mergedSeqRedundant)
+		} else if (opt::dupBloomSize == 0 || !pathRedundant.front()) {
 #pragma omp critical(mergedStream)
-					mergedStream << paths.front();
-			}
-			break;
-
-		case TOO_MANY_PATHS:
-#pragma omp atomic
-			++g_count.tooManyPaths;
-			break;
-
-		case TOO_MANY_BRANCHES:
-#pragma omp atomic
-			++g_count.tooManyBranches;
-			break;
-
-		case PATH_CONTAINS_CYCLE:
-#pragma omp atomic
-			++g_count.containsCycle;
-			break;
-
-		case EXCEEDED_MEM_LIMIT:
-#pragma omp atomic
-			++g_count.exceededMemLimit;
-			break;
-	}
-
-	if (result.pathResult != FOUND_PATH) {
+			outputRead(consensus, mergedStream, opt::fastq);
+		}
+	} else {
 		if (opt::extend) {
-			if (read1Corrected || read2Corrected)
+			if (outputRead1 || outputRead2)
 #pragma omp critical(mergedStream)
 			{
-				if (read1Corrected && !read1Redundant)
-					mergedStream << (FastaRecord)read1;
-				if (read2Corrected && !read2Redundant)
-					mergedStream << (FastaRecord)read2;
+				if (outputRead1)
+					outputRead(read1, mergedStream, opt::fastq);
+				if (outputRead2)
+					outputRead(read2, mergedStream, opt::fastq);
 			}
-			if (!read1Corrected || !read2Corrected)
+			if (!outputRead1 || !outputRead2)
 #pragma omp critical(readStream)
 			{
-				if (!read1Corrected)
-					read1Stream << (FastaRecord)read1;
-				if (!read2Corrected)
-					read1Stream << (FastaRecord)read2;
+				if (!outputRead1)
+					read1Stream << read1;
+				if (!outputRead2)
+					read2Stream << read2;
 			}
 		} else
 #pragma omp critical(readStream)
@@ -784,21 +989,33 @@ int main(int argc, char** argv)
 			setMaxOption(opt::maxMismatches, arg); break;
 		  case 'o':
 			arg >> opt::outputPrefix; break;
+		  case 'p':
+			opt::altPathsMode = true; break;
 		  case 'P':
 			setMaxOption(opt::maxPaths, arg); break;
 		  case 'q':
 			arg >> opt::qualityThreshold; break;
+		  case 'Q':
+			arg >> opt::correctedQual; break;
 		  case 'r':
 			arg >> opt::readName; break;
 		  case 's':
 			opt::searchMem = SIToBytes(arg); break;
 		  case 't':
 			arg >> opt::tracefilePath; break;
+		  case 'x':
+			arg >> opt::minReadIdentity; break;
+		  case 'X':
+			arg >> opt::minPathIdentity; break;
 		  case 'v':
 			opt::verbose++; break;
+		  case OPT_FASTQ:
+			opt::fastq = true; break;
 		  case OPT_HELP:
 			cout << USAGE_MESSAGE;
 			exit(EXIT_SUCCESS);
+		  case OPT_PRESERVE_READS:
+			opt::preserveReads = true; break;
 		  case OPT_VERSION:
 			cout << VERSION_MESSAGE;
 			exit(EXIT_SUCCESS);
@@ -848,6 +1065,15 @@ int main(int argc, char** argv)
 	seqanTests();
 #endif
 
+	/*
+	 * We need to set a default quality score offset
+	 * in order to generate quality scores
+	 * for bases that are corrected/inserted by
+	 * konnector (--fastq option).
+	 */
+	if (opt::qualityOffset == 0)
+		opt::qualityOffset = 33;
+
 	assert(opt::bloomSize > 0);
 
 	if (opt::dupBloomSize > 0)
@@ -925,10 +1151,11 @@ int main(int argc, char** argv)
 	 */
 
 	string mergedOutputPath(opt::outputPrefix);
-	if (opt::extend)
-		mergedOutputPath.append("_pseudoreads.fa");
+	mergedOutputPath.append("_pseudoreads");
+	if (opt::fastq)
+		mergedOutputPath.append(".fq");
 	else
-		mergedOutputPath.append("_merged.fa");
+		mergedOutputPath.append(".fa");
 	ofstream mergedStream(mergedOutputPath.c_str());
 	assert_good(mergedStream, mergedOutputPath);
 
@@ -939,18 +1166,12 @@ int main(int argc, char** argv)
 	 */
 
 	string read1OutputPath(opt::outputPrefix);
-	if (opt::extend)
-		read1OutputPath.append("_reads_1.fa");
-	else
-		read1OutputPath.append("_reads_1.fq");
+	read1OutputPath.append("_reads_1.fq");
 	ofstream read1Stream(read1OutputPath.c_str());
 	assert_good(read1Stream, read1OutputPath);
 
 	string read2OutputPath(opt::outputPrefix);
-	if (opt::extend)
-		read2OutputPath.append("_reads_2.fa");
-	else
-		read2OutputPath.append("_reads_2.fq");
+	read2OutputPath.append("_reads_2.fq");
 	ofstream read2Stream(read2OutputPath.c_str());
 	assert_good(read2Stream, read2OutputPath);
 
@@ -964,10 +1185,13 @@ int main(int argc, char** argv)
 	params.maxPaths = opt::maxPaths;
 	params.maxBranches = opt::maxBranches;
 	params.maxPathMismatches = opt::maxMismatches;
+	params.minPathIdentity = opt::minPathIdentity;
 	params.maxReadMismatches = opt::maxReadMismatches;
+	params.minReadIdentity = opt::minReadIdentity;
 	params.kmerMatchesThreshold = 3;
 	params.fixErrors = opt::fixErrors;
 	params.maskBases = opt::mask;
+	params.preserveReads = opt::preserveReads;
 	params.memLimit = opt::searchMem;
 	params.dotPath = opt::dotPath;
 	params.dotStream = opt::dotPath.empty() ? NULL : &dotStream;
@@ -1037,9 +1261,9 @@ int main(int argc, char** argv)
 				<< "%)\n";
 			if (opt::extend) {
 				cerr << "Unmerged reads corrected/extended: "
-					<< g_count.singleEndCorrected
+					<< g_count.singleEndExtended
 					<< " (" << setprecision(3) <<  (float)100
-					* g_count.singleEndCorrected / ((g_count.readPairsProcessed -
+					* g_count.singleEndExtended / ((g_count.readPairsProcessed -
 					g_count.uniquePath - g_count.multiplePaths) * 2)
 					<< "%)\n";
 			}
diff --git a/Konnector/konnector.h b/Konnector/konnector.h
index e78fac6..2325af9 100644
--- a/Konnector/konnector.h
+++ b/Konnector/konnector.h
@@ -26,7 +26,16 @@ struct ConnectPairsResult
 	unsigned k;
 	std::string readNamePrefix;
 	PathSearchResult pathResult;
+	/** alternate connecting sequence(s) for read pair */
+	std::vector<Sequence> connectingSeqs;
+	/** read pairs joined with alternate connecting sequence(s) */
 	std::vector<FastaRecord> mergedSeqs;
+	/** consensus sequence for alternate connecting sequences */
+	Sequence consensusConnectingSeq;
+	/**
+	 * consensus sequence for read pairs joined by
+	 * alternate connecting sequences
+	 */
 	FastaRecord consensusSeq;
 	bool foundStartKmer;
 	bool foundGoalKmer;
@@ -37,7 +46,9 @@ struct ConnectPairsResult
 	unsigned maxDepthVisitedForward;
 	unsigned maxDepthVisitedReverse;
 	unsigned pathMismatches;
+	float pathIdentity;
 	unsigned readMismatches;
+	float readIdentity;
 	size_t memUsage;
 
 	ConnectPairsResult() :
@@ -52,7 +63,9 @@ struct ConnectPairsResult
 		maxDepthVisitedForward(0),
 		maxDepthVisitedReverse(0),
 		pathMismatches(0),
+		pathIdentity(0.0f),
 		readMismatches(0),
+		readIdentity(0.0f),
 		memUsage(0)
 	{}
 
@@ -70,7 +83,9 @@ struct ConnectPairsResult
 			<< "max_depth_forward" << "\t"
 			<< "max_depth_reverse" << "\t"
 			<< "path_mismatches" << "\t"
+			<< "path_identity" << "\t"
 			<< "read_mismatches" << "\t"
+			<< "read_identity" << "\t"
 			<< "mem_usage" << "\n";
 		return out;
 	}
@@ -105,7 +120,9 @@ struct ConnectPairsResult
 			<< o.maxDepthVisitedForward << "\t"
 			<< o.maxDepthVisitedReverse << "\t"
 			<< o.pathMismatches << "\t"
+			<< std::setprecision(3) << o.pathIdentity << "\t"
 			<< o.readMismatches << "\t"
+			<< std::setprecision(3) << o.readIdentity << "\t"
 			<< o.memUsage << "\n";
 
 		return out;
@@ -119,10 +136,13 @@ struct ConnectPairsParams {
 	unsigned maxPaths;
 	unsigned maxBranches;
 	unsigned maxPathMismatches;
+	float minPathIdentity;
 	unsigned maxReadMismatches;
+	float minReadIdentity;
 	unsigned kmerMatchesThreshold;
 	bool fixErrors;
 	bool maskBases;
+	bool preserveReads;
 	size_t memLimit;
 	std::string dotPath;
 	std::ofstream* dotStream;
@@ -133,10 +153,13 @@ struct ConnectPairsParams {
 		maxPaths(NO_LIMIT),
 		maxBranches(NO_LIMIT),
 		maxPathMismatches(NO_LIMIT),
+		minPathIdentity(0.0f),
 		maxReadMismatches(NO_LIMIT),
+		minReadIdentity(0.0f),
 		kmerMatchesThreshold(1),
 		fixErrors(false),
 		maskBases(false),
+		preserveReads(false),
 		memLimit(std::numeric_limits<std::size_t>::max()),
 		dotStream(NULL)
 	{}
@@ -230,10 +253,10 @@ static inline ConnectPairsResult connectPairs(
 	const unsigned numMatchesThreshold = 3;
 
 	unsigned startKmerPos = getStartKmerPos(read1, k, FORWARD, g,
-		numMatchesThreshold);
+		numMatchesThreshold, params.preserveReads);
 
 	unsigned goalKmerPos = getStartKmerPos(read2, k, FORWARD, g,
-		numMatchesThreshold);
+		numMatchesThreshold, params.preserveReads);
 
 	const FastaRecord* pRead1 = &read1;
 	const FastaRecord* pRead2 = &read2;
@@ -298,48 +321,101 @@ static inline ConnectPairsResult connectPairs(
 	result.maxDepthVisitedReverse = visitor.getMaxDepthVisited(REVERSE);
 	result.memUsage = visitor.approxMemUsage();
 
-	// write traversal graph to dot file (-d option)
-
 	if (result.pathResult == FOUND_PATH) {
 
-		// build sequences for connecting paths
-
-		std::string seqPrefix = pRead1->seq.substr(0, startKmerPos);
-		std::string seqSuffix = reverseComplement(pRead2->seq.substr(0, goalKmerPos));
-		for (unsigned i = 0; i < paths.size(); i++) {
-			FastaRecord mergedSeq;
-			std::stringstream index;
-			index << i;
-			assert(index);
-			mergedSeq.id = result.readNamePrefix + "_" + index.str();
-			mergedSeq.seq = seqPrefix + pathToSeq(paths[i]) + seqSuffix;
-			result.mergedSeqs.push_back(mergedSeq);
+		/* build sequences for connecting paths */
+
+		std::string seqPrefix, seqSuffix;
+
+		if (params.preserveReads) {
+			seqPrefix = pRead1->seq;
+			seqSuffix = reverseComplement(pRead2->seq);
+			unsigned trimLeft = pRead1->seq.length() - startKmerPos;
+			unsigned trimRight = pRead2->seq.length() - goalKmerPos;
+			for (unsigned i = 0; i < paths.size(); i++) {
+				Sequence connectingSeq = pathToSeq(paths[i]);
+				/*
+				 * If the input reads overlap, we must fail because
+				 * there's no way to preserve the original read
+				 * sequences in the merged read (the reads may disagree
+				 * in the region of overlap)
+				 */
+				if (trimLeft + trimRight > connectingSeq.length()) {
+					result.pathResult = NO_PATH;
+					return result;
+				}
+				connectingSeq = connectingSeq.substr(trimLeft,
+					connectingSeq.length() - trimLeft - trimRight);
+				result.connectingSeqs.push_back(connectingSeq);
+			}
+		} else {
+			seqPrefix = pRead1->seq.substr(0, startKmerPos);
+			seqSuffix = reverseComplement(pRead2->seq.substr(0, goalKmerPos));
+			for (unsigned i = 0; i < paths.size(); i++)
+				result.connectingSeqs.push_back(pathToSeq(paths[i]));
 		}
 
-		// calc consensus seq and mismatch stats
+		unsigned readPairLength = read1.seq.length() + read2.seq.length();
 
 		if (paths.size() == 1) {
 
+			/* found a unique path between the reads */
+
+			FastaRecord mergedSeq;
+			mergedSeq.id = result.readNamePrefix;
+			mergedSeq.seq = seqPrefix + result.connectingSeqs.front() + seqSuffix;
 			result.readMismatches =
-				maskNew(read1, read2, result.mergedSeqs.front(), params.maskBases);
+				maskNew(read1, read2, mergedSeq, params.maskBases);
+			result.pathIdentity = 100.0f;
+			result.readIdentity = 100.0f * (float)(readPairLength -
+				result.readMismatches) / readPairLength;
+
+			result.mergedSeqs.push_back(mergedSeq);
+			result.consensusSeq = mergedSeq;
+			result.consensusConnectingSeq = result.connectingSeqs.front();
 
 		} else {
 
+			/*
+			 * multiple paths were found, so build a consensus
+			 * sequence using multiple sequence alignment.
+			 */
+
 			NWAlignment aln;
 			unsigned matches, size;
-			boost::tie(matches, size) = align(result.mergedSeqs, aln);
+			boost::tie(matches, size) = align(result.connectingSeqs, aln);
 			assert(size >= matches);
 			result.pathMismatches = size - matches;
-
+			result.consensusConnectingSeq = aln.match_align;
+			result.pathIdentity = 100.0f *
+				(float)(result.consensusConnectingSeq.length()
+				- result.pathMismatches) / result.consensusConnectingSeq.length();
 			result.consensusSeq.id = result.readNamePrefix;
-			result.consensusSeq.seq = aln.match_align;
+			result.consensusSeq.seq = seqPrefix + result.consensusConnectingSeq +
+				seqSuffix;
 			result.readMismatches =
 				maskNew(read1, read2, result.consensusSeq, params.maskBases);
+			result.readIdentity = 100.0f * (float)(readPairLength -
+				result.readMismatches) / readPairLength;
+
+			unsigned i = 1;
+			for (std::vector<Sequence>::iterator it = result.connectingSeqs.begin();
+				it != result.connectingSeqs.end(); ++it) {
+				FastaRecord mergedSeq;
+				std::ostringstream id;
+				id << result.readNamePrefix << '_' << i++;
+				mergedSeq.id = id.str();
+				mergedSeq.seq = seqPrefix + *it + seqSuffix;
+				result.mergedSeqs.push_back(mergedSeq);
+			}
 
 		}
 
+		assert(result.connectingSeqs.size() == result.mergedSeqs.size());
 	}
 
+	/* write traversal graph to dot file (-d option) */
+
 	if (!params.dotPath.empty()) {
 		HashGraph<Kmer> traversalGraph;
 		visitor.getTraversalGraph(traversalGraph);
@@ -369,7 +445,8 @@ static inline Kmer getHeadKmer(const Sequence& seq, Direction dir,
 template <typename Graph>
 static inline bool extendSeqThroughBubble(Sequence& seq,
 	Direction dir, unsigned startKmerPos, unsigned k,
-	const Graph& g, unsigned trimLen=0, bool maskNew=false)
+	const Graph& g, unsigned trimLen=0, bool maskNew=false,
+	bool preserveSeq=false)
 {
 	assert(seq.length() >= k);
 	assert(dir == FORWARD || dir == REVERSE);
@@ -388,6 +465,11 @@ static inline bool extendSeqThroughBubble(Sequence& seq,
 		return false;
 	}
 
+	std::string headKmer = seq.substr(startKmerPos, k);
+	if (headKmer.find_first_not_of("AGCTagct") !=
+		std::string::npos)
+		return false;
+
 	Kmer head(seq.substr(startKmerPos, k));
 	std::vector<Kmer> buds = trueBranches(head, dir, g, trimLen);
 
@@ -406,8 +488,14 @@ static inline bool extendSeqThroughBubble(Sequence& seq,
 		path1.push_back(head);
 		path2.push_back(head);
 	}
-	extendPath(path1, dir, g, trimLen, k+2);
-	extendPath(path2, dir, g, trimLen, k+2);
+
+	ExtendPathParams params;
+	params.trimLen = trimLen;
+	params.maxLen = k + 2;
+	params.lookBehind = true;
+
+	extendPath(path1, dir, g, params);
+	extendPath(path2, dir, g, params);
 
 	/* paths lengths not k+1 -- not a simple bubble */
 	if (path1.size() != k+2 || path2.size() != k+2)
@@ -432,10 +520,33 @@ static inline bool extendSeqThroughBubble(Sequence& seq,
 	Sequence& consensus = alignment.match_align;
 
 	if (dir == FORWARD) {
-		overlaySeq(consensus, seq, startKmerPos, maskNew);
+		if (preserveSeq) {
+			/*
+			 * make sure bubble extends beyond end of
+			 * original sequence
+			 */
+			assert(startKmerPos + consensus.length()
+				> seq.length());
+			overlaySeq(consensus.substr(seq.length() - startKmerPos),
+				seq, seq.length(), maskNew);
+		} else {
+			overlaySeq(consensus, seq, startKmerPos, maskNew);
+		}
 	} else {
-		overlaySeq(consensus, seq,
-			-consensus.length() + startKmerPos + k, maskNew);
+		if (preserveSeq) {
+			/*
+			 * make sure bubble extends beyond end of
+			 * original sequence
+			 */
+			assert(consensus.length() > startKmerPos + k);
+			consensus = consensus.substr(0,
+				consensus.length() - startKmerPos - k);
+			overlaySeq(consensus, seq, -consensus.length(),
+				maskNew);
+		} else {
+			overlaySeq(consensus, seq,
+				-consensus.length() + startKmerPos + k, maskNew);
+		}
 	}
 
 	return true;
@@ -530,7 +641,8 @@ template <typename Graph>
 static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
 	unsigned startKmerPos, unsigned k, const Graph& g,
 	unsigned maxLen=NO_LIMIT, unsigned trimLen=0,
-	bool maskNew=false)
+	bool maskNew=false, bool popBubbles=true,
+	bool preserveSeq=false)
 {
 	if (seq.length() < k)
 		return ES_NO_START_KMER;
@@ -587,8 +699,12 @@ static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
 				(int)(maxLen - startKmerPos - k + 1));
 		}
 
-		pathResult = extendPath(path, FORWARD, g, trimLen,
-			maxPathLen);
+		ExtendPathParams params;
+		params.trimLen = trimLen;
+		params.maxLen = maxPathLen;
+		params.lookBehind = false;
+
+		pathResult = extendPath(path, FORWARD, g, params);
 
 		/*
 		 * give up if we don't at extend beyond end
@@ -622,35 +738,46 @@ static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
 			pathResult == EXTENDED_TO_LENGTH_LIMIT))
 		{
 			std::string pathSeq = pathToSeq(path);
-			overlaySeq(pathSeq, seq, seq.length() - k + 1, maskNew);
+			if (preserveSeq)
+				overlaySeq(pathSeq.substr(k), seq,
+					seq.length(), maskNew);
+			else
+				overlaySeq(pathSeq, seq,
+					seq.length() - k + 1, maskNew);
 		}
 
 		/*
 		 * extend through simple bubbles
 		 */
 		done = true;
-		if (seq.length() < maxLen &&
+		if (popBubbles && seq.length() < maxLen &&
 			(pathResult == BRANCHING_POINT ||
 			pathResult == EXTENDED_TO_BRANCHING_POINT)) {
 			startKmerPos = startKmerPos + path.size() - 1;
 			assert(startKmerPos < seq.length() - k + 1);
 			if (extendSeqThroughBubble(seq, FORWARD, startKmerPos,
-				k, g, trimLen, maskNew)) {
+				k, g, trimLen, maskNew, preserveSeq)) {
 
 				/* make sure we don't exceed extension limit */
 				if (seq.length() > maxLen)
 					seq = seq.substr(0, maxLen);
 
 				/* check for cycle */
-				Path<Kmer> bubblePath = seqToPath(seq.substr(startKmerPos), k);
-				for (Path<Kmer>::iterator it = bubblePath.begin();
-					it != bubblePath.end(); ++it) {
-					if (visited.containsKmer(*it)) {
+				for (unsigned i = startKmerPos + 1;
+					i < seq.length() - k + 1; ++i) {
+					std::string kmerStr = seq.substr(i, k);
+					size_t pos = kmerStr.find_first_not_of("AGCTagct");
+					if (pos != std::string::npos) {
+						i += pos;
+						continue;
+					}
+					Kmer kmer(kmerStr);
+					if (visited.containsKmer(kmer)) {
 						pathResult = EXTENDED_TO_CYCLE;
-						bubblePath.erase(it, bubblePath.end());
+						seq.erase(i);
 						break;
 					}
-					visited.addKmer(*it);
+					visited.addKmer(kmer);
 				}
 
 				/* set up for another round of extension */
@@ -715,23 +842,15 @@ static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
 	return result;
 }
 
-/**
- * Correct the given sequence using the Bloom filter de Bruijn
- * graph.  The correction is performed by finding the longest
- * stretch of good kmers in the sequence and extending that
- * region both left and right.
- */
 template <typename Graph>
-static inline bool correctAndExtendSeq(Sequence& seq,
-	unsigned k, const Graph& g, unsigned maxLen=NO_LIMIT,
-	unsigned trimLen=0, bool maskNew=false)
+static inline bool trimRead(FastqRecord& read,
+	unsigned k, const Graph& g)
 {
+	Sequence& seq = read.seq;
+
 	if (seq.size() < k)
 		return false;
 
-	if (maxLen < seq.length())
-		maxLen = seq.length();
-
 	/*
 	 * find longest stretch of contiguous kmers
 	 * in de Bruijn graph
@@ -772,22 +891,8 @@ static inline bool correctAndExtendSeq(Sequence& seq,
 	assert(maxMatchStart != UNSET);
 	assert(maxMatchLen > 0);
 
-	unsigned maxMatchSeqLen = maxMatchLen+k-1;
-	unsigned seedSeqLen = std::min(2*k-1, maxMatchSeqLen);
-
-	Sequence correctedSeq = seq.substr(
-		maxMatchStart + maxMatchSeqLen - seedSeqLen,
-		std::string::npos);
-
-	extendSeq(correctedSeq, REVERSE, correctedSeq.length()-k, k, g, 2*k,
-		trimLen, maskNew);
-	if (correctedSeq.length() < 2*k)
-		return false;
-
-	correctedSeq = correctedSeq.substr(0, k);
-	extendSeq(correctedSeq, FORWARD, 0, k, g, 2*k+1, trimLen, maskNew);
-
-	seq = correctedSeq;
+	read.seq = read.seq.substr(maxMatchStart, maxMatchLen);
+	read.qual = read.qual.substr(maxMatchStart, maxMatchLen);
 	return true;
 }
 
diff --git a/LICENSE b/LICENSE
index f13b0d6..542147f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,24 +1,22 @@
 ABySS
-Copyright 2014 Canada's Michael Smith Genome Sciences Centre
+Copyright 2016 British Columbia Cancer Agency Branch
 
-You may use, redistribute and modify this software for non-commercial
-purposes under the terms of the GNU General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
-
-To license ABySS for commercial purposes, please contact
-Patrick Rebstein <prebstein at bccancer.bc.ca>
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU General Public License for more details.
 
-The complete text of the GNU General Public License version 3 follows
-and is also available from the Free Software Foundation web site:
-http://www.gnu.org/licenses/
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+For commercial licensing options, please contact
+Patrick Rebstein <prebstein at bccancer.bc.ca>
 
-See the file `COPYRIGHT` for details of the copyright and license of
+See the file COPYRIGHT for details of the copyright and license of
 each individual file included with this software.
 
                     GNU GENERAL PUBLIC LICENSE
diff --git a/LogKmerCount/CountingBloomFilter.h b/LogKmerCount/CountingBloomFilter.h
index ee09ae0..3dbfea7 100644
--- a/LogKmerCount/CountingBloomFilter.h
+++ b/LogKmerCount/CountingBloomFilter.h
@@ -6,9 +6,9 @@
 #define COUNTINGBLOOMFILTER_H 1
 
 #include "Bloom/Bloom.h"
-#include <vector>
-#include <math.h>
 #include <cassert>
+#include <cmath>
+#include <vector>
 
 /** A counting Bloom filter. */
 template<typename NumericType>
diff --git a/LogKmerCount/plc.h b/LogKmerCount/plc.h
index 07fe2c1..b07a020 100644
--- a/LogKmerCount/plc.h
+++ b/LogKmerCount/plc.h
@@ -5,10 +5,10 @@
  * Exponent = 5 bits
  * Copyright 2014 bcgsc
  */
-#include <stdint.h>
-#include <stdlib.h>
-#include <math.h>
+#include <cmath>
+#include <cstdlib>
 #include <iostream>
+#include <stdint.h>
 
 using namespace std;
 
diff --git a/Makefile.am b/Makefile.am
index 6314e11..190a219 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -12,9 +12,13 @@ dist_doc_DATA = \
 	CITATION.bib CITATION.md \
 	COPYRIGHT \
 	LICENSE \
-	README.css README.html README.md
+	README.md
 
-EXTRA_DIST=doxygen.conf
+if HAVE_PANDOC
+dist_doc_DATA += README.html
+endif
+
+EXTRA_DIST=autogen.sh doxygen.conf
 
 SUBDIRS = \
 	bin \
@@ -29,6 +33,7 @@ SUBDIRS = \
 	Align \
 	ABYSS $(Parallel) \
 	Bloom \
+	BloomDBG \
 	Konnector \
 	Consensus \
 	DAssembler \
@@ -51,11 +56,13 @@ SUBDIRS = \
 	GapFiller \
 	Sealer \
 	AdjList \
+	lib/bloomfilter \
+	lib/rolling-hash \
 	$(GTest) \
 	$(UnitTest)
 
-%.html: %.md
-	-multimarkdown $< >$@
+%.html: $(srcdir)/%.md
+	-pandoc -s -o $@ $<
 
 clean-local:
 	rm -f README.html
diff --git a/ParseAligns/abyss-fixmate.cc b/ParseAligns/abyss-fixmate.cc
index 0c97f92..39eee77 100644
--- a/ParseAligns/abyss-fixmate.cc
+++ b/ParseAligns/abyss-fixmate.cc
@@ -17,7 +17,6 @@
 #include <boost/unordered_map.hpp>
 #include "DataBase/Options.h"
 #include "DataBase/DB.h"
-#include <math.h>
 
 using namespace std;
 
diff --git a/README.css b/README.css
deleted file mode 100644
index 2ed6ef2..0000000
--- a/README.css
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Font */
-body { font: 12pt Georgia, Palatino, Times, serif; }
-h1, h2, h3, h4 {
-	font-family: Verdana, Helvetica, Arial, sans-serif;
-	font-weight: normal;
-}
-h1 { font-size: 18pt; }
-h2, h3, h4 { font-size: 14pt; }
-a { text-decoration: none; }
-code { font: 12pt Courier, monospace; }
-pre { font: 12pt Courier, monospace; }
-
-/* Colour and border */
-a {
-	color: #222222;
-	border-bottom: 1pt dashed #888888;
-}
-a:hover {
-	color: #ffffff;
-	background: #222222;
-}
-pre {
-	background-color: #dddddd;
-	border: #777777 1pt solid;
-}
-
-/* Layout */
-p {
-	text-align: justify;
-	min-width: 18pc;
-	max-width: 42pc;
-}
-pre {
-	word-wrap: break-word;
-	max-width: 42pc;
-	margin: 1pc;
-	padding-left: 1pc;
-	padding-right: 1pc;
-}
diff --git a/README.md b/README.md
index ed06ef3..4765389 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,3 @@
-Title: ABySS README
-Author: Shaun Jackman, Anthony Raymond
-Affiliation: Canada's Michael Smith Genome Sciences Centre
-CSS: README.css
-
 ABySS
 =====
 
@@ -22,6 +17,7 @@ Contents
 * [Assembling multiple libraries](#assembling-multiple-libraries)
 * [Scaffolding](#scaffolding)
 * [Rescaffolding with long sequences](#rescaffolding-with-long-sequences)
+* [Assembling using a Bloom filter de Bruijn graph](#assembling-using-a-bloom-filter-de-bruijn-graph)
 * [Assembling using a paired de Bruijn graph](#assembling-using-a-paired-de-bruijn-graph)
 * [Assembling a strand-specific RNA-Seq library](#assembling-a-strand-specific-rna-seq-library)
 * [Optimizing the parameter k](#optimizing-the-parameter-k)
@@ -69,9 +65,10 @@ Dependencies
 
 ABySS requires the following libraries:
 
-* [Boost](http://www.boost.org)
-* [sparsehash](http://code.google.com/p/sparsehash)
+* [Boost](http://www.boost.org/)
 * [Open MPI](http://www.open-mpi.org)
+* [sparsehash](https://code.google.com/p/sparsehash/)
+* [SQLite](http://www.sqlite.org/)
 
 ABySS requires a C++ compiler that supports
 [OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
@@ -137,6 +134,11 @@ usage, although it will build without. sparsehash should be found in
 
 	./configure CPPFLAGS=-I/usr/local/include
 
+If SQLite is installed in non-default directories, its location can be
+specified to `configure`:
+
+	./configure --with-sqlite=/opt/sqlite3
+
 The default maximum k-mer size is 64 and may be decreased to reduce
 memory usage or increased at compile time. This value must be a
 multiple of 32 (i.e. 32, 64, 96, 128, etc):
@@ -187,22 +189,23 @@ single-end assembly must be well over the fragment-size to obtain an
 accurate empirical distribution.
 
 Here's an example scenario of assembling a data set with two different
-fragment libraries and single-end reads:
+fragment libraries and single-end reads. Note that the names of the libraries
+(`pea` and `peb`) are arbitrary.
 
- * Library `pe200` has reads in two files,
-   `pe200_1.fa` and `pe200_2.fa`.
- * Library `pe500` has reads in two files,
-   `pe500_1.fa` and `pe500_2.fa`.
+ * Library `pea` has reads in two files,
+   `pea_1.fa` and `pea_2.fa`.
+ * Library `peb` has reads in two files,
+   `peb_1.fa` and `peb_2.fa`.
  * Single-end reads are stored in two files, `se1.fa` and `se2.fa`.
 
 The command line to assemble this example data set is:
 
-	abyss-pe k=64 name=ecoli lib='pe200 pe500' \
-		pe200='pe200_1.fa pe200_2.fa' pe500='pe500_1.fa pe500_2.fa' \
+	abyss-pe k=64 name=ecoli lib='pea peb' \
+		pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
 		se='se1.fa se2.fa'
 
 The empirical distribution of fragment sizes will be stored in two
-files named `pe200-3.hist` and `pe500-3.hist`. These files may be
+files named `pea-3.hist` and `peb-3.hist`. These files may be
 plotted to check that the empirical distribution agrees with the
 expected distribution. The assembled contigs will be stored in
 `${name}-contigs.fa`.
@@ -214,11 +217,12 @@ Long-distance mate-pair libraries may be used to scaffold an assembly.
 Specify the names of the mate-pair libraries using the parameter `mp`.
 The scaffolds will be stored in the file `${name}-scaffolds.fa`.
 Here's an example of assembling a data set with two paired-end
-libraries and two mate-pair libraries:
+libraries and two mate-pair libraries. Note that the names of the libraries
+(`pea`, `peb`, `mpa`, `mpb`) are arbitrary.
 
-	abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' \
-		pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
-		mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa'
+	abyss-pe k=64 name=ecoli lib='pea peb' mp='mpc mpd' \
+		pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
+		mpc='mpc_1.fa mpc_2.fa' mpd='mpd_1.fa mpd_2.fa'
 
 The mate-pair libraries are used only for scaffolding and do not
 contribute towards the consensus sequence.
@@ -233,12 +237,32 @@ can be linked unambiguously when considering all BWA-MEM alignments.
 
 Similar to scaffolding, the names of the datasets can be specified with
 the `long` parameter. These scaffolds will be stored in the file
-`${name}-trans-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly:
+`${name}-trans-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly. Note that the names of the libraries are arbitrary.
 
-	abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long=long1 \
+	abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
 		pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
 		mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa' \
-		long1=long1.fa
+		longa='longa.fa'
+
+Assembling using a Bloom filter de Bruijn graph
+=========================================
+
+Assemblies may be performed using a _Bloom filter de Bruijn graph_, which
+typically reduces memory requirements by an order of magnitude. In order to
+assemble in Bloom filter mode, the user must specify 3 additional parameters:
+`B` (Bloom filter size), `H` (number of Bloom filter hash functions), and `kc`
+(minimum k-mer count threshold). Valid size units for the `B` parameter are 'k',
+'M', 'G'. If no unit is specified, bytes are assumed. For example, the following
+will run a E. coli assembly with a Bloom filter size of 100 MB, 3 hash
+functions, a minimum k-mer count threshold of 3, and verbose logging:
+
+	abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
+
+At the current time, the user must calculate suitable values for `B` and `H` on
+their own, and finding the best value for `kc` may require experimentation
+(optimal values are typically in the range of 2-4). Users are recommended to
+target a Bloom filter false positive rate (FPR) that is less than 5%, as
+reported by the assembly log when using the `v=-v` option (verbose level 1).
 
 Assembling using a paired de Bruijn graph
 =========================================
@@ -265,7 +289,7 @@ Assembling a strand-specific RNA-Seq library
 ============================================
 
 Strand-specific RNA-Seq libraries can be assembled such that the
-resulting unitigs, conitgs and scaffolds are oriented correctly with
+resulting unitigs, contigs and scaffolds are oriented correctly with
 respect to the original transcripts that were sequenced. In order to
 run ABySS in strand-specific mode, the `SS` parameter must be used as
 in the following example:
@@ -281,18 +305,17 @@ Optimizing the parameter k
 
 To find the optimal value of `k`, run multiple assemblies and inspect
 the assembly contiguity statistics. The following shell snippet will
-assemble for every value of `k` from 20 to 40.
+assemble for every eighth value of `k` from 50 to 90.
 
-	export k
-	for k in {20..40}; do
+	for k in `seq 50 8 90`; do
 		mkdir k$k
-		abyss-pe -C k$k name=ecoli in=../reads.fa
+		abyss-pe -C k$k name=ecoli k=$k in=../reads.fa
 	done
 	abyss-fac k*/ecoli-contigs.fa
 
-The default maximum value for `k` is 64. This limit may be changed at
+The default maximum value for `k` is 96. This limit may be changed at
 compile time using the `--enable-maxk` option of configure. It may be
-decreased to 32 to decrease memory usage or increased to 96.
+decreased to 32 to decrease memory usage or increased to larger values.
 
 Parallel processing
 ===================
@@ -321,12 +344,11 @@ ABySS integrates well with cluster job schedulers, such as:
  * Load Sharing Facility (LSF)
  * IBM LoadLeveler
 
-For example, to submit an array of jobs to assemble every odd value of
-`k` between 51 and 63 using 64 processes for each job:
+For example, to submit an array of jobs to assemble every eighth value of
+`k` between 50 and 90 using 64 processes for each job:
 
-	mkdir k{51..63}
-	qsub -N ecoli -pe openmpi 64 -t 51-63:2 \
-		<<<'abyss-pe -C k$SGE_TASK_ID in=/data/reads.fa'
+	qsub -N ecoli -pe openmpi 64 -t 50-90:8 \
+		<<<'mkdir k$SGE_TASK_ID && abyss-pe -C k$SGE_TASK_ID in=/data/reads.fa'
 
 Using the DIDA alignment framework
 =================================
@@ -346,34 +368,41 @@ Assembly Parameters
 Parameters of the driver script, `abyss-pe`
 
  * `a`: maximum number of branches of a bubble [`2`]
- * `b`: maximum length of a bubble (bp) [`10000`]
+ * `b`: maximum length of a bubble (bp) [`""`]
+ * `B`: Bloom filter size (e.g. "100M")
  * `c`: minimum mean k-mer coverage of a unitig [`sqrt(median)`]
  * `d`: allowable error of a distance estimate (bp) [`6`]
- * `e`: minimum erosion k-mer coverage [`sqrt(median)`]
- * `E`: minimum erosion k-mer coverage per strand [`1`]
+ * `e`: minimum erosion k-mer coverage [`round(sqrt(median))`]
+ * `E`: minimum erosion k-mer coverage per strand [1 if sqrt(median) > 2 else 0]
+ * `G`: genome size, used to calculate NG50 [disabled]
+ * `H`: number of Bloom filter hash functions [1]
  * `j`: number of threads [`2`]
  * `k`: size of k-mer (when `K` is not set) or the span of a k-mer pair (when `K` is set)
+ * `kc`: minimum k-mer count threshold for Bloom filter assembly [`2`]
  * `K`: the length of a single k-mer in a k-mer pair (bp)
- * `l`: minimum alignment length of a read (bp) [`k`]
+ * `l`: minimum alignment length of a read (bp) [`40`]
  * `m`: minimum overlap of two unitigs (bp) [`30`]
  * `n`: minimum number of pairs required for building contigs [`10`]
  * `N`: minimum number of pairs required for building scaffolds [`n`]
+ * `np`: number of MPI processes [`1`]
  * `p`: minimum sequence identity of a bubble [`0.9`]
  * `q`: minimum base quality [`3`]
- * `s`: minimum unitig size required for building contigs (bp) [`200`]
- * `S`: minimum contig size required for building scaffolds (bp) [`s`]
- * `t`: minimum tip size (bp) [`2k`]
+ * `s`: minimum unitig size required for building contigs (bp) [`1000`]
+ * `S`: minimum contig size required for building scaffolds (bp) [`1000-10000`]
+ * `t`: maximum length of blunt contigs to trim [`k`]
  * `v`: use `v=-v` for verbose logging, `v=-vv` for extra verbose [`disabled`]
+ * `x`: spaced seed (Bloom filter assembly only)
 
 Please see the
 [abyss-pe](http://manpages.ubuntu.com/abyss-pe.1.html)
 manual page for more information on assembly parameters.
 
-Possibly, `abyss-pe` parameters can have same names as existing environment variables'. The parameters then cannot be used until the environment variables are unset. To detect such occasions, run the command:
+Environment variables
+=====================
 
-	abyss-pe env [options]
+`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=20`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
 
-Above command will report all `abyss-pe` parameters that are set from various origins. However it will not operate ABySS programs.
+	abyss-pe env [options]
 
 ABySS programs
 ==============
@@ -408,23 +437,12 @@ ABySS programs
  * `abyss-scaffold`: scaffold contigs using distance estimates
  * `abyss-todot`: convert graph formats and merge graphs
 
-For a flowchart showing the relationship between these programs,
-see doc/flowchart.pdf.
+This [flowchart](https://github.com/bcgsc/abyss/blob/master/doc/flowchart.pdf) shows the ABySS assembly pipeline its intermediate files.
 
 Export to SQLite Database
 =========================
 
-ABySS has a built-in support for SQLite database. With this option activated, it exports log values into a SQLite file and/or `.csv` files at runtime.
-
-## Activating the functionality
-
-Download SQLite [here](http://www.sqlite.org/download.html) and install. (See [Quick Start](#quick-start) for details)
-
-To compile ABySS with SQLite, add configure flag `--with-sqlite` to the steps in [Compiling ABySS from GiHub](#compiling-abyss-from-github) / [Compiling ABySS from source](#compiling-abyss-from-source).
-
-	./configure [other options] --with-sqlite=/path/to/sqlite3/
-	make
-	sudo make install
+ABySS has a built-in support for SQLite database to export log values into a SQLite file and/or `.csv` files at runtime.
 
 ## Database parameters
 Of `abyss-pe`:
@@ -433,9 +451,9 @@ Of `abyss-pe`:
  * `strain`: name of strain to archive [ ]
  * `library`: name of library to archive [ ]
 
-For example, to export data of species 'Ecoli', strain 'O121' and library 'pe200' into your SQLite database repository named '/abyss/test.sqlite':
+For example, to export data of species 'Ecoli', strain 'O121' and library 'pea' into your SQLite database repository named '/abyss/test.sqlite':
 
-	abyss-pe db=/abyss/test.sqlite species=Ecoli strain=O121 library=pe200 [other options]
+	abyss-pe db=/abyss/test.sqlite species=Ecoli strain=O121 library=pea [other options]
 
 ## Helper programs
 Found in your `path`:
@@ -461,7 +479,7 @@ Publications
 ## [ABySS](http://genome.cshlp.org/content/19/6/1117)
 
 Simpson, Jared T., Kim Wong, Shaun D. Jackman, Jacqueline E. Schein,
-Steven JM Jones, and İnanç Birol.
+Steven JM Jones, and Inanc Birol.
 **ABySS: a parallel assembler for short read sequence data**.
 *Genome research* 19, no. 6 (2009): 1117-1123.
 [doi:10.1101/gr.089532.108](http://dx.doi.org/10.1101/gr.089532.108)
@@ -488,6 +506,8 @@ Support
 [Ask a question](https://www.biostars.org/p/new/post/?tag_val=abyss,assembly)
 on [Biostars](https://www.biostars.org/t/abyss/).
 
+[Create a new issue](https://github.com/bcgsc/abyss/issues) on GitHub.
+
 Subscribe to the
 [ABySS mailing list]
 (http://groups.google.com/group/abyss-users),
@@ -501,15 +521,11 @@ For questions related to transcriptome assembly, contact the
 Authors
 =======
 
-- **[Shaun Jackman](http://sjackman.ca)**
-  — [GitHub/sjackman](https://github.com/sjackman)
-  — [@sjackman](https://twitter.com/sjackman)
-- **Tony Raymond** — [GitHub/traymond](https://github.com/traymond)
-- **Ben Vandervalk** — [GitHub/benvvalk ](https://github.com/benvvalk)
-- **Jared Simpson** — [GitHub/jts](https://github.com/jts)
-
-Supervised by [**Dr. İnanç Birol**](http://www.bcgsc.ca/faculty/inanc-birol).
++ **[Shaun Jackman](http://sjackman.ca)** - [GitHub/sjackman](https://github.com/sjackman) - [@sjackman](https://twitter.com/sjackman)
++ **Tony Raymond** - [GitHub/traymond](https://github.com/traymond)
++ **Ben Vandervalk** - [GitHub/benvvalk ](https://github.com/benvvalk)
++ **Jared Simpson** - [GitHub/jts](https://github.com/jts)
 
-Copyright 2014 Canada's Michael Smith Genome Sciences Centre
+Supervised by [**Dr. Inanc Birol**](http://www.bcgsc.ca/faculty/inanc-birol).
 
-[![githalytics.com](https://cruel-carlota.pagodabox.com/af4811df3b40b7d096f6085db2969f0e "githalytics.com")](http://githalytics.com/sjackman/abyss)
+Copyright 2016 Canada's Michael Smith Genome Sciences Centre
diff --git a/Scaffold/drawgraph.cc b/Scaffold/drawgraph.cc
index 6d39178..6b155e4 100644
--- a/Scaffold/drawgraph.cc
+++ b/Scaffold/drawgraph.cc
@@ -293,7 +293,7 @@ int main(int argc, char** argv)
 	for (tie(uit, ulast) = vertices(g); uit != ulast; ++uit) {
 		V u = *uit;
 		size_t ui = get(vertex_index, g, u);
-		double x1 = isnan(b[ui]) ? 0 : b[ui];
+		double x1 = std::isnan(b[ui]) ? 0 : b[ui];
 		sorted.push_back(std::make_pair(x1, u));
 	}
 	sort(sorted.begin(), sorted.end());
diff --git a/Scaffold/scaffold.cc b/Scaffold/scaffold.cc
index d1ef9f0..e021b0f 100644
--- a/Scaffold/scaffold.cc
+++ b/Scaffold/scaffold.cc
@@ -58,6 +58,8 @@ static const char USAGE_MESSAGE[] =
 "          or -s N0-N1   Find the value of s in [N0,N1]\n"
 "                        that maximizes the scaffold N50.\n"
 "  -k, --kmer=N          length of a k-mer\n"
+"  -G, --genome-size=N   expected genome size. Used to calculate NG50\n"
+"                        and associated stats [disabled]\n"
 "      --min-gap=N       minimum scaffold gap length to output [50]\n"
 "      --max-gap=N       maximum scaffold gap length to output [inf]\n"
 "      --complex         remove complex transitive edges\n"
@@ -89,6 +91,9 @@ namespace opt {
 	static unsigned minContigLength = 200;
 	static unsigned minContigLengthEnd;
 
+	/** Genome size. Used to calculate NG50. */
+	static long long unsigned genomeSize;
+
 	/** Minimum scaffold gap length to output. */
 	static int minGap = 50;
 
@@ -115,7 +120,7 @@ namespace opt {
 	static int comp_trans;
 }
 
-static const char shortopts[] = "g:k:n:o:s:v";
+static const char shortopts[] = "G:g:k:n:o:s:v";
 
 enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
 	OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
@@ -124,6 +129,7 @@ enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
 static const struct option longopts[] = {
 	{ "graph",       no_argument,       NULL, 'g' },
 	{ "kmer",        required_argument, NULL, 'k' },
+	{ "genome-size", required_argument, NULL, 'G' },
 	{ "min-gap",     required_argument, NULL, OPT_MIN_GAP },
 	{ "max-gap",     required_argument, NULL, OPT_MAX_GAP },
 	{ "npairs",      required_argument, NULL, 'n' },
@@ -736,7 +742,7 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
 		static bool printHeader = true;
 		Histogram h = buildScaffoldLengthHistogram(g, paths);
 		printContiguityStats(cerr, h, STATS_MIN_LENGTH,
-				printHeader)
+				printHeader, "\t", opt::genomeSize)
 			<< "\ts=" << minContigLength << '\n';
 		if (opt::verbose == 0)
 			printHeader = false;
@@ -765,7 +771,9 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
 
 	// Print assembly contiguity statistics.
 	Histogram h = buildScaffoldLengthHistogram(g, paths);
-	printContiguityStats(cerr, h, STATS_MIN_LENGTH) << '\n';
+	printContiguityStats(cerr, h, STATS_MIN_LENGTH,
+			true, "\t", opt::genomeSize)
+		<< "\ts=" << minContigLength << '\n';
 	addCntgStatsToDb(h, STATS_MIN_LENGTH);
 	return h.trimLow(STATS_MIN_LENGTH).n50();
 }
@@ -787,6 +795,13 @@ int main(int argc, char** argv)
 		  case 'k':
 			arg >> opt::k;
 			break;
+		  case 'G':
+			{
+				double x;
+				arg >> x;
+				opt::genomeSize = x;
+				break;
+			}
 		  case 'g':
 			arg >> opt::graphPath;
 			break;
diff --git a/Sealer/Makefile.am b/Sealer/Makefile.am
index 30704e1..eca0b31 100644
--- a/Sealer/Makefile.am
+++ b/Sealer/Makefile.am
@@ -26,4 +26,4 @@ abyss_sealer_SOURCES = sealer.cc \
 
 # Convert the README.md to a man page using Pandoc
 abyss-sealer.1: README.md
-	-pandoc -s -o $@ $<
+	pandoc -s -o $@ $<
diff --git a/Sealer/README.md b/Sealer/README.md
index 190aa86..d1aab26 100644
--- a/Sealer/README.md
+++ b/Sealer/README.md
@@ -15,11 +15,11 @@ abyss-sealer - Close gaps within scaffolds
 Synopsis
 ================================================================================
 
-`abyss-sealer -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...`
+`abyss-sealer -b <Bloom filter size> -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...`
 
 For example:
 
-`abyss-sealer -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa`
+`abyss-sealer -b20G -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa`
 
 Description
 ===========
@@ -34,7 +34,7 @@ See ABySS installation instructions.
 How to run as stand-alone application
 =====================================
 
-`abyss-sealer [-k values...] [-o outputprefix] [-S assembly file] [options...] [reads...]`
+`abyss-sealer [-b bloom filter size][-k values...] [-o outputprefix] [-S assembly file] [options...] [reads...]`
 
 Sealer requires the following information to run:
 - draft assembly
@@ -47,13 +47,17 @@ Sample commands
 
 Without pre-built bloom filters:
 
-`abyss-sealer -k90 -k80 -o run1 -S test.fa read1.fa.gz read2.fa.gz`
+`abyss-sealer -b20G -k90 -k80 -o run1 -S test.fa read1.fa.gz read2.fa.gz`
 
 With pre-built bloom filters:
 
 `abyss-sealer -k90 -k80 -o run1 -S test.fa -i k90.bloom -i k80.bloom read1.fa.gz read2.fa.gz`
 
-Note: when using pre-built bloom filters, Sealer must be compiled with the same `maxk` value that the bloom filter was built with. For example, if a bloom filter was built with a `maxk`of 64, Sealer must be compiled with a `maxk` of 64 as well. If different values are used between the pre-built bloom filter and Sealer, any sequences generated will be nonsensical and incorrect.
+Reusable Bloom filters can be pre-built with `abyss-bloom build`, e.g.:
+
+`abyss-bloom build -vv -k90 -j12 -b20G -l2 k90.bloom read1.fa.gz read2.fa.gz`
+
+Note: when using pre-built bloom filters generated by `abyss-bloom build`, Sealer must be compiled with the same `maxk` value that `abyss-bloom` was compiled with. For example, if a Bloom filter was built with a `maxk`of 64, Sealer must be compiled with a `maxk` of 64 as well. If different values are used between the pre-built bloom filter and Sealer, any sequences generated will be nonsensical and incorrect.
 
 Suggested parameters for first run
 ==================================
@@ -141,7 +145,7 @@ Parameters of `abyss-sealer`
 * `-D`,`--flank-distance=N`: distance of flank from gap [0]
 * `-j`,`--threads=N`: use N parallel threads [1]
 * `-k`,`--kmer=N`: the size of a k-mer
-* `-b`,`--bloom-size=N`: size of bloom filter [500M]
+* `-b`,`--bloom-size=N`: size of bloom filter. Required when not using pre-built Bloom filter(s).
 * `-B`,`--max-branches=N`: max branches in de Bruijn graph traversal; use 'nolimit' for no limit [1000]
 * `-d`,`--dot-file=FILE`: write graph traversals to a DOT file
 * `-e`,`--fix-errors`: find and fix single-base errors when reads have no kmers in bloom filter [disabled]
diff --git a/Sealer/sealer.cc b/Sealer/sealer.cc
index 420deeb..9c439a7 100644
--- a/Sealer/sealer.cc
+++ b/Sealer/sealer.cc
@@ -60,10 +60,10 @@ PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
 "Copyright 2014 Canada's Michael Smith Genome Science Centre\n";
 
 static const char USAGE_MESSAGE[] =
-"Usage: " PROGRAM " -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...\n"
-"i.e. abyss-sealer -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa\n\n"
+"Usage: " PROGRAM "-b <Bloom filter size> -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...\n"
+"i.e. abyss-sealer -b20G -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa\n\n"
 "Close gaps by using left and right flanking sequences of gaps as 'reads' for Konnector\n"
-"and performing multiple runs with each of the supplied K values..\n"
+"and performing multiple runs with each of the supplied K values.\n"
 "\n"
 " Options:\n"
 "\n"
@@ -71,16 +71,18 @@ static const char USAGE_MESSAGE[] =
 "  -S, --input-scaffold=FILE    load scaffold from FILE\n"
 "  -L, --flank-length=N         length of flanks to be used as pseudoreads [100]\n"
 "  -D, --flank-distance=N       distance of flank from gap [0]\n"
+"  -G, --max-gap-length=N       max gap size to fill in bp [800]; runtime increases\n"
+"                               exponentially with respect to this parameter\n"
 "  -j, --threads=N              use N parallel threads [1]\n"
 "  -k, --kmer=N                 the size of a k-mer\n"
-"  -b, --bloom-size=N           size of bloom filter [500M]\n"
+"  -b, --bloom-size=N           size of Bloom filter (e.g. '40G'). Required\n"
+"                               when not using pre-built Bloom filter(s)\n"
+"                               (-i option)\n"
 "  -B, --max-branches=N         max branches in de Bruijn graph traversal;\n"
 "                               use 'nolimit' for no limit [1000]\n"
 "  -d, --dot-file=FILE          write graph traversals to a DOT file\n"
 "  -e, --fix-errors             find and fix single-base errors when reads\n"
 "                               have no kmers in bloom filter [disabled]\n"
-"  -f, --min-frag=N             min fragment size in base pairs [0]\n"
-"  -F, --max-frag=N             max fragment size in base pairs [1000]\n"
 "  -i, --input-bloom=FILE       load bloom filter from FILE\n"
 "      --mask                   mask new and changed bases as lower case\n"
 "      --no-mask                do not mask bases [default]\n"
@@ -113,6 +115,22 @@ static const char USAGE_MESSAGE[] =
 "      --help                   display this help and exit\n"
 "      --version                output version information and exit\n"
 "\n"
+" Deprecated Options:\n"
+"\n"
+"  -f, --min-frag=N             min fragment size in base pairs\n"
+"  -F, --max-frag=N             max fragment size in base pairs\n"
+"\n"
+"   Note: --max-frag was formerly used to determine the maximum gap\n"
+"   size that abyss-sealer would attempt to close, according to the formula\n"
+"   max_gap_size = max_frag - 2 * flank_length, where flank_length is\n"
+"   deteremined by the -L option.  --max-frag is kept only for backwards\n"
+"   compatibility and is superceded by the more intuitive -G (--max-gap-length)\n"
+"   option. Similarly, --min-frag determines the minimum gap size to close,\n"
+"   according to the formula min_gap_size = min_frag - 2 * flank_length, where\n"
+"   a negative gap size indicates an overlap between gap flanks.  Normally the\n"
+"   user would not want to specify a minimum gap size and so it is recommended to\n"
+"   leave --min-frag unset.\n"
+"\n"
 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
 
 namespace opt {
@@ -123,6 +141,9 @@ namespace opt {
 	/** Distance of flank from gap. */
 	unsigned flankDistance = 0;
 
+	/** Max gap size to fill */
+	unsigned maxGapLength = 800;
+
 	/** scaffold file input. */
 	static string inputScaffold;
 
@@ -130,7 +151,7 @@ namespace opt {
 	static unsigned threads = 1;
 
 	/** The size of the bloom filter in bytes. */
-	size_t bloomSize = 500 * 1024 * 1024;
+	size_t bloomSize = 0;
 
 	/** The maximum count value of the BLoom filter. */
 	unsigned max_count = 2;
@@ -163,7 +184,7 @@ namespace opt {
 	unsigned minFrag = 0;
 
 	/** The maximum fragment size */
-	unsigned maxFrag = 1000;
+	unsigned maxFrag = 0;
 
 	/** Bloom filter input file */
 	static string inputBloomPath;
@@ -218,7 +239,7 @@ struct Counters {
 	size_t skipped;
 };
 
-static const char shortopts[] = "S:L:D:b:B:d:ef:F:i:Ij:k:lm:M:no:P:q:r:s:t:v";
+static const char shortopts[] = "S:L:D:b:B:d:ef:F:G:i:Ij:k:lm:M:no:P:q:r:s:t:v";
 
 enum { OPT_HELP = 1, OPT_VERSION };
 
@@ -228,6 +249,7 @@ static const struct option longopts[] = {
 	{ "input-scaffold",   required_argument, NULL, 'S' },
 	{ "flank-length",     required_argument, NULL, 'L' },
 	{ "flank-distance",   required_argument, NULL, 'D' },
+	{ "max-gap-length",   required_argument, NULL, 'G' },
 	{ "bloom-size",       required_argument, NULL, 'b' },
 	{ "max-branches",     required_argument, NULL, 'B' },
 	{ "dot-file",         required_argument, NULL, 'd' },
@@ -266,7 +288,7 @@ struct Coord
 {
 	int start;
 	int end;
-	
+
 	Coord() { }
 	Coord(int start, int end) : start(start), end(end) { }
 
@@ -701,6 +723,8 @@ int main(int argc, char** argv)
 			arg >> opt::flankLength; break;
 		  case 'D':
 			arg >> opt::flankDistance; break;
+		  case 'G':
+			arg >> opt::maxGapLength; break;
 		  case 'b':
 			opt::bloomSize = SIToBytes(arg); break;
 		  case 'B':
@@ -769,6 +793,14 @@ int main(int argc, char** argv)
 		}
 	}
 
+	/* translate --max-frag to --max-gap-length for backwards compatibility */
+	if (opt::maxFrag > 0) {
+		if ((int)opt::maxFrag < 2 * opt::flankLength)
+			opt::maxGapLength = 0;
+		else
+			opt::maxGapLength = opt::maxFrag - 2 * opt::flankLength;
+	}
+
 	if (opt::inputScaffold.empty()) {
 		cerr << PROGRAM ": missing mandatory option `-S'\n";
 		die = true;
@@ -779,14 +811,36 @@ int main(int argc, char** argv)
 		die = true;
 	}
 
+	if (opt::bloomFilterPaths.size() < opt::kvector.size()
+		&& opt::bloomSize == 0)
+	{
+		cerr << PROGRAM ": missing mandatory option `-b' (Bloom filter size)\n"
+			<< "Here are some guidelines for sizing the Bloom filter:\n"
+			<< "  * E. coli (~5 Mbp genome), 615X coverage: -b500M\n"
+			<< "  * S. cerevisiae (~12 Mbp genome), 25X coverage: -b500M\n"
+			<< "  * C. elegans (~100 Mbp genome), 89X coverage: -b1200M\n"
+			<< "  * H. sapiens (~3 Gbp genome), 71X coverage: -b40G\n";
+		die = true;
+	}
+
 	if (opt::outputPrefix.empty()) {
 		cerr << PROGRAM ": missing mandatory option `-o'\n";
 		die = true;
 	}
 
-	if (argc - optind < 1) {
+	if (opt::bloomFilterPaths.size() > opt::kvector.size()) {
+		cerr << PROGRAM ": you must specify a k-mer size (-k) for each Bloom "
+			" filter file (-i)\n";
+		die = true;
+	} else if (opt::bloomFilterPaths.size() < opt::kvector.size()
+		&& argc - optind < 1) {
 		cerr << PROGRAM ": missing input file arguments\n";
 		die = true;
+	} else if (opt::bloomFilterPaths.size() == opt::kvector.size()
+		&& argc - optind > 0) {
+		cerr << PROGRAM ": input FASTA/FASTQ args should be omitted when using "
+			"pre-built Bloom filters (-i) for all k-mer sizes\n";
+		die = true;
 	}
 
 	if (die) {
@@ -806,8 +860,6 @@ int main(int argc, char** argv)
 	seqanTests();
 #endif
 
-	assert(opt::bloomSize > 0);
-
 	ofstream dotStream;
 	if (!opt::dotPath.empty()) {
 		if (opt::verbose)
@@ -848,7 +900,7 @@ int main(int argc, char** argv)
 	ConnectPairsParams params;
 
 	params.minMergedSeqLen = opt::minFrag;
-	params.maxMergedSeqLen = opt::maxFrag;
+	params.maxMergedSeqLen = opt::maxGapLength + 2 * opt::flankLength;
 	params.maxPaths = opt::maxPaths;
 	params.maxBranches = opt::maxBranches;
 	params.maxPathMismatches = opt::maxMismatches;
@@ -893,12 +945,12 @@ int main(int argc, char** argv)
 		map<FastaRecord, Gap>::iterator read2_it;
 
 		string read1OutputPath(opt::outputPrefix);
-		read1OutputPath.append("_flanks_1.fq");
+		read1OutputPath.append("_flanks_1.fa");
 		ofstream read1Stream(read1OutputPath.c_str());
 		assert_good(read1Stream, read1OutputPath);
 
 		string read2OutputPath(opt::outputPrefix);
-		read2OutputPath.append("_flanks_2.fq");
+		read2OutputPath.append("_flanks_2.fa");
 		ofstream read2Stream(read2OutputPath.c_str());
 		assert_good(read2Stream, read2OutputPath);
 
@@ -934,7 +986,7 @@ int main(int argc, char** argv)
 		BloomFilter* bloom;
 		CascadingBloomFilter* cascadingBloom = NULL;
 
-		if (!opt::bloomFilterPaths.empty() && i <= opt::bloomFilterPaths.size()) {
+		if (!opt::bloomFilterPaths.empty() && i < opt::bloomFilterPaths.size()) {
 
 			temp = "Loading bloom filter from `" + opt::bloomFilterPaths.at(i) + "'...\n";
 			printLog(logStream, temp);
@@ -963,6 +1015,12 @@ int main(int argc, char** argv)
 			bloom = &cascadingBloom->getBloomFilter(opt::max_count - 1);
 		}
 
+		assert(bloom != NULL);
+
+		if (opt::verbose)
+			cerr << "Bloom filter FPR: " << setprecision(3)
+				<< 100 * bloom->FPR() << "%\n";
+
 		DBGBloom<BloomFilter> g(*bloom);
 
 		temp = "Starting K run with k = " + IntToString(opt::k) + "\n";
diff --git a/SimpleGraph/SimpleGraph.cpp b/SimpleGraph/SimpleGraph.cpp
index 373ec70..0a9ebd3 100644
--- a/SimpleGraph/SimpleGraph.cpp
+++ b/SimpleGraph/SimpleGraph.cpp
@@ -647,7 +647,7 @@ static void* worker(void* pArg)
 		static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER;
 		pthread_mutex_lock(&inMutex);
 		EstimateRecord er;
-		bool good = (*arg.in) >> er;
+		bool good = bool((*arg.in) >> er);
 		pthread_mutex_unlock(&inMutex);
 		if (!good)
 			break;
diff --git a/Unittest/BloomDBG/BloomDBGTest.cpp b/Unittest/BloomDBG/BloomDBGTest.cpp
new file mode 100644
index 0000000..155ae38
--- /dev/null
+++ b/Unittest/BloomDBG/BloomDBGTest.cpp
@@ -0,0 +1,155 @@
+#include "Common/Sequence.h"
+#include "BloomDBG/bloom-dbg.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "BloomDBG/RollingHash.h"
+#include "BloomDBG/RollingBloomDBG.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+using namespace std;
+typedef RollingBloomDBG<BloomFilter> Graph;
+typedef graph_traits<Graph> GraphTraits;
+
+/* each vertex is represented by
+ * std::pair<MaskedKmer, vector<size_t>>, where 'string' is the
+ * k-mer and 'vector<size_t>' is the associated set of
+ * hash values */
+typedef graph_traits<Graph>::vertex_descriptor V;
+
+/** Convert a path in the de Bruijn graph to a sequence */
+TEST(BloomDBG, pathToSeq)
+{
+	const string inputSeq = "ACGTAC";
+	const string spacedSeed = "10001";
+	const unsigned k = 5;
+	const unsigned numHashes = 2;
+
+	MaskedKmer::setLength(k);
+	MaskedKmer::setMask(spacedSeed);
+
+	Path<BloomDBG::Vertex> path =
+		BloomDBG::seqToPath(inputSeq, k, numHashes);
+	ASSERT_EQ(2U, path.size());
+
+	string outputSeq = BloomDBG::pathToSeq(path, k);
+	ASSERT_EQ("ACNNAC", outputSeq);
+}
+
+/** Split a sequence at branching k-mers the de Bruijn graph */
+TEST(BloomDBG, splitSeq)
+{
+	const size_t bloomSize = 100000;
+	const unsigned k = 5;
+	const unsigned numHashes = 2;
+	const unsigned minBranchLen = 1;
+	size_t hashes[MAX_HASHES];
+
+	/* it is important to reset these, since they persist between tests */
+	MaskedKmer::setLength(k);
+	MaskedKmer::mask().clear();
+
+	/*
+	 * Test graph (k=5):
+	 *
+	 *   GACTC-ACTCG-CTCGG
+	 *
+	 * Input sequence (horizontal path above):
+	 *
+	 *   GACTCGG
+	 */
+
+	BloomFilter bloom1(bloomSize, numHashes, k);
+
+	RollingHash("GACTC", numHashes, k).getHashes(hashes);
+	bloom1.insert(hashes);
+	RollingHash("ACTCG", numHashes, k).getHashes(hashes);
+	bloom1.insert(hashes);
+	RollingHash("CTCGG", numHashes, k).getHashes(hashes);
+	bloom1.insert(hashes);
+
+	Sequence seq1 = "GACTCGG";
+
+	Graph graph1(bloom1);
+	vector<Sequence> segments1 = BloomDBG::splitSeq(seq1, k,
+		numHashes, graph1, minBranchLen);
+
+	V GACTC(V("GACTC", RollingHash("GACTC", numHashes, k)));
+
+	ASSERT_EQ(1U, out_degree(GACTC, graph1));
+	ASSERT_EQ(1U, segments1.size());
+	ASSERT_EQ("GACTCGG", segments1.front());
+
+	/*
+	 * Test graph (k=5):
+	 *
+	 *         ACTCT
+	 *        /
+	 *   GACTC-ACTCG-CTCGG
+	 *              /
+	 *         TCTCG
+	 *
+	 * Input sequence (horizontal path above):
+	 *
+	 *   GACTCGG
+	 */
+
+	BloomFilter bloom2(bloomSize, numHashes, k);
+
+	RollingHash("GACTC", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("ACTCT", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("ACTCG", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("CTCGG", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("TCTCG", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+
+	Sequence seq2 = "GACTCGG";
+
+	Graph graph2(bloom2);
+	vector<Sequence> segments2 = BloomDBG::splitSeq(seq2, k,
+		numHashes, graph2, minBranchLen);
+
+	ASSERT_EQ(3U, segments2.size());
+	ASSERT_EQ("GACTC", segments2.at(0));
+	ASSERT_EQ("GACTCGG", segments2.at(1));
+	ASSERT_EQ("CTCGG", segments2.at(2));
+
+	/*
+	 * Test graph (k=5):
+	 *
+	 *   TACTC       CTCGA
+	 *        \     /
+	 *   GACTC-ACTCG-CTCGG
+	 *
+	 * Input sequence (horizontal path above):
+	 *
+	 *   ACTCG
+	 */
+
+	BloomFilter bloom3(bloomSize, numHashes, k);
+
+	RollingHash("TACTC", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("GACTC", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("ACTCG", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("CTCGA", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+	RollingHash("CTCGG", numHashes, k).getHashes(hashes);
+	bloom2.insert(hashes);
+
+	Sequence seq3 = "ACTCG";
+
+	Graph graph3(bloom3);
+	vector<Sequence> segments3 = BloomDBG::splitSeq(seq3, k,
+		numHashes, graph3, minBranchLen);
+
+	ASSERT_EQ(1U, segments3.size());
+	ASSERT_EQ("ACTCG", segments3.front());
+}
diff --git a/Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp b/Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp
new file mode 100644
index 0000000..5acfb53
--- /dev/null
+++ b/Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp
@@ -0,0 +1,46 @@
+#include "BloomDBG/RollingHashIterator.h"
+#include "BloomDBG/HashAgnosticCascadingBloom.h"
+
+#include <gtest/gtest.h>
+
+using namespace std;
+typedef uint64_t hash_t;
+
+TEST(HashAgnosticCascadingBloom, base)
+{
+	const unsigned bloomSize = 1000;
+	const unsigned numHashes = 1;
+	const unsigned numLevels = 2;
+	const unsigned k = 16;
+
+	HashAgnosticCascadingBloom x(bloomSize, numHashes, numLevels, k);
+	EXPECT_EQ(x.size(), bloomSize);
+
+	const char* a = "AGATGTGCTGCCGCCT";
+	const char* b = "TGGACAGCGTTACCTC";
+	const char* c = "TAATAACAGTCCCTAT";
+	const char* d = "GATCGTGGCGGGCGAT";
+
+	RollingHashIterator itA(a, numHashes, k);
+	RollingHashIterator itB(b, numHashes, k);
+	RollingHashIterator itC(c, numHashes, k);
+	RollingHashIterator itD(d, numHashes, k);
+	size_t hash;
+
+	x.insert(*itA);
+	EXPECT_EQ(x.popcount(), 0U);
+	EXPECT_FALSE(x.contains(&hash));
+	x.insert(*itA);
+	EXPECT_EQ(x.popcount(), 1U);
+	EXPECT_TRUE(x.contains(*itA));
+	x.insert(*itB);
+	EXPECT_EQ(x.popcount(), 1U);
+	EXPECT_FALSE(x.contains(*itB));
+	x.insert(*itC);
+	EXPECT_EQ(x.popcount(), 1U);
+	EXPECT_FALSE(x.contains(*itC));
+	x.insert(*itB);
+	EXPECT_EQ(x.popcount(), 2U);
+	EXPECT_TRUE(x.contains(*itB));
+	EXPECT_FALSE(x.contains(*itD));
+}
diff --git a/Unittest/BloomDBG/MaskedKmerTest.cpp b/Unittest/BloomDBG/MaskedKmerTest.cpp
new file mode 100644
index 0000000..441954e
--- /dev/null
+++ b/Unittest/BloomDBG/MaskedKmerTest.cpp
@@ -0,0 +1,26 @@
+#include "BloomDBG/MaskedKmer.h"
+
+#include <gtest/gtest.h>
+
+using namespace std;
+
+TEST(MaskedKmerTest, trivialMask)
+{
+	MaskedKmer::setLength(4);
+
+	MaskedKmer kmer1("ACGT");
+	MaskedKmer kmer2("ACGT");
+
+	ASSERT_EQ(kmer1, kmer2);
+}
+
+TEST(MaskedKmerTest, nonTrivialMask)
+{
+	MaskedKmer::setLength(4);
+	MaskedKmer::setMask("1001");
+
+	MaskedKmer kmer1("ACGT");
+	MaskedKmer kmer2("ATTT");
+
+	ASSERT_EQ(kmer1, kmer2);
+}
diff --git a/Unittest/BloomDBG/RollingBloomDBGTest.cpp b/Unittest/BloomDBG/RollingBloomDBGTest.cpp
new file mode 100644
index 0000000..39bf88d
--- /dev/null
+++ b/Unittest/BloomDBG/RollingBloomDBGTest.cpp
@@ -0,0 +1,275 @@
+#include "BloomDBG/RollingBloomDBG.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+#include "Common/UnorderedSet.h"
+
+#include <gtest/gtest.h>
+#include <string>
+
+using namespace std;
+using namespace boost;
+
+typedef RollingBloomDBG<BloomFilter> Graph;
+typedef graph_traits<Graph> GraphTraits;
+typedef graph_traits<Graph>::vertex_descriptor V;
+
+/** Test fixture for RollingBloomDBG tests. */
+class RollingBloomDBGTest : public ::testing::Test
+{
+protected:
+
+	const unsigned m_k;
+	const unsigned m_bloomSize;
+	const unsigned m_numHashes;
+	BloomFilter m_bloom;
+	Graph m_graph;
+
+	RollingBloomDBGTest() : m_k(5), m_bloomSize(100000), m_numHashes(2),
+		m_bloom(m_bloomSize, m_numHashes, m_k), m_graph(m_bloom)
+	{
+		MaskedKmer::setLength(m_k);
+
+		/*
+		 * Test de Bruijn graph:
+		 *
+		 *  CGACT       ACTCT
+		 *       \     /
+		 *        GACTC
+		 *       /     \
+		 *  TGACT       ACTCG
+		 *
+		 * Note: No unexpected edges
+		 * are created by the reverse
+		 * complements of these k-mers.
+		 */
+
+		size_t hashes[MAX_HASHES];
+		RollingHash("CGACT", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("TGACT", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("GACTC", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("ACTCT", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("ACTCG", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+	}
+
+};
+
+TEST_F(RollingBloomDBGTest, out_edge_iterator)
+{
+	/* TEST: check that "GACTC" has the expected outgoing edges */
+
+	const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+	const V ACTCT("ACTCT", RollingHash("ACTCT", m_numHashes, m_k));
+	const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+	unordered_set<V> expectedNeighbours;
+	expectedNeighbours.insert(ACTCT);
+	expectedNeighbours.insert(ACTCG);
+
+	ASSERT_EQ(2u, out_degree(GACTC, m_graph));
+	GraphTraits::out_edge_iterator ei, ei_end;
+	boost::tie(ei, ei_end) = out_edges(GACTC, m_graph);
+	ASSERT_NE(ei_end, ei);
+	unordered_set<V>::iterator neighbour =
+		expectedNeighbours.find(target(*ei, m_graph));
+	EXPECT_NE(expectedNeighbours.end(), neighbour);
+	expectedNeighbours.erase(neighbour);
+	ei++;
+	ASSERT_NE(ei_end, ei);
+	neighbour = expectedNeighbours.find(target(*ei, m_graph));
+	ASSERT_NE(expectedNeighbours.end(), neighbour);
+	ei++;
+	ASSERT_EQ(ei_end, ei);
+}
+
+TEST_F(RollingBloomDBGTest, adjacency_iterator)
+{
+	/* TEST: check that "GACTC" has the expected outgoing edges */
+
+	const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+	const V ACTCT("ACTCT", RollingHash("ACTCT", m_numHashes, m_k));
+	const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+	unordered_set<V> expectedNeighbours;
+	expectedNeighbours.insert(ACTCT);
+	expectedNeighbours.insert(ACTCG);
+
+	ASSERT_EQ(2u, out_degree(GACTC, m_graph));
+	GraphTraits::adjacency_iterator ai, ai_end;
+	boost::tie(ai, ai_end) = adjacent_vertices(GACTC, m_graph);
+	ASSERT_NE(ai_end, ai);
+	unordered_set<V>::iterator neighbour =
+		expectedNeighbours.find(*ai);
+	EXPECT_NE(expectedNeighbours.end(), neighbour);
+	expectedNeighbours.erase(neighbour);
+	ai++;
+	ASSERT_NE(ai_end, ai);
+	neighbour = expectedNeighbours.find(*ai);
+	ASSERT_NE(expectedNeighbours.end(), neighbour);
+	ai++;
+	ASSERT_EQ(ai_end, ai);
+}
+
+TEST_F(RollingBloomDBGTest, in_edges)
+{
+	/* TEST: check that "GACTC" has the expected ingoing edges */
+
+	const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+	const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k));
+	const V TGACT("TGACT", RollingHash("TGACT", m_numHashes, m_k));
+
+	unordered_set<V> expectedNeighbours;
+	expectedNeighbours.insert(CGACT);
+	expectedNeighbours.insert(TGACT);
+
+	ASSERT_EQ(2u, in_degree(GACTC, m_graph));
+	GraphTraits::in_edge_iterator ei, ei_end;
+	boost::tie(ei, ei_end) = in_edges(GACTC, m_graph);
+	ASSERT_NE(ei_end, ei);
+	unordered_set<V>::iterator neighbour =
+		expectedNeighbours.find(source(*ei, m_graph));
+	EXPECT_NE(expectedNeighbours.end(), neighbour);
+	expectedNeighbours.erase(neighbour);
+	ei++;
+	ASSERT_NE(ei_end, ei);
+	neighbour = expectedNeighbours.find(source(*ei, m_graph));
+	ASSERT_NE(expectedNeighbours.end(), neighbour);
+	ei++;
+	ASSERT_EQ(ei_end, ei);
+}
+
+TEST_F(RollingBloomDBGTest, pathTraversal)
+{
+	/*
+	 * Walk a simple path:
+	 *
+	 * CGACT-GACTC-ACTCG
+	 */
+
+	BloomFilter bloom(m_bloomSize, m_numHashes, m_k);
+	Graph graph(bloom);
+
+	const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k));
+	const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+	const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+	size_t hashes[MAX_HASHES];
+	CGACT.rollingHash().getHashes(hashes);
+	bloom.insert(hashes);
+	GACTC.rollingHash().getHashes(hashes);
+	bloom.insert(hashes);
+	ACTCG.rollingHash().getHashes(hashes);
+	bloom.insert(hashes);
+
+	/* step one */
+
+	V v = CGACT;
+    ASSERT_EQ(1u, out_degree(v, graph));
+	GraphTraits::out_edge_iterator ei, ei_end;
+	boost::tie(ei, ei_end) = out_edges(v, graph);
+	ASSERT_NE(ei_end, ei);
+	ASSERT_EQ(CGACT, source(*ei, graph));
+	ASSERT_EQ(GACTC, target(*ei, graph));
+	v = target(*ei, graph);
+	++ei;
+	ASSERT_EQ(ei_end, ei);
+
+	/* step two */
+
+    ASSERT_EQ(1u, out_degree(v, graph));
+	boost::tie(ei, ei_end) = out_edges(v, graph);
+	ASSERT_NE(ei_end, ei);
+	ASSERT_EQ(GACTC, source(*ei, graph));
+	ASSERT_EQ(ACTCG, target(*ei, graph));
+	v = target(*ei, graph);
+	++ei;
+	ASSERT_EQ(ei_end, ei);
+}
+
+/** Test fixture for RollingBloomDBG with spaced seed k-mers. */
+class RollingBloomDBGSpacedSeedTest : public ::testing::Test
+{
+protected:
+
+	const unsigned m_k;
+	const unsigned m_bloomSize;
+	const unsigned m_numHashes;
+	BloomFilter m_bloom;
+	Graph m_graph;
+	const std::string m_spacedSeed;
+
+	RollingBloomDBGSpacedSeedTest() : m_k(5), m_bloomSize(100000), m_numHashes(1),
+		m_bloom(m_bloomSize, m_numHashes, m_k), m_graph(m_bloom),
+		m_spacedSeed("11011")
+	{
+		MaskedKmer::setLength(m_k);
+		MaskedKmer::setMask(m_spacedSeed);
+
+		/*
+		 * Test de Bruijn graph:
+		 *
+		 *  CGACT       ACTCT
+		 *       \     /
+		 *        GACTC
+		 *       /     \
+		 *  TGACT       ACTCG
+		 *
+		 * Masked version:
+		 *
+		 *  CG_CT       AC_CT
+		 *       \     /
+		 *        GA_TC
+		 *       /     \
+		 *  TG_CT       AC_CG
+		 *
+		 * Note: With respect to the spaced seed "11011",
+		 * GACTC is equivalent to its own reverse complement
+		 * GAGTC.  However, this does not result in
+		 * any additional edges in the graph.
+		 */
+
+		size_t hashes[MAX_HASHES];
+		RollingHash("CGACT", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("TGACT", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("GACTC", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("ACTCT", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+		RollingHash("ACTCG", m_numHashes, m_k).getHashes(hashes);
+		m_bloom.insert(hashes);
+	}
+
+};
+
+TEST_F(RollingBloomDBGSpacedSeedTest, out_edge_iterator)
+{
+	/* TEST: check that "GACTC" has the expected outgoing edges */
+
+	const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+	const V ACTCT("ACTCT", RollingHash("ACTCT", m_numHashes, m_k));
+	const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+	unordered_set<V> expectedNeighbours;
+	expectedNeighbours.insert(ACTCT);
+	expectedNeighbours.insert(ACTCG);
+
+	ASSERT_EQ(2u, out_degree(GACTC, m_graph));
+	GraphTraits::out_edge_iterator ei, ei_end;
+	boost::tie(ei, ei_end) = out_edges(GACTC, m_graph);
+	ASSERT_NE(ei_end, ei);
+	unordered_set<V>::iterator neighbour =
+		expectedNeighbours.find(target(*ei, m_graph));
+	EXPECT_NE(expectedNeighbours.end(), neighbour);
+	expectedNeighbours.erase(neighbour);
+	ei++;
+	ASSERT_NE(ei_end, ei);
+	neighbour = expectedNeighbours.find(target(*ei, m_graph));
+	ASSERT_NE(expectedNeighbours.end(), neighbour);
+	ei++;
+	ASSERT_EQ(ei_end, ei);
+}
diff --git a/Unittest/BloomDBG/RollingHashIteratorTest.cpp b/Unittest/BloomDBG/RollingHashIteratorTest.cpp
new file mode 100644
index 0000000..f2272be
--- /dev/null
+++ b/Unittest/BloomDBG/RollingHashIteratorTest.cpp
@@ -0,0 +1,116 @@
+#include "BloomDBG/RollingHashIterator.h"
+
+#include <gtest/gtest.h>
+#include <string>
+
+using namespace std;
+
+TEST(RollingHashIterator, reverseComplement)
+{
+	const unsigned k = 6;
+	const unsigned numHashes = 1;
+	const char* seq = "GCAATGT";
+	const char* rcSeq = "ACATTGC";
+
+	/** hash forward sequence */
+
+	RollingHashIterator it(seq, numHashes, k);
+	size_t kmer1Hash, kmer2Hash;
+	kmer1Hash = (*it)[0];
+	++it;
+	kmer2Hash = (*it)[0];
+	++it;
+	ASSERT_EQ(RollingHashIterator::end(), it);
+
+	/** hash reverse complement sequence */
+
+	RollingHashIterator rcIt(rcSeq, numHashes, k);
+	size_t rcKmer1Hash, rcKmer2Hash;
+	rcKmer2Hash = (*rcIt)[0];
+	++rcIt;
+	rcKmer1Hash = (*rcIt)[0];
+	++rcIt;
+	ASSERT_EQ(RollingHashIterator::end(), rcIt);
+
+	/** check hash values are the same for forward and reverse complement */
+
+	ASSERT_EQ(kmer1Hash, rcKmer1Hash);
+	ASSERT_EQ(kmer2Hash, rcKmer2Hash);
+}
+
+TEST(RollingHashIterator, badKmers)
+{
+	const unsigned k = 3;
+	const unsigned numHashes = 1;
+
+    /* skip bad k-mers in middle of sequence */
+
+	const char* seq = "AAANAAA";
+    RollingHashIterator it(seq, numHashes, k);
+	ASSERT_EQ(0u, it.pos());
+	++it;
+	ASSERT_EQ(4u, it.pos());
+	++it;
+	ASSERT_EQ(RollingHashIterator::end(), it);
+
+	/* all bad k-mers */
+
+	const char* seq2 = "NNNNNNN";
+	RollingHashIterator it2(seq2, numHashes, k);
+	ASSERT_EQ(RollingHashIterator::end(), it2);
+}
+
+TEST(RollingHashIterator, seqShorterThanK)
+{
+	const unsigned k = 5;
+	const unsigned numHashes = 1;
+	const char* seq = "ACGT";
+
+	RollingHashIterator it(seq, numHashes, k);
+	ASSERT_EQ(RollingHashIterator::end(), it);
+}
+
+TEST(RollingHashIterator, emptySeq)
+{
+	const unsigned k = 3;
+	const unsigned numHashes = 1;
+	const char* seq = "";
+
+	RollingHashIterator it(seq, numHashes, k);
+	ASSERT_EQ(RollingHashIterator::end(), it);
+}
+
+TEST(RollingHashIterator, spacedSeed)
+{
+	const unsigned k = 5;
+	const unsigned numHashes = 1;
+	const char* seq = "AGNNGC";
+	const char* rcSeq = "GCNNCT";
+	Kmer::setLength(k);
+	MaskedKmer::setMask("10001");
+
+	/** hash forward sequence */
+
+	RollingHashIterator it(seq, numHashes, k);
+	size_t kmer1Hash, kmer2Hash;
+	kmer1Hash = (*it)[0];
+	++it;
+	kmer2Hash = (*it)[0];
+	++it;
+	ASSERT_EQ(RollingHashIterator::end(), it);
+
+	/** hash reverse complement sequence */
+
+	RollingHashIterator rcIt(rcSeq, numHashes, k);
+	size_t rcKmer1Hash, rcKmer2Hash;
+	rcKmer2Hash = (*rcIt)[0];
+	++rcIt;
+	rcKmer1Hash = (*rcIt)[0];
+	++rcIt;
+	ASSERT_EQ(RollingHashIterator::end(), rcIt);
+
+	/** check hash values are the same for forward and reverse complement */
+
+	ASSERT_EQ(kmer1Hash, rcKmer1Hash);
+	ASSERT_EQ(kmer2Hash, rcKmer2Hash);
+}
diff --git a/Unittest/BloomDBG/RollingHashTest.cpp b/Unittest/BloomDBG/RollingHashTest.cpp
new file mode 100644
index 0000000..4895213
--- /dev/null
+++ b/Unittest/BloomDBG/RollingHashTest.cpp
@@ -0,0 +1,195 @@
+#include "BloomDBG/RollingHash.h"
+
+#include <gtest/gtest.h>
+#include <string>
+#include <algorithm>
+#include "boost/dynamic_bitset.hpp"
+
+using namespace std;
+using namespace boost;
+
+/** test fixture for RollingHash tests */
+class RollingHashTest : public ::testing::Test
+{
+protected:
+
+	const unsigned m_numHashes;
+	const unsigned m_k;
+	const string m_kmerMask;
+
+	RollingHashTest() : m_numHashes(2), m_k(4)
+	{
+		Kmer::setLength(m_k);
+	}
+};
+
+TEST_F(RollingHashTest, kmerMask)
+{
+	MaskedKmer::setMask("1001");
+	RollingHash kmer1Hash("GCCG", m_numHashes, m_k);
+	RollingHash kmer2Hash("GTTG", m_numHashes, m_k);
+	ASSERT_EQ(kmer1Hash, kmer2Hash);
+}
+
+TEST_F(RollingHashTest, rollRight)
+{
+	MaskedKmer::mask().clear();
+	RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+	RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+	RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+	leftKmerHash.rollRight("GACG", 'T');
+	ASSERT_EQ(middleKmerHash, leftKmerHash);
+	leftKmerHash.rollRight("ACGT", 'C');
+	ASSERT_EQ(rightKmerHash, leftKmerHash);
+}
+
+TEST_F(RollingHashTest, rollRightMasked)
+{
+	MaskedKmer::setMask("1001");
+	RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+	RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+	RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+	leftKmerHash.rollRight("GACG", 'T');
+	ASSERT_EQ(middleKmerHash, leftKmerHash);
+	leftKmerHash.rollRight("ACGT", 'C');
+	ASSERT_EQ(rightKmerHash, leftKmerHash);
+}
+
+TEST_F(RollingHashTest, rollRightMaskedMismatch)
+{
+	MaskedKmer::setMask("1001");
+
+	const char* origSeq    = "GACGTC";
+	const char* mutatedSeq = "GACTTC";
+
+	RollingHash left(origSeq, m_numHashes, m_k);
+	RollingHash middle(origSeq + 1, m_numHashes, m_k);
+	RollingHash right(origSeq + 2, m_numHashes, m_k);
+
+	RollingHash mutated(mutatedSeq, m_numHashes, m_k);
+
+	ASSERT_NE(left, mutated);
+	mutated.rollRight(mutatedSeq, 'T');
+	ASSERT_EQ(middle, mutated);
+	mutated.rollRight(mutatedSeq + 1, 'C');
+	ASSERT_EQ(right, mutated);
+}
+
+TEST_F(RollingHashTest, rollLeft)
+{
+	MaskedKmer::mask().clear();
+
+	RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+	RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+	RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+	rightKmerHash.rollLeft('A', "CGTC");
+	ASSERT_EQ(middleKmerHash, rightKmerHash);
+	rightKmerHash.rollLeft('G', "ACGT");
+	ASSERT_EQ(leftKmerHash, rightKmerHash);
+}
+
+TEST_F(RollingHashTest, rollLeftMasked)
+{
+	MaskedKmer::setMask("1001");
+
+	RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+	RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+	RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+	rightKmerHash.rollLeft('A', "CGTC");
+	ASSERT_EQ(middleKmerHash, rightKmerHash);
+	rightKmerHash.rollLeft('G', "ACGT");
+	ASSERT_EQ(leftKmerHash, rightKmerHash);
+}
+
+TEST_F(RollingHashTest, rollLeftMaskedMismatch)
+{
+	MaskedKmer::setMask("1001");
+
+	const char* origSeq    = "GACGTC";
+	const char* mutatedSeq = "GAGGTC";
+
+	RollingHash left(origSeq, m_numHashes, m_k);
+	RollingHash middle(origSeq + 1, m_numHashes, m_k);
+	RollingHash right(origSeq + 2, m_numHashes, m_k);
+
+	RollingHash mutated(mutatedSeq + 2, m_numHashes, m_k);
+
+	ASSERT_NE(right, mutated);
+	mutated.rollLeft('A', mutatedSeq + 2);
+	ASSERT_EQ(middle, mutated);
+	mutated.rollLeft('G', mutatedSeq + 1);
+	ASSERT_EQ(left, mutated);
+}
+
+TEST_F(RollingHashTest, reset)
+{
+	MaskedKmer::mask().clear();
+
+	RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+	RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+	middleKmerHash.reset("CGTC");
+	ASSERT_EQ(rightKmerHash, middleKmerHash);
+}
+
+TEST_F(RollingHashTest, resetMasked)
+{
+	MaskedKmer::setMask("1001");
+
+	RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+	RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+	/*
+	 * Note: third base of middleKmerHash is intentionally set to 'G'
+	 * instead of 'T'. However, the hash values should
+	 * still match the rightKmerHash due to the effect of
+	 * the k-mer mask.
+	 */
+	middleKmerHash.reset("CGGC");
+	ASSERT_EQ(rightKmerHash, middleKmerHash);
+}
+
+TEST_F(RollingHashTest, setBase)
+{
+	MaskedKmer::mask().clear();
+
+	char kmer1[] = "ACGT";
+	char kmer2[] = "ACCT";
+
+	RollingHash hash1(kmer1, m_numHashes, m_k);
+	RollingHash hash2(kmer2, m_numHashes, m_k);
+
+	ASSERT_NE(hash2, hash1);
+	hash1.setBase(kmer1, 2, 'C');
+	ASSERT_EQ(0, strcmp(kmer1, kmer2));
+	ASSERT_EQ(hash2, hash1);
+}
+
+TEST_F(RollingHashTest, setBaseMasked)
+{
+	MaskedKmer::setMask("1101");
+
+	char kmer1[] = "ACGT";
+	char kmer2[] = "ACCT";
+
+	RollingHash hash1(kmer1, m_numHashes, m_k);
+	RollingHash hash2(kmer2, m_numHashes, m_k);
+
+	/* hashes should agree since mismatch is in masked position */
+	ASSERT_EQ(hash2, hash1);
+	ASSERT_NE(0, strcmp(kmer1, kmer2));
+
+	/* fix mismatch in masked position (hash values shouldn't change) */
+	hash1.setBase(kmer1, 2, 'C');
+	ASSERT_EQ(hash2, hash1);
+	ASSERT_EQ(0, strcmp(kmer1, kmer2));
+
+	/* create mismatch in unmasked position (hash value should now differ) */
+	hash1.setBase(kmer1, 1, 'G');
+	ASSERT_NE(hash2, hash1);
+	ASSERT_NE(0, strcmp(kmer1, kmer2));
+}
diff --git a/Unittest/BloomDBG/SpacedSeedTest.cpp b/Unittest/BloomDBG/SpacedSeedTest.cpp
new file mode 100644
index 0000000..ecd222a
--- /dev/null
+++ b/Unittest/BloomDBG/SpacedSeedTest.cpp
@@ -0,0 +1,26 @@
+#include "BloomDBG/SpacedSeed.h"
+#include <gtest/gtest.h>
+
+using namespace std;
+
+TEST(SpacedSeedTest, qrSeed)
+{
+	/*
+	* Generate a Quadratic Residue (QR) seed. The background theory
+	* for QR seeds is described in:
+	*
+	* Egidi, Lavinia, and Giovanni Manzini. "Multiple seeds
+	* sensitivity using a single seed with threshold." Journal of
+	* bioinformatics and computational biology 13.04 (2015): 1550011.
+	*/
+	ASSERT_EQ("10100011101", SpacedSeed::qrSeed(11));
+}
+
+TEST(SpacedSeedTest, qrSeedPair)
+{
+	/*
+	 *  Generate spaced seed pattern for two mirrored QR seeds with
+	 * a gap in between.
+	 */
+	ASSERT_EQ("101000111010000000000010111000101", SpacedSeed::qrSeedPair(33,11));
+}
diff --git a/Unittest/Graph/ExtendPathTest.cpp b/Unittest/Graph/ExtendPathTest.cpp
index 63ad7d9..47072d3 100644
--- a/Unittest/Graph/ExtendPathTest.cpp
+++ b/Unittest/Graph/ExtendPathTest.cpp
@@ -62,6 +62,54 @@ TEST(extendPath, lookAhead)
 	ASSERT_FALSE(lookAhead(0, FORWARD, depth, g2));
 }
 
+TEST(extendPath, depth)
+{
+	/*
+	 *      2
+	 *     /
+	 * 0--1
+	 *     \
+	 *      3--4
+	 */
+
+	Graph g;
+	add_edge(0, 1, g);
+	add_edge(1, 2, g);
+	add_edge(1, 3, g);
+	add_edge(3, 4, g);
+
+	/* note: depth of starting node is 0 */
+	ASSERT_EQ(3u, depth(0, FORWARD, g));
+	ASSERT_EQ(2u, depth(1, FORWARD, g));
+	ASSERT_EQ(3u, depth(4, REVERSE, g));
+	ASSERT_EQ(1u, depth(1, REVERSE, g));
+}
+
+TEST(extendPath, longestBranch)
+{
+	/*
+	 *      2
+	 *     /
+	 * 0--1
+	 *     \
+	 *      3--4
+	 *     /
+	 *    5
+	 */
+
+	Graph g;
+	add_edge(0, 1, g);
+	add_edge(1, 2, g);
+	add_edge(1, 3, g);
+	add_edge(3, 4, g);
+	add_edge(5, 3, g);
+
+	ASSERT_EQ(1u, longestBranch(0, FORWARD, g));
+	ASSERT_EQ(3u, longestBranch(1, FORWARD, g));
+	ASSERT_EQ(1u, longestBranch(3, REVERSE, g));
+	ASSERT_EQ(3u, longestBranch(4, REVERSE, g));
+}
+
 TEST(extendPath, noExtension)
 {
 	// Graph containing a single edge.
@@ -170,7 +218,8 @@ TEST(extendPath, bidirectional)
 
 TEST(extendPath, withTrimming)
 {
-	const unsigned trimLen = 1;
+	ExtendPathParams params;
+	params.trimLen = 1;
 
 	/*
 	 *       2
@@ -193,8 +242,7 @@ TEST(extendPath, withTrimming)
 	Path<Vertex> path;
 	path.push_back(0);
 
-	extendPath(path, FORWARD, g, trimLen);
-	ASSERT_EQ(4u, path.size());
+	extendPath(path, FORWARD, g, params);
 	ASSERT_EQ(expectedPath, path);
 
 	/*
@@ -212,17 +260,29 @@ TEST(extendPath, withTrimming)
 	add_edge(3, 4, g2);
 	add_edge(3, 5, g2);
 
-	Path<Vertex> expectedPath2;
-	expectedPath2.push_back(0);
-	expectedPath2.push_back(1);
-	expectedPath2.push_back(3);
-
 	Path<Vertex> path2;
 	path2.push_back(0);
 
-	extendPath(path2, FORWARD, g2, trimLen);
-	EXPECT_EQ(3u, path2.size());
-	ASSERT_EQ(expectedPath2, path2);
+	extendPath(path2, FORWARD, g2, params);
+
+	/**
+	 * Note: In situations where there are
+	 * multiple branches shorter than the trim
+	 * length, we chose the longest one.  (And
+	 * if the branches are of equal length we
+	 * choose one arbitrarily.)
+	 *
+	 * This is the desired behaviour to deal
+	 * with coverage gaps in the de Bruijn
+	 * graph, which can make a legimitate branch
+	 * indistinguishable from short branches
+	 * due to read errors / Bloom filter false
+	 * positives.
+	 */
+	ASSERT_EQ(4u, path2.size());
+	ASSERT_EQ(0u, path2.at(0));
+	ASSERT_EQ(1u, path2.at(1));
+	ASSERT_EQ(3u, path2.at(2));
 }
 
 TEST(extendPath, cycles)
@@ -282,11 +342,13 @@ TEST(extendPath, cycles)
 	Path<Vertex> expectedPath2;
 	expectedPath2.push_back(0);
 	expectedPath2.push_back(1);
-	expectedPath2.push_back(2);
-	expectedPath2.push_back(3);
 
 	result = extendPath(path2, FORWARD, g2);
-	EXPECT_EQ(EXTENDED_TO_CYCLE, result);
+	/*
+	 * note: expected result is EXTENDED_TO_BRANCHING_POINT
+	 * because vertex 1 has 2 incoming edges
+	 */
+	EXPECT_EQ(EXTENDED_TO_BRANCHING_POINT, result);
 	EXPECT_EQ(expectedPath2, path2);
 
 	/*
@@ -305,13 +367,15 @@ TEST(extendPath, cycles)
 	path3.push_back(0);
 
 	Path<Vertex> expectedPath3;
-	expectedPath3.push_back(3);
-	expectedPath3.push_back(2);
 	expectedPath3.push_back(1);
 	expectedPath3.push_back(0);
 
 	result = extendPath(path3, REVERSE, g3);
-	EXPECT_EQ(EXTENDED_TO_CYCLE, result);
+	/*
+	 * note: expected result is EXTENDED_TO_BRANCHING_POINT
+	 * because vertex 1 has 2 incoming edges
+	 */
+	EXPECT_EQ(EXTENDED_TO_BRANCHING_POINT, result);
 	EXPECT_EQ(expectedPath3, path3);
 }
 
diff --git a/Unittest/Makefile.am b/Unittest/Makefile.am
index d0e41d4..da0980f 100644
--- a/Unittest/Makefile.am
+++ b/Unittest/Makefile.am
@@ -1,150 +1,111 @@
-GTEST_LIBS_ = $(top_builddir)/lib/gtest-1.7.0/libgtest_main.a
 # -Wno-error is used here because there is no portable way
 # to suppress warning: "argument unused during compilation: '-pthread'"
 # for clang on OSX.
 # See: http://stackoverflow.com/questions/17841140/os-x-clang-pthread
-GTEST_CXXFLAGS_ = $(AM_CXXFLAGS) $(PTHREAD_CFLAGS) -Wno-error
-GTEST_INCLUDES_ = -I$(top_srcdir) -I$(top_srcdir)/lib/gtest-1.7.0/include
-GTEST_LDFLAGS_ = $(PTHREAD_LIBS)
+AM_CXXFLAGS += $(PTHREAD_CFLAGS) -Wno-error
+AM_LDFLAGS = $(PTHREAD_LIBS)
+AM_CPPFLAGS = \
+	-I$(top_srcdir) \
+	-I$(top_srcdir)/lib/gtest-1.7.0/include
+LDADD = $(top_builddir)/lib/gtest-1.7.0/libgtest_main.a
 
 check_PROGRAMS = common_stringutil
 common_stringutil_SOURCES = Common/StringUtilTest.cpp
-common_stringutil_CPPFLAGS = $(GTEST_INCLUDES_)
-common_stringutil_LDADD = $(GTEST_LIBS_)
-common_stringutil_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_stringutil_LDFLAGS = $(GTEST_LDFLAGS_)
 
 check_PROGRAMS += common_histogram
 common_histogram_SOURCES = Common/HistogramTest.cpp
-common_histogram_CPPFLAGS = $(GTEST_INCLUDES_)
-common_histogram_LDADD = $(GTEST_LIBS_)
-common_histogram_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_histogram_LDFLAGS = $(GTEST_LDFLAGS_)
 
 check_PROGRAMS += common_bitutil
 common_bitutil_SOURCES = Common/BitUtilTest.cpp
-common_bitutil_CPPFLAGS = $(GTEST_INCLUDES_)
-common_bitutil_LDADD = $(GTEST_LIBS_)
-common_bitutil_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_bitutil_LDFLAGS = $(GTEST_LDFLAGS_)
 
 check_PROGRAMS += common_kmer
 common_kmer_SOURCES = Common/KmerTest.cpp
-common_kmer_CPPFLAGS = $(GTEST_INCLUDES_)
-common_kmer_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_kmer_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_kmer_LDFLAGS = $(GTEST_LDFLAGS_)
+common_kmer_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += common_sequence
 common_sequence_SOURCES = Common/Sequence.cc
-common_sequence_CPPFLAGS = $(GTEST_INCLUDES_)
-common_sequence_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_sequence_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_sequence_LDFLAGS = $(GTEST_LDFLAGS_)
+common_sequence_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += common_KmerIterator
 common_KmerIterator_SOURCES = Common/KmerIteratorTest.cpp
-common_KmerIterator_CPPFLAGS = $(GTEST_INCLUDES_)
-common_KmerIterator_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_KmerIterator_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_KmerIterator_LDFLAGS = $(GTEST_LDFLAGS_)
+common_KmerIterator_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += common_sam
 common_sam_SOURCES = Common/SAM.cc
-common_sam_CPPFLAGS = $(GTEST_INCLUDES_)
-common_sam_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_sam_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_sam_LDFLAGS = $(GTEST_LDFLAGS_)
+common_sam_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += BloomFilter
 BloomFilter_SOURCES = Konnector/BloomFilter.cc
-BloomFilter_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-BloomFilter_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-BloomFilter_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-BloomFilter_LDFLAGS = $(GTEST_LDFLAGS_)
+BloomFilter_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+BloomFilter_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
+BloomFilter_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 check_PROGRAMS += Konnector_DBGBloom
 Konnector_DBGBloom_SOURCES = Konnector/DBGBloomTest.cpp
-Konnector_DBGBloom_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-Konnector_DBGBloom_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-Konnector_DBGBloom_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-Konnector_DBGBloom_LDFLAGS = $(GTEST_LDFLAGS_)
+Konnector_DBGBloom_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+Konnector_DBGBloom_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
+Konnector_DBGBloom_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 check_PROGRAMS += Konnector_DBGBloomAlgorithms
 Konnector_DBGBloomAlgorithms_SOURCES = Konnector/DBGBloomAlgorithmsTest.cpp
-Konnector_DBGBloomAlgorithms_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-Konnector_DBGBloomAlgorithms_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-Konnector_DBGBloomAlgorithms_LDADD = \
-	$(top_builddir)/Common/libcommon.a \
-	$(GTEST_LIBS_)
-Konnector_DBGBloomAlgorithms_LDFLAGS = $(GTEST_LDFLAGS_)
+Konnector_DBGBloomAlgorithms_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+Konnector_DBGBloomAlgorithms_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
+Konnector_DBGBloomAlgorithms_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += graph_ConstrainedBFSVisitor
 graph_ConstrainedBFSVisitor_SOURCES = Graph/ConstrainedBFSVisitorTest.cpp
-graph_ConstrainedBFSVisitor_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_ConstrainedBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_ConstrainedBFSVisitor_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_ConstrainedBFSVisitor_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_ConstrainedBFSVisitor_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_ConstrainedBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += graph_BidirectionalBFS
 graph_BidirectionalBFS_SOURCES = Graph/BidirectionalBFSTest.cpp
-graph_BidirectionalBFS_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_BidirectionalBFS_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_BidirectionalBFS_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_BidirectionalBFS_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_BidirectionalBFS_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_BidirectionalBFS_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += graph_AllPathsSearch
 graph_AllPathsSearch_SOURCES = Graph/AllPathsSearchTest.cpp
-graph_AllPathsSearch_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_AllPathsSearch_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_AllPathsSearch_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_AllPathsSearch_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_AllPathsSearch_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_AllPathsSearch_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += graph_HashGraph
 graph_HashGraph_SOURCES = Graph/HashGraphTest.cpp
-graph_HashGraph_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_HashGraph_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_HashGraph_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_HashGraph_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_HashGraph_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_HashGraph_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += graph_ConstrainedBidiBFSVisitor
 graph_ConstrainedBidiBFSVisitor_SOURCES = \
 	Graph/ConstrainedBidiBFSVisitorTest.cpp
-graph_ConstrainedBidiBFSVisitor_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_ConstrainedBidiBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_ConstrainedBidiBFSVisitor_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_ConstrainedBidiBFSVisitor_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_ConstrainedBidiBFSVisitor_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_ConstrainedBidiBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += graph_ExtendPath
 graph_ExtendPath_SOURCES = Graph/ExtendPathTest.cpp
-graph_ExtendPath_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_ExtendPath_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_ExtendPath_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_ExtendPath_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_ExtendPath_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_ExtendPath_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
 
 check_PROGRAMS += Konnector_konnector
 Konnector_konnector_SOURCES = \
 	Konnector/konnectorTest.cpp
-Konnector_konnector_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
+Konnector_konnector_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
 Konnector_konnector_LDADD = \
 	$(top_builddir)/Align/libalign.a \
-	$(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-Konnector_konnector_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-Konnector_konnector_LDFLAGS = $(GTEST_LDFLAGS_)
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+Konnector_konnector_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 check_PROGRAMS += DBG_LoadAlgorithm
 DBG_LoadAlgorithm_SOURCES = \
 	DBG/LoadAlgorithmTest.cpp
 DBG_LoadAlgorithm_CPPFLAGS = \
-	$(GTEST_INCLUDES_) \
+	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/DataLayer \
 	-I$(top_srcdir)/Common
 DBG_LoadAlgorithm_LDADD = \
 	$(top_builddir)/Assembly/libassembly.a \
 	$(top_builddir)/DataLayer/libdatalayer.a \
-	$(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-DBG_LoadAlgorithm_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-DBG_LoadAlgorithm_LDFLAGS = $(GTEST_LDFLAGS_)
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+DBG_LoadAlgorithm_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 if PAIRED_DBG
 
@@ -152,59 +113,103 @@ check_PROGRAMS += PairedDBG_LoadAlgorithm
 PairedDBG_LoadAlgorithm_SOURCES = \
 	PairedDBG/LoadAlgorithmTest.cpp
 PairedDBG_LoadAlgorithm_CPPFLAGS = \
-	$(GTEST_INCLUDES_) \
+	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/DataLayer \
 	-I$(top_srcdir)/Common
 PairedDBG_LoadAlgorithm_LDADD = \
 	$(top_builddir)/PairedDBG/libpaireddbg.a \
 	$(top_builddir)/Assembly/libassembly.a \
 	$(top_builddir)/DataLayer/libdatalayer.a \
-	$(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-PairedDBG_LoadAlgorithm_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-PairedDBG_LoadAlgorithm_LDFLAGS = $(GTEST_LDFLAGS_)
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+PairedDBG_LoadAlgorithm_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 check_PROGRAMS += PairedDBG_KmerPair
 PairedDBG_KmerPair_SOURCES = \
 	PairedDBG/KmerPairTest.cc
 PairedDBG_KmerPair_CPPFLAGS = \
-	$(GTEST_INCLUDES_) \
+	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/DataLayer \
 	-I$(top_srcdir)/Common
 PairedDBG_KmerPair_LDADD = \
 	$(top_builddir)/PairedDBG/libpaireddbg.a \
 	$(top_builddir)/DataLayer/libdatalayer.a \
-	$(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-PairedDBG_KmerPair_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-PairedDBG_KmerPair_LDFLAGS = $(GTEST_LDFLAGS_)
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+PairedDBG_KmerPair_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 check_PROGRAMS += PairedDBG_Dinuc
 PairedDBG_Dinuc_SOURCES = \
 	PairedDBG/DinucTest.cc
 PairedDBG_Dinuc_CPPFLAGS = \
-	$(GTEST_INCLUDES_) \
+	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/DataLayer \
 	-I$(top_srcdir)/Common
 PairedDBG_Dinuc_LDADD = \
 	$(top_builddir)/PairedDBG/libpaireddbg.a \
 	$(top_builddir)/DataLayer/libdatalayer.a \
-	$(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
 PairedDBG_Dinuc_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
-PairedDBG_Dinuc_LDFLAGS = $(GTEST_LDFLAGS_)
 
 check_PROGRAMS += PairedDBG_BranchRecord
 PairedDBG_BranchRecord_SOURCES = \
 	PairedDBG/BranchRecordTest.cpp
 PairedDBG_BranchRecord_CPPFLAGS = \
-	$(GTEST_INCLUDES_) \
+	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/DataLayer \
 	-I$(top_srcdir)/Common
 PairedDBG_BranchRecord_LDADD = \
 	$(top_builddir)/PairedDBG/libpaireddbg.a \
 	$(top_builddir)/DataLayer/libdatalayer.a \
-	$(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-PairedDBG_BranchRecord_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-PairedDBG_BranchRecord_LDFLAGS = $(GTEST_LDFLAGS_)
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+PairedDBG_BranchRecord_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
 
 endif # PAIRED_DBG
 
+check_PROGRAMS += BloomDBG_BloomDBG
+BloomDBG_BloomDBG_SOURCES = BloomDBG/BloomDBGTest.cpp
+BloomDBG_BloomDBG_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+BloomDBG_BloomDBG_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
+BloomDBG_BloomDBG_LDADD = \
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+
+check_PROGRAMS += BloomDBG_RollingHash
+BloomDBG_RollingHash_SOURCES = BloomDBG/RollingHashTest.cpp
+BloomDBG_RollingHash_LDADD = \
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+
+check_PROGRAMS += BloomDBG_RollingHashIterator
+BloomDBG_RollingHashIterator_SOURCES = BloomDBG/RollingHashIteratorTest.cpp
+BloomDBG_RollingHashIterator_LDADD = \
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+
+check_PROGRAMS += BloomDBG_HashAgnosticCascadingBloom
+BloomDBG_HashAgnosticCascadingBloom_SOURCES = \
+	BloomDBG/HashAgnosticCascadingBloomTest.cpp
+BloomDBG_HashAgnosticCascadingBloom_CXXFLAGS = $(AM_CXXFLAGS) \
+	$(OPENMP_CXXFLAGS)
+
+check_PROGRAMS += BloomDBG_RollingBloomDBG
+BloomDBG_RollingBloomDBG_SOURCES = BloomDBG/RollingBloomDBGTest.cpp
+BloomDBG_RollingBloomDBG_CXXFLAGS = $(AM_CXXFLAGS) \
+	$(OPENMP_CXXFLAGS)
+BloomDBG_RollingBloomDBG_LDADD = \
+	$(top_builddir)/DataLayer/libdatalayer.a \
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+
+check_PROGRAMS += BloomDBG_MaskedKmer
+BloomDBG_MaskedKmer_SOURCES = BloomDBG/MaskedKmerTest.cpp
+BloomDBG_MaskedKmer_LDADD = \
+	$(top_builddir)/Common/libcommon.a \
+	$(LDADD)
+
+check_PROGRAMS += BloomDBG_SpacedSeed
+BloomDBG_SpacedSeed_SOURCES = BloomDBG/SpacedSeedTest.cpp
+
 TESTS = $(check_PROGRAMS)
diff --git a/bin/abyss-adjtodot.pl b/bin/abyss-adjtodot.pl
index 416eafe..2736fa6 100755
--- a/bin/abyss-adjtodot.pl
+++ b/bin/abyss-adjtodot.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # Convert an ABySS adjacency file to GraphViz dot format.
 # Written by Shaun Jackman <sjackman at bcgsc.ca>.
 use strict;
diff --git a/bin/abyss-cstont b/bin/abyss-cstont
index 270e02a..33137f8 100755
--- a/bin/abyss-cstont
+++ b/bin/abyss-cstont
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # Convert colour-space FASTA sequences to nucleotide FASTA sequences.
 # Written by Shaun Jackman <sjackman at bcgsc.ca>.
 # Usage: cstofasta data.csfa >data.fa
diff --git a/bin/abyss-dida b/bin/abyss-dida
index 593cfec..31cbc7d 100755
--- a/bin/abyss-dida
+++ b/bin/abyss-dida
@@ -71,7 +71,7 @@ fi
 # Add file arguments to dida command.  Convert all input file paths
 # to absolute, since we change to a temp dir below
 
-query=($(readlink -f "$@"))
+query=($(echo "$@" | xargs -n1 readlink -f))
 target=${query[${#query[@]}-1]}
 unset query[${#query[@]}-1]
 
diff --git a/bin/abyss-fac.pl b/bin/abyss-fac.pl
index 828caa1..4728760 100755
--- a/bin/abyss-fac.pl
+++ b/bin/abyss-fac.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # abyss-fac (FASTA count)
 # Calculate assembly contiguity statistics, such as N50.
 # Written by Shaun Jackman <sjackman at bcgsc.ca>.
diff --git a/bin/abyss-fatoagp b/bin/abyss-fatoagp
index a5297b9..65353b2 100755
--- a/bin/abyss-fatoagp
+++ b/bin/abyss-fatoagp
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # Convert a FASTA file of scaffolds to a FASTA file of contigs and an
 # AGP file.
 # Written by Shaun Jackman <sjackman at bcgsc.ca>.
@@ -7,9 +7,12 @@ use strict;
 use Getopt::Std qw'getopts';
 
 my %opt;
-getopts 'f:s:', \%opt;
+getopts 'f:s:S:', \%opt;
 my $opt_fasta = $opt{'f'};
-my $opt_min_len = defined $opt{'s'} ? $opt{'s'} : 200;
+# scaffolds shorter than this length will be excluded
+my $opt_min_scaf_len = defined $opt{'s'} ? $opt{'s'} : 200;
+# scaftigs shorter than this length will be masked with "N"s
+my $opt_min_ctg_len = defined $opt{'S'} ? $opt{'S'} : 50;
 
 open FASTA, ">$opt_fasta"
 	or die "error: `$opt_fasta': $!\n"
@@ -24,9 +27,21 @@ while (<>) {
 	my $scafseq = <>;
 	chomp $scafseq;
 	my $scaflen = $scafseq =~ tr/ACGTacgt//;
-	next if $scaflen < $opt_min_len;
+	next if $scaflen < $opt_min_scaf_len;
 
+	# mask scaftigs shorter than length threshold with "N"s
 	my @ctgseqs = split /([Nn]+)/, $scafseq;
+	foreach my $ctgseq (@ctgseqs) {
+		next if /^[nN]/;
+		if (length($ctgseq) < $opt_min_ctg_len) {
+			$ctgseq = "N" x length($ctgseq);
+		}
+	}
+	# rejoin and split to merge adjacent stretches of "N"s
+	$scafseq = join '', @ctgseqs;
+	next unless $scafseq =~ /[^nN]/;
+	@ctgseqs = split /([Nn]+)/, $scafseq;
+
 	my $i = 0;
 	my $x = 0;
 	for my $ctgseq (@ctgseqs) {
diff --git a/bin/abyss-joindist b/bin/abyss-joindist
index ddb03be..860b55d 100755
--- a/bin/abyss-joindist
+++ b/bin/abyss-joindist
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # Join multiple ABySS distance estimate files.
 # Written by Shaun Jackman <sjackman at bcgsc.ca>.
 use strict;
diff --git a/bin/abyss-pe b/bin/abyss-pe
index 60436c6..48cdd2f 100755
--- a/bin/abyss-pe
+++ b/bin/abyss-pe
@@ -101,7 +101,7 @@ species ?= ""
 endif
 
 # Programs
-MARKDOWN=multimarkdown
+MARKDOWN=pandoc
 
 map=$(foreach a,$(2),$(call $(1),$(a)))
 deref=$($1)
@@ -129,6 +129,18 @@ graph?=dot
 # g is private. Use graph instead.
 override g:=$(graph)
 
+# Number of threads
+ifdef PE_HOSTFILE
+hostname?=$(shell hostname -f)
+j?=$(shell awk '$$1 == "$(hostname)" {print $$2}' $(PE_HOSTFILE))
+endif
+ifeq ($j,)
+j:=$(np)
+endif
+ifeq ($j,)
+j:=2
+endif
+
 # ABYSS parameters
 q ?= 3
 abyssopt += -k$k -q$q
@@ -157,31 +169,42 @@ endif
 ifdef ss
 SS=--SS
 endif
-abyssopt += $v $(dbopt) $(SS) --coverage-hist=coverage.hist -s $*-bubbles.fa
+abyssopt += $v
 
-# Number of threads
-ifdef PE_HOSTFILE
-hostname?=$(shell hostname -f)
-j?=$(shell awk '$$1 == "$(hostname)" {print $$2}' $(PE_HOSTFILE))
+# additional params for Bloom filter assembly (`abyss-bloom-dbg`)
+ifdef B
+abyssopt += -b$B
+ifdef H
+abyssopt += -H$H
 endif
-ifeq ($j,)
-j:=$(np)
+ifdef j
+abyssopt += -j$j
 endif
-ifeq ($j,)
-j:=2
+ifdef kc
+abyssopt += --kc=$(kc)
+endif
+ifdef x
+abyssopt += -s$x
+endif
+else
+abyssopt += $(dbopt) $(SS) --coverage-hist=coverage.hist -s $*-bubbles.fa
 endif
 
 # AdjList parameters
 m?=50
 alopt += $v $(dbopt) $(SS) -k$k -m$m
+ifndef B
 ifdef K
 alopt += -K$K
 endif
+endif
 
 # filtergraph parameters
+ifndef B
 ifdef K
 fgopt += --assemble --shim-max-degree=2
 endif
+endif
 ifdef xtip
 fgopt += -t$(shell echo $k*2 |bc)
 endif
@@ -237,8 +260,8 @@ fmopt=$v $(dbopt) -l$($*_l) $(FIXMATE_OPTIONS)
 
 # DistanceEst parameters
 DistanceEst?=DistanceEst$(ssq_t)
-l?=$k
-s?=200
+l?=40
+s?=1000
 n?=10
 libs=$(pe) $(mp)
 $(foreach i,$(libs),$(eval $i_l?=$l))
@@ -246,6 +269,7 @@ $(foreach i,$(libs),$(eval $i_s?=$s))
 $(foreach i,$(libs),$(eval $i_n?=$n))
 deopt=$v $(dbopt) -j$j -k$k -l$($*_l) -s$($*_s) -n$($*_n) $($*_de) \
 	$(DISTANCEEST_OPTIONS)
+scaffold_deopt=--dot --mean $(deopt)
 
 # SimpleGraph parameters
 sgopt += $(dbopt)
@@ -270,9 +294,17 @@ pcopt += -p$p
 mcopt += $v $(dbopt) -k$k
 
 # Scaffold parameters
-S?=$s
+S?=1000-10000
 N?=$n
 scopt += $v $(dbopt) $(SS) -k$k
+ifdef G
+scopt += -G$G
+endif
+
+# abyss-fac parameters
+ifdef G
+override facopt = -G$G
+endif
 
 # BWA-SW parameters
 bwaswopt=-t$j
@@ -305,23 +337,21 @@ error::
 # Help and version messages
 
 help:
-	@printf '\
-Usage: abyss-pe [OPTION]... [PARAMETER=VALUE]... [COMMAND]...\n\
-Assemble reads into contigs and scaffolds. ABySS is a de novo\n\
-sequence assembler intended for short paired-end reads and large\n\
-genomes. See the abyss-pe man page for documentation of assembly\n\
-parameters and commands. abyss-pe is a Makefile script, and so\n\
-options of `make` may also be used with abyss-pe. See the `make`\n\
-man page for documentation.\n\
-\n\
-Report bugs to <abyss-users at bcgsc.ca>.\n'
+	@echo 'Usage: abyss-pe [OPTION]... [PARAMETER=VALUE]... [COMMAND]...'
+	@echo 'Assemble reads into contigs and scaffolds. ABySS is a de novo'
+	@echo 'sequence assembler intended for short paired-end reads and large'
+	@echo 'genomes. See the abyss-pe man page for documentation of assembly'
+	@echo 'parameters and commands. abyss-pe is a Makefile script, and so'
+	@echo 'options of `make` may also be used with abyss-pe. See the `make`'
+	@echo 'man page for documentation.'
+	@echo
+	@echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users at bcgsc.ca.'
 
 version:
-	@printf '\
-abyss-pe (ABySS) 1.9.0\n\
-Written by Shaun Jackman and Anthony Raymond.\n\
-\n\
-Copyright 2012 Canada'\''s Michael Smith Genome Science Centre\n'
+	@echo "abyss-pe (ABySS) 2.0.1"
+	@echo "Written by Shaun Jackman and Anthony Raymond."
+	@echo
+	@echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
 
 versions: version
 	@echo PATH=$(PATH)
@@ -453,7 +483,11 @@ startDb:
 	> db.txt
 endif
 
-ifdef K
+ifdef B
+%-1.fa:
+	abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@
+else ifdef K
+
 ifdef np
 %-1.fa:
 	$(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se)
@@ -462,15 +496,13 @@ else
 	abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa -g $*-1.$g $(in) $(se)
 endif
 
-else
-ifdef np
+else ifdef np
 %-1.fa:
 	$(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
 else
 %-1.fa:
 	ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
 endif
-endif
 
 # Find overlapping contigs
 
@@ -560,7 +592,7 @@ ifndef cs
 
 %-5.path %-5.fa %-5.$g: %-3.fa %-4.fa %-4.$g %-4.path3
 	cat $(wordlist 1, 2, $^) \
-		|PathConsensus $v --$g -k$k $(pcopt) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
+		|PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
 
 %-6.fa: %-3.fa %-4.fa %-5.fa %-5.$g %-5.path
 	cat $(wordlist 1, 3, $^) |MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^)
@@ -607,17 +639,17 @@ endif
 
 %-6.dist.dot: %-6.sam.gz %-6.hist
 	gunzip -c $< \
-	|$(DistanceEst) --dot $(deopt) -o $@ $*-6.hist
+	|$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
 
 %-6.dist.dot: %-6.bam %-6.hist
 	samtools view -h $< \
-	|$(DistanceEst) --dot $(deopt) -o $@ $*-6.hist
+	|$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
 
 %-6.dist.dot: $(name)-6.fa
 	$(align) $(mapopt) $(strip $($*)) $< \
 		|$(fixmate) $(fmopt) -h $*-6.hist \
 		|sort -snk3 -k4 \
-		|$(DistanceEst) --dot $(deopt) -o $@ $*-6.hist
+		|$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
 
 # Scaffold
 
@@ -625,7 +657,7 @@ endif
 	abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@
 
 %-7.path %-7.$g %-7.fa: %-6.fa %-6.$g %-6.path
-	PathConsensus $v --$g -k$k $(pcopt) -s $*-7.fa -g $*-7.$g -o $*-7.path $^
+	PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-7.fa -g $*-7.$g -o $*-7.path $^
 
 %-8.fa: %-6.fa %-7.fa %-7.$g %-7.path
 	cat $(wordlist 1, 2, $^) \
@@ -674,7 +706,7 @@ sealer_ks?=-k90 -k80 -k70 -k60 -k50 -k40 -k30
 	abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@
 
 %-9.path %-9.$g %-9.fa: %-8.fa %-8.$g %-8.path
-	PathConsensus $v --$g -k$k $(pcopt) -s $*-9.fa -g $*-9.$g -o $*-9.path $^
+	PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-9.fa -g $*-9.$g -o $*-9.path $^
 
 %-10.fa: %-8.fa %-9.fa %-9.$g %-9.path
 	cat $(wordlist 1, 2, $^) \
@@ -750,7 +782,7 @@ ifneq ($(long),)
 $(name)-stats.tab: %-stats.tab: %-long-scaffs.fa
 endif
 $(name)-stats.tab:
-	abyss-fac $(FAC_OPTIONS) $^ |tee $@
+	abyss-fac $(facopt) $(FAC_OPTIONS) $^ |tee $@
 
 %.csv: %.tab
 	tr '\t' , <$< >$@
@@ -778,7 +810,7 @@ $(name)-stats.tab:
 
 # Report ABySS configuration variable(s) and value(s) currently set.
 
-override varList := a b c d e E j k l m n N p q s S t v cs pi \
+override varList := a b c d e E G j k l m n N p q s S t v cs pi \
 	np pe lib mp se SS hostname xtip \
 	ssq ssq_ti libs path name in mpirun \
 	aligner long ref fixmate DistanceEst \
@@ -792,12 +824,11 @@ override varList := a b c d e E j k l m n N p q s S t v cs pi \
 	MARKDOWN
 
 env:
-	@echo -e "\
-	List of ABySS configuration variables currently set:\n\n\
-	[environment], if variable was inherited from the environment.\n\
-	[command line], if variable was defined on the command line.\n\
-	[file], if variable was defined in (this) makefile.\n\
-	[override], if variable was defined with an override directive in (this) makefile.\n"
+	@echo 'List of ABySS configuration variables currently set:'
+	@echo '[environment], if variable was inherited from the environment.'
+	@echo '[command line], if variable was defined on the command line.'
+	@echo '[file], if variable was defined in (this) makefile.'
+	@echo '[override], if variable was defined with an override directive in (this) makefile.'
 
 	@$(foreach var,$(varList),\
 		echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]" | grep -v "undefined";)
diff --git a/bin/abyss-samtoafg b/bin/abyss-samtoafg
index 48f972f..893dfe8 100755
--- a/bin/abyss-samtoafg
+++ b/bin/abyss-samtoafg
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 use strict;
 use Getopt::Long;
 use Pod::Usage;
diff --git a/configure.ac b/configure.ac
index a3af488..884852b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
 AC_PREREQ(2.62)
-AC_INIT(ABySS, 1.9.0, abyss-users at bcgsc.ca, abyss,
+AC_INIT(ABySS, 2.0.1, abyss-users at bcgsc.ca, abyss,
 		http://www.bcgsc.ca/platform/bioinfo/software/abyss)
 m4_include(m4/m4_ax_pthread.m4)
 AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)
@@ -104,12 +104,16 @@ AC_ARG_WITH(sqlite, AS_HELP_STRING([--with-sqlite=PATH],
 	[specify prefix directory for the installed sqlite library]))
 if test "$with_sqlite" -a "$with_sqlite" != "no" -a -d "$with_sqlite"; then
 	sqlite_cppflags="-I$with_sqlite/include"
-	sqlite_ldflags="-L$with_sqlite/lib -lsqlite3"
+	if test -d "$with_sqlite/lib64"; then
+		sqlite_ldflags="-L$with_sqlite/lib64 -lsqlite3"
+	else
+		sqlite_ldflags="-L$with_sqlite/lib -lsqlite3"
+	fi
 fi
 
 # SparseHash
 AC_ARG_WITH(sparsehash, AS_HELP_STRING([--with-sparsehash=PATH],
-	[specify prefix directory for the installed spasehash library]))
+	[specify prefix directory for the installed sparsehash library]))
 if test "$with_sparsehash" -a "$with_sparsehash" != "no" -a -d "$with_sparsehash" ; then
 	sparsehash_cppflags="-isystem$with_sparsehash/include"
 	sparsehash_ldflags="-L$with_sparsehash/lib"
@@ -122,10 +126,15 @@ AC_DEFINE_UNQUOTED(FMBITS, $enable_fm,
 				   [Width of bits of the FM-index in bits])
 
 AC_ARG_ENABLE(maxk, AS_HELP_STRING([--enable-maxk=N],
-	[set the maximum k-mer length (default is 96)]),
-	[], [enable_maxk=96])
+	[set the maximum k-mer length (default is 128)]),
+	[], [enable_maxk=128])
 AC_DEFINE_UNQUOTED(MAX_KMER, [$enable_maxk], [maximum k-mer length])
 
+AC_ARG_ENABLE(max-hashes, AS_HELP_STRING([--enable-max-hashes],
+	[set the maximum number of Bloom filter hash functions (default is 32)]),
+	[], [enable_max_hashes=32])
+AC_DEFINE_UNQUOTED(MAX_HASHES, [$enable_max_hashes], [maximum Bloom filter hash functions])
+
 # Find the absolute path to the source.
 my_abs_srcdir=$(cd $srcdir; pwd)
 
@@ -220,6 +229,9 @@ fi
 if (test "$ac_cv_header_sqlite3_h" = "yes" -a "$ac_cv_lib_sqlite3_main" = "yes"); then
 	AC_DEFINE(_SQL, 1, [Define to 1 if you have sqlite lib/header])
 fi
+AM_CONDITIONAL(HAVE_SQLITE3,
+	[test "$ac_cv_header_sqlite3_h" = "yes" -a "$ac_cv_lib_sqlite3_main" = "yes"],
+	[Define to 1 if you have sqlite lib/header])
 AC_SUBST(SQLITE_LIBS, "$LIBS")
 LIBS=$libs
 
@@ -284,18 +296,21 @@ AC_CONFIG_FILES([
 	Unittest/Makefile
 	LogKmerCount/Makefile
 	Bloom/Makefile
+	BloomDBG/Makefile
 	DataBase/Makefile
+	lib/bloomfilter/Makefile
+	lib/rolling-hash/Makefile
 ])
 
 if test "$with_sparsehash" != "no" -a "$ac_cv_header_google_sparse_hash_map" != "yes"; then
 	AC_MSG_ERROR([ABySS should be compiled with Google sparsehash to
 	reduce memory usage. It may be downloaded here:
-	http://code.google.com/p/google-sparsehash
+	https://code.google.com/p/sparsehash/
 
 	If you do not wish to use sparsehash, specify --without-sparsehash.])
 fi
 
-if test $ac_cv_header_pthread_h != yes -o $ac_cv_lib_pthread_pthread_create != yes; then
+if test x"$have_pthread" != x"yes"; then
 	AC_MSG_WARN([Warning: Running the unit tests with 'make check' has been disabled
 	because pthread.h and/or libpthread could not be found.])
 fi
diff --git a/doc/ABYSS.1 b/doc/ABYSS.1
index 6b90240..14a9f69 100644
--- a/doc/ABYSS.1
+++ b/doc/ABYSS.1
@@ -1,4 +1,4 @@
-.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 1.9.0" "User Commands"
+.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.0.1" "User Commands"
 .SH NAME
 ABYSS \- assemble short reads into contigs
 .SH SYNOPSIS
diff --git a/doc/abyss-pe.1 b/doc/abyss-pe.1
index 349ee05..fb94fe7 100644
--- a/doc/abyss-pe.1
+++ b/doc/abyss-pe.1
@@ -1,4 +1,4 @@
-.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 1.9.0" "User Commands"
+.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.0.1" "User Commands"
 .SH NAME
 abyss-pe - assemble reads into contigs
 .SH SYNOPSIS
@@ -49,7 +49,10 @@ files containing single-end reads
 maximum number of branches of a bubble [2]
 .TP
 .B b
-maximum length of a bubble (bp) [10000]
+maximum length of a bubble (bp) [""]
+.br
+abyss-pe has two bubble popping stages. The default limits are 3*k bp
+for ABYSS and 10000 bp for PopBubbles.
 .TP
 .B c
 minimum mean k-mer coverage of a unitig [sqrt(median)]
@@ -58,10 +61,10 @@ minimum mean k-mer coverage of a unitig [sqrt(median)]
 allowable error of a distance estimate (bp) [6]
 .TP
 .B e
-minimum erosion k-mer coverage [sqrt(median)]
+minimum erosion k-mer coverage [round(sqrt(median))]
 .TP
 .B E
-minimum erosion k-mer coverage per strand [1]
+minimum erosion k-mer coverage per strand [1 if sqrt(median) > 2 else 0]
 .TP
 .B j
 number of threads [2]
@@ -73,7 +76,7 @@ size of a k-mer (when K is not set) or the span of a k-mer pair (when K is set)
 size of a single k-mer in a k-mer pair (bp)
 .TP
 .B l
-minimum alignment length of a read (bp) [k]
+minimum alignment length of a read (bp) [40]
 .TP
 .B m
 minimum overlap of two unitigs (bp) [30]
@@ -98,13 +101,13 @@ minimum base quality [0]
 Mask all bases of reads whose quality is less than Q as `N'.
 .TP
 .B s
-minimum unitig size required for building contigs (bp) [200]
+minimum unitig size required for building contigs (bp) [1000]
 .br
 The seed length should be at least twice the value of k. If more
 sequence is assembled than the expected genome size, try increasing s.
 .TP
 .B S
-minimum contig size required for building scaffolds (bp) [s]
+minimum contig size required for building scaffolds (bp) [1000-10000]
 .TP
 .B SS
 SS=--SS to assemble in strand-specific mode
@@ -114,7 +117,7 @@ Assumes that the first read in a read pair is reveresed WRT the
 transcripts sequenced.
 .TP
 .B t
-minimum tip size (bp) [2k]
+maximum length of blunt contigs to trim [k]
 .TP
 .B v
 v=-v to enable verbose logging
diff --git a/doc/abyss-tofastq.1 b/doc/abyss-tofastq.1
index e997080..4250191 100644
--- a/doc/abyss-tofastq.1
+++ b/doc/abyss-tofastq.1
@@ -1,4 +1,4 @@
-.TH abyss-tofastq "1" "2015-May" "ABySS 1.9.0" "User Commands"
+.TH abyss-tofastq "1" "2015-May" "ABySS 2.0.1" "User Commands"
 .SH NAME
 abyss-tofastq \- convert various file formats to FASTQ format
 .br
diff --git a/doc/flowchart.graffle b/doc/flowchart.graffle
index c00b837..a695117 100644
--- a/doc/flowchart.graffle
+++ b/doc/flowchart.graffle
@@ -5198,7 +5198,7 @@
 {\colortbl;\red255\green255\blue255;}
 \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
 
-\f0\b\fs28 \cf0 ABySS paired-end pipeline version 1.9.0}</string>
+\f0\b\fs28 \cf0 ABySS paired-end pipeline version 2.0.1}</string>
 				<key>VerticalPad</key>
 				<integer>0</integer>
 			</dict>
diff --git a/lib/bloomfilter/BloomFilter.hpp b/lib/bloomfilter/BloomFilter.hpp
new file mode 100644
index 0000000..4b60eb7
--- /dev/null
+++ b/lib/bloomfilter/BloomFilter.hpp
@@ -0,0 +1,446 @@
+/*
+ *
+ * BloomFilter.hpp
+ *
+ *  Created on: Aug 10, 2012
+ *      Author: cjustin
+ */
+
+#ifndef BLOOMFILTER_H_
+#define BLOOMFILTER_H_
+#include <string>
+#include <vector>
+#include <stdint.h>
+#include <math.h>
+#include <fstream>
+#include <iostream>
+#include <sys/stat.h>
+#include <cstring>
+#include <cassert>
+#include <cstdlib>
+#include <stdio.h>
+#include <cstring>
+#include "lib/rolling-hash/rolling.h"
+
+using namespace std;
+
+static const uint8_t bitsPerChar = 0x08;
+static const unsigned char bitMask[0x08] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20,
+		0x40, 0x80 };
+
+inline unsigned popCnt(unsigned char x) {
+	return ((0x876543210
+			>> (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2))
+			>> ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2)) & 0xf;
+}
+
+class BloomFilter {
+public:
+
+	struct FileHeader {
+		char magic[8];
+		uint32_t hlen;
+		uint64_t size;
+		uint32_t nhash;
+		uint32_t kmer;
+		double dFPR;
+		uint64_t nEntry;
+		uint64_t tEntry;
+	};
+
+	/*
+	 * Default constructor.
+	 */
+	BloomFilter() :
+			m_filter(0), m_size(0), m_sizeInBytes(0), m_hashNum(0), m_kmerSize(
+					0), m_dFPR(0), m_nEntry(0), m_tEntry(0) {
+	}
+
+	/* De novo filter constructor.
+	 *
+	 * preconditions:
+	 * filterSize must be a multiple of 64
+	 *
+	 * kmerSize refers to the number of bases the kmer has
+	 */
+	BloomFilter(size_t filterSize, unsigned hashNum, unsigned kmerSize) :
+			m_size(filterSize), m_hashNum(hashNum), m_kmerSize(kmerSize), m_dFPR(
+					0), m_nEntry(0), m_tEntry(0) {
+		initSize(m_size);
+		memset(m_filter, 0, m_sizeInBytes);
+	}
+
+	/* De novo filter constructor.
+	 * Allocates a filter size based on the number of expected elements and FPR
+	 *
+	 * If hashNum is set to 0, an optimal value is computed based on the FPR
+	 */
+	BloomFilter(size_t expectedElemNum, double fpr, unsigned hashNum,
+			unsigned kmerSize) :
+			m_size(0), m_hashNum(hashNum), m_kmerSize(kmerSize), m_dFPR(fpr), m_nEntry(
+					0), m_tEntry(0) {
+		if (m_hashNum == 0) {
+			m_hashNum = calcOptiHashNum(m_dFPR);
+		}
+		if (m_size == 0) {
+			m_size = calcOptimalSize(expectedElemNum, m_dFPR);
+		}
+		initSize(m_size);
+		memset(m_filter, 0, m_sizeInBytes);
+	}
+
+	BloomFilter(const string &filterFilePath) {
+		FILE *file = fopen(filterFilePath.c_str(), "rb");
+		if (file == NULL) {
+			cerr << "file \"" << filterFilePath << "\" could not be read."
+					<< endl;
+			exit(1);
+		}
+
+		loadHeader(file);
+
+		long int lCurPos = ftell(file);
+		fseek(file, 0, 2);
+		size_t fileSize = ftell(file) - sizeof(struct FileHeader);
+		fseek(file, lCurPos, 0);
+		if (fileSize != m_sizeInBytes) {
+			cerr << "Error: " << filterFilePath
+					<< " does not match size given by its information file. Size: "
+					<< fileSize << " vs " << m_sizeInBytes << " bytes." << endl;
+			exit(1);
+		}
+
+		size_t countRead = fread(m_filter, fileSize, 1, file);
+		if (countRead != 1 && fclose(file) != 0) {
+			cerr << "file \"" << filterFilePath << "\" could not be read."
+					<< endl;
+			exit(1);
+		}
+	}
+
+	void loadHeader(FILE *file) {
+
+		FileHeader header;
+		if (fread(&header, sizeof(struct FileHeader), 1, file) == 1) {
+			cerr << "Loading header..." << endl;
+		} else {
+			cerr << "Failed to header" << endl;
+		}
+		char magic[9];
+		strncpy(magic, header.magic, 8);
+		magic[8] = '\0';
+
+//        cerr << "Loading header... magic: " <<
+//            magic << " hlen: " <<
+//            header.hlen << " size: " <<
+//            header.size << " nhash: " <<
+//            header.nhash << " kmer: " <<
+//            header.kmer << " dFPR: " <<
+//            header.dFPR << " aFPR: " <<
+//            header.aFPR << " rFPR: " <<
+//            header.rFPR << " nEntry: " <<
+//            header.nEntry << " tEntry: " <<
+//            header.tEntry << endl;
+
+		m_size = header.size;
+		initSize(m_size);
+		m_hashNum = header.nhash;
+		m_kmerSize = header.kmer;
+	}
+
+	/*
+	 * Accepts a list of precomputed hash values. Faster than rehashing each time.
+	 */
+	void insert(vector<size_t> const &precomputed) {
+
+		//iterates through hashed values adding it to the filter
+		for (size_t i = 0; i < m_hashNum; ++i) {
+			size_t normalizedValue = precomputed.at(i) % m_size;
+			__sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
+					bitMask[normalizedValue % bitsPerChar]);
+		}
+	}
+
+	/*
+	 * Accepts a list of precomputed hash values. Faster than rehashing each time.
+	 */
+	void insert(const size_t precomputed[]) {
+
+		//iterates through hashed values adding it to the filter
+		for (size_t i = 0; i < m_hashNum; ++i) {
+			size_t normalizedValue = precomputed[i] % m_size;
+			__sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
+				bitMask[normalizedValue % bitsPerChar]);
+		}
+	}
+
+	void insert(const char* kmer) {
+		uint64_t hVal = getChval(kmer, m_kmerSize);
+		for (unsigned i = 0; i < m_hashNum; i++) {
+			size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
+			__sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
+					bitMask[normalizedValue % bitsPerChar]);
+		}
+	}
+
+	/*
+	 * Returns if already inserted
+	 */
+	bool insertAndCheck(const char* kmer) {
+		uint64_t hVal = getChval(kmer, m_kmerSize);
+		bool found = true;
+		for (unsigned i = 0; i < m_hashNum; i++) {
+			size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
+			found &= __sync_or_and_fetch(
+					&m_filter[normalizedValue / bitsPerChar],
+					bitMask[normalizedValue % bitsPerChar]);
+		}
+		return found;
+	}
+
+	/*
+	 * Accepts a list of precomputed hash values. Faster than rehashing each time.
+	 * Returns if already inserted
+	 */
+	bool insertAndCheck(vector<size_t> const &precomputed) {
+		//iterates through hashed values adding it to the filter
+		bool found = true;
+		for (size_t i = 0; i < m_hashNum; ++i) {
+			size_t normalizedValue = precomputed.at(i) % m_size;
+			found &= __sync_or_and_fetch(
+					&m_filter[normalizedValue / bitsPerChar],
+					bitMask[normalizedValue % bitsPerChar]);
+		}
+		return found;
+	}
+
+	/*
+	 * Accepts a list of precomputed hash values. Faster than rehashing each time.
+	 */
+	bool contains(vector<size_t> const &precomputed) const {
+		for (size_t i = 0; i < m_hashNum; ++i) {
+			size_t normalizedValue = precomputed.at(i) % m_size;
+			unsigned char bit = bitMask[normalizedValue % bitsPerChar];
+			if ((m_filter[normalizedValue / bitsPerChar] & bit) != bit) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/*
+	 * Accepts a list of precomputed hash values. Faster than rehashing each time.
+	 */
+	bool contains(const size_t precomputed[]) const {
+		for (size_t i = 0; i < m_hashNum; ++i) {
+			size_t normalizedValue = precomputed[i] % m_size;
+			unsigned char bit = bitMask[normalizedValue % bitsPerChar];
+			if ((m_filter[normalizedValue / bitsPerChar] & bit) != bit) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	/*
+	 * Single pass filtering, computes hash values on the fly
+	 */
+	bool contains(const char* kmer) const {
+		uint64_t hVal = getChval(kmer, m_kmerSize);
+		for (unsigned i = 0; i < m_hashNum; i++) {
+			size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
+			unsigned char bit = bitMask[normalizedValue % bitsPerChar];
+			if ((m_filter[normalizedValue / bitsPerChar] & bit) == 0)
+				return false;
+		}
+		return true;
+	}
+
+	void writeHeader(ofstream &out) const {
+		FileHeader header;
+		strncpy(header.magic, "BlOOMFXX", 8);
+		char magic[9];
+		strncpy(magic, header.magic, 8);
+		magic[8] = '\0';
+
+		header.hlen = sizeof(struct FileHeader);
+		header.size = m_size;
+		header.nhash = m_hashNum;
+		header.kmer = m_kmerSize;
+		header.dFPR = m_dFPR;
+		header.nEntry = m_nEntry;
+		header.tEntry = m_tEntry;
+
+//        cerr << "Writing header... magic: "
+//            << magic << " hlen: "
+//            << header.hlen << " size: "
+//            << header.size << " nhash: "
+//            << header.nhash << " kmer: "
+//            << header.kmer << " dFPR: "
+//            << header.dFPR << " aFPR: "
+//            << header.aFPR << " rFPR: "
+//            << header.rFPR << " nEntry: "
+//            << header.nEntry << " tEntry: "
+//            << header.tEntry << endl;
+
+		out.write(reinterpret_cast<char*>(&header), sizeof(struct FileHeader));
+	}
+
+	/*
+	 * Stores the filter as a binary file to the path specified
+	 * Stores uncompressed because the random data tends to
+	 * compress poorly anyway
+	 */
+	void storeFilter(string const &filterFilePath) const {
+		ofstream myFile(filterFilePath.c_str(), ios::out | ios::binary);
+
+		cerr << "Storing filter. Filter is " << m_sizeInBytes << "bytes."
+				<< endl;
+
+		assert(myFile);
+		writeHeader(myFile);
+
+		//write out each block
+		myFile.write(reinterpret_cast<char*>(m_filter), m_sizeInBytes);
+
+		myFile.close();
+		assert(myFile);
+	}
+
+	size_t getPop() const {
+		size_t i, popBF = 0;
+#pragma omp parallel for reduction(+:popBF)
+		for (i = 0; i < (m_size + 7) / 8; i++)
+			popBF = popBF + popCnt(m_filter[i]);
+		return popBF;
+	}
+
+	unsigned getHashNum() const {
+		return m_hashNum;
+	}
+
+	unsigned getKmerSize() const {
+		return m_kmerSize;
+	}
+
+//    void setdFPR(double value) {
+//        m_dFPR = value;
+//    }
+
+	/*
+	 * Calculates that False positive rate that a redundant entry is actually
+	 * a unique entry
+	 */
+	double getRedudancyFPR() {
+		assert(m_nEntry > 0);
+		double total = log(calcFPR_numInserted(1));
+		for (size_t i = 2; i < m_nEntry; ++i) {
+			total = log(exp(total) + calcFPR_numInserted(i));
+		}
+		return exp(total) / m_nEntry;
+	}
+
+	/*
+	 * Return FPR based on popcount
+	 */
+	double getFPR() const {
+		return pow(double(getPop())/double(m_size), m_hashNum);
+	}
+
+	/*
+	 * Return FPR based on number of inserted elements
+	 */
+	double getFPR_numEle() const {
+		assert(m_nEntry > 0);
+		return calcFPR_numInserted(m_nEntry);
+	}
+
+	uint64_t getnEntry() {
+		return m_nEntry;
+	}
+
+	uint64_t gettEntry() {
+		return m_tEntry;
+	}
+
+	void setnEntry(uint64_t value) {
+		m_nEntry = value;
+	}
+
+	void settEntry(uint64_t value) {
+		m_tEntry = value;
+	}
+
+	size_t getFilterSize() const {
+		return m_size;
+	}
+
+	~BloomFilter() {
+		delete[] m_filter;
+	}
+private:
+	BloomFilter(const BloomFilter& that); //to prevent copy construction
+
+	/*
+	 * Checks filter size and initializes filter
+	 */
+	void initSize(size_t size) {
+		if (size % 8 != 0) {
+			cerr << "ERROR: Filter Size \"" << size
+					<< "\" is not a multiple of 8." << endl;
+			exit(1);
+		}
+		m_sizeInBytes = size / bitsPerChar;
+		m_filter = new unsigned char[m_sizeInBytes];
+	}
+
+	/*
+	 * Only returns multiples of 64 for filter building purposes
+	 * Is an estimated size using approximations of FPR formula
+	 * given the number of hash functions
+	 */
+	size_t calcOptimalSize(size_t entries, double fpr) const {
+		size_t non64ApproxVal = size_t(
+				-double(entries) * double(m_hashNum)
+						/ log(1.0 - pow(fpr, double(1 / double(m_hashNum)))));
+
+		return non64ApproxVal + (64 - non64ApproxVal % 64);
+	}
+
+	/*
+	 * Calculates the optimal number of hash function to use
+	 * Calculation assumes optimal ratio of bytes per entry given a fpr
+	 */
+	static unsigned calcOptiHashNum(double fpr) {
+		return unsigned(-log(fpr) / log(2));
+	}
+
+	/*
+	 * Calculate FPR based on hash functions, size and number of entries
+	 * see http://en.wikipedia.org/wiki/Bloom_filter
+	 */
+	double calcFPR_numInserted(size_t numEntr) const {
+		return pow(
+				1.0
+						- pow(1.0 - 1.0 / double(m_size),
+								double(numEntr) * m_hashNum), double(m_hashNum));
+	}
+
+	/*
+	 * Calculates the optimal FPR to use based on hash functions
+	 */
+	double calcFPR_hashNum(unsigned hashFunctNum) const {
+		return pow(2, -hashFunctNum);
+	}
+
+	uint8_t* m_filter;
+	size_t m_size;
+	size_t m_sizeInBytes;
+	unsigned m_hashNum;
+	unsigned m_kmerSize;
+	double m_dFPR;
+	uint64_t m_nEntry;
+	uint64_t m_tEntry;
+};
+
+#endif /* BLOOMFILTER_H_ */
diff --git a/lib/bloomfilter/Makefile.am b/lib/bloomfilter/Makefile.am
new file mode 100644
index 0000000..fb10c08
--- /dev/null
+++ b/lib/bloomfilter/Makefile.am
@@ -0,0 +1 @@
+EXTRA_DIST = README.md
diff --git a/lib/bloomfilter/README.md b/lib/bloomfilter/README.md
new file mode 100644
index 0000000..651a381
--- /dev/null
+++ b/lib/bloomfilter/README.md
@@ -0,0 +1,4 @@
+These files come from:
+
+* https://github.com/bcgsc/bloomfilter
+* commit f1232c2
diff --git a/lib/rolling-hash/Makefile.am b/lib/rolling-hash/Makefile.am
new file mode 100644
index 0000000..fb10c08
--- /dev/null
+++ b/lib/rolling-hash/Makefile.am
@@ -0,0 +1 @@
+EXTRA_DIST = README.md
diff --git a/lib/rolling-hash/README.md b/lib/rolling-hash/README.md
new file mode 100644
index 0000000..b76df57
--- /dev/null
+++ b/lib/rolling-hash/README.md
@@ -0,0 +1,2 @@
+* source repo: https://github.com/bcgsc/ntHash
+* git commit: 9f107de
diff --git a/lib/rolling-hash/rolling.h b/lib/rolling-hash/rolling.h
new file mode 100644
index 0000000..1e27d6a
--- /dev/null
+++ b/lib/rolling-hash/rolling.h
@@ -0,0 +1,316 @@
+#ifndef ROLLING_HASH_H
+#define ROLLING_HASH_H
+
+#include <stdint.h>
+
+// offset for the complement base in the random seeds table
+const int cpOff = -20;
+
+// shift for gerenerating multiple hash values
+const int varShift = 27;
+
+// seed for gerenerating multiple hash values
+const uint64_t varSeed = 10427061540882326010ul;
+
+// 64-bit random seed table corresponding to bases and their complements
+static const uint64_t seedTab[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0, // 0..7
+    0, 0, 0, 0, 0, 0, 0, 0, // 8..15
+    0, 0, 0, 0, 0, 0, 0, 0, // 16..23
+    0, 0, 0, 0, 0, 0, 0, 0, // 24..31
+    0, 0, 0, 0, 0, 0, 0, 0, // 32..39
+    0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 40..47
+    0, 0, 0, 3572411708064410444ul, 0, 0, 0, 0, // 48..55
+    0, 0, 0, 0, 0, 0, 0, 0, // 56..63
+    4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 64..71
+    0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 72..79
+    0, 0, 0, 3572411708064410444ul, 2978368046464386134ul, 0, 0, 0, // 80..87
+    0, 0, 0, 0, 0, 0, 0, 0, // 88..95
+    4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 96..103
+    0, 0, 0, 0, 0, 0, 0, 0, // 104..111
+    0, 0, 0, 0, 2978368046464386134ul, 0, 0, 0, // 112..119
+    0, 0, 0, 0, 0, 0, 0, 0, // 120..127
+    0, 0, 0, 0, 0, 0, 0, 0, // 128..135
+    0, 0, 0, 0, 0, 0, 0, 0, // 136..143
+    0, 0, 0, 0, 0, 0, 0, 0, // 144..151
+    0, 0, 0, 0, 0, 0, 0, 0, // 152..159
+    0, 0, 0, 0, 0, 0, 0, 0, // 160..167
+    0, 0, 0, 0, 0, 0, 0, 0, // 168..175
+    0, 0, 0, 0, 0, 0, 0, 0, // 176..183
+    0, 0, 0, 0, 0, 0, 0, 0, // 184..191
+    0, 0, 0, 0, 0, 0, 0, 0, // 192..199
+    0, 0, 0, 0, 0, 0, 0, 0, // 200..207
+    0, 0, 0, 0, 0, 0, 0, 0, // 208..215
+    0, 0, 0, 0, 0, 0, 0, 0, // 216..223
+    0, 0, 0, 0, 0, 0, 0, 0, // 224..231
+    0, 0, 0, 0, 0, 0, 0, 0, // 232..239
+    0, 0, 0, 0, 0, 0, 0, 0, // 240..247
+    0, 0, 0, 0, 0, 0, 0, 0  // 248..255
+};
+
+// rotate "v" to the left by "s" positions
+inline uint64_t rol(const uint64_t v, const int s) {
+    return (v << s) | (v >> (64 - s));
+}
+
+// rotate "v" to the right by "s" positions
+inline uint64_t ror(const uint64_t v, const int s) {
+    return (v >> s) | (v << (64 - s));
+}
+
+// forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
+inline uint64_t getFhval(const char * kmerSeq, const unsigned k) {
+    uint64_t hVal=0;
+    for(unsigned i=0; i<k; i++)
+        hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+    return hVal;
+}
+
+// reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
+inline uint64_t getRhval(const char * kmerSeq, const unsigned k) {
+    uint64_t hVal=0;
+    for(unsigned i=0; i<k; i++)
+        hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+    return hVal;
+}
+
+// cannonical hash value of the base kmer, i.e. rhval(kmer_0)
+inline uint64_t getChval(const char * kmerSeq, const unsigned k) {
+    uint64_t fhVal = getFhval(kmerSeq, k);
+    uint64_t rhVal = getRhval(kmerSeq, k);
+    return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// initialize forward-strand hash value of the first kmer, i.e. fhval(kmer_0)
+inline uint64_t initHashes(const char * kmerSeq, const unsigned k) {
+    return getFhval(kmerSeq, k);
+}
+
+// initialize cannonical hash value of the first kmer, i.e. chval(kmer_0)
+inline uint64_t initHashes(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
+    fhVal = getFhval(kmerSeq, k);
+    rhVal = getRhval(kmerSeq, k);
+    return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// recursive forward-strand hash value for next k-mer
+inline uint64_t rollHashesRight(const uint64_t fhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
+    return(rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]);
+}
+
+// recursive cannonical hash value for next k-mer
+inline uint64_t rollHashesRight(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
+    fhVal = rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
+    rhVal = ror(rhVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
+    return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// recursive forward-strand hash value for prev k-mer
+inline uint64_t rollHashesLeft(const uint64_t fhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
+    return(ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1));
+}
+
+// recursive canonical hash value for prev k-mer
+inline uint64_t rollHashesLeft(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
+    fhVal = ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
+    rhVal = rol(rhVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
+    return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// change a single base and update forward-strand hash value accordingly
+inline uint64_t setBase(uint64_t fhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
+{
+    fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+    kmerSeq[pos] = base;
+    fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+    return fhVal;
+}
+
+// change a single base and update hash values accordingly
+inline uint64_t setBase(uint64_t& fhVal, uint64_t& rhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
+{
+    fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+    rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
+    kmerSeq[pos] = base;
+    fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+    rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
+    return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+/**
+ * Compute multiple pseudo-independent hash values from a seed hash value.
+ *
+ * @param hashes array for storing computed hash values
+ * @param seedVal seed value for multi-hash calculation
+ * @param numHashes number of hash values to compute
+ * @param k-mer size
+ */
+inline void multiHash(uint64_t hashes[], uint64_t seedVal, unsigned numHashes, unsigned k)
+{
+    for (unsigned i = 0; i < numHashes; i++) {
+        hashes[i] = seedVal * (i ^ k * varSeed);
+        hashes[i] ^= hashes[i] >> varShift;
+    }
+}
+
+// spaced-seed hash values
+
+/**
+ * Calculate forward-strand spaced seed hash value of the base kmer, i.e. fhval(kmer_0)
+ *
+ * @param kVal set to forward-strand hash value for unmasked k-mer
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq k-mer to be hashed
+ * @param k k-mer size
+ * @return hash value for masked forward-strand k-mer
+ */
+inline uint64_t getFhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
+    kVal=0;
+    uint64_t sVal=0;
+    for(unsigned i=0; i<k; i++) {
+        kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+        if(seedSeq[i]=='1')
+            sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+    }
+    return sVal;
+}
+
+/**
+ * Calculate reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
+ *
+ * @param kVal set to reverse-strand hash value for unmasked k-mer
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq k-mer to be hashed
+ * @param k k-mer size
+ * @return hash for masked reverse-strand k-mer
+ */
+// reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
+inline uint64_t getRhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
+    kVal=0;
+    uint64_t sVal=0;
+    for(unsigned i=0; i<k; i++) {
+        kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+        if(seedSeq[i]=='1')
+            sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+    }
+    return sVal;
+}
+
+/**
+ * Recursive forward-strand spaced seed hash value for next k-mer
+ *
+ * @param kVal hash value for current k-mer unmasked and in forward orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for *current* k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the right
+ * @param k k-mer size
+ * @return hash for masked k-mer in forward orientation
+ */
+inline uint64_t rollHashesRight(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+    const unsigned charOut = kmerSeq[0];
+    kVal = rol(kVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
+    uint64_t sVal=kVal;
+    for(unsigned i=1; i<k-1; i++) {
+        if(seedSeq[i]!='1')
+            sVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
+    }
+    return sVal;
+}
+
+/**
+ * Recursive forward-strand spaced seed hash value for prev k-mer
+ *
+ * @param kVal hash value for current k-mer unmasked and in forward orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the left
+ * @param k k-mer size
+ * @return hash for masked k-mer in forward orientation
+ */
+inline uint64_t rollHashesLeft(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+    const unsigned charOut = kmerSeq[k-1];
+    kVal = ror(kVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
+    uint64_t sVal=kVal;
+    for(unsigned i=1; i<k-1; i++) {
+        if(seedSeq[i]!='1')
+            sVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
+    }
+    return sVal;
+}
+
+/**
+ * Recursive canonical spaced seed hash value for next k-mer
+ *
+ * @param fkVal hash value for current k-mer unmasked and in forward orientation
+ * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the right
+ * @param k k-mer size
+ * @return canonical hash value for masked k-mer
+ */
+inline uint64_t rollHashesRight(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+    const unsigned charOut = kmerSeq[0];
+    fkVal = rol(fkVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
+    rkVal = ror(rkVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
+    uint64_t fsVal=fkVal, rsVal=rkVal;
+    for(unsigned i=1; i<k-1; i++) {
+        if(seedSeq[i]!='1') {
+            fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
+            rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]+cpOff], i);
+        }
+    }
+    return (rsVal<fsVal)? rsVal : fsVal;
+}
+
+/**
+ * Recursive canonical spaced seed hash value for prev k-mer
+ *
+ * @param fkVal hash value for current k-mer unmasked and in forward orientation
+ * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the left
+ * @param k k-mer size
+ * @return canonical hash value for masked k-mer
+ */
+inline uint64_t rollHashesLeft(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+    const unsigned charOut = kmerSeq[k-1];
+    fkVal = ror(fkVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
+    rkVal = rol(rkVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
+    uint64_t fsVal=fkVal, rsVal=rkVal;
+    for(unsigned i=1; i<k-1; i++) {
+        if(seedSeq[i]!='1') {
+            fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
+            rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]+cpOff], i);
+        }
+    }
+    return (rsVal<fsVal)? rsVal : fsVal;
+}
+
+/**
+ * Change a single base and recompute spaced seed hash values
+ *
+ * @param fkVal hash value for current k-mer unmasked and in forward orientation
+ * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer
+ * @param pos position of base to change
+ * @param base new base value
+ * @param k k-mer size
+ * @return updated canonical hash value for masked k-mer
+ */
+inline uint64_t setBase(uint64_t& fkVal, uint64_t& rkVal, const char * seedSeq, char * kmerSeq, unsigned pos, char base, unsigned k)
+{
+    setBase(fkVal, rkVal, kmerSeq, pos, base, k);
+    uint64_t fsVal=fkVal, rsVal=rkVal;
+    for(unsigned i=0; i<k; i++) {
+        if(seedSeq[i]!='1') {
+            fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+            rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+        }
+    }
+    return (rsVal<fsVal)? rsVal : fsVal;
+}
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/abyss.git



More information about the debian-med-commit mailing list