[med-svn] [mash] 01/05: New upstream version 2.0

Sascha Steinbiss satta at debian.org
Sun Oct 1 10:19:11 UTC 2017


This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository mash.

commit 4ad406c58b9b0fc5e0c3fa4f6f2a2a5330a0b7e1
Author: Sascha Steinbiss <satta at debian.org>
Date:   Sun Oct 1 10:25:44 2017 +0100

    New upstream version 2.0
---
 .gitignore                        |   9 +
 INSTALL.txt                       |   2 +-
 Makefile.in                       |   3 +-
 doc/sphinx/tutorials.rst          |  14 +-
 src/mash/Command.cpp              |  24 +-
 src/mash/Command.h                |   4 +
 src/mash/CommandBounds.cpp        |  97 +++--
 src/mash/CommandBounds.h          |   4 +
 src/mash/CommandContain.cpp       |   5 +
 src/mash/CommandContain.h         |   4 +
 src/mash/CommandDistance.cpp      |   7 +-
 src/mash/CommandDistance.h        |   4 +
 src/mash/CommandFind.cpp          |  14 +-
 src/mash/CommandFind.h            |   4 +
 src/mash/CommandInfo.cpp          |  61 ++-
 src/mash/CommandInfo.h            |   4 +
 src/mash/CommandList.cpp          |  13 +-
 src/mash/CommandList.h            |   4 +
 src/mash/CommandPaste.cpp         |  12 +-
 src/mash/CommandPaste.h           |   4 +
 src/mash/CommandScreen.cpp        | 844 ++++++++++++++++++++++++++++++++++++++
 src/mash/CommandScreen.h          | 163 ++++++++
 src/mash/CommandSketch.cpp        |  17 +-
 src/mash/CommandSketch.h          |   4 +
 src/mash/MinHashHeap.h            |   4 +-
 src/mash/Sketch.cpp               |  46 ++-
 src/mash/Sketch.h                 |  10 +-
 src/mash/capnp/MinHash.capnp      |   4 +-
 src/mash/hash.cpp                 |   2 +-
 src/mash/hash.h                   |   4 +-
 src/mash/mash.cpp                 |  20 +-
 src/mash/schema-1.0.0.json        | 150 +++++++
 src/mash/sketchParameterSetup.cpp |   7 +-
 src/mash/sketchParameterSetup.h   |   4 +
 src/mash/version.h                |   2 +-
 35 files changed, 1467 insertions(+), 107 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..07d55fa
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/src/mash/*.o
+/src/mash/capnp/MinHash.capnp.*
+/Makefile
+/autom4te.cache/
+/config.log
+/config.status
+/configure
+/libmash.a
+/mash
diff --git a/INSTALL.txt b/INSTALL.txt
index 3eee3f6..842b179 100644
--- a/INSTALL.txt
+++ b/INSTALL.txt
@@ -54,7 +54,7 @@ Configure options:
   Installs binaries to bin/ and libraries to lib/ within this path. The default
   is /usr/local/, which will typically require sudo for make install. Must be
   absolute path.
---with-capnp=</path/to/capnp/> or  the
+--with-capnp=</path/to/capnp/>
   Where to find the Cap'n Proto install if not in the default location
   (/usr/local). Must be absolute path and should not include bin/ or lib/
   paths and should not include "bin" or "lib".
diff --git a/Makefile.in b/Makefile.in
index 7b44de6..146a9fc 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,4 +1,4 @@
-CXXFLAGS += -O3 -std=c++11 -Isrc -I at capnp@/include -I @mathinc@
+CXXFLAGS += -O3 -std=c++11 -Isrc -I at capnp@/include -I at mathinc@
 CPPFLAGS += @amcppflags@
 
 UNAME_S=$(shell uname -s)
@@ -15,6 +15,7 @@ SOURCES=\
 	src/mash/CommandBounds.cpp \
 	src/mash/CommandContain.cpp \
 	src/mash/CommandDistance.cpp \
+	src/mash/CommandScreen.cpp \
 	src/mash/CommandFind.cpp \
 	src/mash/CommandInfo.cpp \
 	src/mash/CommandPaste.cpp \
diff --git a/doc/sphinx/tutorials.rst b/doc/sphinx/tutorials.rst
index f188019..9df67e5 100644
--- a/doc/sphinx/tutorials.rst
+++ b/doc/sphinx/tutorials.rst
@@ -79,7 +79,7 @@ by ignoring single-copy k-mers, which are more likely to be erroneous:
 
 .. code::
 
-  mash sketch -m 2 reads.fastq
+  mash sketch -m 2 -k 16 -s 400 reads.fastq
 
 Run :code:`mash dist` with the RefSeq archive as the reference and the read
 sketch as the query:
@@ -93,3 +93,15 @@ Sort the results to see the top hits and their p-values:
 .. code ::
 
   sort -gk3 distances.tab | head
+
+Building a custom RefSeq database
+---------------------------------
+
+To create the RefSeq Mash database, genomes were downloaded from NCBI
+(:code:`ftp.ncbi.nlm.nih.gov/refseq/release/complete`, fasta sequence and
+GenBank annotations for :code:`genomic`), and the
+`refseqCollate <https://github.com/ondovb/refseqCollate/releases>`_ utility was
+used to collate contigs/chromosomes into individual fasta files per genome.
+Groups of these files were sketched in parallel and then pasted together with
+:code:`mash paste`. This process could be repeated for more current or custom
+databases.
diff --git a/src/mash/Command.cpp b/src/mash/Command.cpp
index f4f7504..1dbab31 100644
--- a/src/mash/Command.cpp
+++ b/src/mash/Command.cpp
@@ -12,7 +12,15 @@
 #include "Command.h"
 #include "version.h"
 
-using namespace::std;
+using std::cout;
+using std::cerr;
+using std::endl;
+using std::string;
+using std::exception;
+using std::vector;
+using std::pair;
+
+namespace mash {
 
 Command::Option::Option
 (
@@ -167,6 +175,7 @@ Command::Command()
     addAvailableOption("individual", Option(Option::Boolean, "i", "Sketch", "Sketch individual sequences, rather than whole files, e.g. for multi-fastas of single-chromosome genomes or pair-wise gene comparisons.", ""));
     addAvailableOption("warning", Option(Option::Number, "w", "Sketch", "Probability threshold for warning about low k-mer size.", "0.01", 0, 1));
     addAvailableOption("reads", Option(Option::Boolean, "r", "Sketch", "Input is a read set. See Reads options below. Incompatible with -i.", ""));
+    addAvailableOption("seed", Option(Option::Integer, "S", "Sketch", "Seed to provide to the hash function.", "42", 0, 0xFFFFFFFF));
     addAvailableOption("memory", Option(Option::Size, "b", "Reads", "Use a Bloom filter of this size (raw bytes or with K/M/G/T) to filter out unique k-mers. This is useful if exact filtering with -m uses too much memory. However, some unique k-mers may pass erroneously, and copies cannot be counted beyond 2. Implies -r."));
     addAvailableOption("minCov", Option(Option::Integer, "m", "Reads", "Minimum copies of each k-mer required to pass noise filter for reads. Implies -r.", "1"));
     addAvailableOption("targetCov", Option(Option::Number, "c", "Reads", "Target coverage. Sketching will conclude if this coverage is reached before the end of the input file (estimated by average k-mer multiplicity). Implies -r."));
@@ -270,13 +279,13 @@ void Command::print() const
 		
 			if ( option.argumentMin != option.argumentMax )
 			{
-				stringstream stringMin;
-				stringstream stringMax;
+				std::stringstream stringMin;
+				std::stringstream stringMax;
 			
 				if ( option.type == Option::Integer )
 				{
-					stringMin << int(option.argumentMin);
-					stringMax << int(option.argumentMax);
+					stringMin << long(option.argumentMin);
+					stringMax << long(option.argumentMax);
 				}
 				else
 				{
@@ -357,6 +366,7 @@ void Command::useSketchOptions()
 #endif
     useOption("sketchSize");
     useOption("individual");
+    useOption("seed");
     useOption("warning");
     useOption("reads");
     useOption("memory");
@@ -389,7 +399,7 @@ void splitFile(const string & file, vector<string> & lines)
 {
     string line;
     
-    ifstream in(file);
+    std::ifstream in(file);
     
     if ( in.fail() )
     {
@@ -527,3 +537,5 @@ void printColumns(const vector<vector<string>> & columns, const vector<pair<int,
         cout << endl << endl;
     }
 }
+
+} // namespace mash
diff --git a/src/mash/Command.h b/src/mash/Command.h
index 817d8dc..7eb0331 100644
--- a/src/mash/Command.h
+++ b/src/mash/Command.h
@@ -12,6 +12,8 @@
 #include <vector>
 #include <set>
 
+namespace mash {
+
 class Command
 {
 
@@ -100,4 +102,6 @@ void splitFile(const std::string & file, std::vector<std::string> & lines);
 void printColumns(const std::vector<std::vector<std::string>> & columns, int indent = 2, int spacing = 2, const char * missing = "-", int max = 80);
 void printColumns(const std::vector<std::vector<std::string>> & columns, const std::vector<std::pair<int, std::string>> & dividers, int indent = 2, int spacing = 2, const char * missing = "-", int max = 80);
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandBounds.cpp b/src/mash/CommandBounds.cpp
index 59ad0a3..e47c6f2 100644
--- a/src/mash/CommandBounds.cpp
+++ b/src/mash/CommandBounds.cpp
@@ -15,14 +15,17 @@
     #include <gsl/gsl_cdf.h>
 #endif
 
-using namespace::std;
+using std::cout;
+using std::endl;
+
+namespace mash {
 
 CommandBounds::CommandBounds()
 : Command()
 {
     name = "bounds";
     summary = "Print a table of Mash error bounds.";
-    description = "Print a table of Mash error bounds for various sketch sizes and Mash distances based on a given k-mer size and desired confidence.";
+    description = "Print a table of Mash error bounds for various sketch sizes and Mash distances based on a given k-mer size and desired confidence. Note that these calculations assume sequences are much larger than the sketch size, and will overestimate error bounds if this is not the case.";
     argumentString = "";
     
     useOption("help");
@@ -51,51 +54,83 @@ int CommandBounds::run() const
 	cout << "   k:   " << k << endl;
 	cout << "   p:   " << getOption("prob").getArgumentAsNumber() << endl << endl;
 	
-	cout << "\tMash distance" << endl;
-	cout << "Sketch";
-	
-	for ( int i = 0; i < distCount; i++ )
+	for ( int cont = 0; cont < 2; cont++ )
 	{
-		cout << '\t' << dists[i];
-	}
+		if ( cont )
+		{
+			cout << "\tScreen distance" << endl;
+		}
+		else
+		{
+			cout << "\tMash distance" << endl;
+		}
+		
+		cout << "Sketch";
+	
+		for ( int i = 0; i < distCount; i++ )
+		{
+			cout << '\t' << dists[i];
+		}
 	
-	cout << endl;
+		cout << endl;
 	
-	for ( int i = 0; i < sketchSizeCount; i++ )
-	{
-		int s = sketchSizes[i];
-		cout << s;
-		
-		for ( int j = 0; j < distCount; j++ )
+		for ( int i = 0; i < sketchSizeCount; i++ )
 		{
-			double m2j = 1.0 / (2.0 * exp(k * dists[j]) - 1.0);
+			int s = sketchSizes[i];
+			cout << s;
+		
+			for ( int j = 0; j < distCount; j++ )
+			{
+				double m2j;
+				
+				if ( cont )
+				{
+					m2j = exp(-k * dists[j]);
+				}
+				else
+				{
+					m2j = 1.0 / (2.0 * exp(k * dists[j]) - 1.0);
+				}
 			
-			int x = 0;
+				int x = 0;
 			
-			while ( x < s )
-			{
+				while ( x < s )
+				{
 #ifdef USE_BOOST
-			    double cdfx = cdf(binomial(s, m2j), x);
+					double cdfx = cdf(binomial(s, m2j), x);
 #else
-			    double cdfx = gsl_cdf_binomial_P(x, m2j, s);
+					double cdfx = gsl_cdf_binomial_P(x, m2j, s);
 #endif
-				if ( cdfx > q2 )
+					if ( cdfx > q2 )
+					{
+						break;
+					}
+				
+					x++;
+				}
+			
+				double je = double(x) / s;
+				double j2m;
+				
+				if ( cont )
 				{
-					break;
+					j2m = -1.0 / k * log(je);
+				}
+				else
+				{
+					j2m = -1.0 / k * log(2.0 * je / (1.0 + je));
 				}
 				
-				x++;
+				cout << '\t' << j2m - dists[j];
 			}
-			
-			double je = double(x) / s;
-			double j2m = -1.0 / k * log(2.0 * je / (1.0 + je));
-			cout << '\t' << j2m - dists[j];
-		}
 		
+			cout << endl;
+		}
+	
 		cout << endl;
 	}
 	
-	cout << endl;
-	
 	return 0;
 }
+
+} // namespace mash
diff --git a/src/mash/CommandBounds.h b/src/mash/CommandBounds.h
index 0cfe430..ce62ffb 100644
--- a/src/mash/CommandBounds.h
+++ b/src/mash/CommandBounds.h
@@ -9,6 +9,8 @@
 
 #include "Command.h"
 
+namespace mash {
+
 class CommandBounds : public Command
 {
 public:
@@ -17,4 +19,6 @@ public:
     int run() const; // override
 };
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandContain.cpp b/src/mash/CommandContain.cpp
index c7d1854..c95324e 100644
--- a/src/mash/CommandContain.cpp
+++ b/src/mash/CommandContain.cpp
@@ -14,6 +14,8 @@
 
 using namespace::std;
 
+namespace mash {
+
 CommandContain::CommandContain()
 : Command()
 {
@@ -96,6 +98,7 @@ int CommandContain::run() const
         parameters.kmerSize = sketchRef.getKmerSize();
         parameters.noncanonical = sketchRef.getNoncanonical();
         parameters.preserveCase = sketchRef.getPreserveCase();
+        parameters.seed = sketchRef.getHashSeed();
         
         string alphabet;
         sketchRef.getAlphabetAsString(alphabet);
@@ -258,3 +261,5 @@ double containSketches(const HashList & hashesSortedRef, const HashList & hashes
     
     return double(common) / j;
 }
+
+} // namespace mash
diff --git a/src/mash/CommandContain.h b/src/mash/CommandContain.h
index 05ffc27..c840109 100644
--- a/src/mash/CommandContain.h
+++ b/src/mash/CommandContain.h
@@ -10,6 +10,8 @@
 #include "Command.h"
 #include "Sketch.h"
 
+namespace mash {
+
 class CommandContain : public Command
 {
 public:
@@ -83,4 +85,6 @@ private:
 CommandContain::ContainOutput * contain(CommandContain::ContainInput * data);
 double containSketches(const HashList & hashesSortedRef, const HashList & hashesSortedQuery, double & errorToSet);
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandDistance.cpp b/src/mash/CommandDistance.cpp
index 9632324..ce8e52a 100644
--- a/src/mash/CommandDistance.cpp
+++ b/src/mash/CommandDistance.cpp
@@ -21,6 +21,8 @@
 
 using namespace::std;
 
+namespace mash {
+
 CommandDistance::CommandDistance()
 : Command()
 {
@@ -30,7 +32,7 @@ CommandDistance::CommandDistance()
     argumentString = "<reference> <query> [<query>] ...";
     
     useOption("help");
-    addOption("list", Option(Option::Boolean, "l", "Input", "List input. Each query file contains a list of sequence files, one per line. The reference file is not affected.", ""));
+    addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <query> specify paths to sequence files, one per line. The reference file is not affected.", ""));
     addOption("table", Option(Option::Boolean, "t", "Output", "Table output (will not report p-values, but fields will be blank if they do not meet the p-value threshold).", ""));
     //addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", ""));
     addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report.", "1.0", 0., 1.));
@@ -127,6 +129,7 @@ int CommandDistance::run() const
         parameters.kmerSize = sketchRef.getKmerSize();
         parameters.noncanonical = sketchRef.getNoncanonical();
         parameters.preserveCase = sketchRef.getPreserveCase();
+        parameters.seed = sketchRef.getHashSeed();
         
         string alphabet;
         sketchRef.getAlphabetAsString(alphabet);
@@ -422,3 +425,5 @@ double pValue(uint64_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerS
     return gsl_cdf_binomial_Q(x - 1, r, sketchSize);
 #endif
 }
+
+} // namespace mash
diff --git a/src/mash/CommandDistance.h b/src/mash/CommandDistance.h
index c1c931f..b209423 100644
--- a/src/mash/CommandDistance.h
+++ b/src/mash/CommandDistance.h
@@ -10,6 +10,8 @@
 #include "Command.h"
 #include "Sketch.h"
 
+namespace mash {
+
 class CommandDistance : public Command
 {
 public:
@@ -90,4 +92,6 @@ CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * input);
 void compareSketches(CommandDistance::CompareOutput::PairOutput * output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, uint64_t sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue);
 double pValue(uint64_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint64_t sketchSize);
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandFind.cpp b/src/mash/CommandFind.cpp
index 4004c8d..4d5e81c 100644
--- a/src/mash/CommandFind.cpp
+++ b/src/mash/CommandFind.cpp
@@ -14,7 +14,13 @@
 #include "ThreadPool.h"
 #include "sketchParameterSetup.h"
 
-using namespace::std;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::vector;
+
+namespace mash {
 
 KSEQ_INIT(gzFile, gzread)
 
@@ -223,7 +229,7 @@ CommandFind::FindOutput * find(CommandFind::FindInput * data)
 
 void findPerStrand(const CommandFind::FindInput * input, CommandFind::FindOutput * output, bool minusStrand)
 {
-    typedef unordered_map < uint32_t, set<uint32_t> > PositionsBySequence_umap;
+    typedef std::unordered_map < uint32_t, std::set<uint32_t> > PositionsBySequence_umap;
     
     bool verbose = false;
     
@@ -322,6 +328,8 @@ void findPerStrand(const CommandFind::FindInput * input, CommandFind::FindOutput
     
     for ( PositionsBySequence_umap::iterator i = hits.begin(); i != hits.end(); i++ )
     {
+    	using std::set;
+    	
         // pointer to the position at the beginning of the window; to be updated
         // as the end of the window is incremented
         //
@@ -415,3 +423,5 @@ bool operator<(const CommandFind::FindOutput::Hit & a, const CommandFind::FindOu
     
     return a.score > b.score;
 }
+
+} // namespace mash
diff --git a/src/mash/CommandFind.h b/src/mash/CommandFind.h
index a83a35b..7f1de1c 100644
--- a/src/mash/CommandFind.h
+++ b/src/mash/CommandFind.h
@@ -12,6 +12,8 @@
 #include <string.h>
 #include <queue>
 
+namespace mash {
+
 class CommandFind : public Command
 {
 public:
@@ -90,4 +92,6 @@ CommandFind::FindOutput * find(CommandFind::FindInput * data);
 void findPerStrand(const CommandFind::FindInput * input, CommandFind::FindOutput * output, bool minusStrand);
 bool operator<(const CommandFind::FindOutput::Hit & a, const CommandFind::FindOutput::Hit & b);
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandInfo.cpp b/src/mash/CommandInfo.cpp
index 1e5fc2c..f9b65da 100644
--- a/src/mash/CommandInfo.cpp
+++ b/src/mash/CommandInfo.cpp
@@ -8,7 +8,19 @@
 #include "Sketch.h"
 #include <iostream>
 
-using namespace::std;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::vector;
+
+namespace mash {
+
+#ifdef ARCH_32
+	#define HASH "MurmurHash3_x86_32"
+#else
+	#define HASH "MurmurHash3_x64_128"
+#endif
 
 CommandInfo::CommandInfo()
 : Command()
@@ -85,11 +97,21 @@ int CommandInfo::run() const
         return 1;
     }
     
-    Sketch sketch;
-    Sketch::Parameters params;
-    params.parallelism = 1;
-    
-    sketch.initFromFiles(arguments, params);
+	Sketch sketch;
+	Sketch::Parameters params;
+	params.parallelism = 1;
+	
+	uint64_t referenceCount;
+	
+	if ( header )
+	{
+		referenceCount = sketch.initParametersFromCapnp(arguments[0].c_str());
+	}
+	else
+	{
+	    sketch.initFromFiles(arguments, params);
+	    referenceCount = sketch.getReferenceCount();
+	}
     
     if ( counts )
     {
@@ -110,10 +132,11 @@ int CommandInfo::run() const
     	sketch.getAlphabetAsString(alphabet);
     	
 		cout << "Header:" << endl;
+		cout << "  Hash function (seed):          " << HASH << " (" << sketch.getHashSeed() << ")" << endl;
 		cout << "  K-mer size:                    " << sketch.getKmerSize() << " (" << (sketch.getUse64() ? "64" : "32") << "-bit hashes)" << endl;
 		cout << "  Alphabet:                      " << alphabet << (sketch.getNoncanonical() ? "" : " (canonical)") << (sketch.getPreserveCase() ? " (case-sensitive)" : "") << endl;
 		cout << "  Target min-hashes per sketch:  " << sketch.getMinHashesPerWindow() << endl;
-		cout << "  Sketches:                      " << sketch.getReferenceCount() << endl;
+		cout << "  Sketches:                      " << referenceCount << endl;
 	}
 	
     if ( ! header )
@@ -125,10 +148,10 @@ int CommandInfo::run() const
 			cout << endl;
 			cout << "Sketches:" << endl;
 		
-			columns[0].push_back("Hashes");
-			columns[1].push_back("Length");
-			columns[2].push_back("ID");
-			columns[3].push_back("Comment");
+			columns[0].push_back("[Hashes]");
+			columns[1].push_back("[Length]");
+			columns[2].push_back("[ID]");
+			columns[3].push_back("[Comment]");
 		}
         
         for ( uint64_t i = 0; i < sketch.getReferenceCount(); i++ )
@@ -145,8 +168,8 @@ int CommandInfo::run() const
             }
             else
             {
-				columns[0].push_back(to_string(ref.hashesSorted.size()));
-				columns[1].push_back(to_string(ref.length));
+				columns[0].push_back(std::to_string(ref.hashesSorted.size()));
+				columns[1].push_back(std::to_string(ref.length));
 				columns[2].push_back(ref.name);
 				columns[3].push_back(ref.comment);
 			}
@@ -163,6 +186,8 @@ int CommandInfo::run() const
 
 int CommandInfo::printCounts(const Sketch & sketch) const
 {
+	using std::map;
+	
 	if ( sketch.getReferenceCount() == 0 )
 	{
 		cerr << "ERROR: Sketch file contains no sketches" << endl;
@@ -200,12 +225,6 @@ int CommandInfo::writeJson(const Sketch & sketch) const
 	sketch.getAlphabetAsString(alphabet);
 	bool use64 = sketch.getUse64();
 	
-#ifdef ARCH_32
-	#define HASH "MurmurHash3_x86_32"
-#else
-	#define HASH "MurmurHash3_x64_128"
-#endif
-	
 	cout << "{" << endl;
 	cout << "	\"kmer\" : " << sketch.getKmerSize() << ',' << endl;
 	cout << "	\"alphabet\" : \"" << alphabet << "\"," << endl;
@@ -214,7 +233,7 @@ int CommandInfo::writeJson(const Sketch & sketch) const
 	cout << "	\"sketchSize\" : " << sketch.getMinHashesPerWindow() << ',' << endl;
 	cout << "	\"hashType\" : \"" << HASH << "\"," << endl;
 	cout << "	\"hashBits\" : " << (use64 ? 64 : 32) << ',' << endl;
-	cout << "	\"hashSeed\" : " << seed << ',' << endl;
+	cout << "	\"hashSeed\" : " << sketch.getHashSeed() << ',' << endl;
 	cout << " 	\"sketches\" :" << endl;
 	cout << "	[" << endl;
 	
@@ -258,3 +277,5 @@ int CommandInfo::writeJson(const Sketch & sketch) const
 	
 	return 0;
 }
+
+} // namespace mash
\ No newline at end of file
diff --git a/src/mash/CommandInfo.h b/src/mash/CommandInfo.h
index 742d95a..5cc2afd 100644
--- a/src/mash/CommandInfo.h
+++ b/src/mash/CommandInfo.h
@@ -10,6 +10,8 @@
 #include "Command.h"
 #include "Sketch.h"
 
+namespace mash {
+
 class CommandInfo : public Command
 {
 public:
@@ -24,4 +26,6 @@ private:
 	int writeJson(const Sketch & sketch) const;
 };
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandList.cpp b/src/mash/CommandList.cpp
index 93c559e..bf1bf77 100644
--- a/src/mash/CommandList.cpp
+++ b/src/mash/CommandList.cpp
@@ -10,7 +10,11 @@
 #include "version.h"
 #include <string.h>
 
-using namespace::std;
+using std::cout;
+using std::endl;
+using std::string;
+
+namespace mash {
 
 CommandList::CommandList(string nameNew)
 {
@@ -19,7 +23,7 @@ CommandList::CommandList(string nameNew)
 
 CommandList::~CommandList()
 {
-    for ( map<string, Command *>::iterator i = commands.begin(); i != commands.end(); i++ )
+    for ( std::map<string, Command *>::iterator i = commands.begin(); i != commands.end(); i++ )
     {
         delete i->second;
     }
@@ -32,6 +36,9 @@ void CommandList::addCommand(Command * command)
 
 void CommandList::print()
 {
+	using std::map;
+	using std::vector;
+	
     vector<vector<string>> columns(1);
     
     cout << endl << "Mash version " << version << endl << endl;
@@ -210,3 +217,5 @@ THE SOFTWARE.\n\
 \n";
 #endif
 }
+
+} // namespace mash
diff --git a/src/mash/CommandList.h b/src/mash/CommandList.h
index df1c877..498638b 100644
--- a/src/mash/CommandList.h
+++ b/src/mash/CommandList.h
@@ -11,6 +11,8 @@
 
 #include "Command.h"
 
+namespace mash {
+
 class CommandList
 {
     std::map<std::string, Command *> commands;
@@ -31,4 +33,6 @@ private:
     std::string name;
 };
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandPaste.cpp b/src/mash/CommandPaste.cpp
index 19b10b2..520a830 100644
--- a/src/mash/CommandPaste.cpp
+++ b/src/mash/CommandPaste.cpp
@@ -9,7 +9,11 @@
 #include <iostream>
 #include "unistd.h"
 
-using namespace::std;
+using std::string;
+using std::cerr;
+using std::endl;
+
+namespace mash {
 
 CommandPaste::CommandPaste()
 : Command()
@@ -32,7 +36,7 @@ int CommandPaste::run() const
     }
     
     bool list = options.at("list").active;
-    vector<string> files;
+    std::vector<string> files;
     
     for ( int i = 1; i < arguments.size(); i++ )
     {
@@ -47,7 +51,7 @@ int CommandPaste::run() const
     }
     
     Sketch sketch;
-    vector<string> filesGood;
+    std::vector<string> filesGood;
     Sketch::Parameters parameters;
     parameters.parallelism = 1;
     
@@ -83,3 +87,5 @@ int CommandPaste::run() const
     
     return 0;
 }
+
+} // namespace mash
diff --git a/src/mash/CommandPaste.h b/src/mash/CommandPaste.h
index 31cbf46..59a7d61 100644
--- a/src/mash/CommandPaste.h
+++ b/src/mash/CommandPaste.h
@@ -10,6 +10,8 @@
 #include "Command.h"
 #include "Sketch.h"
 
+namespace mash {
+
 class CommandPaste : public Command
 {
 public:
@@ -19,4 +21,6 @@ public:
     int run() const; // override
 };
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/CommandScreen.cpp b/src/mash/CommandScreen.cpp
new file mode 100644
index 0000000..adeea6f
--- /dev/null
+++ b/src/mash/CommandScreen.cpp
@@ -0,0 +1,844 @@
+// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
+// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
+// Sergey Koren, and Adam Phillippy
+//
+// See the LICENSE.txt file included with this software for license information.
+
+#include "CommandScreen.h"
+#include "CommandDistance.h" // for pvalue
+#include "Sketch.h"
+#include "kseq.h"
+#include <iostream>
+#include <zlib.h>
+#include "ThreadPool.h"
+#include <math.h>
+#include <set>
+
+#ifdef USE_BOOST
+	#include <boost/math/distributions/binomial.hpp>
+	using namespace::boost::math;
+#else
+	#include <gsl/gsl_cdf.h>
+#endif
+
+#define SET_BINARY_MODE(file)
+KSEQ_INIT(gzFile, gzread)
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::list;
+using std::string;
+using std::unordered_map;
+using std::unordered_set;
+using std::vector;
+
+namespace mash {
+
+CommandScreen::CommandScreen()
+: Command()
+{
+	name = "screen";
+	summary = "Determine whether query sequences are within a larger pool of sequences.";
+	description = "Determine how well query sequences are contained within a pool of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <pool> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <pool> to read from standard input. The <pool> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, sh [...]
+    argumentString = "<queries>.msh <pool> [<pool>] ...";
+	
+	useOption("help");
+	useOption("threads");
+//	useOption("minCov");
+//    addOption("saturation", Option(Option::Boolean, "s", "", "Include saturation curve in output. Each line will have an additional field representing the absolute number of k-mers seen at each Jaccard increase, formatted as a comma-separated list.", ""));
+    addOption("winning!", Option(Option::Boolean, "w", "", "Winner-takes-all strategy for identity estimates. After counting hashes for each query, hashes that appear in multiple queries will be removed from all except the one with the best identity (ties broken by larger query), and other identities will be reduced. This removes output redundancy, providing a rough compositional outline.", ""));
+	//useSketchOptions();
+    addOption("identity", Option(Option::Number, "i", "Output", "Minimum identity to report. Inclusive unless set to zero, in which case only identities greater than zero (i.e. with at least one shared hash) will be reported. Set to -1 to output everything.", "0", -1., 1.));
+    addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report.", "1.0", 0., 1.));
+}
+
+int CommandScreen::run() const
+{
+	if ( arguments.size() < 2 || options.at("help").active )
+	{
+		print();
+		return 0;
+	}
+	
+	if ( ! hasSuffix(arguments[0], suffixSketch) )
+	{
+		cerr << "ERROR: " << arguments[0] << " does not look like a sketch (.msh)" << endl;
+		exit(1);
+	}
+	
+	bool sat = false;//options.at("saturation").active;
+	
+    double pValueMax = options.at("pvalue").getArgumentAsNumber();
+    double identityMin = options.at("identity").getArgumentAsNumber();
+    
+    vector<string> refArgVector;
+    refArgVector.push_back(arguments[0]);
+	
+	Sketch sketch;
+    Sketch::Parameters parameters;
+	
+    sketch.initFromFiles(refArgVector, parameters);
+    
+    string alphabet;
+    sketch.getAlphabetAsString(alphabet);
+    setAlphabetFromString(parameters, alphabet.c_str());
+	
+	parameters.parallelism = options.at("threads").getArgumentAsNumber();
+	parameters.kmerSize = sketch.getKmerSize();
+	parameters.noncanonical = sketch.getNoncanonical();
+	parameters.use64 = sketch.getUse64();
+	parameters.preserveCase = sketch.getPreserveCase();
+	parameters.seed = sketch.getHashSeed();
+	parameters.minHashesPerWindow = sketch.getMinHashesPerWindow();
+	
+	HashTable hashTable;
+	unordered_map<uint64_t, std::atomic<uint32_t>> hashCounts;
+	unordered_map<uint64_t, list<uint32_t> > saturationByIndex;
+	
+	cerr << "Loading " << arguments[0] << "..." << endl;
+	
+	for ( int i = 0; i < sketch.getReferenceCount(); i++ )
+	{
+		const HashList & hashes = sketch.getReference(i).hashesSorted;
+		
+		for ( int j = 0; j < hashes.size(); j++ )
+		{
+			uint64_t hash = hashes.get64() ? hashes.at(j).hash64 : hashes.at(j).hash32;
+			
+			if ( hashTable.count(hash) == 0 )
+			{
+				hashCounts[hash] = 0;
+			}
+			
+			hashTable[hash].insert(i);
+		}
+	}
+	
+	cerr << "   " << hashTable.size() << " distinct hashes." << endl;
+	
+	unordered_set<MinHashHeap *> minHashHeaps;
+	
+	bool trans = (alphabet == alphabetProtein);
+	
+/*	if ( ! trans )
+	{
+		if ( alphabet != alphabetNucleotide )
+		{
+			cerr << "ERROR: <query> sketch must have nucleotide or amino acid alphabet" << endl;
+			exit(1);
+		}
+		
+		if ( sketch.getNoncanonical() )
+		{
+			cerr << "ERROR: nucleotide <query> sketch must be canonical" << endl;
+			exit(1);
+		}
+	}
+*/	
+	int queryCount = arguments.size() - 1;
+	cerr << (trans ? "Translating from " : "Streaming from ");
+	
+	if ( queryCount == 1 )
+	{
+		cerr << arguments[1];
+	}
+	else
+	{
+		cerr << queryCount << " inputs";
+	}
+	
+	cerr << "..." << endl;
+	
+	int kmerSize = parameters.kmerSize;
+	int minCov = 1;//options.at("minCov").getArgumentAsNumber();
+	
+	ThreadPool<CommandScreen::HashInput, CommandScreen::HashOutput> threadPool(hashSequence, parameters.parallelism);
+	
+	// open all query files for round robin
+	//
+	gzFile fps[queryCount];
+	list<kseq_t *> kseqs;
+	//
+	for ( int f = 1; f < arguments.size(); f++ )
+	{
+		if ( arguments[f] == "-" )
+		{
+			if ( f > 1 )
+			{
+				cerr << "ERROR: '-' for stdin must be first query" << endl;
+				exit(1);
+			}
+			
+			fps[f - 1] = gzdopen(fileno(stdin), "r");
+		}
+		else
+		{
+			fps[f - 1] = gzopen(arguments[f].c_str(), "r");
+		}
+		
+		kseqs.push_back(kseq_init(fps[f - 1]));
+	}
+	
+	// perform round-robin, closing files as they end
+	//
+	int l;
+	uint64_t count = 0;
+	//uint64_t kmersTotal = 0;
+	uint64_t chunkSize = 1 << 20;
+	string input;
+	input.reserve(chunkSize);
+	list<kseq_t *>::iterator it = kseqs.begin();
+	//
+	while ( true )
+	{
+		if ( kseqs.begin() == kseqs.end() )
+		{
+			l = 0;
+		}
+		else
+		{
+			l = kseq_read(*it);
+		
+			if ( l < -1 ) // error
+			{
+				break;
+			}
+		
+			if ( l == -1 ) // eof
+			{
+				kseq_destroy(*it);
+				it = kseqs.erase(it);
+				if ( it == kseqs.end() )
+				{
+					it = kseqs.begin();
+				}
+				//continue;
+			}
+		}
+		
+		if ( input.length() + (l >= kmerSize ? l + 1 : 0) > chunkSize || kseqs.begin() == kseqs.end() )
+		{
+			// chunk big enough or at the end; time to flush
+			
+			// buffer this out since kseq will overwrite (deleted by HashInput destructor)
+			//
+			char * seqCopy = new char[input.length()];
+			//
+			memcpy(seqCopy, input.c_str(), input.length());
+			
+			if ( minHashHeaps.begin() == minHashHeaps.end() )
+			{
+				minHashHeaps.emplace(new MinHashHeap(sketch.getUse64(), sketch.getMinHashesPerWindow()));
+			}
+			
+			threadPool.runWhenThreadAvailable(new HashInput(hashCounts, *minHashHeaps.begin(), seqCopy, input.length(), parameters, trans));
+		
+			input = "";
+		
+			minHashHeaps.erase(minHashHeaps.begin());
+		
+			while ( threadPool.outputAvailable() )
+			{
+				useThreadOutput(threadPool.popOutputWhenAvailable(), minHashHeaps);
+			}
+		}
+		
+		if ( kseqs.begin() == kseqs.end() )
+		{
+			break;
+		}
+		
+		count++;
+		
+		if ( l >= kmerSize )
+		{
+			input.append(1, '*');
+			input.append((*it)->seq.s, l);
+		}
+		
+		it++;
+		
+		if ( it == kseqs.end() )
+		{
+			it = kseqs.begin();
+		}
+	}
+	
+	if (  l != -1 )
+	{
+		cerr << "\nERROR: reading inputs" << endl;
+		exit(1);
+	}
+    
+	while ( threadPool.running() )
+	{
+		useThreadOutput(threadPool.popOutputWhenAvailable(), minHashHeaps);
+	}
+	
+	for ( int i = 0; i < queryCount; i++ )
+	{
+		gzclose(fps[i]);
+	}
+	
+	MinHashHeap minHashHeap(sketch.getUse64(), sketch.getMinHashesPerWindow());
+	
+	for ( unordered_set<MinHashHeap *>::const_iterator i = minHashHeaps.begin(); i != minHashHeaps.end(); i++ )
+	{
+		HashList hashList(parameters.kmerSize);
+		
+		(*i)->toHashList(hashList);
+		
+		for ( int i = 0; i < hashList.size(); i++ )
+		{
+			minHashHeap.tryInsert(hashList.at(i));
+		}
+		
+		delete *i;
+	}
+	
+	if ( count == 0 )
+	{
+		cerr << "\nERROR: Did not find sequence records in inputs" << endl;
+		
+		exit(1);
+	}
+	
+	/*
+	if ( parameters.targetCov != 0 )
+	{
+		cerr << "Reads required for " << parameters.targetCov << "x coverage: " << count << endl;
+	}
+	else
+	{
+		cerr << "Estimated coverage: " << minHashHeap.estimateMultiplicity() << "x" << endl;
+	}
+	*/
+	
+	uint64_t setSize = minHashHeap.estimateSetSize();
+	cerr << "   Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in pool: " << setSize << endl;
+	
+	if ( setSize == 0 )
+	{
+		cerr << "WARNING: no valid k-mers in input." << endl;
+		//exit(0);
+	}
+	
+	cerr << "Summing shared..." << endl;
+	
+	uint64_t * shared = new uint64_t[sketch.getReferenceCount()];
+	vector<uint64_t> * depths = new vector<uint64_t>[sketch.getReferenceCount()];
+	
+	memset(shared, 0, sizeof(uint64_t) * sketch.getReferenceCount());
+	
+	for ( unordered_map<uint64_t, std::atomic<uint32_t> >::const_iterator i = hashCounts.begin(); i != hashCounts.end(); i++ )
+	{
+		if ( i->second >= minCov )
+		{
+			const unordered_set<uint64_t> & indeces = hashTable.at(i->first);
+
+			for ( unordered_set<uint64_t>::const_iterator k = indeces.begin(); k != indeces.end(); k++ )
+			{
+				shared[*k]++;
+				depths[*k].push_back(i->second);
+			
+				if ( sat )
+				{
+					saturationByIndex[*k].push_back(0);// TODO kmersTotal);
+				}
+			}
+		}
+	}
+	
+	if ( options.at("winning!").active )
+	{
+		cerr << "Reallocating to winners..." << endl;
+		
+		double * scores = new double[sketch.getReferenceCount()];
+		
+		for ( int i = 0; i < sketch.getReferenceCount(); i ++ )
+		{
+			scores[i] = estimateIdentity(shared[i], sketch.getReference(i).hashesSorted.size(), kmerSize, sketch.getKmerSpace());
+		}
+		
+		memset(shared, 0, sizeof(uint64_t) * sketch.getReferenceCount());
+		
+		for ( int i = 0; i < sketch.getReferenceCount(); i++ )
+		{
+			depths[i].clear();
+		}
+		
+		for ( HashTable::const_iterator i = hashTable.begin(); i != hashTable.end(); i++ )
+		{
+			if ( hashCounts.count(i->first) == 0 || hashCounts.at(i->first) < minCov )
+			{
+				continue;
+			}
+			
+			const unordered_set<uint64_t> & indeces = i->second;
+			double maxScore = 0;
+			uint64_t maxLength = 0;
+			uint64_t maxIndex;
+			
+			for ( unordered_set<uint64_t>::const_iterator k = indeces.begin(); k != indeces.end(); k++ )
+			{
+				if ( scores[*k] > maxScore )
+				{
+					maxScore = scores[*k];
+					maxIndex = *k;
+					maxLength = sketch.getReference(*k).length;
+				}
+				else if ( scores[*k] == maxScore && sketch.getReference(*k).length > maxLength )
+				{
+					maxIndex = *k;
+					maxLength = sketch.getReference(*k).length;
+				}
+			}
+			
+			shared[maxIndex]++;
+			depths[maxIndex].push_back(hashCounts.at(i->first));
+		}
+		
+		delete [] scores;
+	}
+	
+	cerr << "Computing coverage medians..." << endl;
+	
+	for ( int i = 0; i < sketch.getReferenceCount(); i++ )
+	{
+		sort(depths[i].begin(), depths[i].end());
+	}
+	
+	cerr << "Writing output..." << endl;
+	
+	for ( int i = 0; i < sketch.getReferenceCount(); i++ )
+	{
+		if ( shared[i] != 0 || identityMin < 0.0)
+		{
+			double identity = estimateIdentity(shared[i], sketch.getReference(i).hashesSorted.size(), kmerSize, sketch.getKmerSpace());
+			
+			if ( identity < identityMin )
+			{
+				continue;
+			}
+			
+			double pValue = pValueWithin(shared[i], setSize, sketch.getKmerSpace(), sketch.getReference(i).hashesSorted.size());
+			
+			if ( pValue > pValueMax )
+			{
+				continue;
+			}
+			
+			cout << identity << '\t' << shared[i] << '/' << sketch.getReference(i).hashesSorted.size() << '\t' << (shared[i] > 0 ? depths[i].at(shared[i] / 2) : 0) << '\t' << pValue << '\t' << sketch.getReference(i).name << '\t' << sketch.getReference(i).comment;
+			
+			if ( sat )
+			{
+				cout << '\t';
+				
+				for ( list<uint32_t>::const_iterator j = saturationByIndex.at(i).begin(); j != saturationByIndex.at(i).end(); j++ )
+				{
+					if ( j != saturationByIndex.at(i).begin() )
+					{
+						cout << ',';
+					}
+					
+					cout << *j;
+				}
+			}
+			
+			cout << endl;
+		}
+	}
+	
+	delete [] shared;
+	
+	return 0;
+}
+
+double estimateIdentity(uint64_t common, uint64_t denom, int kmerSize, double kmerSpace)
+{
+	double identity;
+	double jaccard = double(common) / denom;
+	
+	if ( common == denom ) // avoid -0
+	{
+		identity = 1.;
+	}
+	else if ( common == 0 ) // avoid inf
+	{
+		identity = 0.;
+	}
+	else
+	{
+		//distance = log(double(common + 1) / (denom + 1)) / log(1. / (denom + 1));
+		identity = pow(jaccard, 1. / kmerSize);
+	}
+	
+	return identity;
+}
+
+CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
+{
+	CommandScreen::HashOutput * output = new CommandScreen::HashOutput(input->minHashHeap);
+	
+	int l = input->length;
+	bool trans = input->trans;
+	
+	bool use64 = input->parameters.use64;
+	uint32_t seed = input->parameters.seed;
+	int kmerSize = input->parameters.kmerSize;
+	bool noncanonical = input->parameters.noncanonical;
+	
+	char * seq = input->seq;
+	
+	// uppercase
+	//
+	for ( uint64_t i = 0; i < l; i++ )
+	{
+		if ( ! input->parameters.preserveCase && seq[i] > 96 && seq[i] < 123 )
+		{
+			seq[i] -= 32;
+		}
+	}
+	
+	char * seqRev;
+	
+	if ( ! noncanonical || trans )
+	{
+		seqRev = new char[l];
+		reverseComplement(seq, seqRev, l);
+	}
+	
+	for ( int i = 0; i < (trans ? 6 : 1); i++ )
+	{
+		bool useRevComp = false;
+		int frame = i % 3;
+		bool rev = i > 2;
+		
+		int lenTrans = (l - frame) / 3;
+		
+		char * seqTrans;
+		
+		if ( trans )
+		{
+			seqTrans = new char[lenTrans];
+			translate((rev ? seqRev : seq) + frame, seqTrans, lenTrans);
+		}
+		
+		int64_t lastGood = -1;
+		int length = trans ? lenTrans : l;
+		
+		for ( int j = 0; j < length - kmerSize + 1; j++ )
+		{
+			while ( lastGood < j + kmerSize - 1 && lastGood < length )
+			{
+				lastGood++;
+				
+				if ( trans ? (seqTrans[lastGood] == '*') : (!input->parameters.alphabet[seq[lastGood]]) )
+				{
+					j = lastGood + 1;
+				}
+			}
+			
+			if ( j > length - kmerSize )
+			{
+				break;
+			}
+			
+			//kmersTotal++; TODO
+			
+			if ( ! noncanonical )
+			{
+				bool debug = false;
+				useRevComp = true;
+				bool prefixEqual = true;
+	
+				if ( debug ) {for ( uint64_t k = j; k < j + kmerSize; k++ ) { cout << *(seq + k); } cout << endl;}
+				
+				for ( uint64_t k = 0; k < kmerSize; k++ )
+				{
+					char base = seq[j + k];
+					char baseMinus = seqRev[l - j - kmerSize + k];
+		
+					if ( debug ) cout << baseMinus;
+		
+					if ( prefixEqual && baseMinus > base )
+					{
+						useRevComp = false;
+						break;
+					}
+		
+					if ( prefixEqual && baseMinus < base )
+					{
+						prefixEqual = false;
+					}
+				}
+	
+				if ( debug ) cout << endl;
+			}
+	
+			const char * kmer;
+			
+			if ( trans )
+			{
+				kmer = seqTrans + j;
+			}
+			else
+			{
+				kmer = useRevComp ? seqRev + l - j - kmerSize : seq + j;
+			}
+			
+			//cout << kmer << '\t' << kmerSize << endl;
+			hash_u hash = getHash(kmer, kmerSize, seed, use64);
+			//cout << kmer << '\t' << hash.hash64 << endl;
+			input->minHashHeap->tryInsert(hash);
+			
+			uint64_t key = use64 ? hash.hash64 : hash.hash32;
+			
+			if ( input->hashCounts.count(key) == 1 )
+			{
+				//cout << "Incrementing " << key << endl;
+				input->hashCounts[key]++;
+			}
+		}
+		
+		if ( trans )
+		{
+			delete [] seqTrans;
+		}
+	}
+	
+	if ( ! noncanonical || trans )
+	{
+		delete [] seqRev;
+	}
+	/*
+	addMinHashes(minHashHeap, seq, l, parameters);
+	
+	if ( parameters.targetCov > 0 && minHashHeap.estimateMultiplicity() >= parameters.targetCov )
+	{
+		l = -1; // success code
+		break;
+	}
+	*/
+	
+	return output;
+}
+
+double pValueWithin(uint64_t x, uint64_t setSize, double kmerSpace, uint64_t sketchSize)
+{
+    if ( x == 0 )
+    {
+        return 1.;
+    }
+    
+    double r = 1. / (1. + kmerSpace / setSize);
+    
+#ifdef USE_BOOST
+    return cdf(complement(binomial(sketchSize, r), x - 1));
+#else
+    return gsl_cdf_binomial_Q(x - 1, r, sketchSize);
+#endif
+}
+
+void translate(const char * src, char * dst, uint64_t len)
+{
+	for ( uint64_t n = 0, a = 0; a < len; a++, n+= 3 )
+	{
+		dst[a] = aaFromCodon(src + n);
+	}
+}
+
+char aaFromCodon(const char * codon)
+{
+	string str(codon, 3);
+	/*
+	if ( codons.count(str) == 1 )
+	{
+		return codons.at(str);
+	}
+	else
+	{
+		return 0;
+	}
+	*/
+	char aa = '*';//0;
+	
+	switch (codon[0])
+	{
+	case 'A':
+		switch (codon[1])
+		{
+		case 'A':
+			switch (codon[2])
+			{
+				case 'A': aa = 'K'; break;
+				case 'C': aa = 'N'; break;
+				case 'G': aa = 'K'; break;
+				case 'T': aa = 'N'; break;
+			}
+			break;
+		case 'C':
+			switch (codon[2])
+			{
+				case 'A': aa = 'T'; break;
+				case 'C': aa = 'T'; break;
+				case 'G': aa = 'T'; break;
+				case 'T': aa = 'T'; break;
+			}
+			break;
+		case 'G':
+			switch (codon[2])
+			{
+				case 'A': aa = 'R'; break;
+				case 'C': aa = 'S'; break;
+				case 'G': aa = 'R'; break;
+				case 'T': aa = 'S'; break;
+			}
+			break;
+		case 'T':
+			switch (codon[2])
+			{
+				case 'A': aa = 'I'; break;
+				case 'C': aa = 'I'; break;
+				case 'G': aa = 'M'; break;
+				case 'T': aa = 'I'; break;
+			}
+			break;
+		}
+		break;
+	case 'C':
+		switch (codon[1])
+		{
+		case 'A':
+			switch (codon[2])
+			{
+				case 'A': aa = 'Q'; break;
+				case 'C': aa = 'H'; break;
+				case 'G': aa = 'Q'; break;
+				case 'T': aa = 'H'; break;
+			}
+			break;
+		case 'C':
+			switch (codon[2])
+			{
+				case 'A': aa = 'P'; break;
+				case 'C': aa = 'P'; break;
+				case 'G': aa = 'P'; break;
+				case 'T': aa = 'P'; break;
+			}
+			break;
+		case 'G':
+			switch (codon[2])
+			{
+				case 'A': aa = 'R'; break;
+				case 'C': aa = 'R'; break;
+				case 'G': aa = 'R'; break;
+				case 'T': aa = 'R'; break;
+			}
+			break;
+		case 'T':
+			switch (codon[2])
+			{
+				case 'A': aa = 'L'; break;
+				case 'C': aa = 'L'; break;
+				case 'G': aa = 'L'; break;
+				case 'T': aa = 'L'; break;
+			}
+			break;
+		}
+		break;
+	case 'G':
+		switch (codon[1])
+		{
+		case 'A':
+			switch (codon[2])
+			{
+				case 'A': aa = 'E'; break;
+				case 'C': aa = 'D'; break;
+				case 'G': aa = 'E'; break;
+				case 'T': aa = 'D'; break;
+			}
+			break;
+		case 'C':
+			switch (codon[2])
+			{
+				case 'A': aa = 'A'; break;
+				case 'C': aa = 'A'; break;
+				case 'G': aa = 'A'; break;
+				case 'T': aa = 'A'; break;
+			}
+			break;
+		case 'G':
+			switch (codon[2])
+			{
+				case 'A': aa = 'G'; break;
+				case 'C': aa = 'G'; break;
+				case 'G': aa = 'G'; break;
+				case 'T': aa = 'G'; break;
+			}
+			break;
+		case 'T':
+			switch (codon[2])
+			{
+				case 'A': aa = 'V'; break;
+				case 'C': aa = 'V'; break;
+				case 'G': aa = 'V'; break;
+				case 'T': aa = 'V'; break;
+			}
+			break;
+		}
+		break;
+	case 'T':
+		switch (codon[1])
+		{
+		case 'A':
+			switch (codon[2])
+			{
+				case 'A': aa = '*'; break;
+				case 'C': aa = 'Y'; break;
+				case 'G': aa = '*'; break;
+				case 'T': aa = 'Y'; break;
+			}
+			break;
+		case 'C':
+			switch (codon[2])
+			{
+				case 'A': aa = 'S'; break;
+				case 'C': aa = 'S'; break;
+				case 'G': aa = 'S'; break;
+				case 'T': aa = 'S'; break;
+			}
+			break;
+		case 'G':
+			switch (codon[2])
+			{
+				case 'A': aa = '*'; break;
+				case 'C': aa = 'C'; break;
+				case 'G': aa = 'W'; break;
+				case 'T': aa = 'C'; break;
+			}
+			break;
+		case 'T':
+			switch (codon[2])
+			{
+				case 'A': aa = 'L'; break;
+				case 'C': aa = 'F'; break;
+				case 'G': aa = 'L'; break;
+				case 'T': aa = 'F'; break;
+			}
+			break;
+		}
+		break;
+	}
+	
+	return aa;//(aa == '*') ? 0 : aa;
+}
+
+void useThreadOutput(CommandScreen::HashOutput * output, unordered_set<MinHashHeap *> & minHashHeaps)
+{
+	minHashHeaps.emplace(output->minHashHeap);
+	delete output;
+}
+
+} // namespace mash
diff --git a/src/mash/CommandScreen.h b/src/mash/CommandScreen.h
new file mode 100644
index 0000000..ff8121f
--- /dev/null
+++ b/src/mash/CommandScreen.h
@@ -0,0 +1,163 @@
+// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
+// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
+// Sergey Koren, and Adam Phillippy
+//
+// See the LICENSE.txt file included with this software for license information.
+
+#ifndef INCLUDED_CommandScreen
+#define INCLUDED_CommandScreen
+
+#include "Command.h"
+#include "Sketch.h"
+#include <list>
+#include <string>
+#include <vector>
+#include <atomic>
+#include <unordered_set>
+#include <unordered_map>
+#include "MinHashHeap.h"
+
+namespace mash {
+
+typedef std::unordered_map< uint64_t, std::unordered_set<uint64_t> > HashTable;
+
+static const std::unordered_map< std::string, char > codons =
+{
+	{"AAA",	'K'},
+	{"AAC",	'N'},
+	{"AAG",	'K'},
+	{"AAT",	'N'},
+	{"ACA",	'T'},
+	{"ACC",	'T'},
+	{"ACG",	'T'},
+	{"ACT",	'T'},
+	{"AGA",	'R'},
+	{"AGC",	'S'},
+	{"AGG",	'R'},
+	{"AGT",	'S'},
+	{"ATA",	'I'},
+	{"ATC",	'I'},
+	{"ATG",	'M'},
+	{"ATT",	'I'},
+	{"CAA",	'Q'},
+	{"CAC",	'H'},
+	{"CAG",	'Q'},
+	{"CAT",	'H'},
+	{"CCA",	'P'},
+	{"CCC",	'P'},
+	{"CCG",	'P'},
+	{"CCT",	'P'},
+	{"CGA",	'R'},
+	{"CGC",	'R'},
+	{"CGG",	'R'},
+	{"CGT",	'R'},
+	{"CTA",	'L'},
+	{"CTC",	'L'},
+	{"CTG",	'L'},
+	{"CTT",	'L'},
+	{"GAA",	'E'},
+	{"GAC",	'D'},
+	{"GAG",	'E'},
+	{"GAT",	'D'},
+	{"GCA",	'A'},
+	{"GCC",	'A'},
+	{"GCG",	'A'},
+	{"GCT",	'A'},
+	{"GGA",	'G'},
+	{"GGC",	'G'},
+	{"GGG",	'G'},
+	{"GGT",	'G'},
+	{"GTA",	'V'},
+	{"GTC",	'V'},
+	{"GTG",	'V'},
+	{"GTT",	'V'},
+	{"TAA",	'*'},
+	{"TAC",	'Y'},
+	{"TAG",	'*'},
+	{"TAT",	'Y'},
+	{"TCA",	'S'},
+	{"TCC",	'S'},
+	{"TCG",	'S'},
+	{"TCT",	'S'},
+	{"TGA",	'*'},
+	{"TGC",	'C'},
+	{"TGG",	'W'},
+	{"TGT",	'C'},
+	{"TTA",	'L'},
+	{"TTC",	'F'},
+	{"TTG",	'L'},
+	{"TTT",	'F'}
+};
+
+class CommandScreen : public Command
+{
+public:
+    
+    struct HashInput
+    {
+    	HashInput(std::unordered_map<uint64_t, std::atomic<uint32_t> > & hashCountsNew, MinHashHeap * minHashHeapNew, char * seqNew, uint64_t lengthNew, const Sketch::Parameters & parametersNew, bool transNew)
+    	:
+    	hashCounts(hashCountsNew),
+    	minHashHeap(minHashHeapNew),
+    	seq(seqNew),
+    	length(lengthNew),
+    	parameters(parametersNew),
+    	trans(transNew)
+    	{}
+    	
+    	~HashInput()
+    	{
+    		if ( seq != 0 )
+    		{
+	    		delete [] seq;
+	    	}
+    	}
+    	
+    	std::string fileName;
+    	
+    	char * seq;
+    	uint64_t length;
+    	bool trans;
+    	
+    	Sketch::Parameters parameters;
+		std::unordered_map<uint64_t, std::atomic<uint32_t> > & hashCounts;
+		MinHashHeap * minHashHeap;
+    };
+    
+    struct HashOutput
+    {
+    	HashOutput(MinHashHeap * minHashHeapNew)
+    	:
+    	minHashHeap(minHashHeapNew)
+    	{}
+    	
+		MinHashHeap * minHashHeap;
+    };
+    
+    CommandScreen();
+    
+    int run() const; // override
+
+private:
+	
+	struct Reference
+	{
+		Reference(uint64_t amerCountNew, std::string nameNew, std::string commentNew)
+		: amerCount(amerCountNew), name(nameNew), comment(commentNew) {}
+		
+		uint64_t amerCount;
+		std::string name;
+		std::string comment;
+	};
+};
+
+char aaFromCodon(const char * codon);
+double estimateIdentity(uint64_t common, uint64_t denom, int kmerSize, double kmerSpace);
+CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input);
+double pValueWithin(uint64_t x, uint64_t setSize, double kmerSpace, uint64_t sketchSize);
+void translate(const char * src, char * dst, uint64_t len);
+void useThreadOutput(CommandScreen::HashOutput * output, std::unordered_set<MinHashHeap *> & minHashHeaps);
+
+} // namespace mash
+
+#endif
diff --git a/src/mash/CommandSketch.cpp b/src/mash/CommandSketch.cpp
index ac6af01..5221459 100644
--- a/src/mash/CommandSketch.cpp
+++ b/src/mash/CommandSketch.cpp
@@ -9,18 +9,23 @@
 #include "sketchParameterSetup.h"
 #include <iostream>
 
-using namespace::std;
+using std::cerr;
+using std::endl;
+using std::string;
+using std::vector;
+
+namespace mash {
 
 CommandSketch::CommandSketch()
 : Command()
 {
     name = "sketch";
     summary = "Create sketches (reduced representations for fast operations).";
-    description = "Create a sketch file, which is a reduced representation of a sequence or set of sequences (based on min-hashes) that can be used for fast distance estimations. Input can be fasta or fastq files (gzipped or not), and \"-\" can be given to read from standard input. Input files can also be files of file names (see -l). For output, one sketch file will be generated, but it can have multiple sketches within it, divided by sequences or files (see -i). By default, the output  [...]
-    argumentString = "fast(a|q)[.gz] ...";
+    description = "Create a sketch file, which is a reduced representation of a sequence or set of sequences (based on min-hashes) that can be used for fast distance estimations. Inputs can be fasta or fastq files (gzipped or not), and \"-\" can be given to read from standard input. Input files can also be files of file names (see -l). For output, one sketch file will be generated, but it can have multiple sketches within it, divided by sequences or files (see -i). By default, the output [...]
+    argumentString = "<input> [<input>] ...";
     
     useOption("help");
-    addOption("list", Option(Option::Boolean, "l", "Input", "List input. Each file contains a list of sequence files, one per line.", ""));
+    addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", ""));
     addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", ""));
     useSketchOptions();
 }
@@ -47,7 +52,7 @@ int CommandSketch::run() const
     {
         if ( false && hasSuffix(arguments[i], suffixSketch) )
         {
-            cerr << "ERROR: " << arguments[i] << " looks like it is already sketched.\n";
+            cerr << "ERROR: " << arguments[i] << " looks like it is already sketched." << endl;
             exit(1);
         }
     }
@@ -132,3 +137,5 @@ int CommandSketch::run() const
     
     return 0;
 }
+
+} // namespace mash
diff --git a/src/mash/CommandSketch.h b/src/mash/CommandSketch.h
index 4f7d547..30b6d56 100644
--- a/src/mash/CommandSketch.h
+++ b/src/mash/CommandSketch.h
@@ -9,6 +9,8 @@
 
 #include "Command.h"
 
+namespace mash {
+
 class CommandSketch : public Command
 {
 public:
@@ -18,4 +20,6 @@ public:
     int run() const; // override
 };
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/MinHashHeap.h b/src/mash/MinHashHeap.h
index ab011fe..b7d2955 100644
--- a/src/mash/MinHashHeap.h
+++ b/src/mash/MinHashHeap.h
@@ -42,8 +42,8 @@ private:
     uint64_t kmersUsed;
 };
 
-inline double MinHashHeap::estimateMultiplicity() const {return (double)multiplicitySum / hashes.size();}
-inline double MinHashHeap::estimateSetSize() const {return pow(2.0, use64 ? 64.0 : 32.0) * (double)hashes.size() / (use64 ? (double)hashesQueue.top().hash64 : (double)hashesQueue.top().hash32);}
+inline double MinHashHeap::estimateMultiplicity() const {return hashes.size() ? (double)multiplicitySum / hashes.size() : 0;}
+inline double MinHashHeap::estimateSetSize() const {return hashes.size() ? pow(2.0, use64 ? 64.0 : 32.0) * (double)hashes.size() / (use64 ? (double)hashesQueue.top().hash64 : (double)hashesQueue.top().hash32) : 0;}
 inline void MinHashHeap::toHashList(HashList & hashList) const {hashes.toHashList(hashList);}
 inline void MinHashHeap::toCounts(std::vector<uint32_t> & counts) const {hashes.toCounts(counts);}
 
diff --git a/src/mash/Sketch.cpp b/src/mash/Sketch.cpp
index ef433ee..9f81590 100644
--- a/src/mash/Sketch.cpp
+++ b/src/mash/Sketch.cpp
@@ -125,6 +125,11 @@ int Sketch::initFromFiles(const vector<string> & files, const Parameters & param
             	continue;
             }
 			
+            if ( sketchTest.getHashSeed() != parameters.seed )
+            {
+				cerr << "\nWARNING: The sketch " << files[i] << " has a seed size (" << sketchTest.getHashSeed() << ") that does not match the current seed (" << parameters.seed << "). This file will be skipped." << endl << endl;
+				continue;
+            }
 			if ( sketchTest.getKmerSize() != parameters.kmerSize )
 			{
 				cerr << "\nWARNING: The sketch " << files[i] << " has a kmer size (" << sketchTest.getKmerSize() << ") that does not match the current kmer size (" << parameters.kmerSize << "). This file will be skipped." << endl << endl;
@@ -232,7 +237,7 @@ int Sketch::initFromFiles(const vector<string> & files, const Parameters & param
     return 0;
 }
 
-void Sketch::initParametersFromCapnp(const char * file)
+uint64_t Sketch::initParametersFromCapnp(const char * file)
 {
     int fd = open(file, O_RDONLY);
     
@@ -242,13 +247,23 @@ void Sketch::initParametersFromCapnp(const char * file)
         exit(1);
     }
     
+    struct stat fileInfo;
+    
+    if ( stat(file, &fileInfo) == -1 )
+    {
+        cerr << "ERROR: could not get file stats for \"" << file << "\"." << endl;
+        exit(1);
+    }
+    
+    void * data = mmap(NULL, fileInfo.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+    
     capnp::ReaderOptions readerOptions;
     
     readerOptions.traversalLimitInWords = 1000000000000;
     readerOptions.nestingLimit = 1000000;
     
-    capnp::StreamFdMessageReader * message = new capnp::StreamFdMessageReader(fd, readerOptions);
-    //capnp::FlatArrayMessageReader * message = new capnp::FlatArrayMessageReader(kj::ArrayPtr<const capnp::word>(reinterpret_cast<const capnp::word *>(data), fileInfo.st_size / sizeof(capnp::word)), readerOptions);
+    //capnp::StreamFdMessageReader * message = new capnp::StreamFdMessageReader(fd, readerOptions);
+    capnp::FlatArrayMessageReader * message = new capnp::FlatArrayMessageReader(kj::ArrayPtr<const capnp::word>(reinterpret_cast<const capnp::word *>(data), fileInfo.st_size / sizeof(capnp::word)), readerOptions);
     capnp::MinHash::Reader reader = message->getRoot<capnp::MinHash>();
     
     parameters.kmerSize = reader.getKmerSize();
@@ -258,6 +273,12 @@ void Sketch::initParametersFromCapnp(const char * file)
     parameters.concatenated = reader.getConcatenated();
     parameters.noncanonical = reader.getNoncanonical();
    	parameters.preserveCase = reader.getPreserveCase();
+
+    capnp::MinHash::ReferenceList::Reader referenceListReader = reader.getReferenceList().getReferences().size() ? reader.getReferenceList() : reader.getReferenceListOld();
+    capnp::List<capnp::MinHash::ReferenceList::Reference>::Reader referencesReader = referenceListReader.getReferences();
+    uint64_t referenceCount = referencesReader.size();
+    
+   	parameters.seed = reader.getHashSeed();
     
     if ( reader.hasAlphabet() )
     {
@@ -274,6 +295,8 @@ void Sketch::initParametersFromCapnp(const char * file)
 		delete message;
 	}
 	catch (exception e) {}
+	
+	return referenceCount;
 }
 
 bool Sketch::sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool)
@@ -347,7 +370,7 @@ int Sketch::writeToCapnp(const char * file) const
     capnp::MallocMessageBuilder message;
     capnp::MinHash::Builder builder = message.initRoot<capnp::MinHash>();
     
-    capnp::MinHash::ReferenceList::Builder referenceListBuilder = builder.initReferenceList();
+    capnp::MinHash::ReferenceList::Builder referenceListBuilder = (parameters.seed == 42 ? builder.initReferenceListOld() : builder.initReferenceList());
     
     capnp::List<capnp::MinHash::ReferenceList::Reference>::Builder referencesBuilder = referenceListBuilder.initReferences(references.size());
     
@@ -422,6 +445,7 @@ int Sketch::writeToCapnp(const char * file) const
     }
     
     builder.setKmerSize(parameters.kmerSize);
+    builder.setHashSeed(parameters.seed);
     builder.setError(parameters.error);
     builder.setMinHashesPerWindow(parameters.minHashesPerWindow);
     builder.setWindowSize(parameters.windowSize);
@@ -549,7 +573,7 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
         const char * kmer = useRevComp ? seqRev + length - i - kmerSize : seq + i;
         bool filter = false;
         
-        hash_u hash = getHash(useRevComp ? seqRev + length - i - kmerSize : seq + i, kmerSize, parameters.use64);
+        hash_u hash = getHash(useRevComp ? seqRev + length - i - kmerSize : seq + i, kmerSize, parameters.seed, parameters.use64);
         
         if ( debug ) cout << endl;
         
@@ -660,7 +684,7 @@ void getMinHashPositions(vector<Sketch::PositionHash> & positionHashes, char * s
         
         if ( i >= nextValidKmer )
         {
-            Sketch::hash_t hash = getHash(seq + i, kmerSize, parameters.use64).hash64; // TODO: dynamic
+            Sketch::hash_t hash = getHash(seq + i, kmerSize, parameters.seed, parameters.use64).hash64; // TODO: dynamic
             
             if ( verbosity > 1 )
             {
@@ -909,7 +933,7 @@ Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input)
     capnp::FlatArrayMessageReader * message = new capnp::FlatArrayMessageReader(kj::ArrayPtr<const capnp::word>(reinterpret_cast<const capnp::word *>(data), fileInfo.st_size / sizeof(capnp::word)), readerOptions);
     capnp::MinHash::Reader reader = message->getRoot<capnp::MinHash>();
     
-    capnp::MinHash::ReferenceList::Reader referenceListReader = reader.getReferenceList();
+    capnp::MinHash::ReferenceList::Reader referenceListReader = reader.getReferenceList().getReferences().size() ? reader.getReferenceList() : reader.getReferenceListOld();
     
     capnp::List<capnp::MinHash::ReferenceList::Reference>::Reader referencesReader = referenceListReader.getReferences();
     
@@ -1105,18 +1129,17 @@ void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap &
 
 Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
 {
-	FILE * inStream = 0;
+	gzFile fp;
 	
 	if ( input->fileName == "-" )
 	{
-		inStream = stdin;
+		fp = gzdopen(fileno(stdin), "r");
 	}
 	else
 	{
-		inStream = fopen(input->fileName.c_str(), "r");
+		fp = gzopen(input->fileName.c_str(), "r");
 	}
 	
-	gzFile fp = gzdopen(fileno(inStream), "r");
 	kseq_t *seq = kseq_init(fp);
 	
 	const Sketch::Parameters & parameters = input->parameters;
@@ -1224,7 +1247,6 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
 	
 	kseq_destroy(seq);
 	gzclose(fp);
-	fclose(inStream);
 	
 	return output;
 }
diff --git a/src/mash/Sketch.h b/src/mash/Sketch.h
index a756169..8bffe4e 100644
--- a/src/mash/Sketch.h
+++ b/src/mash/Sketch.h
@@ -36,11 +36,12 @@ public:
     {
         Parameters()
             :
-            parallelism(0),
+            parallelism(1),
             kmerSize(0),
             alphabetSize(0),
             preserveCase(false),
             use64(false),
+            seed(0),
             error(0),
             warning(0),
             minHashesPerWindow(0),
@@ -50,7 +51,7 @@ public:
             noncanonical(false),
             reads(false),
             memoryBound(0),
-            minCov(0),
+            minCov(1),
             targetCov(0),
             genomeSize(0)
         {
@@ -64,6 +65,7 @@ public:
             alphabetSize(other.alphabetSize),
             preserveCase(other.preserveCase),
             use64(other.use64),
+            seed(other.seed),
             error(other.error),
             warning(other.warning),
             minHashesPerWindow(other.minHashesPerWindow),
@@ -86,6 +88,7 @@ public:
         uint32_t alphabetSize;
         bool preserveCase;
         bool use64;
+        uint32_t seed;
         double error;
         double warning;
         uint64_t minHashesPerWindow;
@@ -179,6 +182,7 @@ public:
     bool getConcatenated() const {return parameters.concatenated;}
     float getError() const {return parameters.error;}
     int getHashCount() const {return lociByHash.size();}
+    uint32_t getHashSeed() const {return parameters.seed;}
     const std::vector<Locus> & getLociByHash(hash_t hash) const;
     float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
 	int getMinKmerSize(uint64_t reference) const;
@@ -196,7 +200,7 @@ public:
     bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
     bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
     int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
-    void initParametersFromCapnp(const char * file);
+    uint64_t initParametersFromCapnp(const char * file);
 	bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
 	void useThreadOutput(SketchOutput * output);
     void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
diff --git a/src/mash/capnp/MinHash.capnp b/src/mash/capnp/MinHash.capnp
index 5fff82c..ad9cfbe 100644
--- a/src/mash/capnp/MinHash.capnp
+++ b/src/mash/capnp/MinHash.capnp
@@ -50,7 +50,9 @@ struct MinHash
 	noncanonical @7 : Bool;
 	alphabet @8 : Text;
 	preserveCase @9 : Bool;
+	hashSeed @10 : UInt32 = 42;
 	
-	referenceList @4 : ReferenceList;
+	referenceListOld @4 : ReferenceList;
+	referenceList @11 : ReferenceList;
 	locusList @5 : LocusList;
 }
diff --git a/src/mash/hash.cpp b/src/mash/hash.cpp
index 36aaa1f..cc8da58 100644
--- a/src/mash/hash.cpp
+++ b/src/mash/hash.cpp
@@ -7,7 +7,7 @@
 #include "hash.h"
 #include "MurmurHash3.h"
 
-hash_u getHash(const char * seq, int length, bool use64)
+hash_u getHash(const char * seq, int length, uint32_t seed, bool use64)
 {
     //for ( int i = 0; i < length; i++ ) { cout << *(seq + i); } cout << endl;
     
diff --git a/src/mash/hash.h b/src/mash/hash.h
index 591515e..65734ec 100644
--- a/src/mash/hash.h
+++ b/src/mash/hash.h
@@ -9,8 +9,6 @@
 
 #include <inttypes.h>
 
-static const int seed = 42; // TODO: better seed???
-
 typedef uint32_t hash32_t;
 typedef uint64_t hash64_t;
 
@@ -20,7 +18,7 @@ union hash_u
     hash64_t hash64;
 };
 
-hash_u getHash(const char * seq, int length, bool use64);
+hash_u getHash(const char * seq, int length, uint32_t seed, bool use64);
 bool hashLessThan(hash_u hash1, hash_u hash2, bool use64);
 
 #endif
diff --git a/src/mash/mash.cpp b/src/mash/mash.cpp
index 564e474..1662a14 100644
--- a/src/mash/mash.cpp
+++ b/src/mash/mash.cpp
@@ -9,28 +9,28 @@
 #include "CommandSketch.h"
 #include "CommandFind.h"
 #include "CommandDistance.h"
+#include "CommandScreen.h"
 #include "CommandContain.h"
 #include "CommandInfo.h"
 #include "CommandPaste.h"
 
-using namespace::std;
-
 int main(int argc, const char ** argv)
 {
-    CommandList commandList("mash");
+    mash::CommandList commandList("mash");
     
-    commandList.addCommand(new CommandSketch());
+    commandList.addCommand(new mash::CommandSketch());
     //commandList.addCommand(new CommandFind());
-    commandList.addCommand(new CommandDistance());
+    commandList.addCommand(new mash::CommandDistance());
+    commandList.addCommand(new mash::CommandScreen());
 #ifdef COMMAND_WITHIN
-    commandList.addCommand(new CommandContain());
+    commandList.addCommand(new mash::CommandContain());
 #endif
 #ifdef COMMAND_FIND
-	commandList.addCommand(new CommandFind());
+	commandList.addCommand(new mash::CommandFind());
 #endif
-    commandList.addCommand(new CommandInfo());
-    commandList.addCommand(new CommandPaste());
-    commandList.addCommand(new CommandBounds());
+    commandList.addCommand(new mash::CommandInfo());
+    commandList.addCommand(new mash::CommandPaste());
+    commandList.addCommand(new mash::CommandBounds());
     
     return commandList.run(argc, argv);
 }
diff --git a/src/mash/schema-1.0.0.json b/src/mash/schema-1.0.0.json
new file mode 100644
index 0000000..e9c54f3
--- /dev/null
+++ b/src/mash/schema-1.0.0.json
@@ -0,0 +1,150 @@
+{
+	// 'meta-schema' describing this JSON as a schema
+	//
+	"$schema" : "http://json-schema.org/schema#",
+	
+	"type" : "object",
+	"required" :
+	[
+		"schema",
+		"kmerSize",
+		"alphabet",
+		"preserveCase",
+		"canonical",
+		"sketchSize",
+		"hashType",
+		"hashBits",
+		"hashSeed"
+	],
+	"properties" :
+	{
+		// URI of the appropriate version of this schema
+		//
+		"schema" : {"type" : "string"},
+		
+		// the number of characters in each overlapping 'shingle'
+		//
+		"kmerSize" : {"type" : "number"},
+		
+		// all letters a k-mer can have (others will be skipped) (see also
+		// 'preserveCase')
+		//
+		"alphabet" : {"type" : "string"},
+		
+		// if true, letters in a k-mer must match the case of letters in
+		// 'alphabet'
+		//
+		"preserveCase" : {"type" : "boolean"},
+		
+		// if true, use alphabetical minima of k-mers and their reverse
+		// complements (only makes sense for nucleotide alphabet)
+		//
+		"canonical" : {"type" : "boolean"},
+		
+		// the (maximum) number of min-hashes each sketch can have (there can be
+		// if a sequence does not have enough valid k-mers)
+		//
+		"sketchSize" : {"type" : "number"},
+		
+		// the hashing function used to hash each k-mer
+		//
+		"hashType" : {"type" : "string"},
+		
+		// the number of (least significant) bits taken from the result of the
+		// hash function
+		//
+		"hashBits" : {"type" : "number"},
+		
+		// the seed of the hash function, if applicable
+		//
+		"hashSeed" : {"type" : ["number", "null"]},
+		
+		// a collection of sketches that share the above sketching parameters
+		//
+		"sketches" :
+		{
+			"type" : "array",
+			"items" :
+			{
+				"type" : "object",
+				"required" :
+				[
+					"name",
+					"hashes"
+				],
+				"properties" :
+				{
+					// a sequence identifier, expected to be unique in the
+					// collection
+					//
+					"name" : {"type" : "string"},
+					
+					// the length of the source sequence that was sketched,
+					// which could be a sum of for concatenated sequences or an
+					// estimate from k-mer content of reads
+					//
+					"seqLength" : {"type" : "number"},
+					
+					// an additional description of the source sequence not
+					// captured by 'name' (e.g. after whitesapce in a fasta tag)
+					//
+					"comment" : {"type" : "string"},
+					
+					// the number of k-mers from the source sequence that
+					// conformed to the letters in 'alphabet' and was thus
+					// considered for min-hashing (including repeated k-mers)
+					//
+					"numValidKmers" : {"type" : "number"},
+					
+					"filters" :
+					{
+						"type" : "object",
+						"properties" :
+						{
+							// the minimum number of times a k-mer must appear
+							// in the source sequence to be considered for min-
+							// hashing (for filtering out erroneous k-mers in
+							// read sets)
+							//
+							"minCopies" : {"type" : "number"},
+						}
+					},
+					
+					// the min-hashes of this sketch, represented as unsigned
+					// integers but quoted as strings to avoid overflow
+					//
+					"hashes" :
+					{
+						"type" : "array",
+						"uniqueItems" : true,
+						"items" :
+						{
+							"type" : "string",
+							"pattern" : "^[0-9]+$"
+						}
+					},
+					
+					// the k-mers that correspond to the hashes in 'hashes' (in
+					// the same order), used mainly for confirming the hash
+					// function and not necessarily valid for Jaccard estimates
+					// due to potential hash collisions
+					//
+					"kmers" :
+					{
+						"type" : "array",
+						"items" : {"type" : "string"}
+					},
+					
+					// the number of times each hash in 'hash' was derived from
+					// k-mers in the source sequence (in the same order)
+					//
+					"counts" :
+					{
+						"type" : "array",
+						"items" : {"type" : "number"}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/src/mash/sketchParameterSetup.cpp b/src/mash/sketchParameterSetup.cpp
index e161fd1..74ebef4 100644
--- a/src/mash/sketchParameterSetup.cpp
+++ b/src/mash/sketchParameterSetup.cpp
@@ -7,7 +7,10 @@
 #include "sketchParameterSetup.h"
 #include <iostream>
 
-using namespace::std;
+using std::cerr;
+using std::endl;
+
+namespace mash {
 
 int sketchParameterSetup(Sketch::Parameters & parameters, const Command & command)
 {
@@ -15,6 +18,7 @@ int sketchParameterSetup(Sketch::Parameters & parameters, const Command & comman
     parameters.minHashesPerWindow = command.getOption("sketchSize").getArgumentAsNumber();
     parameters.concatenated = ! command.getOption("individual").active;
     parameters.noncanonical = command.getOption("noncanonical").active;
+    parameters.seed = command.getOption("seed").getArgumentAsNumber();
     parameters.reads = command.getOption("reads").active;
     parameters.minCov = command.getOption("minCov").getArgumentAsNumber();
     parameters.targetCov = command.getOption("targetCov").getArgumentAsNumber();
@@ -110,3 +114,4 @@ void warnKmerSize(const Sketch::Parameters & parameters, const Command & command
 		<< " is required. See: -" << command.getOption("kmer").identifier << ", -" << command.getOption("warning").identifier << "." << endl << endl;
 }
 
+} // namespace mash
\ No newline at end of file
diff --git a/src/mash/sketchParameterSetup.h b/src/mash/sketchParameterSetup.h
index 939f71c..cf20e2a 100644
--- a/src/mash/sketchParameterSetup.h
+++ b/src/mash/sketchParameterSetup.h
@@ -10,7 +10,11 @@
 #include "Command.h"
 #include "Sketch.h"
 
+namespace mash {
+
 int sketchParameterSetup(Sketch::Parameters & parameters, const Command & command);
 void warnKmerSize(const Sketch::Parameters & parameters, const Command & command, uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount);
 
+} // namespace mash
+
 #endif
diff --git a/src/mash/version.h b/src/mash/version.h
index 7bd7fdf..2deff88 100644
--- a/src/mash/version.h
+++ b/src/mash/version.h
@@ -4,4 +4,4 @@
 //
 // See the LICENSE.txt file included with this software for license information.
 
-static const char * version = "1.1.1";
+static const char * version = "2.0";

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mash.git



More information about the debian-med-commit mailing list