[med-svn] [rambo-k] 01/02: Imported Upstream version 1.21

Andreas Tille tille at debian.org
Thu Nov 5 11:08:47 UTC 2015


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository rambo-k.

commit 06f5ac67c0446b4781e92e378dca5df0967bee60
Author: Andreas Tille <tille at debian.org>
Date:   Thu Nov 5 12:00:57 2015 +0100

    Imported Upstream version 1.21
---
 License.txt                                        | 165 +++++++++++
 RAMBO-K-plugin_eclipse/.classpath                  |   9 +
 RAMBO-K-plugin_eclipse/.project                    |  17 ++
 .../RAMBO-K Indexer (32 bit).launch                |  27 ++
 .../RAMBO-K indexer (64 bit).launch                |  27 ++
 RAMBO-K-plugin_eclipse/RAMBO-K-Indexer.iml         |  13 +
 RAMBO-K-plugin_eclipse/plugin.properties           |  11 +
 .../org/rki/rambok/indexer/GeneiousDataReader.java |  56 ++++
 .../org/rki/rambok/indexer/GeneiousDataWriter.java |  73 +++++
 .../src/org/rki/rambok/indexer/GeneiousReader.java |  82 ++++++
 .../src/org/rki/rambok/indexer/GeneiousWriter.java |  53 ++++
 .../src/org/rki/rambok/indexer/Indexer.java        | 189 ++++++++++++
 .../src/org/rki/rambok/indexer/IndexerPlugin.java  |  92 ++++++
 .../rki/rambok/indexer/RAMBOKProgressListener.java |  43 +++
 .../src/org/rki/rambok/indexer/RAMBOKresult.java   | 206 +++++++++++++
 .../rambok/indexer/RAMBOKresultDocumentViewer.java |  79 +++++
 RAMBOK.py                                          | 219 ++++++++++++++
 ReadClassifier_eclipse/.classpath                  |   9 +
 ReadClassifier_eclipse/.project                    |  17 ++
 ReadClassifier_eclipse/classifier.log              |   1 +
 .../src/org/rki/readclassifier/AbstractWriter.java |   7 +
 .../src/org/rki/readclassifier/Classifier.java     | 233 +++++++++++++++
 .../readclassifier/ClassifierProgressListener.java |   5 +
 .../org/rki/readclassifier/ClassifierThread.java   |  45 +++
 .../src/org/rki/readclassifier/DistClassifier.java |  85 ++++++
 .../org/rki/readclassifier/KmerDistribution.java   |  20 ++
 .../src/org/rki/readclassifier/MMClassifier.java   | 212 ++++++++++++++
 .../src/org/rki/readclassifier/Read.java           |  87 ++++++
 .../src/org/rki/readclassifier/ReadClassifier.java |  10 +
 .../src/org/rki/readclassifier/ReadPair.java       |  11 +
 .../src/org/rki/readclassifier/Reader.java         | 135 +++++++++
 .../src/org/rki/readclassifier/Writer.java         |  71 +++++
 ReadTrainer_eclipse/.classpath                     |   9 +
 ReadTrainer_eclipse/.project                       |  17 ++
 .../src/org/rki/readtrainer/AbstractReader.java    |   9 +
 .../src/org/rki/readtrainer/Trainer.java           | 241 +++++++++++++++
 .../rki/readtrainer/TrainerProgressListener.java   |   5 +
 .../src/org/rki/readtrainer/TrainerThread.java     |  82 ++++++
 Readme.pdf                                         | Bin 0 -> 1321321 bytes
 Readme.txt                                         | 105 +++++++
 plot.py                                            | 325 +++++++++++++++++++++
 simulate_reads.py                                  | 299 +++++++++++++++++++
 42 files changed, 3401 insertions(+)

diff --git a/License.txt b/License.txt
new file mode 100644
index 0000000..ddedb0f
--- /dev/null
+++ b/License.txt
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
\ No newline at end of file
diff --git a/RAMBO-K-plugin_eclipse/.classpath b/RAMBO-K-plugin_eclipse/.classpath
new file mode 100644
index 0000000..20c0f69
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/.classpath
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/GeneiousFiles"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/ReadTrainer"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/ReadClassifier"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/RAMBO-K-plugin_eclipse/.project b/RAMBO-K-plugin_eclipse/.project
new file mode 100644
index 0000000..629e25b
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>RAMBO-K-plugin_eclipse</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/RAMBO-K-plugin_eclipse/RAMBO-K Indexer (32 bit).launch b/RAMBO-K-plugin_eclipse/RAMBO-K Indexer (32 bit).launch
new file mode 100644
index 0000000..36e92e1
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/RAMBO-K Indexer (32 bit).launch	
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/RAMBO-K-Indexer"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="4"/>
+</listAttribute>
+<mapAttribute key="org.eclipse.debug.core.environmentVariables">
+<mapEntry key="DYLD_LIBRARY_PATH" value="./activation/macos:./native_libs/macos"/>
+<mapEntry key="LD_LIBRARY_PATH" value="./activation/linux32:./native_libs/linux32"/>
+</mapAttribute>
+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
+<listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
+</listAttribute>
+<listAttribute key="org.eclipse.jdt.launching.CLASSPATH">
+<listEntry value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<runtimeClasspathEntry containerPath="org.eclipse.jdt.launching.JRE_CONTAINER" javaProject="RAMBO-K-Indexer" path="1" type="4"/>
"/>
+<listEntry value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<runtimeClasspathEntry id="org.eclipse.jdt.launching.classpathentry.defaultClasspath">
<memento exportedEntriesOnly="false" project="RAMBO-K-Indexer"/>
</runtimeClasspathEntry>
"/>
+<listEntry value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<runtimeClasspathEntry internalArchive="/GeneiousFiles/resources" path="3" type="2"/>
"/>
+</listAttribute>
+<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="false"/>
+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="com.biomatters.iseek.application.ISeekMain"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="extraPlugins="org.rki.rambok.indexer.IndexerPlugin""/>
+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="RAMBO-K-Indexer"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea -Xss512K -Xms256M -Xmx768M -XX:MaxPermSize=96M -Djava.util.logging.config.file=no_logging.properties"/>
+<stringAttribute key="org.eclipse.jdt.launching.WORKING_DIRECTORY" value="${workspace_loc:GeneiousFiles}"/>
+</launchConfiguration>
diff --git a/RAMBO-K-plugin_eclipse/RAMBO-K indexer (64 bit).launch b/RAMBO-K-plugin_eclipse/RAMBO-K indexer (64 bit).launch
new file mode 100644
index 0000000..87532e4
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/RAMBO-K indexer (64 bit).launch	
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/RAMBO-K-Indexer"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="4"/>
+</listAttribute>
+<mapAttribute key="org.eclipse.debug.core.environmentVariables">
+<mapEntry key="DYLD_LIBRARY_PATH" value="./activation/macos:./native_libs/macos"/>
+<mapEntry key="LD_LIBRARY_PATH" value="./activation/linux64:./native_libs/linux64"/>
+</mapAttribute>
+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
+<listEntry value="org.eclipse.debug.ui.launchGroup.debug"/>
+</listAttribute>
+<listAttribute key="org.eclipse.jdt.launching.CLASSPATH">
+<listEntry value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<runtimeClasspathEntry containerPath="org.eclipse.jdt.launching.JRE_CONTAINER" javaProject="RAMBO-K-Indexer" path="1" type="4"/>
"/>
+<listEntry value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<runtimeClasspathEntry id="org.eclipse.jdt.launching.classpathentry.defaultClasspath">
<memento exportedEntriesOnly="false" project="RAMBO-K-Indexer"/>
</runtimeClasspathEntry>
"/>
+<listEntry value="<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<runtimeClasspathEntry internalArchive="/GeneiousFiles/resources" path="3" type="2"/>
"/>
+</listAttribute>
+<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="false"/>
+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="com.biomatters.iseek.application.ISeekMain"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="extraPlugins="org.rki.rambok.indexer.IndexerPlugin""/>
+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="RAMBO-K-Indexer"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-XX:+UseConcMarkSweepGC -ea -Xss512K -Xms256M -Xmx768M -XX:MaxPermSize=256M -Djava.util.logging.config.file=no_logging.properties"/>
+<stringAttribute key="org.eclipse.jdt.launching.WORKING_DIRECTORY" value="${workspace_loc:GeneiousFiles}"/>
+</launchConfiguration>
diff --git a/RAMBO-K-plugin_eclipse/RAMBO-K-Indexer.iml b/RAMBO-K-plugin_eclipse/RAMBO-K-Indexer.iml
new file mode 100644
index 0000000..a8a2107
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/RAMBO-K-Indexer.iml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="module" module-name="GeneiousFiles" />
+  </component>
+</module>
+
diff --git a/RAMBO-K-plugin_eclipse/plugin.properties b/RAMBO-K-plugin_eclipse/plugin.properties
new file mode 100644
index 0000000..a69ba44
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/plugin.properties
@@ -0,0 +1,11 @@
+# This file contains important information about the plugin
+# and must be included in the plugin jar file.
+
+# This is the fully qualified name of the main class
+# of the plugin you are developing. This is required by
+# Geneious to install a plugin.
+plugin-name=org.rki.rambok.indexer.IndexerPlugin
+
+# This is the short name for your plugin. It is used
+# to name the gplugin file for distribution.
+short-plugin-name=IndexerPlugin
\ No newline at end of file
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousDataReader.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousDataReader.java
new file mode 100644
index 0000000..8af469c
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousDataReader.java
@@ -0,0 +1,56 @@
+package org.rki.rambok.indexer;
+
+import java.util.List;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import org.rki.readclassifier.Read;
+import org.rki.readclassifier.ReadPair;
+import org.rki.readtrainer.AbstractReader;
+
+import com.biomatters.geneious.publicapi.documents.sequence.NucleotideGraphSequenceDocument;
+import com.biomatters.geneious.publicapi.documents.sequence.NucleotideSequenceDocument;
+import com.biomatters.geneious.publicapi.documents.sequence.SequenceListDocument;
+
+public class GeneiousDataReader extends AbstractReader {
+	private List<NucleotideSequenceDocument> _seqs;
+	private int _numThreads;
+	
+	public GeneiousDataReader(SequenceListDocument doc, int numThreads) {
+		_seqs = doc.getNucleotideSequences();
+		_numThreads = numThreads;
+		try {
+			reads = new LinkedBlockingQueue<ReadPair>(50000);
+		} catch(Exception e) {
+			System.out.println("--------------\n");
+			e.printStackTrace();
+			System.out.println("--------------\n");
+		}
+	}
+	
+	public void run() {
+		readReads();
+        try {
+        	for(int i=0; i<_numThreads; i++) {
+                reads.put(new ReadPair(null, null));
+        	}
+        } catch(Exception e) {
+         	e.printStackTrace();
+        }
+	}
+	
+	private void readReads() {
+		for(NucleotideSequenceDocument seq : _seqs) {
+			NucleotideGraphSequenceDocument gseq = (NucleotideGraphSequenceDocument)seq;
+			byte[] intQualities = new byte[gseq.getSequenceLength()];
+			for(int i=0; i<gseq.getSequenceLength(); i++) {
+				intQualities[i] = (byte)gseq.getSequenceQuality(i);
+			}
+			Read read = new Read(seq.getName(), gseq.getSequenceString(), intQualities, true);
+			try {
+				reads.put(new ReadPair(read, null));
+			} catch(InterruptedException e) {
+				e.printStackTrace();
+			}
+		} 	
+	}
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousDataWriter.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousDataWriter.java
new file mode 100644
index 0000000..4f52b0d
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousDataWriter.java
@@ -0,0 +1,73 @@
+package org.rki.rambok.indexer;
+
+import java.util.Date;
+import java.util.LinkedList;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import org.rki.readclassifier.AbstractWriter;
+import org.rki.readclassifier.Read;
+import org.rki.readclassifier.ReadPair;
+
+import com.biomatters.geneious.publicapi.documents.sequence.CompactQualityOnlyGraph;
+import com.biomatters.geneious.publicapi.documents.sequence.DefaultSequenceListDocument;
+import com.biomatters.geneious.publicapi.implementations.sequence.DefaultNucleotideGraphSequence;
+
+public class GeneiousDataWriter extends AbstractWriter {
+	private int _numThreads;
+	private DefaultSequenceListDocument _out1; 
+	private int _scoreCutoff;
+	private int _numToTrain;
+	private int _numTrained;
+	private RAMBOKresult _resdoc;
+	
+	public GeneiousDataWriter(int numThreads, DefaultSequenceListDocument out1, DefaultSequenceListDocument out2, int scoreCutoff, int numToTrain, RAMBOKresult resdoc) {
+		outReads = new LinkedBlockingQueue<ReadPair>(50000);
+		_numThreads = numThreads;
+		_out1 = out1;
+		_scoreCutoff = scoreCutoff;
+		_numToTrain = numToTrain;
+		_numTrained = 0;
+		_resdoc = resdoc;
+	}
+
+    public void run() {
+    	LinkedList<Double> scores = new LinkedList<Double>();
+    	int threadsRunning = _numThreads;
+        while(true) {
+            try {
+                ReadPair reads = outReads.take();
+                if(reads.read1 == null) {
+                	threadsRunning --;
+                	if(threadsRunning == 0)
+                		break;
+                	continue;
+                }
+                if(reads.read1.invalid && (reads.read2 == null || reads.read2.invalid))
+                	continue;
+                if(reads.read1.score > _scoreCutoff) {
+                	_out1.addNucleotideSequence(makeSeqGraph(reads.read1));
+                }
+                _numTrained += 1;
+                if(_numTrained < _numToTrain) {
+                	scores.add(reads.read1.score);
+                }
+                else if(_numTrained == _numToTrain) {
+                	_resdoc.setRealScores(scores);
+                }
+//                else {
+//                	_out2.addNucleotideSequence(makeSeqGraph(reads.read1));                	
+//                }
+            } catch(Exception e) {
+            	e.printStackTrace();
+            }
+        }
+        if(_numTrained < _numToTrain) {
+        	_resdoc.setRealScores(scores);
+        }
+    }
+    
+    private DefaultNucleotideGraphSequence makeSeqGraph(Read read) {
+    	DefaultNucleotideGraphSequence gdoc = new DefaultNucleotideGraphSequence(read.name, "", read.bases, new Date(), new CompactQualityOnlyGraph(read.intQualities));
+    	return gdoc;
+    }
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousReader.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousReader.java
new file mode 100644
index 0000000..3d40ee1
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousReader.java
@@ -0,0 +1,82 @@
+package org.rki.rambok.indexer;
+
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import org.rki.readclassifier.Read;
+import org.rki.readclassifier.ReadPair;
+import org.rki.readtrainer.AbstractReader;
+
+import com.biomatters.geneious.publicapi.documents.sequence.DefaultSequenceListDocument;
+import com.biomatters.geneious.publicapi.documents.sequence.NucleotideSequenceDocument;
+import com.biomatters.geneious.publicapi.documents.sequence.SequenceCharSequence;
+
+public class GeneiousReader extends AbstractReader {
+	private List<NucleotideSequenceDocument> _seqs;
+	private int _numReads;
+	private int _numThreads;
+	private int _readsRead = 0;
+	
+	public GeneiousReader(DefaultSequenceListDocument doc, int numReads, int numThreads) {
+		_seqs = doc.getNucleotideSequences();
+		_numReads = numReads;
+		_numThreads = numThreads;
+		try {
+			reads = new LinkedBlockingQueue<ReadPair>(numReads);
+		} catch(Exception e) {
+			System.out.println("--------------\n");
+			e.printStackTrace();
+			System.out.println("--------------\n");
+		}
+	}
+	
+	public void run() {
+		readReads();
+        try {
+        	for(int i=0; i<_numThreads; i++) {
+                reads.put(new ReadPair(null, null));
+        	}
+        } catch(Exception e) {
+         	e.printStackTrace();
+        }
+	}
+	
+	private void readReads() {
+		int readLength = 100;
+		Random rand = new Random();
+		int numpos = 0;
+		for(NucleotideSequenceDocument seq : _seqs) {
+			numpos += seq.getSequenceLength();
+		}
+		double posOdds = (double)_numReads / (double)numpos;
+        readloop: while(_readsRead < _numReads) {
+    		for(NucleotideSequenceDocument seq : _seqs) {
+    			int numToDraw = (int)(posOdds*rand.nextDouble()*seq.getSequenceLength());
+    			int numDrawn = 0;
+    			SequenceCharSequence seqSeq = seq.getCharSequence();
+    			while(numDrawn < numToDraw) {
+    				int seqPos = (int)(rand.nextDouble()*(seq.getSequenceLength()-readLength));
+    				SequenceCharSequence readSeq = seqSeq.subSequence(seqPos, seqPos+readLength);
+    				boolean canKeep = true;
+    				for(int i=0; i<readSeq.length(); i++) {
+    					if(readSeq.charAt(i) != 'A' && 
+    							readSeq.charAt(i) != 'G' &&
+    							readSeq.charAt(i) != 'C' &&
+    							readSeq.charAt(i) != 'T') {
+    						canKeep = false;
+    						break;
+    					}
+    				}
+    				if(canKeep) {
+    					Read read = new Read("1", readSeq.toString(), null);
+    					reads.add(new ReadPair(read, null));
+    					_readsRead += 1;
+    					if(_readsRead >= _numReads-1)
+    						break readloop;
+    				}
+    			}
+    		} 	
+        }
+	}
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousWriter.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousWriter.java
new file mode 100644
index 0000000..3638d92
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/GeneiousWriter.java
@@ -0,0 +1,53 @@
+package org.rki.rambok.indexer;
+
+import java.util.LinkedList;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import org.rki.readclassifier.AbstractWriter;
+import org.rki.readclassifier.Classifier;
+import org.rki.readclassifier.ReadPair;
+
+public class GeneiousWriter extends AbstractWriter {
+
+	private int _numThreads;
+	private LinkedList<Double> _scores;
+	private int _currentOrg = 0;
+	
+	public GeneiousWriter(int numReads, int numThreads) {
+		outReads = new LinkedBlockingQueue<ReadPair>(numReads);
+		_numThreads = numThreads;
+		_scores = new LinkedList<Double>();
+	}
+
+	public void setCurrentOrg(int currentOrg) {
+		_currentOrg = currentOrg;
+	}
+	
+    public void run() {
+    	int threadsRunning = _numThreads;
+        while(true) {
+            try {
+                ReadPair reads = outReads.take();
+                if(reads.read1 == null) {
+                	threadsRunning --;
+                	if(threadsRunning == 0)
+                		break;
+                }
+                if(reads.read1.invalid && reads.read2.invalid)
+                	continue;
+                _scores.add(reads.read1.score);
+//                if(Classifier.useCutoff == Classifier.Cutoff.HIGHER && reads.read1.score < Classifier.cutoffHigher)
+//                	continue;
+//                else if(Classifier.useCutoff == Classifier.Cutoff.LOWER && reads.read1.score > Classifier.cutoffLower)
+//                	continue;
+//                out1w.write(reads.read1.toString(true));
+            } catch(Exception e) {
+            }
+        }
+    }
+    
+    public LinkedList<Double> getScores() {
+    	return _scores;
+    }
+
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/Indexer.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/Indexer.java
new file mode 100644
index 0000000..d705bbd
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/Indexer.java
@@ -0,0 +1,189 @@
+package org.rki.rambok.indexer;
+
+import java.util.Date;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.rki.readclassifier.ClassifierThread;
+import org.rki.readclassifier.MMClassifier;
+import org.rki.readtrainer.Trainer;
+import org.rki.readtrainer.TrainerThread;
+
+import com.biomatters.geneious.publicapi.documents.AnnotatedPluginDocument;
+import com.biomatters.geneious.publicapi.documents.DocumentUtilities;
+import com.biomatters.geneious.publicapi.documents.sequence.DefaultSequenceListDocument;
+import com.biomatters.geneious.publicapi.implementations.sequence.DefaultNucleotideSequence;
+import com.biomatters.geneious.publicapi.plugin.DocumentOperation;
+import com.biomatters.geneious.publicapi.plugin.DocumentSelectionOption;
+import com.biomatters.geneious.publicapi.plugin.DocumentSelectionSignature;
+import com.biomatters.geneious.publicapi.plugin.DocumentType;
+import com.biomatters.geneious.publicapi.plugin.GeneiousActionOptions;
+import com.biomatters.geneious.publicapi.plugin.Options;
+import com.biomatters.geneious.publicapi.plugin.Options.IntegerOption;
+
+import jebl.util.ProgressListener;
+
+/**
+ * This plugin shows how to create a simple AlignmentOperation by providing an implementation that just pads the alignment
+ * to make all sequences the same length
+ * 
+ * @author Matt Kearse
+ * @version $Id$
+ */
+
+public class Indexer extends DocumentOperation {
+    public String getHelp() {
+        return "This plugin separates foreground reads from background reads using RAMBO-K.";
+    }
+
+    public String getName() {
+        return "RAMBO-K";
+    }
+
+    public double getTabPosition() {
+        return 0.5;
+    }
+
+    public DocumentSelectionSignature[] getSelectionSignatures() {
+    	return new DocumentSelectionSignature[] { DocumentSelectionSignature.forNucleotideSequences(1, Integer.MAX_VALUE) };
+    }
+    
+    public boolean isProOnly() {
+        return false;
+    }
+
+    @Override
+    public Options getOptions(AnnotatedPluginDocument[] docs) {
+    	Options opt= new Options(this.getClass());
+    	opt.addIntegerOption("kMerSize", "Kmer size to use", 8, 2, 10);
+    	opt.addIntegerOption("scoreCutoff", "Score cutoff (everything with a lower score will be considered background)", 0, -1000, 1000);
+    	opt.addIntegerOption("numThreads", "Number of threads to use", 4, 1, 64);
+    	HashSet<DocumentType> allowedDocs = new HashSet<DocumentType>();
+    	allowedDocs.add(DocumentType.NUCLEOTIDE_SEQUENCE_LIST_TYPE);
+    	allowedDocs.add(DocumentType.NUCLEOTIDE_SEQUENCE_TYPE);
+    	opt.addDocumentSelectionOption("foregroundReference", "Foreground reference(s)", DocumentSelectionOption.FolderOrDocuments.EMPTY, allowedDocs, null, "Reference sequence(s)", null, false, new LinkedList<AnnotatedPluginDocument>());
+    	opt.addDocumentSelectionOption("backgroundReference", "Background reference(s)", DocumentSelectionOption.FolderOrDocuments.EMPTY, allowedDocs, null, "Background sequence(s)", null, false, new LinkedList<AnnotatedPluginDocument>());
+    	return opt;
+    }
+
+	@Override
+	public GeneiousActionOptions getActionOptions() {
+/*		GeneiousActionOptions parent = new GeneiousActionOptions("RAMBO-K").setMainMenuLocation(GeneiousActionOptions.MainMenu.Tools).setInMainToolbar(true);
+		GeneiousActionOptions index = new GeneiousActionOptions("Build index", "Builds a RAMBO-K index from reference sequence(s)", null);
+		GeneiousActionOptions res = GeneiousActionOptions.createSubmenuActionOptions(index, parent);
+		return res;*/
+		return new GeneiousActionOptions("Spearate reads through RAMBO-K").setMainMenuLocation(GeneiousActionOptions.MainMenu.Tools).setInMainToolbar(true);
+	}
+	
+	public List performOperation(AnnotatedPluginDocument[] docs, ProgressListener progress, Options options) {
+		int numToTrain = 50000;
+		int numThreads = ((IntegerOption)(options.getOption("numThreads"))).getValue();
+		int kmersize = ((IntegerOption)(options.getOption("kMerSize"))).getValue();
+		int scoreCutoff = ((IntegerOption)(options.getOption("scoreCutoff"))).getValue();
+		Trainer.kmersize = kmersize;
+		int numReal = 0;
+		DefaultSequenceListDocument realReads = null;
+		try {
+			realReads = (DefaultSequenceListDocument)(docs[0].getDocument());
+			numReal = realReads.getNucleotideSequences().size();
+		} catch(Exception e) {
+			e.printStackTrace();
+		}
+		RAMBOKProgressListener prog = new RAMBOKProgressListener(progress, 2*numToTrain, 2*numToTrain+numReal);
+		AtomicInteger[][] matrix1 = new AtomicInteger[(int)Math.pow(4, kmersize)][4];
+		AtomicInteger[][] matrix2 = new AtomicInteger[(int)Math.pow(4, kmersize)][4];
+		List<AnnotatedPluginDocument> res = new LinkedList<AnnotatedPluginDocument>();
+		LinkedList<Double>[] scores = new LinkedList[] {new LinkedList<Double>(), new LinkedList<Double>()};
+		try {
+			prog.setTitle("Training model...");
+			DefaultSequenceListDocument seqDoc1 = (DefaultSequenceListDocument)(((DocumentSelectionOption)(options.getOption("foregroundReference"))).getDocuments().get(0).getDocument());
+			trainMatrix(seqDoc1, matrix1, numToTrain, numThreads, prog);
+			DefaultSequenceListDocument seqDoc2 = (DefaultSequenceListDocument)(((DocumentSelectionOption)(options.getOption("backgroundReference"))).getDocuments().get(0).getDocument());
+			trainMatrix(seqDoc2, matrix2, numToTrain, numThreads, prog);
+			
+			prog.setTitle("Classifying simulated reads...");
+			MMClassifier classifier = new MMClassifier();
+			classifier.setMatrices(matrix1, matrix2, seqDoc1.getName(), seqDoc2.getName(), kmersize);
+			scores[0] = testReads(seqDoc1, classifier, numToTrain, numThreads, kmersize, prog);
+			scores[1] = testReads(seqDoc2, classifier, numToTrain, numThreads, kmersize, prog);
+			RAMBOKresult resdoc = new RAMBOKresult("RAMBOK summary: foreground " + seqDoc1.getName() + " vs. background " + seqDoc2.getName(), seqDoc1.getName(), seqDoc2.getName(), scores[0], scores[1]);
+			prog.setTitle("Classifying real reads...");
+			res = classifyReads(realReads, numThreads, classifier, prog, scoreCutoff, numToTrain, resdoc);
+			res.get(0).setName("RAMBO-K foreground reads: " + seqDoc1.getName());
+//			res.add(resdoc);
+			res.add(DocumentUtilities.createAnnotatedPluginDocument(resdoc));
+			res.get(1).setName(resdoc.getName());
+			//			res.get(1).setName("RAMBO-K reads: " + seqDoc2.getName());
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+		return res;
+	}
+
+	private LinkedList<AnnotatedPluginDocument> classifyReads(DefaultSequenceListDocument doc, int numThreads, MMClassifier classifier, RAMBOKProgressListener prog, int scoreCutoff, int numToTrain, RAMBOKresult resdoc) throws InterruptedException {
+		GeneiousDataReader reader = new GeneiousDataReader(doc, numThreads);
+		DefaultSequenceListDocument out1 = new DefaultSequenceListDocument();
+		DefaultSequenceListDocument out2 = new DefaultSequenceListDocument();
+		GeneiousDataWriter writer = new GeneiousDataWriter(numThreads, out1, out2, scoreCutoff, numToTrain, resdoc);
+		System.out.println("\n====" + scoreCutoff + "====");
+		writer.start();
+		reader.start();
+		LinkedList<ClassifierThread> threadList = new LinkedList<ClassifierThread>();
+		while(threadList.size() < numThreads) {
+			threadList.add(new ClassifierThread(reader, writer, classifier, prog));
+		}
+		for(ClassifierThread c : threadList) {
+			c.start();
+		}
+		for(ClassifierThread c : threadList) {
+			c.join();
+		}
+        reader.join();
+		writer.join();
+    	out1.addNucleotideSequence(new DefaultNucleotideSequence("test", "AG"));
+//    	out2.addNucleotideSequence(new DefaultNucleotideSequence("test", "AG"));
+		LinkedList<AnnotatedPluginDocument> res = new LinkedList<AnnotatedPluginDocument>();
+		res.add(DocumentUtilities.createAnnotatedPluginDocument(out1));
+//		res.add(DocumentUtilities.createAnnotatedPluginDocument(out2));
+		return res;
+	}
+	
+	private LinkedList<Double> testReads(DefaultSequenceListDocument doc, MMClassifier classifier, int numToTrain, int numThreads, int kmersize, RAMBOKProgressListener prog) throws InterruptedException {
+		GeneiousWriter writer = new GeneiousWriter(numToTrain, numThreads);
+		writer.start();
+		GeneiousReader reader = new GeneiousReader(doc, numToTrain, numThreads);
+		reader.start();
+		LinkedList<ClassifierThread> threadList = new LinkedList<ClassifierThread>();
+		while(threadList.size() < numThreads) {
+			threadList.add(new ClassifierThread(reader, writer, classifier, prog));
+		}
+		for(ClassifierThread c : threadList) {
+			c.start();
+		}
+		for(ClassifierThread c : threadList) {
+			c.join();
+		}
+        reader.join();
+		writer.join();
+		return writer.getScores();
+	}
+	
+	private void trainMatrix(DefaultSequenceListDocument doc, AtomicInteger[][] matrix, int numToTrain, int numThreads, RAMBOKProgressListener prog) throws InterruptedException {
+		Trainer.clearMatrix(matrix);
+		GeneiousReader reader = new GeneiousReader(doc, numToTrain, numThreads);
+		reader.start();
+		LinkedList<TrainerThread> trainers = new LinkedList<TrainerThread>();
+		for(int i=0; i<numThreads; i++) {
+			trainers.add(new TrainerThread(reader, matrix, prog));
+		}
+		for(TrainerThread trainer : trainers) {
+			trainer.start();
+		}
+		for(TrainerThread trainer : trainers) {
+			trainer.join();
+		}
+		reader.join();
+	}
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/IndexerPlugin.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/IndexerPlugin.java
new file mode 100644
index 0000000..dc3c3ca
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/IndexerPlugin.java
@@ -0,0 +1,92 @@
+package org.rki.rambok.indexer;
+
+import com.biomatters.geneious.publicapi.documents.AnnotatedPluginDocument;
+import com.biomatters.geneious.publicapi.plugin.DocumentOperation;
+import com.biomatters.geneious.publicapi.plugin.DocumentSelectionSignature;
+import com.biomatters.geneious.publicapi.plugin.DocumentViewer;
+import com.biomatters.geneious.publicapi.plugin.DocumentViewerFactory;
+import com.biomatters.geneious.publicapi.plugin.GeneiousPlugin;
+
+/**
+ * This plugin shows how to create a simple alignment operation plugin. This allows
+ * the user to create an alignment document out of sequences.
+ * <p/>
+ * All the plugin has to do, is given a list of sequences return a list of aligned sequences to be
+ * appear in the alignment and the Geneious framework handles the rest (such as:
+ * <ul>;
+ * <li>The type of documents that have been selected (sequence documents, sequence list documents, or alignment documents)</li>;
+ * <li>Reordering and reversing of the sequences</li>;
+ * <li>Whether the alignment is local or global</li>;
+ * <li>Referenced documents</li>;
+ * <li>Bases/residues unsupported by the underlying algorithm</li>;
+ * </ul>;
+ * <p/>
+ * This class just provides the framework to hook the {@link RAMBO-K-Indexer}
+ * into Geneious. All of the real work happens in {@link RAMBO-K-Indexer}.
+ */
+public class IndexerPlugin extends GeneiousPlugin {
+    @Override
+    public DocumentOperation[] getDocumentOperations() {
+        return new DocumentOperation[] {new Indexer() };
+    }
+    
+    public DocumentViewerFactory[] getDocumentViewerFactories() {
+    	return new DocumentViewerFactory[] {
+    			new DocumentViewerFactory() {
+					
+					@Override
+					public DocumentSelectionSignature[] getSelectionSignatures() {
+						return new DocumentSelectionSignature[] { new DocumentSelectionSignature(RAMBOKresult.class, 1, 1)};
+					}
+					
+					@Override
+					public String getName() {
+						return "RAMBO-K results viewer";
+					}
+					
+					@Override
+					public String getHelp() {
+						return "Shows the metrics associated with a RAMBO-K result.";
+					}
+					
+					@Override
+					public String getDescription() {
+						return "Shows the metrics associated with a RAMBO-K result.";
+					}
+					
+					@Override
+					public DocumentViewer createViewer(AnnotatedPluginDocument[] docs) {
+						return new RAMBOKresultDocumentViewer((RAMBOKresult)docs[0].getDocumentOrCrash());
+					}
+				}
+    	};
+    }
+
+    public String getName() {
+        return "RAMBO-K";
+    }
+
+    public String getHelp() {
+        return "RAMBO-K";
+    }
+
+    public String getDescription() {
+        return "RAMBO-K";
+    }
+
+    public String getAuthors() {
+        return "Piotr Wojciech Dabrowski & Simon H. Tausch, Robert Koch Insitute";
+    }
+
+    public String getVersion() {
+        return "0.1";
+    }
+
+    public String getMinimumApiVersion() {
+        return "4.11";
+    }
+
+    public int getMaximumApiVersion() {
+        return 4;
+    }
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKProgressListener.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKProgressListener.java
new file mode 100644
index 0000000..6a90f91
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKProgressListener.java
@@ -0,0 +1,43 @@
+package org.rki.rambok.indexer;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.rki.readclassifier.ClassifierProgressListener;
+import org.rki.readtrainer.TrainerProgressListener;
+
+import jebl.util.ProgressListener;
+
+public class RAMBOKProgressListener implements TrainerProgressListener, ClassifierProgressListener {
+	private AtomicInteger _numReadsProcessed = new AtomicInteger(0);
+	private int _numReadsToTrain;
+	private int _numReadsToClassify;
+	private int _numSteps;
+	private ProgressListener _progress;
+	
+	public RAMBOKProgressListener(ProgressListener progress, int readsToTrain, int readsToClassify) {
+		_numReadsProcessed = new AtomicInteger(0);
+		_numReadsToTrain = readsToTrain;
+		_numReadsToClassify = readsToClassify;
+		_progress = progress;
+		_numSteps = _numReadsToTrain + _numReadsToClassify;
+		_progress.setProgress(0, _numReadsToTrain);
+	}
+	
+	public void readTrained() {
+		int numt = _numReadsProcessed.incrementAndGet();
+		if(numt % 1000 == 0) {
+			_progress.setProgress(numt, _numSteps);
+		}
+	}
+
+	public void readClassified() {
+		int numt = _numReadsProcessed.incrementAndGet();
+		if(numt % 1000 == 0) {
+			_progress.setProgress(numt, _numSteps);
+		}
+	}
+	
+	public void setTitle(String title) {
+		_progress.setTitle(title);
+	}
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKresult.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKresult.java
new file mode 100644
index 0000000..1c859c4
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKresult.java
@@ -0,0 +1,206 @@
+package org.rki.rambok.indexer;
+
+import java.util.Collections;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.jdom.Element;
+
+import com.biomatters.geneious.publicapi.documents.DocumentField;
+import com.biomatters.geneious.publicapi.documents.PluginDocument;
+import com.biomatters.geneious.publicapi.documents.URN;
+import com.biomatters.geneious.publicapi.documents.XMLSerializationException;
+
+public class RAMBOKresult implements PluginDocument {
+	private int NUMBINS = 100;
+	
+	private static int HIST_ORG1 = 0;
+	private static int HIST_ORG2 = 1;
+	private static int HIST_REAL = 2;
+	
+	private String _name;
+	private String _oname1;
+	private String _oname2;
+	private LinkedList<Double> _scores1;
+	private LinkedList<Double> _scores2;
+	private LinkedList<Double> _realScores;
+	private Date _creationDate;
+	private int[][] _scoreHist;
+	private double _histMin;
+	private double _histMax;
+	private double _histStep;
+	private int _highestNum;
+
+	public RAMBOKresult(String docname, String oname1, String oname2, LinkedList<Double> scores1, LinkedList<Double> scores2) {
+		_name = docname;
+		_oname1 = oname1;
+		_oname2 = oname2;
+		_scores1 = scores1;
+		_scores2 = scores2;
+		_scoreHist = new int[NUMBINS][3];
+		for(int i=0; i<NUMBINS; i++) {
+			for(int j=0; j<3; j++) {
+				_scoreHist[i][j] = 0;
+			}
+		}
+	}
+
+	public RAMBOKresult() {}
+
+	public void setRealScores(LinkedList<Double> scores) {
+		_realScores = scores;
+		calculateHistograms();
+	}
+	
+	private void calculateHistograms() {
+		Collections.sort(_scores1);
+		Collections.sort(_scores2);
+		Collections.sort(_realScores);
+		LinkedList<Double> allscores = new LinkedList<Double>();
+		allscores.addAll(_scores1);
+		allscores.addAll(_scores2);
+		allscores.addAll(_realScores);
+		Collections.sort(allscores);
+		_histMin = allscores.get(5*allscores.size()/100);
+		_histMax = allscores.get(allscores.size()-(5*allscores.size()/100));
+		_histStep = (_histMax - _histMin)/NUMBINS;
+		
+		double currVal = _histMin;
+		int currpos = 0;
+		Iterator<Double> s1it = _scores1.iterator();
+		Iterator<Double> s2it = _scores2.iterator();
+		Iterator<Double> srit = _realScores.iterator();
+		double n1 = s1it.next();
+		double n2 = s2it.next();
+		double nr = srit.next();
+		while(currVal <= _histMax && currpos < _scoreHist.length) {
+			while(n1 <= currVal) {
+				_scoreHist[currpos][0] += 1;
+				if(s1it.hasNext())
+					n1 = s1it.next();
+				else
+					n1 = Double.MAX_VALUE;
+			}
+			while(n2 <= currVal) {
+				_scoreHist[currpos][1] += 1;
+				if(s2it.hasNext())
+					n2 = s2it.next();
+				else
+					n2 = Double.MAX_VALUE;
+			}
+			while(nr <= currVal) {
+				_scoreHist[currpos][2] += 1;
+				if(srit.hasNext())
+					nr = srit.next();
+				else
+					nr = Double.MAX_VALUE;
+			}
+			currVal += _histStep;
+			currpos += 1;
+		}
+		_highestNum = 0;
+		for(int i=0; i<NUMBINS; i++) {
+			_highestNum = Math.max(_highestNum, getLargest(_scoreHist[i]));
+		}
+	}
+
+	private int getLargest(int[] vals) {
+		int max = 0;
+		for(int i=0; i<vals.length; i++) {
+			if(vals[i] > max)
+				max = vals[i];
+		}
+		return max;
+	}
+	
+	public int[][] getHistogram() {
+		return _scoreHist;
+	}
+	
+	public int getHighestNum() {
+		return _highestNum;
+	}
+	
+	public void fromXML(Element doc) throws XMLSerializationException {
+		_name = doc.getChildText("name");
+		_oname1 = doc.getChildText("oname1");
+		_oname2 = doc.getChildText("oname2");
+		_histMin = Double.parseDouble(doc.getChildText("histMin"));
+		_histMax = Double.parseDouble(doc.getChildText("histMax"));
+		_histStep = Double.parseDouble(doc.getChildText("histStep"));
+		NUMBINS = Integer.parseInt(doc.getChildText("histBins"));
+		_highestNum = Integer.parseInt(doc.getChildText("highestNum"));
+		histFromString(doc.getChildText("scoreHist"));
+	}
+
+	public Element toXML() {
+		Element root = new Element("TextDocument");
+		root.addContent(new Element("name").setText(_name));
+		root.addContent(new Element("oname1").setText(_oname1));
+		root.addContent(new Element("oname2").setText(_oname2));
+		root.addContent(new Element("histMin").setText(Double.toString(_histMin)));
+		root.addContent(new Element("histMax").setText(Double.toString(_histMax)));
+		root.addContent(new Element("histStep").setText(Double.toString(_histStep)));
+		root.addContent(new Element("histBins").setText(Integer.toString(NUMBINS)));
+		root.addContent(new Element("highestNum").setText(Integer.toString(_highestNum)));
+		root.addContent(new Element("scoreHist").setText(histToString()));
+		return root;
+	}
+
+	private String histToString() {
+		StringBuilder res = new StringBuilder();
+		for(int i=0; i<NUMBINS; i++) {
+			for(int j=0; j<3; j++) {
+				res.append(Integer.toString(_scoreHist[i][j]));
+				res.append(",");
+			}
+		}
+		return res.toString();
+	}
+	
+	private void histFromString(String hist) {
+		_scoreHist = new int[NUMBINS][3];
+		String[] vals = hist.split(",");
+		for(int i=0; i<NUMBINS; i++) {
+			for(int j=0; j<3; j++) {
+				_scoreHist[i][j] = Integer.parseInt(vals[3*i+j]);
+			}
+		}
+	}
+	
+	public Date getCreationDate() {
+		return _creationDate;
+	}
+
+	public String getDescription() {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	public List<DocumentField> getDisplayableFields() {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	public Object getFieldValue(String arg0) {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	public String getName() {
+		return _name;
+	}
+
+	public URN getURN() {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+	public String toHTML() {
+		// TODO Auto-generated method stub
+		return null;
+	}
+
+}
diff --git a/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKresultDocumentViewer.java b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKresultDocumentViewer.java
new file mode 100644
index 0000000..3b0fc2b
--- /dev/null
+++ b/RAMBO-K-plugin_eclipse/src/org/rki/rambok/indexer/RAMBOKresultDocumentViewer.java
@@ -0,0 +1,79 @@
+package org.rki.rambok.indexer;
+
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Graphics;
+
+import javax.swing.JComponent;
+import javax.swing.JPanel;
+
+import com.biomatters.geneious.publicapi.plugin.DocumentViewer;
+
+public class RAMBOKresultDocumentViewer extends DocumentViewer {
+
+	private int _histHeight=300;
+	private int _histWidth = 500;
+	private int _offsetX = 10;
+	private int _offsetY = 10;
+	private int _topSpace = 50;
+	private int _fillAlpha = 80;
+	private RAMBOKresult _doc;
+	
+	public RAMBOKresultDocumentViewer(RAMBOKresult doc) {
+		_doc = doc;
+	}
+	
+	@Override
+	public JComponent getComponent() {
+		JPanel res = new JPanel() {
+			public void paintComponent(Graphics g) {
+				int maxHeight = _histHeight - _topSpace;
+				int[][] hist = _doc.getHistogram();
+				int stepX = _histWidth / hist.length;
+				int highestNum = _doc.getHighestNum();
+				int[] scaled = new int[hist.length+2];
+				int[] xpoints = new int[hist.length+2];
+				xpoints[0] = _offsetX;
+				scaled[0] = _offsetY+_histHeight;
+				xpoints[xpoints.length-1] = _offsetX+_histWidth;
+				scaled[scaled.length - 1] = _offsetY+_histHeight;
+				for(int i=0; i<hist.length; i++) {
+					scaled[i+1] = maxHeight*hist[i][0];
+					scaled[i+1] /= highestNum;
+					scaled[i+1] = _offsetY+_topSpace+maxHeight - scaled[i+1];
+					xpoints[i+1] = i*stepX+_offsetX;
+				}
+				g.setColor(new Color(255, 0, 0, _fillAlpha));
+				g.fillPolygon(xpoints, scaled, scaled.length);
+				g.setColor(new Color(255, 0, 0, 255));
+				g.drawPolyline(xpoints, scaled, scaled.length);
+				for(int i=0; i<hist.length; i++) {
+					scaled[i+1] = maxHeight*hist[i][1];
+					scaled[i+1] /= highestNum;
+					scaled[i+1] = _offsetY+_topSpace+maxHeight - scaled[i+1];
+				}
+				g.setColor(new Color(0, 0, 255, _fillAlpha));
+				g.fillPolygon(xpoints, scaled, scaled.length);
+				g.setColor(new Color(0, 0, 255, 255));
+				g.drawPolyline(xpoints, scaled, scaled.length);
+				for(int i=0; i<hist.length; i++) {
+					scaled[i+1] = maxHeight*hist[i][2];
+					scaled[i+1] /= highestNum;
+					scaled[i+1] = _offsetY+_topSpace+maxHeight - scaled[i+1];
+				}
+				g.setColor(new Color(0, 150, 0, _fillAlpha));
+				g.fillPolygon(xpoints, scaled, scaled.length);
+				g.setColor(new Color(0, 150, 0, 255));
+				g.drawPolyline(xpoints, scaled, scaled.length);
+				g.setColor(Color.BLACK);
+				g.drawRect(_offsetX, _offsetY, _histWidth, _histHeight);
+			}
+			
+			public Dimension getPreferredSize() {
+				return new Dimension(800, 800);
+			}
+		};
+		return res;
+	}
+
+}
diff --git a/RAMBOK.py b/RAMBOK.py
new file mode 100644
index 0000000..70d5eda
--- /dev/null
+++ b/RAMBOK.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+import os
+import argparse
+import simulate_reads
+import plot
+import shutil
+import sys
+
+
+def copyhead(unassigned, amount, outfile):
+    infile = open(unassigned)
+    outfile = open(outfile, 'w')
+    i = 0
+    while i < amount * 4:
+        outfile.write(infile.readline())
+        i += 1
+    outfile.close()
+
+
+def simulation(reffile1, reffile2, name1, name2, temppath, real_readfiles, gapsize, amount, chunksize, se):
+    print('Step 1/5')
+    if not se:
+        if not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_1.fasta')) and not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_2_+' + str(amount) + '.fasta')):
+            simulate_reads.main(reffile1, reffile2, name1, name2, temppath, real_readfiles, se, amount, gapsize, chunksize)
+        else:
+            print('Simulated reads of %s and %s exist. Skipping.' % (name1, name2))
+    else:
+        if not os.path.isfile(os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '.fasta')):
+            simulate_reads.main(reffile1, reffile2, name1, name2, temppath, real_readfiles, se, amount, gapsize, chunksize)
+        else:
+            print('Simulated reads of %s and %s exist. Skipping.' % (name1, name2))
+    if not se:
+        if not os.path.isfile(os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq')):
+            copyhead(real_readfiles[0], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq'))
+            copyhead(real_readfiles[1], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_2.fastq'))
+    else:
+        if not os.path.isfile(os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq')):
+            copyhead(real_readfiles[0], amount, os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq'))
+
+
+def training(kmers, name1, name2, temppath, threads, amount, se, skriptpath):
+    print('Step 2/5')
+    print('Calculating transition matrix')
+    if not se:
+        simulated11 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '_1.fasta')
+        simulated12 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '_2.fasta')
+        simulated21 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '_1.fasta')
+        simulated22 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '_2.fasta')
+        for k in kmers:
+            if not os.path.isfile(os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt')):
+                os.system('java -jar ' + os.path.join(skriptpath, 'trainer.jar') + ' -a1 ' + simulated11 + ' -a2 ' + simulated12 + ' -b1 ' + simulated21 + ' -b2 ' + simulated22 + ' -na ' + name1 + ' -nb ' + name2 + ' -o ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -T -t ' + str(threads) + ' -k ' + str(k))
+            else:
+                print('Transition matrix for k = %i exists. Skipping.' % k)
+    else:
+        simulated11 = os.path.join(temppath, 'simulated_reads_' + name1 + '_' + str(amount) + '.fasta')
+        simulated21 = os.path.join(temppath, 'simulated_reads_' + name2 + '_' + str(amount) + '.fasta')
+        for k in kmers:
+            if not os.path.isfile(os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt')):
+                os.system('java -jar ' + os.path.join(skriptpath, 'trainer.jar') + ' -a1 ' + simulated11 + ' -b1 ' + simulated21 + ' -na ' + name1 + ' -nb ' + name2 + ' -o ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -T -t ' + str(threads) + ' -k ' + str(k))
+            else:
+                print('Transition matrix for k = %i exists. Skipping.' % k)
+
+
+def assign_simulated(kmers, temppath, name1, name2, threads, amount, se, skriptpath):
+    print('Step 3/5')
+    print('Assigning subset of simulated reads')
+
+    if not se:
+        for k in kmers:
+            if not os.path.isfile(os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta')):
+                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_1.fasta') + ' -2 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '_2.fasta') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath [...]
+            else:
+                print('Assigned simulated reads for k = %i exist. Skipping' % k)
+    else:
+        for k in kmers:
+            if not os.path.isfile(os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta')):
+                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'simulated_reads_' + name1 + '_and_' + name2 + '_' + str(amount) + '.fasta') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta') + ' -t 1')
+            else:
+                print('Assigned simulated reads for k = %i exist. Skipping' % k)
+
+
+def assign_real(kmers, temppath, name1, name2, threads, amount, se, skriptpath):
+    print('Step 4/5')
+    print('Assigning subset of real reads')
+    if not se:
+        for k in kmers:
+            if not os.path.isfile(os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq')):
+                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_1.fastq') + ' -2 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '_2.fastq') + ' -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_PE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amoun [...]
+            else:
+                print('Assigned reads for k = %i exist. Skipping.' % k)
+    else:
+        for k in kmers:
+            if not os.path.isfile(os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq')):
+                os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + os.path.join(temppath, 'unassigned_subsample_' + str(amount) + '.fastq') + ' -d ' + os.path.join(
+                    temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(k) + '_' + str(amount) + '_SE.txt') + ' -c MMClassifier -n 100 -o1 ' + os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq') + ' -t ' + str(threads))
+            else:
+                print('Assigned reads for k = %i exist. Skipping.' % k)
+
+
+def do_plots(kmers, outpath, temppath, name1, name2, amount, se, filetype):
+    if not se:
+        for k in kmers:
+            if not os.path.isfile(
+                    os.path.join(outpath, 'ROCplot_k' + str(k) + '_' + str(amount) + '_PE.png')) or not os.path.isfile(
+                    os.path.join(outpath, 'score_histogram_k' + str(k) + '_' + str(amount) + '_PE.png')) or not os.path.isfile(
+                    os.path.join(outpath, 'fitted_histograms_k' + str(k) + '_' + str(amount) + '_PE.png')):
+                plot.main(k, outpath, name1, name2, os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '_1.fasta'), os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '_1.fastq'), amount, se, filetype)
+            else:
+                print('Plots for k = %i exist. Skipping' % k)
+
+    else:
+        for k in kmers:
+            if not os.path.isfile(os.path.join(outpath, 'ROCplot_k' + str(k) + '_' + str(amount) + '_SE.png')) or not os.path.isfile(os.path.join(outpath, 'score_histogram_k' + str(k) + '_' + str(amount) + '_SE.png')) or not os.path.isfile(os.path.join(outpath, 'fitted_histograms_k' + str(k) + '_' + str(amount) + '_SE.png')):
+                plot.main(k, outpath, name1, name2, os.path.join(temppath, 'classified_simulated_reads_k' + str(k) + '_' + str(amount) + '.fasta'), os.path.join(temppath, 'classified_real_reads_k' + str(k) + '_' + str(amount) + '.fastq'), amount, se, filetype)
+            else:
+                print('Plots for k = %i exist. Skipping.' % k)
+
+
+def assign_all(cutoff_higher, cutoff_lower, kmers, amount, unassigned1, unassigned2, se, temppath, outpath, name1, name2, filetype, skriptpath):
+    if cutoff_higher:
+        if not se:
+            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -2 ' + unassigned2 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_PE.txt') + ' -o1 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '_1.' + str(filetype)) + ' -o2 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str [...]
+        else:
+            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_SE.txt') + ' -o1 ' + os.path.join(outpath, name1 + '_cutoff_' + str(cutoff_higher) + '_k_' + str(kmers[0]) + '.' + str(filetype)) + ' -ch ' + str(cutoff_higher))
+    elif cutoff_lower:
+        if not se:
+            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -2 ' + unassigned2 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_PE.txt') + ' -o1 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '_1.' + str(filetype)) + ' -o2 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(k [...]
+        else:
+            os.system('java -jar ' + os.path.join(skriptpath, 'classifier.jar') + ' -1 ' + unassigned1 + ' -c MMClassifier -n 100 -d ' + os.path.join(temppath, 'transition_matrix_' + name1 + '_' + name2 + '_k' + str(kmers[0]) + '_' + str(amount) + '_SE.txt') + ' -o1 ' + os.path.join(outpath, name2 + '_cutoff_' + str(cutoff_lower) + '_k_' + str(kmers[0]) + '.' + str(filetype)) + ' -cl ' + str(cutoff_lower))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--reffile1', help='Reference file of species 1 in fasta-format. Should pair with name1', required=True)
+    parser.add_argument('-R', '--reffile2', help='Reference file of species 2 in fasta-format. Should pair with name2', required=True)
+    parser.add_argument('-n', '--name1', help='Name of species 1', default='species1', required=False)
+    parser.add_argument('-N', '--name2', help='Name of species 2', default='species2', required=False)
+    parser.add_argument('-1', '--unassigned1', help='Fasta- or fastq-file containing mixed reads.', required=True)
+    parser.add_argument('-2', '--unassigned2', help='Fasta- or fastq-file containing mixed reads, only required in paired end mode.', default=None, required=False)
+    parser.add_argument('-k', '--kmersizes', help='Order of Markov-Chain/kmer length. Enter as range (e.g. 4:8) or list (e.g. 4,6,8) or integer (e.g. 8). Default = 8', default='8', required=False)
+    parser.add_argument('-o', '--outpath', help='Folder to write results to. Default = $name1_$name2/ in your working directory', required=False)
+    parser.add_argument('-a', '--amount', help='Number of reads to be simulated, default = 50000', default=50000, required=False)
+    parser.add_argument('-t', '--threads', help='Number of Threads to use', default=1, required=False)
+    parser.add_argument('-x', '--chunksize', help='Size of chunks created at a time for simulation, default = 100000. Only change if you know what you are doing!', default=100000, required=False)
+    parser.add_argument('-g', '--gapsize', help='Estimated size of gapsize in case of paired end reads, default = 1', default=1, required=False)
+    parser.add_argument('-c', '--cutoff_lower', help='Lower cutoff:  Output only reads with a score lower than or equal to this value, use m1 for -1')
+    parser.add_argument('-C', '--cutoff_higher', help='Higher cutoff: Output only reads with a score higher than or equal to this value, use m1 for -1')
+    parser.add_argument('-d', '--delete_temp', help='\Delete temporary files. Calculations will start from beginning next time.', action='store_true', default=False)
+    parser.add_argument('-f', '--filetype', help='Type of your input reads. fasta or fastq, default = fastq', default='fastq', required=False)
+    args = parser.parse_args()
+    name1 = args.name1
+    name2 = args.name2
+    if not args.outpath:
+        outpath = os.path.join(os.getcwd(), str(name1) + '_' + str(name2))
+    else:
+        outpath = args.outpath
+        outpath = os.path.join(os.getcwd(), outpath)
+    temppath = os.path.join(outpath, 'temp')
+    graphicspath = os.path.join(outpath, 'graphics')
+    skriptpath = os.path.dirname(os.path.realpath(sys.argv[0]))
+    if not os.path.exists(temppath):
+        os.makedirs(temppath)
+    if not os.path.exists(graphicspath):
+        os.makedirs(graphicspath)
+    amount = int(args.amount)
+    kmersarg = args.kmersizes
+    if ':' in kmersarg:
+        kmers = range(int(kmersarg.split(':')[0]), int(kmersarg.split(':')[1]) + 1)
+    elif ',' in kmersarg:
+        kmers = []
+        for each in kmersarg.split(','):
+            kmers.append(int(each))
+    else:
+        kmers = [int(kmersarg)]
+    threads = int(args.threads)
+    chunksize = int(args.chunksize)
+    gapsize = int(args.gapsize)
+    reffile1 = args.reffile1
+    reffile2 = args.reffile2
+    unassigned1 = args.unassigned1
+    unassigned2 = args.unassigned2
+    cutoff_lower = args.cutoff_lower
+    cutoff_higher = args.cutoff_higher
+    filetype = args.filetype
+    if not unassigned2:
+        se = True
+    else:
+        se = False
+
+    # ##Step1: Read Simulation
+    simulation(reffile1, reffile2, name1, name2, temppath, [unassigned1, unassigned2], gapsize, amount, chunksize, se)
+
+    ###Step2: Training
+    training(kmers, name1, name2, temppath, threads, amount, se, skriptpath)
+
+    ###Step3: Assign simulated reads
+    assign_simulated(kmers, temppath, name1, name2, 1, amount, se, skriptpath)
+
+    ###Step4: Assign real reads
+    assign_real(kmers, temppath, name1, name2, threads, amount, se, skriptpath)
+
+    ###Step5: Generate Graphics:
+    do_plots(kmers, graphicspath, temppath, name1, name2, amount, se, filetype)
+
+    ###Final Step: Assign all reads
+
+    if ((cutoff_higher and not cutoff_lower) or (not cutoff_higher and cutoff_lower)) and len(kmers) == 1:
+        print('Assigning reads. This may take a while.')
+        assign_all(cutoff_higher, cutoff_lower, kmers, amount, unassigned1, unassigned2, se, temppath, outpath, name1,
+                   name2, filetype, skriptpath)
+    else:
+        print('To assign your reads, please choose a cutoff as well as one kmer-size.')
+
+    if args.delete_temp:
+        shutil.rmtree(os.path.join(temppath))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ReadClassifier_eclipse/.classpath b/ReadClassifier_eclipse/.classpath
new file mode 100644
index 0000000..93d6cb1
--- /dev/null
+++ b/ReadClassifier_eclipse/.classpath
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="lib" path="lib/commons-cli-1.2.jar"/>
+	<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/ReadTrainer"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/ReadClassifier_eclipse/.project b/ReadClassifier_eclipse/.project
new file mode 100644
index 0000000..d88232e
--- /dev/null
+++ b/ReadClassifier_eclipse/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ReadClassifier</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/ReadClassifier_eclipse/classifier.log b/ReadClassifier_eclipse/classifier.log
new file mode 100644
index 0000000..d42d177
--- /dev/null
+++ b/ReadClassifier_eclipse/classifier.log
@@ -0,0 +1 @@
+INFO - Found kmer size: 4
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/AbstractWriter.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/AbstractWriter.java
new file mode 100644
index 0000000..470dd32
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/AbstractWriter.java
@@ -0,0 +1,7 @@
+package org.rki.readclassifier;
+
+import java.util.concurrent.BlockingQueue;
+
+public abstract class AbstractWriter extends Thread {
+	public BlockingQueue<ReadPair> outReads;
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/Classifier.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/Classifier.java
new file mode 100644
index 0000000..6b68a1a
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/Classifier.java
@@ -0,0 +1,233 @@
+package org.rki.readclassifier;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.util.LinkedList;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+
+
+public class Classifier {
+	
+	public enum Cutoff {NONE, LOWER, HIGHER}
+	
+//	public static Logger logger = Logger.getLogger(Classifier.class.getName());
+	public static KmerDistribution[] kmersOrg1;
+	public static KmerDistribution[] kmersOrg2;
+	public static int kmersize;
+	public static long startTimeMillis;
+	
+	public static boolean showTime = false;
+	public static int threads = 4;
+	public static int inChunksize = 500;
+	
+	public static int normalizeTo = -1;
+	
+	public static double cutoffLower = 0.0;
+	public static double cutoffHigher = 0.0;
+	
+	public static Cutoff useCutoff = Cutoff.NONE; 
+	
+	public static FileInputStream in1 = null;
+	public static FileInputStream in2 = null;
+
+	public static FileInputStream trainin = null;
+
+	public static FileOutputStream out1 = null;
+	public static FileOutputStream out2 = null;
+	
+	public static ReadClassifier[] classifiers = {new DistClassifier(), new MMClassifier()};
+	public static ReadClassifier selectedClassifier = null;
+	
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		try {
+			init();
+			parseOptions(args);
+//			parseKmers();
+			process();
+			finish();
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+
+	public static void finish() {
+		if(showTime) {
+			long currentTime = System.currentTimeMillis();
+			System.out.println("Time taken: " + (currentTime - startTimeMillis) + " ms");
+		}
+	}
+	
+	public static void process() throws Exception {
+		selectedClassifier.readData(trainin);
+		Reader reader = new Reader(in1, in2, inChunksize, 10000, threads);
+		Writer writer = new Writer(out1, out2, 100000);
+		LinkedList<ClassifierThread> threadList = new LinkedList<ClassifierThread>();
+                reader.start();
+                writer.start();
+		while(threadList.size() < threads) {
+			threadList.add(new ClassifierThread(reader, writer, selectedClassifier, null));
+		}
+		for(ClassifierThread c : threadList) {
+			c.start();
+		}
+        reader.join();
+		for(ClassifierThread c : threadList) {
+			c.join();
+		}
+                writer.join();
+	}
+		
+	public static void init() throws Exception {
+		startTimeMillis = System.currentTimeMillis();
+//		FileAppender appender = new FileAppender(new SimpleLayout(), "classifier.log", false);
+//		logger.addAppender(appender);
+//		logger.setLevel(Level.ALL);		
+	}
+	
+	public static void printClassifiers() {
+		for(ReadClassifier classifier : classifiers) {
+			System.out.println("  " + classifier.getName() + ": " + classifier.getDescription());
+		}
+	}
+	
+	public static void parseOptions(String[] args) throws ParseException, FileNotFoundException {
+		Options options = new Options();
+		options.addOption("1", "in_1", true, "Input fastq/fasta file containing the first read");
+		options.addOption("2", "in_2", true, "Input fastq/fasta file containing the second read");
+		options.addOption("c", "classifier", true, "Name of classifier to use");
+		options.addOption("o1", "out_1", true, "Output fastq file for the first read");
+		options.addOption("o2", "out_2", true, "Output fastq file for the second read");
+		options.addOption("cl", "cutoff_lower", true, "Output only reads with a score lower than or equal to this value, use m1 for -1");
+		options.addOption("ch", "cutoff_higher", true, "Output only reads with a score higher than or equal to this value, use m1 for -1");
+		options.addOption("n", "normalize_scores", true, "Normalize scores to this length (default: Do not normalize)");
+		options.addOption("d", "datafile", true, "File containing data from training");
+		options.addOption("t", "threads", true, "Number of threads");
+		options.addOption("C", "inchunksize", true, "Input chunk size (don't touch this unless you know what you are doing)");
+		options.addOption("T", "show_time", false, "Show processing time");
+		options.addOption("l", "list_classifiers", true, "List available classifiers");
+		options.addOption("h", "help", false, "Print usage instructions");
+		
+		CommandLineParser parser = new GnuParser();
+		CommandLine cmd = parser.parse(options, args);
+		
+		if(cmd.hasOption("h")) {
+			HelpFormatter formatter = new HelpFormatter();
+			formatter.printHelp("java -jar classifier.jar", options);
+			System.exit(0);
+		}
+		if(cmd.hasOption("l")) {
+			printClassifiers();
+			System.exit(0);
+		}
+		
+		if(!cmd.hasOption("c")) {
+			System.out.println("Please provide a classifier to use. Available classifiers:");
+			printClassifiers();
+			System.exit(1);
+		}
+		else {
+			for(ReadClassifier classifier : classifiers) {
+				if(classifier.getName().equals(cmd.getOptionValue("c"))) {
+					selectedClassifier = classifier;
+					break;
+				}
+			}
+			if(selectedClassifier == null) {
+				System.out.println("Invalid classifier selected. Available classifiers:");
+				printClassifiers();
+			}
+		}
+		if(!cmd.hasOption("d")) {
+			System.out.println("Data from training has to be provided.");
+			System.exit(1);
+		}
+		if(!cmd.hasOption("1")) {
+			System.out.println("Input fastq for the first read must be provided.");
+			System.exit(1);
+		}
+		if(!cmd.hasOption("o1")) {
+			System.out.println("Output fastq for the first read must be provided.");
+			System.exit(1);
+		}
+		if((cmd.hasOption("o2") && !cmd.hasOption("2"))) {
+			System.out.println("If output fastq for the second read is provided, input fastq for the second read must also be provided.");
+			System.exit(1);
+		}
+		if((!cmd.hasOption("o2") && cmd.hasOption("2"))) {
+			System.out.println("If input fastq for the second read is provided, output fastq for the second read must also be provided.");
+			System.exit(1);
+		}
+		trainin = new FileInputStream(new File(cmd.getOptionValue("d")));
+		in1 = new FileInputStream(new File(cmd.getOptionValue("1")));
+		out1 = new FileOutputStream(new File(cmd.getOptionValue("o1")));
+		if(cmd.hasOption("2")) {
+			in2 = new FileInputStream(new File(cmd.getOptionValue("2")));
+			out2 = new FileOutputStream(new File(cmd.getOptionValue("o2")));			
+		}
+		if(cmd.hasOption("t")) {
+			threads = Integer.parseInt(cmd.getOptionValue("t"));
+		}
+		if(cmd.hasOption("C")) {
+			inChunksize = Integer.parseInt(cmd.getOptionValue("C"));
+		}
+		if(cmd.hasOption("cl")) {
+			String val = cmd.getOptionValue("cl");
+			if(val.charAt(0) == 'm') 
+				cutoffLower = -1*Double.parseDouble(val.substring(1));				
+			else
+				cutoffLower = Double.parseDouble(val);				
+			useCutoff = Cutoff.LOWER;
+		}
+		if(cmd.hasOption("ch")) {
+			String val = cmd.getOptionValue("ch");
+			if(val.charAt(0) == 'm') 
+				cutoffHigher = -1*Double.parseDouble(val.substring(1));				
+			else
+				cutoffHigher = Double.parseDouble(val);				
+			useCutoff = Cutoff.HIGHER;
+		}
+		if(cmd.hasOption("n")) {
+			normalizeTo = Integer.parseInt(cmd.getOptionValue("n"));
+			if(normalizeTo <= 0) {
+				System.out.println("Length for normalization must be an integer > 0.");
+				System.exit(1);
+			}
+		}
+		showTime = cmd.hasOption("T");
+	}
+		
+	public static int calculateVal(char[] kmer) {
+		int res = 0;
+		for(char base : kmer) {
+			res = res << 2;
+			switch (base) {
+			case 'A':
+				break;
+			case 'C':
+				res |= 1;
+				break;
+			case 'G':
+				res |= 2;
+				break;
+			case 'T':
+				res |= 3;
+				break;
+			default:
+				break;
+			}
+		}
+		return res;
+	}
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/ClassifierProgressListener.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/ClassifierProgressListener.java
new file mode 100644
index 0000000..8fb5c62
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/ClassifierProgressListener.java
@@ -0,0 +1,5 @@
+package org.rki.readclassifier;
+
+public interface ClassifierProgressListener {
+	public void readClassified();
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/ClassifierThread.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/ClassifierThread.java
new file mode 100644
index 0000000..ed0a44d
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/ClassifierThread.java
@@ -0,0 +1,45 @@
+package org.rki.readclassifier;
+
+import org.rki.readtrainer.AbstractReader;
+
+public class ClassifierThread extends Thread {
+
+	private AbstractReader reader;
+	private AbstractWriter writer;
+	private ReadClassifier readClassifier;
+	private ClassifierProgressListener prog;
+
+	public ClassifierThread(AbstractReader reader, AbstractWriter writer, ReadClassifier readClassifier,
+			ClassifierProgressListener prog) {
+		super();
+		this.reader = reader;
+		this.writer = writer;
+		this.readClassifier = readClassifier;
+		this.prog = prog;
+	}
+
+	public void run() {
+		try {
+			while (true) {
+				ReadPair reads = reader.reads.take();
+				if (reads.read1 == null) {
+					writer.outReads.put(new ReadPair(null, null));
+					return;
+				}
+				reads.read1.bases = reads.read1.bases.toUpperCase();
+				if (reads.read2 != null)
+					reads.read2.bases = reads.read2.bases.toUpperCase();
+				readClassifier.getScore(reads);
+				reads.read1.toString();
+				if (reads.read2 != null)
+					reads.read2.toString();
+				writer.outReads.put(reads);
+				if (prog != null) {
+					prog.readClassified();
+				}
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+}
\ No newline at end of file
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/DistClassifier.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/DistClassifier.java
new file mode 100644
index 0000000..bfae508
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/DistClassifier.java
@@ -0,0 +1,85 @@
+package org.rki.readclassifier;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+public class DistClassifier implements ReadClassifier {
+	private KmerDistribution[] kmers1;
+	private KmerDistribution[] kmers2;
+	public String org1name = "";
+	public String org2name = "";
+	private int kmersize;
+	
+	public String getName() {
+		return "DistClassifier";
+	}
+	
+	public String getDescription() {
+		return "Classifies reads based on distance between Kmer frequency distributions.";
+	}
+	
+	public void readData(InputStream in) throws Exception {
+		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+		String line = reader.readLine();
+		String[] dat = line.split("\\s+");
+		if(!dat[0].equals("gaussian_dists")) {
+			throw new Exception("Error: Wrong data file type. Found " + dat[0] + ", expected gaussian_dists");
+		}
+		kmersize = Integer.parseInt(dat[1].split("=")[1]);
+		kmers1 = new KmerDistribution[(int)Math.pow(4, kmersize)];
+		kmers2 = new KmerDistribution[(int)Math.pow(4, kmersize)];
+//		Classifier.logger.log(Level.INFO, "Found kmer size: " + kmersize);
+		org1name = dat[2].split("=")[1].trim();
+		int currDist = 0;
+		while(reader.ready()) {
+			dat = reader.readLine().split("\\s+");
+			if(dat[0].equals("gaussian_dists")) {
+				currDist += 1;
+				org2name = dat[2].split("=")[1].trim();
+				continue;
+			}
+			double mean = Double.parseDouble(dat[1]);
+			double sd = Double.parseDouble(dat[2]);
+			int val = Classifier.calculateVal(dat[0].toCharArray());
+			if(currDist == 0)
+				kmers1[val] = new KmerDistribution(mean, sd);
+			else
+				kmers2[val] = new KmerDistribution(mean, sd);
+		}
+		reader.close();
+	}
+	
+	public void getScore(ReadPair reads) {
+		getReadScore(reads.read1);
+		if(reads.read2 != null)
+			getReadScore(reads.read2);
+	}
+	
+	private void getReadScore(Read read) {
+		char[] bases = read.bases.toCharArray();
+		read.score = 0;
+		read.score1 = 0;
+		read.score2 = 0;
+		double len = bases.length;
+		int[] kmervals = new int[kmers1.length]; 
+		char[] kmer = new char[kmersize];
+		for(int i=0; i<bases.length-kmersize; i++) {
+			for(int j=0; j<kmersize; j++) {
+				kmer[j] = bases[i+j];
+			}
+			kmervals[Classifier.calculateVal(kmer)] += 1;
+		}
+		for(int i=0; i<kmervals.length; i++) {
+			double val = kmervals[i]/(double)(len-kmersize+1);
+			double score1 = kmers1[i].valueAt(val);
+			double score2 = kmers2[i].valueAt(val);
+			if(score1 + score2 != 0) {
+				double fact = 100.0/(score1+score2);
+				read.score += fact*(score1-score2);				
+			}
+			read.score1 += score1;
+			read.score2 += score2;
+		}				
+	}
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/KmerDistribution.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/KmerDistribution.java
new file mode 100644
index 0000000..5d71a56
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/KmerDistribution.java
@@ -0,0 +1,20 @@
+package org.rki.readclassifier;
+
+public class KmerDistribution {
+	public double mean;
+	public double sd;
+	public double zeroval;
+	
+	private double factor;
+	
+	public KmerDistribution(double mean, double sd) {
+		this.mean = mean;
+		this.sd = sd;
+		factor = 1.0/(sd*Math.sqrt(2*Math.PI));
+		zeroval = valueAt(0);
+	}
+
+	public double valueAt(double x) {
+		return x == 0?zeroval:factor*Math.exp(-1*(x-mean)*(x-mean)/(2*sd*sd));
+	}
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/MMClassifier.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/MMClassifier.java
new file mode 100644
index 0000000..024dd6b
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/MMClassifier.java
@@ -0,0 +1,212 @@
+package org.rki.readclassifier;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.rki.readtrainer.Trainer;
+
+public class MMClassifier implements ReadClassifier {
+	private double tmorg1[][]; 
+	private double tmorg2[][];
+	private int kmersize;
+	private int kmermask = 0;
+	
+	public String org1name = "";
+	public String org2name = "";
+	
+	@Override
+	public String getName() {
+		return "MMClassifier";
+	}
+
+	@Override
+	public String getDescription() {
+		return("Classifies reads based on Markov Model of Kmer transitions.");
+	}
+
+	public void setMatrices(AtomicInteger[][] matrix1, AtomicInteger[][] matrix2, String o1name, String o2name, int kmersize) {
+		this.kmersize = kmersize;
+		kmermask = (int)Math.pow(4, kmersize) - 1;
+		tmorg1 = new double[(int)Math.pow(4, kmersize)][4];
+		tmorg2 = new double[(int)Math.pow(4, kmersize)][4];
+		setMatrix(matrix1, tmorg1);
+		setMatrix(matrix2, tmorg2);
+		org1name = o1name;
+		org2name = o2name;
+	}
+
+	private void setMatrix(AtomicInteger[][] matrix, double[][] tm) {
+		for(int i=0; i<matrix.length; i++) {
+			int rowsum = 0;
+			for(int j=0; j<4; j++) {
+				rowsum += matrix[i][j].get();
+			}
+			if(rowsum == 0) {
+				for(int j=0; j<4; j++) {
+					tm[i][j] = Double.NaN;
+				}
+				continue;
+			}
+			for(int j=0; j<4; j++) {
+				double val = (double)matrix[i][j].get();
+				if(val == 0)
+					val = Trainer.absentval;
+				val = val/(double)rowsum*4.0d;
+				val = Math.log(val);
+				tm[i][j] = val;
+			}
+		}		
+
+	}
+	
+	@Override
+	public void readData(InputStream in) throws Exception {
+		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+		String line = reader.readLine();
+		String[] dat = line.split("\\s+");
+		if(!dat[0].equals("transition_matrix")) {
+			throw new Exception("Error: Wrong data file type. Found " + dat[0] + ", expected transition_matrix");
+		}
+		kmersize = Integer.parseInt(dat[1].split("=")[1]);
+		kmermask = (int)Math.pow(4, kmersize) - 1;
+		tmorg1 = new double[(int)Math.pow(4, kmersize)][4];
+		tmorg2 = new double[(int)Math.pow(4, kmersize)][4];
+//		Classifier.logger.log(Level.INFO, "Found kmer size: " + kmersize);
+		org1name = dat[2].split("=")[1].trim();
+		int currDist = 0;
+		int numline = 0;
+		while(reader.ready()) {
+			dat = reader.readLine().split("\\s+");
+			if(dat[0].equals("transition_matrix")) {
+				currDist += 1;
+				numline = 0;
+				org2name = dat[2].split("=")[1].trim();
+				continue;
+			}
+			int pos=0;
+			for(String val : dat) {
+				double dval;
+				val = val.trim();
+				if(val.equals("None")) {
+					dval = Double.NaN;
+				}
+				else {
+					dval = Double.parseDouble(val);
+				}
+				if(currDist == 0)
+					tmorg1[numline][pos] = dval;
+				else
+					tmorg2[numline][pos] = dval;
+				pos += 1;
+			}
+			numline += 1;
+		}
+		reader.close();	
+	}
+
+	@Override
+	public void getScore(ReadPair reads) {
+		if(reads.read1.bases.length() < kmersize) {
+			System.out.println("Error: Read with length < kmer size found: ");
+			System.out.println(">" + reads.read1.name);
+			System.out.println(">" + reads.read1.bases);
+			System.exit(1);
+		}
+		getReadScore(reads.read1);
+		if(reads.read2 != null) {
+			if(reads.read2.bases.length() < kmersize) {
+				System.out.println("Error: Read with length < kmer size found: ");
+				System.out.println(">" + reads.read2.name);
+				System.out.println(">" + reads.read2.bases);
+				System.exit(1);
+			}
+			getReadScore(reads.read2);
+			reads.read2.score1 += reads.read1.score1;
+			reads.read2.score2 += reads.read1.score2;
+			reads.read1.score1 = reads.read2.score1;
+			reads.read1.score2 = reads.read2.score2;
+			reads.read1.score += reads.read2.score;
+			reads.read2.score = reads.read1.score;
+		}
+	}
+	
+	private int[] getKmerval(char[] bases, int start) {
+		int[] res = {0, 0};
+		int i=0;
+		int kmerval = 0;
+		while(start+i<bases.length && i<kmersize) {
+			int nextbase = getBaseVal(bases[start+i]);
+			if(nextbase == -1) {
+				start += i + 1;
+				i = 0;
+				kmerval = 0;
+				continue;
+			}
+			kmerval = kmerval << 2;
+			kmerval |= nextbase;
+			i++;
+		}
+		if(i < kmersize) {
+			return null;
+		}
+		res[0] = kmerval;
+		res[1] = start + i;
+		return res;
+	}
+	
+	private void getReadScore(Read read) {
+		char[] bases = read.bases.toCharArray();
+		read.score = 0;
+		read.score1 = 0;
+		read.score2 = 0;
+		double len = bases.length;
+		int[] initialKmer = getKmerval(bases, 0);
+		if(initialKmer == null) {
+			read.invalid = true;
+			return;
+		}
+		int kmerval = initialKmer[0];
+		int i = initialKmer[1];
+		for(; i<len; i++) {
+			int nextbase = getBaseVal(bases[i]);
+			if(nextbase == -1) {
+				initialKmer = getKmerval(bases, i+1);
+				if(initialKmer == null)
+					break;
+				kmerval = initialKmer[0];
+				i = initialKmer[1];
+				continue;
+			}
+			double ds1 = tmorg1[kmerval][nextbase];
+			double ds2 = tmorg2[kmerval][nextbase];
+			if(!Double.isNaN(ds1) && !Double.isNaN(ds2)) {
+				read.score1 += ds1;
+				read.score2 += ds2;
+//				read.score1 *= ds1;
+//				read.score2 *= ds2;	
+			}
+			kmerval = kmerval << 2;
+			kmerval |= nextbase;
+			kmerval &= kmermask;
+		}
+		read.score = read.score1 - read.score2;
+		read.normalizeTo(Classifier.normalizeTo);
+	}
+
+	private int getBaseVal(char base) {
+		switch (base) {
+		case 'A':
+			return 0;
+		case 'C':
+			return 1;
+		case 'G':
+			return 2;
+		case 'T':
+			return 3;
+		default:
+			return -1;
+		}		
+	}	
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/Read.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/Read.java
new file mode 100644
index 0000000..9ce25b7
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/Read.java
@@ -0,0 +1,87 @@
+package org.rki.readclassifier;
+
+import java.util.LinkedList;
+
+public class Read {
+	public boolean invalid = false;
+	public String name;
+	public String bases;
+	public String qualities;
+	public byte[] intQualities = null;
+	public double score = 0;
+	public double score1 = 0;
+	public double score2 = 0;
+        public String fastq = null;
+        public String fasta = null;
+	
+	public Read(String name, String bases, String qualities) {
+		this.name = name;
+		this.bases = bases;
+		this.qualities = qualities;
+		this.score1 = 0;
+		this.score2 = 0;
+	}
+	
+	public Read(String name, String bases, byte[] qualities, boolean intQuals) {
+		this.name = name;
+		this.bases = bases;
+		this.intQualities = qualities;
+		this.score1 = 0;
+		this.score2 = 0;
+	}	
+	
+	public String toString(boolean showQuals) {
+		if(intQualities == null)
+			return "";
+		if(qualities != null)
+			return toFastq(showQuals);
+		return toFasta(showQuals);
+	}
+     
+	public void normalizeTo(int length) {
+		if(length == -1)
+			return;
+		float fact = (float)length/(float)bases.length(); 
+		score *= fact;
+		score1 *= fact;
+		score2 *= fact;
+	}
+	
+	public String toFasta(boolean showQuals) {
+        if(fasta == null) {
+			StringBuilder res = new StringBuilder();
+			res.append(name);
+			if(showQuals) {
+				res.append("_");
+				res.append(score1);
+				res.append("_");
+				res.append(score2);
+            }
+			res.append("\n");
+			res.append(bases);
+			res.append("\n");
+            fasta = res.toString();
+        }
+        return fasta;
+	}
+	
+	public String toFastq(boolean showQuals) {
+        if(fastq == null) {
+			StringBuilder res = new StringBuilder();
+			res.append(name);
+			if(showQuals) {
+				res.append("_");
+				res.append(score1);
+				res.append("_");
+				res.append(score2);
+			}
+			res.append("\n");
+			res.append(bases);
+			res.append("\n+\n");
+			res.append(qualities);
+			res.append("\n");
+			fastq = res.toString();
+        }
+        return fastq;
+	}
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/ReadClassifier.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/ReadClassifier.java
new file mode 100644
index 0000000..827c629
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/ReadClassifier.java
@@ -0,0 +1,10 @@
+package org.rki.readclassifier;
+
+import java.io.InputStream;
+
+public interface ReadClassifier {
+	public String getName();
+	public String getDescription();
+	public void readData(InputStream in) throws Exception;
+	public void getScore(ReadPair reads);
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/ReadPair.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/ReadPair.java
new file mode 100644
index 0000000..489ce4d
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/ReadPair.java
@@ -0,0 +1,11 @@
+package org.rki.readclassifier;
+
+public class ReadPair {
+	public Read read1;
+	public Read read2;
+	
+	public ReadPair(Read read1, Read read2) {
+		this.read1 = read1;
+		this.read2 = read2;
+	}
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/Reader.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/Reader.java
new file mode 100644
index 0000000..7e4c489
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/Reader.java
@@ -0,0 +1,135 @@
+package org.rki.readclassifier;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.concurrent.LinkedBlockingQueue;
+
+public class Reader extends AbstractReader {
+	private InputStream in1;
+	private InputStream in2;
+	private int numReads;
+	private String lastLine1 = "";
+	private String lastLine2 = "";
+	private BufferedReader r1;
+	private BufferedReader r2;
+	private boolean isFastq = true;
+	private boolean firstRead1 = true;
+	private boolean firstRead2 = true;
+	private int numThreads;
+	
+	public Boolean fileDone = false;
+    public boolean stop = false;
+	
+	public Reader(InputStream infile1, InputStream infile2, int numReads, int bufferSize, int numThreads) {
+		super();
+		this.in1 = infile1;
+		this.in2 = infile2;
+		this.numReads = numReads;
+		this.numThreads = numThreads;
+		reads = new LinkedBlockingQueue<ReadPair>(numReads);
+		r1 = new BufferedReader(new InputStreamReader(in1), bufferSize);
+		r2 = in2==null?null:new BufferedReader(new InputStreamReader(in2));
+	}
+	
+        public void run() {
+            while(!fileDone) {
+                readMore();
+            }
+            try {
+            	for(int i=0; i<numThreads; i++) {
+                    reads.put(new ReadPair(null, null));
+            	}
+                } catch(Exception e) {
+                	e.printStackTrace();
+                }
+        }
+        
+	public void readMore() {
+		if(fileDone)
+			return;
+		try {		
+			int readsRead = 0;
+			while(r1.ready()) {
+				Read read1;
+				read1 = readRead(r1, 1);
+				Read read2 = readRead(r2, 2);
+				reads.put(new ReadPair(read1, read2));
+				readsRead += 1;
+				if(readsRead >= numReads) {
+					return;
+				}
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+		fileDone = true;
+	}
+	
+	private Read readRead(BufferedReader reader, int numRead) throws IOException {
+		if(reader == null)
+			return null;
+		String name = "";
+		StringBuilder bases = new StringBuilder();
+		StringBuilder qualities = new StringBuilder();
+		String line = "";
+		int numLines = 0;
+		if((firstRead1 && numRead == 1) || (firstRead2 && numRead == 2)) {
+			line = reader.readLine();
+			if(line.charAt(0) == '>') {
+				isFastq = false;
+				if(numRead == 1) {
+					lastLine1 = line;
+					firstRead1 = false;
+				}
+				else if(numRead == 2) {
+					lastLine2 = line;
+					firstRead2 = false;
+				}
+			}
+			else if(line.charAt(0) == '@') {
+				isFastq = true;
+				name = line.trim();
+			}
+			else
+				throw new IOException("Read file has to be in FASTA or FASTQ format.");
+		}
+		if(isFastq) {
+			if((numRead == 1 && !firstRead1) || (numRead == 2 && !firstRead2))
+				name = reader.readLine().trim();
+			while(true) {
+				line = reader.readLine();
+				if(line.charAt(0) == '+')
+					break;
+				numLines += 1;
+				bases.append(line.trim());
+			}
+			while(numLines > 0) {
+				line = reader.readLine();
+				numLines -= 1;
+				qualities.append(line.trim());
+			}
+			return new Read(name, bases.toString(), qualities.toString());
+		}
+		else {
+			if(numRead == 1)
+				name = lastLine1.trim();
+			else if(numRead == 2)
+				name = lastLine2.trim();
+			while(reader.ready()) {
+				line = reader.readLine();
+				if(line.charAt(0) == '>') {
+					if(numRead == 1)
+						lastLine1 = line;
+					else if(numRead == 2)
+						lastLine2 = line;
+					return new Read(name, bases.toString(), null);
+				}
+				bases.append(line.trim());
+			}
+			return new Read(name, bases.toString(), null);
+		}
+	}
+	
+}
diff --git a/ReadClassifier_eclipse/src/org/rki/readclassifier/Writer.java b/ReadClassifier_eclipse/src/org/rki/readclassifier/Writer.java
new file mode 100644
index 0000000..794c70b
--- /dev/null
+++ b/ReadClassifier_eclipse/src/org/rki/readclassifier/Writer.java
@@ -0,0 +1,71 @@
+package org.rki.readclassifier;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+
+public class Writer extends AbstractWriter {
+	private BufferedWriter out1w;
+	private BufferedWriter out2w;
+        public BlockingQueue<ReadPair> outReads;
+//	public LinkedList<ReadPair> outReads = new LinkedList<ReadPair>();
+	
+	public Writer(OutputStream out1, OutputStream out2, int numReads) {
+                outReads = new LinkedBlockingQueue<ReadPair>(numReads);
+		out1w = new BufferedWriter(new OutputStreamWriter(out1));
+		out2w = out2==null?null:new BufferedWriter(new OutputStreamWriter(out2));
+	}
+	
+        public void run() {
+            while(true) {
+                try {
+                    ReadPair reads = outReads.take();
+                    if(reads.read1 == null)
+                        break;
+                    if(reads.read1.invalid && reads.read2.invalid)
+                    	continue;
+                    if(Classifier.useCutoff == Classifier.Cutoff.HIGHER && reads.read1.score < Classifier.cutoffHigher)
+                    	continue;
+                    else if(Classifier.useCutoff == Classifier.Cutoff.LOWER && reads.read1.score > Classifier.cutoffLower)
+                    	continue;
+                    out1w.write(reads.read1.toString(true));
+                    if(out2w != null && reads.read2 != null)
+                        out2w.write(reads.read2.toString(true));
+                } catch(Exception e) {
+                }
+            }
+            try {
+                out1w.close();
+                if(out2w != null)
+                    out2w.close();
+            } catch(Exception e) {}
+        }
+        
+	public void write() {
+		try {
+			for(ReadPair reads : outReads) {
+				out1w.write(reads.read1.toString(true));
+				if(out2w != null && reads.read2 != null) {
+					out2w.write(reads.read2.toString(true));
+				}
+			}
+		} catch(Exception e) {
+			e.printStackTrace();
+		}
+		outReads.clear();
+	}
+	
+	public void close() {
+		write();
+		try {
+			out1w.close();
+			if(out2w != null)
+				out2w.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+}
diff --git a/ReadTrainer_eclipse/.classpath b/ReadTrainer_eclipse/.classpath
new file mode 100644
index 0000000..9f0c200
--- /dev/null
+++ b/ReadTrainer_eclipse/.classpath
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/ReadClassifier"/>
+	<classpathentry kind="lib" path="/ReadClassifier/lib/commons-cli-1.2.jar"/>
+	<classpathentry kind="lib" path="/ReadClassifier/lib/log4j-1.2.15.jar"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/ReadTrainer_eclipse/.project b/ReadTrainer_eclipse/.project
new file mode 100644
index 0000000..adc16b6
--- /dev/null
+++ b/ReadTrainer_eclipse/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ReadTrainer</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/ReadTrainer_eclipse/src/org/rki/readtrainer/AbstractReader.java b/ReadTrainer_eclipse/src/org/rki/readtrainer/AbstractReader.java
new file mode 100644
index 0000000..7c98462
--- /dev/null
+++ b/ReadTrainer_eclipse/src/org/rki/readtrainer/AbstractReader.java
@@ -0,0 +1,9 @@
+package org.rki.readtrainer;
+
+import java.util.concurrent.BlockingQueue;
+
+import org.rki.readclassifier.ReadPair;
+
+public abstract class AbstractReader extends Thread {
+	public BlockingQueue<ReadPair> reads;
+}
diff --git a/ReadTrainer_eclipse/src/org/rki/readtrainer/Trainer.java b/ReadTrainer_eclipse/src/org/rki/readtrainer/Trainer.java
new file mode 100644
index 0000000..175768c
--- /dev/null
+++ b/ReadTrainer_eclipse/src/org/rki/readtrainer/Trainer.java
@@ -0,0 +1,241 @@
+package org.rki.readtrainer;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.LinkedList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.rki.readclassifier.Reader;
+
+public class Trainer {
+	public static boolean showTime;
+	public static double startTimeMillis;
+	public static int kmersize = 4;
+	public static AtomicInteger[][] matrix1;
+	public static AtomicInteger[][] matrix2;
+	
+	public static FileInputStream a1 = null;
+	public static FileInputStream a2 = null;
+	public static FileInputStream b1 = null;
+	public static FileInputStream b2 = null;
+	
+	public static String namea = "";
+	public static String nameb = "";
+	
+	public static FileOutputStream out = null;
+	public static int threads = 4;
+	
+	public static double absentval = 0.0001;
+	
+	public static void main(String[] args) {
+		try {
+			init();
+			parseOptions(args);
+			process();
+			finish();
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+
+	public static void finish() {
+		if(showTime) {
+			long currentTime = System.currentTimeMillis();
+			System.out.println("Time taken: " + (currentTime - startTimeMillis) + " ms");
+		}
+	}
+	
+	public static void process() throws Exception {
+		BufferedWriter outWriter = new BufferedWriter(new OutputStreamWriter(out));
+		matrix1 = new AtomicInteger[(int)Math.pow(4, kmersize)][4];
+		clearMatrix(matrix1);
+		trainSet(a1, a2, matrix1);
+		writeMatrix(outWriter, namea, matrix1);
+		matrix2 = new AtomicInteger[(int)Math.pow(4, kmersize)][4];
+		clearMatrix(matrix2);
+		trainSet(b1, b2, matrix2);
+		writeMatrix(outWriter, nameb, matrix2);
+		outWriter.close();
+		showScore();
+	}
+
+	public static void showScore() {
+		double score = 0;
+		for(int i=0; i<matrix1.length; i++) {
+			int rowsum1 = 0;
+			for(int j=0; j<4; j++) {
+				rowsum1 += matrix1[i][j].get();
+			}
+			int rowsum2 = 0;
+			for(int j=0; j<4; j++) {
+				rowsum2 += matrix2[i][j].get();
+			}
+			for(int j=0; j<4; j++) {
+				score += Math.abs((matrix1[i][j].get()*4.0/(double)rowsum1)-(matrix2[i][j].get()*4.0/(double)rowsum2));
+			}
+		}
+		score = score / Math.pow(4.0d, kmersize+1.0d);
+		System.out.println("Matrix difference (expected quality) score: " + score);
+	}
+	
+	public static void writeMatrix(BufferedWriter writer, String name, AtomicInteger[][] matrix) throws IOException {
+		writer.write("transition_matrix\tk=");
+		writer.write(Integer.toString(kmersize));
+		writer.write("\tspecies_name=");
+		writer.write(name);
+		writer.write("\n");
+		for(int i=0; i<matrix.length; i++) {
+			int rowsum = 0;
+			for(int j=0; j<4; j++) {
+				rowsum += matrix[i][j].get();
+			}
+			if(rowsum == 0) {
+				writer.write("None\tNone\tNone\tNone\n");
+				continue;
+			}
+			for(int j=0; j<4; j++) {
+				double val = (double)matrix[i][j].get();
+				if(val == 0)
+					val = absentval;
+				val = val/(double)rowsum*4.0d;
+				val = Math.log(val);
+				writer.write(Double.toString(val));
+				if(j != 3) {
+					writer.write("\t");
+				}
+			}
+			writer.write("\n");
+		}
+	}
+	
+	public static void clearMatrix(AtomicInteger[][] matrix) {
+		for(int i=0; i<(int)Math.pow(4, kmersize); i++) {
+			for(int j=0; j<4; j++) {
+				matrix[i][j] = new AtomicInteger(0);
+			}
+		}
+	}
+	
+	public static void trainSet(FileInputStream f1, FileInputStream f2, AtomicInteger[][] matrix) throws InterruptedException {
+		Reader reader = new Reader(f1, f2, 10000, 50000, threads);
+		reader.start();
+		LinkedList<TrainerThread> trainers = new LinkedList<TrainerThread>();
+		for(int i=0; i<threads; i++) {
+			trainers.add(new TrainerThread(reader, matrix, null));
+		}
+		for(TrainerThread trainer : trainers) {
+			trainer.start();
+		}
+		for(TrainerThread trainer : trainers) {
+			trainer.join();
+		}
+		reader.join();
+	}
+		
+	public static void init() throws Exception {
+		startTimeMillis = System.currentTimeMillis();
+	}
+		
+	public static void parseOptions(String[] args) throws ParseException, FileNotFoundException {
+		Options options = new Options();
+		options.addOption("a1", "a1", true, "Input fastq/fasta file containing first reads from organism a");
+		options.addOption("a2", "a2", true, "Input fastq/fasta file containing second reads from organism a");
+		options.addOption("b1", "b1", true, "Input fastq/fasta file containing first reads from organism b");
+		options.addOption("b2", "b2", true, "Input fastq/fasta file containing second reads from organism b");		
+		options.addOption("na", "name_a", true, "Name for organism a (default: inferred from first read file name)");
+		options.addOption("nb", "name_b", true, "Name for organism b (default: inferred from first read file name)");
+		options.addOption("o", "datafile", true, "output file");
+		options.addOption("k", "kmer", true, "K-mer size (default: " + kmersize + ")");
+		options.addOption("t", "threads", true, "Number of threads (default: " + threads + ")");
+		options.addOption("T", "show_time", false, "Show processing time");
+		options.addOption("h", "help", false, "Print usage instructions");
+		
+		CommandLineParser parser = new GnuParser();
+		CommandLine cmd = parser.parse(options, args);
+		
+		if(cmd.hasOption("h")) {
+			HelpFormatter formatter = new HelpFormatter();
+			formatter.printHelp("java -jar trainer.jar", options);
+			System.exit(0);
+		}
+		if(!cmd.hasOption("a1")) {
+			System.out.println("Input file for the first read for organism a must be provided.");
+			System.exit(1);
+		}
+		if(!cmd.hasOption("b1")) {
+			System.out.println("Input file for the first read for organism b must be provided.");
+			System.exit(1);
+		}
+		if((cmd.hasOption("a2") && !cmd.hasOption("b2"))) {
+			System.out.println("If the second read file is provided for one organism, it has to also be provided for the other.");
+			System.exit(1);
+		}
+		if((cmd.hasOption("b2") && !cmd.hasOption("a2"))) {
+			System.out.println("If the second read file is provided for one organism, it has to also be provided for the other.");
+			System.exit(1);
+		}
+		
+		if(cmd.hasOption("k")) {
+			kmersize = Integer.parseInt(cmd.getOptionValue("k"));
+		}
+		
+		a1 = new FileInputStream(new File(cmd.getOptionValue("a1")));
+		if(cmd.hasOption("a2"))
+			a2 = new FileInputStream(new File(cmd.getOptionValue("a2")));
+		b1 = new FileInputStream(new File(cmd.getOptionValue("b1")));
+		if(cmd.hasOption("b2"))
+			b2 = new FileInputStream(new File(cmd.getOptionValue("b2")));
+
+		out = new FileOutputStream(new File(cmd.getOptionValue("o")));
+		
+		if(cmd.hasOption("na")) {
+			namea = cmd.getOptionValue("na");
+		} else {
+			namea = new File(cmd.getOptionValue("a1")).getName().split("\\.")[0];
+		}
+		if(cmd.hasOption("nb")) {
+			nameb = cmd.getOptionValue("nb");
+		} else {
+			nameb = new File(cmd.getOptionValue("b1")).getName().split("\\.")[0];
+		}
+		
+		if(cmd.hasOption("t")) {
+			threads = Integer.parseInt(cmd.getOptionValue("t"));
+		}
+		showTime = cmd.hasOption("T");
+	}
+		
+	public static int calculateVal(char[] kmer) {
+		int res = 0;
+		for(char base : kmer) {
+			res = res << 2;
+			switch (base) {
+			case 'A':
+				break;
+			case 'C':
+				res |= 1;
+				break;
+			case 'G':
+				res |= 2;
+				break;
+			case 'T':
+				res |= 3;
+				break;
+			default:
+				break;
+			}
+		}
+		return res;
+	}
+}
diff --git a/ReadTrainer_eclipse/src/org/rki/readtrainer/TrainerProgressListener.java b/ReadTrainer_eclipse/src/org/rki/readtrainer/TrainerProgressListener.java
new file mode 100644
index 0000000..6598d3a
--- /dev/null
+++ b/ReadTrainer_eclipse/src/org/rki/readtrainer/TrainerProgressListener.java
@@ -0,0 +1,5 @@
+package org.rki.readtrainer;
+
+public interface TrainerProgressListener {
+	public void readTrained();
+}
diff --git a/ReadTrainer_eclipse/src/org/rki/readtrainer/TrainerThread.java b/ReadTrainer_eclipse/src/org/rki/readtrainer/TrainerThread.java
new file mode 100644
index 0000000..82368e0
--- /dev/null
+++ b/ReadTrainer_eclipse/src/org/rki/readtrainer/TrainerThread.java
@@ -0,0 +1,82 @@
+package org.rki.readtrainer;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.rki.readclassifier.Classifier;
+import org.rki.readclassifier.Read;
+import org.rki.readclassifier.ReadPair;
+import org.rki.readclassifier.Reader;
+
+public class TrainerThread extends Thread {
+	private AbstractReader reader;
+	private int kmermask;
+	private AtomicInteger[][] matrix;
+	private TrainerProgressListener listener;
+	
+	public TrainerThread(AbstractReader reader, AtomicInteger[][] matrix, TrainerProgressListener listener) {
+		super();
+		this.reader = reader;
+		this.matrix = matrix;
+		this.listener = listener;
+	}
+	
+	public void run() {
+		kmermask = (int)Math.pow(4, Trainer.kmersize) - 1;
+		try {
+			while(true) {
+				ReadPair pair = reader.reads.take();
+				if(pair.read1 == null)
+					break;
+				pair.read1.bases = pair.read1.bases.toUpperCase();
+				trainRead(pair.read1);
+				if(pair.read2 != null) {
+					pair.read2.bases = pair.read2.bases.toUpperCase();
+					trainRead(pair.read2);
+				}
+				if(listener != null) {
+					listener.readTrained();
+				}
+			}
+		} catch(Exception e) {
+			e.printStackTrace();
+		}
+	}
+	
+	private void trainRead(Read read) {
+		char[] bases = read.bases.toCharArray();
+		read.score = 0;
+		read.score1 = 1;
+		read.score2 = 1;
+		double len = bases.length;
+		char[] kmer = new char[Trainer.kmersize];
+		int i=0;
+		int kmerval = 0;
+		while(i<len && i<Trainer.kmersize) {
+			kmer[i] = bases[i];
+			i++;
+		}
+		kmerval = Classifier.calculateVal(kmer);
+		for(; i<len; i++) {
+			int nextbase = getBaseVal(bases[i]);
+			matrix[kmerval][nextbase].addAndGet(1);
+			kmerval = kmerval << 2;
+			kmerval |= nextbase;
+			kmerval &= kmermask;
+		}
+	}
+
+	private int getBaseVal(char base) {
+		switch (base) {
+		case 'A':
+			return 0;
+		case 'C':
+			return 1;
+		case 'G':
+			return 2;
+		case 'T':
+			return 3;
+		default:
+			return 0;
+		}		
+	}
+}
diff --git a/Readme.pdf b/Readme.pdf
new file mode 100644
index 0000000..523034b
Binary files /dev/null and b/Readme.pdf differ
diff --git a/Readme.txt b/Readme.txt
new file mode 100644
index 0000000..32cfab8
--- /dev/null
+++ b/Readme.txt
@@ -0,0 +1,105 @@
+INTRODUCTION
+------------
+
+RAMBO-K is a reference-based tool for rapid and sensitive extraction of one organism’s reads from a mixed dataset. It is based on a Markov chain implementation, which uses genomic characteristics of each reference to assign reads to the associated set.
+
+
+SYSTEM REQUIREMENTS
+-------------------
+
+RAMBO-K is implemented in python and Java. Thus, it requires Java SE 7 as well as python 2.7 or higher (including modules numpy and matplotlib). 
+
+
+LICENSE
+-------
+
+Copyright (C) 2015  Simon H. Tausch
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License, version 3
+as published by the Free Software Foundation.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this program.  If not, see
+<http://www.gnu.org/licenses/>. 
+
+
+RUNNING RAMBO-K
+---------------
+
+	References and names
+	To use RAMBO-K you need to specify two reference sets (using parameters -R and -r). References must be provided as fasta-files and may contain one or more sequences corresponding to the desired species. If you lack an exact reference, a multiple FASTA file containing sequences from a set of species related as closely as possible will also work.
+	You can enter two names for the reference sets (using -N corresponding to -R and -n corresponding to -r), which will be used for naming of the result files and labelling the plots.
+
+	Unassigned reads
+	The unassigned reads from the original dataset can be provided as fastq or fasta files (if fasta, filetype option must be set to -t fasta) either as single- or paired end sets using parameter -1 (and -2 for paired reads). If paired end information is available, using it is strongly recommended.
+
+	File structure
+	RAMBO-K will create a folder for every new set of species (named $name1_$name2/). In that folder, the assigned reads will be saved as $name*.fastq (or .fasta if input format is specified as fasta). All files are named according to the entered options.
+	There will also be a folder $name1_$name2/temp/ containing temporary files. This can be used to rerun RAMBO-K without repeating all calculations. The temp folder can be deleted after a run using the -d parameter, but as temporary data are rather small and save a lot of time when re-running RAMBO-K, this is not recommended. 
+	In the $name1_$name2/graphics/ folder you will find some plots which are helpful for determining your final parameters (see Cutoffs).
+	If you want to write your results to a folder other than your working directory, use the parameter -o.
+	
+	K-mer sizes
+	The size of k has a crucial influence on assignment quality. It is recommended to run RAMBO-K over a range of k values first to find a suitable value. Generally, a low k (<5) works best if the reference is rather distant, while higher values (>8) can be used if a close reference is known. Best results are usually achieved with k between 4 and 12. 
+	You can enter k as a range (e.g. -k 4:8), a list (e.g. -k 4,8,12) or a single integer (e.g. -k 8). For a more detailed discussion on the choice of k see the provided file Readme.pdf
+
+	Cutoffs
+	After the first run of RAMBO-K, you will find a series of plots provided in the graphics folder. All plots are named according to the entered options. The score_histogram_*.png plot will show the theoretical distribution of scores of each species. Ideally, the curves for the two species, which are to be separated, should overlap as little as possible. Theoretical specificity and sensitivity is shown in ROCplot_*.png. To check the correlation of the theoretical distributions to your real [...]
+	Cutoffs are provided either using parameter -c to assign all reads with scores lower than the given number, or parameter -C to assign all reads scoring higher than the given number.
+	For negative cutoffs, use m instead of - (e.g. m1 = -1).
+
+	Number of reads used for simulation
+	The -a parameter allows you to vary the number of reads used to simulate data and draw the plots. Especially for very large reference genomes, a higher number is recommended to yield more precise results. Also, the plots will look smoother using more data. However, using higher numbers of reads for this step may slow down the precalculation phase.
+	
+	Threading
+	RAMBO-K is implemented to use multiple threads with near-linear speedup until hard drive access speed becomes the limiting factor. The number of threads is specified by parameter -t. Per default, multithreading is disabled. 
+
+	Hint
+	In case of low quality data, quality trimming will significantly improve the results of RAMBO-K. 
+	For a short help and explanation of all parameters type ./RAMBOK -h .
+	
+	
+EXAMPLE
+-------
+
+Divide a set of viral and human reads from a paired end data set in fastq format:
+
+1) Run RAMBO-K to evaluate optimal parameters (optional):
+
+	./RAMBOK -r human_reference_sequences.fasta -n H.sapiens -R viral_reference_sequences.fasta -N virus -1 unassigned_reads.1.fastq -2 unassigned_reads.2.fastq -t 4 -k 4:8 
+
+2) Now take a look at the plots to choose ideal values of k and c. Run RAMBO-K again specifying k and c. Do not change any of the other options except for the number of threads. In this example, the best discrimination would be obtained using k = 6 and setting the cutoff to 0. To assign viral reads while dismissing the human background, run: 
+
+	./RAMBOK -r human_reference_sequences.fasta -n H.sapiens -R viral_reference_sequences.fasta -N virus -1 unassigned_reads.1.fastq -2 unassigned_reads.2.fastq -t 4 -k 6 -c 0
+
+Results will be saved in virus_cutoff=0_k=6_1.fastq and virus_cutoff=0_k=6_2.fastq in your working directory or, if specified, the output folder. 	
+Or, to assign the human reads while dismissing viral reads, run:
+
+	./RAMBOK -r human_reference_sequences.fasta -n H.sapiens -R viral_reference_sequences.fasta -N virus -1 unassigned_reads.1.fastq -2 unassigned_reads.2.fastq -t 4 -k 6 -C 0
+	
+Results will be saved in H.sapiens_cutoff=0_k=6_1.fastq and H.sapiens_cutoff=0_k=6_2.fastq in your working directory or, if specified, the output folder. 
+	
+	
+Or, to divide a set of viral and human reads from a single end data set in fasta format:
+
+1) Run RAMBO-K to evaluate optimal parameters (optional):
+
+	./RAMBOK -r human_reference_sequences.fasta -n H.sapiens -R viral_reference_sequences.fasta -N virus -1 unassigned_reads.1.fasta -f fasta -t 4 -k 4:8 
+
+2) Now take a look at the plots to choose ideal values of k and c. Run RAMBO-K again specifying k and c. Do not change any of the other options except for the number of threads. In this example, the best discrimination would be obtained using k = 10 and setting the cutoff to -10. To assign viral reads while dismissing the human background, run: 
+
+	./RAMBOK -r human_reference_sequences.fasta -n H.sapiens -R viral_reference_sequences.fasta -N virus -1 unassigned_reads.1.fasta -f fasta -t 4 -k 10 -c m10
+
+	
+Results will be saved in virus_cutoff=m10_k=10.fasta in your working directory or, if specified, the output folder. 	
+Or, to assign the human reads while dismissing viral reads, run:
+
+	./RAMBOK -r human_reference_sequences.fasta -n H.sapiens -R viral_reference_sequences.fasta -N virus -1 unassigned_reads.1.fasta -f fasta -t 4 -k 10 -C m10
+	
+Results will be saved in H.sapiens_cutoff=m10_k=10.fasta in your working directory or, if specified, the output folder. 	
diff --git a/plot.py b/plot.py
new file mode 100644
index 0000000..1e52c6b
--- /dev/null
+++ b/plot.py
@@ -0,0 +1,325 @@
+import numpy as np
+import matplotlib
+
+matplotlib.use('Agg')
+import pylab as pl
+import argparse
+import os
+from sklearn.metrics import auc
+from matplotlib.font_manager import FontProperties
+
+
+# find bounds for plotting
+# threshold tells, how many reads are to be ignored on both sides (relative to length of the list)
+def find_bounds(scorelist, threshold=.0005):
+    all_length = len(scorelist)
+    i = 0
+    j = all_length
+    scorelist.sort()
+    while i < threshold * all_length:
+        i += 1
+    while all_length - j < threshold * all_length:
+        j -= 1
+    return scorelist[i][0], scorelist[j - 1][0]
+
+
+def find_bounds_unassigned(scorelist, threshold=.0005):
+    all_length = len(scorelist)
+    i = 0
+    j = all_length
+    scorelist.sort()
+    while i < threshold * all_length:
+        i += 1
+    while all_length - j < threshold * all_length:
+        j -= 1
+    return float(scorelist[i]), float(scorelist[j - 1])
+
+
+def hist_of_scores(species_score, path, name1, name2, k, amount, SE_String):
+    species1 = []
+    species2 = []
+    bins2 = [0] * 101
+    bins3 = [0] * 101
+    bounds = find_bounds(species_score)
+    lowerbound = bounds[0]
+    upperbound = bounds[1]
+    stepsize = (bounds[1] - bounds[0]) / 101.
+    for key in species_score:
+        logkey = (key[0])
+        if key[1] == '1':
+            species1.append(logkey)
+        elif key[1] == '2':
+            species2.append(logkey)
+    for i in range(len(species1)):
+        if lowerbound < species1[i] < upperbound:
+            bins2[int((species1[i] - lowerbound) / stepsize)] += 1
+    for i in range(len(species2)):
+        if lowerbound < species2[i] < upperbound:
+            bins3[int((species2[i] - lowerbound) / stepsize)] += 1
+    ticks = np.arange(bounds[0], bounds[1] + bounds[1] / 1000., (bounds[1] - bounds[0]) / 100.)
+    pl.plot(ticks, bins2, label=name1, color='blue')
+    pl.plot(ticks, bins3, label=name2, color='red')
+    pl.legend(loc='upper right', numpoints=1)
+    pl.savefig(os.path.join(path, 'score_histogram_k' + str(k) + '_' + str(amount) + SE_String + '.png'))
+    return species1, species2, bounds
+
+
+def fasta_generator(infile):
+    block = []
+    state = 'name'
+    for line in infile:
+        if state == 'name':
+            block.append(line.rstrip())
+            state = 'read'
+        elif state == 'read':
+            block.append(line.rstrip())
+            state = 'name'
+            yield block
+            block = []
+
+
+def fastq_generator(infile):
+    block = []
+    qual = ''
+    read = ''
+    state = 'name'  # name read qual
+    counter = 0
+
+    for line in infile:
+        if state == 'name':
+            block.append(line.rstrip())
+            state = 'read'
+        elif state == 'read':
+            if line[0] == '+':
+                block.append(read)
+                block.append('+')
+                read = ''
+                state = 'qual'
+            else:
+                counter += 1
+                read += line.rstrip()
+        elif state == 'qual':
+            if counter > 1:
+                qual += line.rstrip()
+                counter -= 1
+            else:
+                qual += line.rstrip()
+                counter = 0
+                block.append(qual)
+                qual = ''
+                state = 'name'
+                yield block
+                block = []
+
+
+def hist_distribution_plot(infile, path, species_hist_scores, name1, name2, k, amount, SE_String, filetype):
+    if filetype == 'fastq':
+        gen = fastq_generator(open(infile))
+    else:
+        gen = fasta_generator(open(infile))
+    block = next(gen, False)
+    species_by_cutoff = []
+    while block:
+        species_by_cutoff.append((float(block[0].split('_')[-2]) - float(block[0].split('_')[-1])))
+        block = next(gen, False)
+    bounds = find_bounds_unassigned(species_by_cutoff)
+    lowerbound = min(species_hist_scores[2][0], bounds[0])
+    upperbound = max(species_hist_scores[2][1], bounds[1])
+    species1_scores = species_hist_scores[0]
+    species2_scores = species_hist_scores[1]
+    bins = [0] * 101
+    bins2 = [0] * 101
+    bins3 = [0] * 101
+
+    stepsize = (upperbound - lowerbound) / 100.
+    for logkey in species_by_cutoff:
+        if logkey > lowerbound and logkey < upperbound:
+            bins[int((logkey - lowerbound) / stepsize)] += 1
+    for i in range(min(len(species1_scores), len(species2_scores))):
+        if species1_scores[i] > lowerbound and species1_scores[i] < upperbound:
+            bins2[int((species1_scores[i] - lowerbound) / stepsize)] += 1
+        if species2_scores[i] > lowerbound and species2_scores[i] < upperbound:
+            bins3[int((species2_scores[i] - lowerbound) / stepsize)] += 1
+    sum_bins = float(sum(bins))
+    for each in range(len(bins)):
+        bins[each] = (bins[each] / sum_bins)  # +1 to get positive logarithms
+
+    a = []
+    b = bins
+    sum_bins2 = float(sum(bins2))
+    sum_bins3 = float(sum(bins3))
+    for each in range(len(bins)):
+        bins2[each] /= sum_bins2
+        bins3[each] /= sum_bins3
+    for each in range(len(bins2)):
+        a.append([bins2[each], bins3[each]])
+    x, y = np.linalg.lstsq(a, b)[0]
+    xplusy = x + y
+    x = x / xplusy
+    y = y / xplusy
+    sum_of_hist_scores = []
+    for each in range(len(bins2)):
+        bins[each] += 1
+        bins2[each] = (max(0, bins2[each] * x)) + 1
+        bins3[each] = (max(0, bins3[each] * y)) + 1  # +1 to get positive logarithms
+        sum_of_hist_scores.append(bins2[each] * bins3[each])
+    ticks = np.arange(lowerbound, upperbound + upperbound / 1000., (upperbound - lowerbound) / 100.)
+    fig = pl.figure()
+    fig.set_size_inches(12, 9.5)
+    ax = fig.add_subplot(1, 1, 1)
+    ax.plot(ticks, bins, label='real data', color='purple', linewidth=3, alpha=1)
+    ax.plot(ticks, bins2, label='simulated data of ' + name1, color='blue', linewidth=3, alpha=.25)
+    ax.fill_between(ticks, bins2, 1, color='blue', alpha=0.25, offset_position='screen')
+    ax.plot(ticks, bins3, label='simulated data of ' + name2, color='red', linewidth=3, alpha=.25)
+    ax.fill_between(ticks, bins3, 1, color='red', alpha=0.25, offset_position='screen')
+    ax.plot(ticks, sum_of_hist_scores, label='sum of simulated data', color='green', linewidth=3, alpha=1)
+    ax.set_yscale('log')
+    ax.axes.get_yaxis().set_visible(False)
+    fontp = FontProperties()
+    fontp.set_size(27)
+    pl.title(name1 + ' (' + str(round(x * 100, 2)) + '%) and ' + name2 + ' (' + str(round(y * 100, 2)) + '%) data',
+             fontsize=40.)
+    pl.ylabel('Frequency', size=40.)
+    ax.set_yticks([0, 0.5, 1])
+    pl.xlabel('Read score', size=40.)
+    pl.ylim(0, 1.02 * (
+        max((max(bins), max(bins2), max(bins3))) + (max((max(bins), max(bins2),
+                                                         max(bins3)))) / 500.))  # normalize height
+    leg = pl.legend(loc='upper right', numpoints=1, prop=fontp)
+    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label]):
+        item.set_fontsize(27)
+    for item in (ax.get_xticklabels() + ax.get_yticklabels()):
+        item.set_fontsize(20)
+    ax.text(-.01, .005, '0', horizontalalignment='right', verticalalignment='center', rotation='horizontal',
+            transform=ax.transAxes, size=20)
+    ax.text(-.01, .475, '.5', horizontalalignment='right', verticalalignment='center', rotation='horizontal',
+            transform=ax.transAxes, size=20)
+    ax.text(-.01, .95, '1', horizontalalignment='right', verticalalignment='center', rotation='horizontal',
+            transform=ax.transAxes, size=20)
+    ax.text(-.04, .5, 'Frequency [rel. units]', horizontalalignment='right', verticalalignment='center',
+            rotation='vertical', transform=ax.transAxes, size=27)
+
+    leg.get_frame().set_alpha(0.5)
+    pl.savefig(os.path.join(path, 'fitted_histograms_k' + str(k) + '_' + str(amount) + SE_String + '.png'))
+
+
+def parse_fasta(infile, name1, name2):
+    infile = open(infile)
+    species_by_cutoff = []
+    name = infile.readline()
+    infile.readline()
+    while name[0] == '>':
+        name = name[::-1].split('_', 2)[::-1]
+        name = [x[::-1].rstrip() for x in name]
+        if name[0][1:len(name1) + 1] == name1:
+            species_by_cutoff.append([float(name[-2]) - float(name[-1]), '1'])
+        elif name[0][1:len(name2) + 1] == name2:
+            species_by_cutoff.append([float(name[-2]) - float(name[-1]), '2'])
+        else:
+            raise Exception(
+                'There seems to be a problem with the species names. Try to avoid special characters or use the default values')
+        name = infile.readline()
+        infile.readline()
+        if name == '':
+            name = 'ending now'
+    return species_by_cutoff
+
+
+def parse_fastq(infile, name1, name2):
+    infile = open(infile)
+    species_by_cutoff = []
+
+    name = '@'
+    while name[0][0] == '@':
+        name = infile.readline()
+        infile.readline()
+        infile.readline()
+        infile.readline()
+        if len(name) <= 1:
+            name = 'ending now'
+        name = name.split('_')
+        if name[0][1] == name1[0]:
+            species_by_cutoff.append([float(name[-2]) - float(name[-1]), '1'])
+        elif name[0][1] == name2[0]:
+            species_by_cutoff.append([float(name[-2]) - float(name[-1]), '2'])
+    return species_by_cutoff
+
+
+def roc_plot(species_by_cutoff, k, path, amount, SE_String):
+    tpr = [0]
+    fpr = [0]
+    species_by_cutoff = sorted(species_by_cutoff)
+    tpl = []
+    fontp = FontProperties()
+    fontp.set_size(27)
+    for each in species_by_cutoff:
+        tpl.append(each[1])
+    species1count = 0
+    species2count = 0
+    for i in range(len(tpl)):
+        if tpl[i] == '2':
+            species1count += 1
+        elif tpl[i] == '1':
+            species2count += 1
+        tpr.append(species1count / float(
+            len(species_by_cutoff) / 2.))  # this will not work if both species appear differently often!
+        fpr.append(species2count / float(len(species_by_cutoff) / 2.))
+    roc_auc = auc(fpr, tpr)
+    # Plot ROC curve
+    pl.clf()
+    fig = pl.figure()
+    fig.set_size_inches(12, 9.5)
+    ax = fig.add_subplot(1, 1, 1)
+    ax.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc, color='green', zorder=10, lw=4, alpha=.6)
+    ax.plot([0.0, 1], [0, 1], 'k--')
+    ax.fill_between(fpr, tpr, color=(44 / 255., 160 / 255., 44 / 255.), alpha=.6)
+    pl.xlabel('False Positive Rate', fontsize=27)
+    pl.ylabel('True Positive Rate', fontsize=27)
+    pl.title('ROC-Curve of ' + str(k) + '-mer based read distinction', fontsize=27)
+    pl.legend(loc='lower right', prop=fontp)
+    for item in (ax.get_xticklabels() + ax.get_yticklabels()):
+        item.set_fontsize(20)
+    pl.savefig(os.path.join(path, 'ROCplot_k' + str(k) + '_' + str(amount) + SE_String + '.png'))
+
+
+def main(k, path, name1, name2, infile_simulated, infile_real, amount, SE, filetype='fastq'):
+    all_return_vals = parse_fasta(infile_simulated, name1, name2)
+    species_by_cutoff = all_return_vals
+    if SE:
+        se_string = '_SE'
+    else:
+        se_string = '_PE'
+    print('Plotting histogram of scores')
+    species_scores = hist_of_scores(species_by_cutoff, path, name1, name2, k, amount, se_string)
+    pl.close('all')
+    print('Plotting ROC-plot')
+    roc_plot(species_by_cutoff, k, path, amount, se_string)
+    pl.close('all')
+    print('Plotting distribution heights')
+    hist_distribution_plot(infile_real, path, species_scores, name1, name2, k, amount, se_string, filetype)
+    pl.close('all')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-k', '--kmer', required=True)
+    parser.add_argument('-p', '--path', required=True)
+    parser.add_argument('-a', '--amount', required=False, default=50000)
+    parser.add_argument('-n', '--names', required=True, action='append',
+                        help='give 2 species names (-n SPECIES1 -n SPECIES2)')
+    parser.add_argument('-t', '--filetype', default='fastq',
+                        help='specify whether your input is fastq of fasta-type. Default = fastq. Type -t fasta to change to fasta',
+                        required=False)
+    parser.add_argument('-s', 'simulated', required=True)
+    parser.add_argument('-r', 'real', required=True)
+    args = parser.parse_args()
+    path = args.path
+    amount = int(args.amount)
+    filetype = args.filetype
+    name1 = args.names[0]
+    name2 = args.names[1]
+    k = args.kmer
+    infile_simulated = str(args.simulated)
+    infile_real = str(args.real)
+    main(k, path, name1, name2, infile_simulated, infile_real, amount, filetype)
+
diff --git a/simulate_reads.py b/simulate_reads.py
new file mode 100644
index 0000000..9d46179
--- /dev/null
+++ b/simulate_reads.py
@@ -0,0 +1,299 @@
+import random
+import argparse
+import os
+import sys
+import time
+
+
+def find_out_length(infiles,amount,file_type):
+	infile1 = open(infiles[0])
+	lengths1 = []
+	lengths2 = []
+	i = 0
+	if file_type == "fastq":
+		name = infile1.readline()
+	
+		while i <= amount+1 and name != "":
+			lengths1.append(len(infile1.readline())-1)
+			infile1.readline()
+			infile1.readline()
+			name = infile1.readline()
+			i += 1
+	if file_type == "fasta":
+		while i <= amount+1 and name != "":
+			lengths1.append(len(infile1.readline())-1)
+			name = infile1.readline()
+			i += 1
+	if not SingleEnd:
+		infile2 = open(infiles[1])
+		i = 0
+		name = infile2.readline()
+		if file_type == "fastq":
+			while i <= amount+1 and name != "":
+				lengths2.append(len(infile2.readline())-1)
+				infile2.readline()
+				infile2.readline()
+				name = infile2.readline()
+				i += 1
+		if file_type == "fasta":
+			while i <= amount+1 and name != "":
+				lengths2.append(len(infile2.readline())-1)
+				name = infile2.readline()
+				i += 1
+		infile2.close()
+	else:
+		lengths2 = [0]*len(lengths1)
+	infile1.close()
+	j = 0
+	while len(lengths1) <= amount:
+		lengths1.append(lengths1[j])
+		j += 1
+	j = 0 
+	while len(lengths2) <= amount:
+		lengths2.append(lengths2[j])
+		j += 1
+	return lengths1,lengths2
+	
+
+def checkread(read,verbose,allowed):
+	#allowed = set("ACGTacgt")
+	if set(read) <= allowed:
+		return True
+	else:
+		if verbose is True:
+			print("invalid read:")
+			print(read)
+		return False
+
+
+def complement(s):
+	basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 'T', 'c': 'G', 'g': 'C', 't': 'A'}
+	letters = list(s)
+	letters = [basecomplement[base] for base in letters]
+	return ''.join(letters)
+
+
+def revcom(s):
+	return complement(s[::-1])
+
+
+def create_random_readpairs(infile, readlength1, readlength2, gapsize, pos, verbose, allowed):
+	#try:
+	infile.seek(pos)
+	full_seq = infile.read(readlength1+gapsize+readlength2).replace('\n','').replace('\r','')
+	read1 = full_seq[0:readlength1]		#todo: fit readlengths, they are 1 too long?
+	read2 = full_seq[-readlength2:]
+	#except:
+	#	return False
+	if checkread(full_seq,verbose,allowed):
+		return read1, read2
+
+
+def createrandomunassignedreads(infile, outfile1, outfile2, name, gapsize, verbose, chunk, amount, real_readfiles):
+	if chunk > amount:
+		chunk = amount
+	rest = chunk
+	revcomp = 0
+	filesize = os.path.getsize(infile)
+	infile = open(infile)
+	list1 = []
+	list2 = []
+	i = 0
+	sys.stdout.write("Simulating reads of "+name+" for evaluation:\n")
+	lengths = find_out_length(real_readfiles,amount,"fastq")
+	lengths1 = lengths[0]
+	lengths2 = lengths[1]
+	while rest > 0:
+		poslist = []
+		for j in range(rest+1):
+			poslist.append(random.randint(0, filesize-(lengths1[j]+lengths2[j]+gapsize)))
+		poslist = sorted(poslist)
+		for k in range(len(poslist)-1):
+			readlength1 = lengths1[i]
+			if SingleEnd:
+				readlength2 = 0
+			else:	
+				readlength2 = lengths2[i]
+			reads = create_random_readpairs(infile,readlength1,readlength2,gapsize,poslist[k],verbose,allowed=set("ACGTacgt"))
+			###for fasta-reads
+			if reads:
+				i += 1
+				revcomp += 1
+				if revcomp % 2 == 0:
+					list1.append("".join([">",name , str(i), "\n" , reads[0], "\n"]))
+					list2.append("".join([">",name , str(i), "\n" , reads[1], "\n"]))
+				elif revcomp % 2 == 1:
+					list1.append("".join([">",name , str(i), "\n" , revcom(reads[0]), "\n"]))
+					list2.append("".join([">",name , str(i), "\n" , revcom(reads[1]), "\n"]))
+				if i%1000 == 0:
+					sys.stdout.write(str(float(i)/amount*100)+"%\r")
+					sys.stdout.flush()
+			else:
+				continue
+			###for fastq-reads
+			"""if reads:
+				i += 1
+				revcomp += 1
+				if revcomp % 2 == 0:
+					list1.append("".join(["@",name , str(i) , "\n" , reads[0] , "\n" , "+\n" , "#"*len(reads[0]) , "\n"]))
+					list2.append("".join(["@",name , str(i) , "\n" , reads[1] , "\n" , "+\n" , "#"*len(reads[1]) , "\n"]))
+
+				elif revcomp % 2 == 1:
+					list1.append("".join(["@",name , str(i) , "\n" , revcom(reads[0]) , "\n" , "+\n" , "#"*len(reads[0]) , "\n"]))
+					list2.append("".join(["@",name , str(i) , "\n" , revcom(reads[1]) , "\n" , "+\n" , "#"*len(reads[1]) , "\n"]))
+				if i%1000 == 0:
+					sys.stdout.write(str(float(i)/amount*100)+"%\r")
+					sys.stdout.flush()
+			else:
+				continue"""
+		for elem in range(len(list1)):
+			outfile1.write(list1[elem])
+			if not SingleEnd:
+				outfile2.write(list2[elem])
+		list1 = []
+		list2 = []
+		if rest < amount - i:
+			continue
+		else:
+			rest = amount - i
+	print("\n")
+	infile.close()
+
+
+def createrandomassignedreads(infile, outfile1, outfile2, name, gapsize, verbose, chunk, amount, real_readfiles):
+	if chunk > amount:
+		chunk = amount
+	rest = chunk
+	revcomp = 0
+	filesize = os.path.getsize(infile)
+	infile = open(infile)
+	list1 = []
+	list2 = []
+	i = 0
+	sys.stdout.write("Simulating reads of "+name+" for training:\n")
+	lengths = find_out_length(real_readfiles,amount,"fastq")
+	while rest > 0:
+		poslist = []
+		for j in range(rest+1):
+			poslist.append(random.randint(0, filesize-(lengths[0][j]+lengths[1][j]+gapsize)))
+		poslist = sorted(poslist)
+		for k in range(len(poslist)-1):
+			readlength1 = lengths[0][i]
+			if SingleEnd:
+				readlength2 = 0
+			else:	
+				readlength2 = lengths[1][i]
+			reads = create_random_readpairs(infile,readlength1,readlength2,gapsize,poslist[k],verbose,allowed=set("ACGTacgt"))
+			if reads:
+				i += 1
+				revcomp += 1
+				if revcomp % 2 == 0:
+					list1.append("".join([">",name , str(i) , "\n" , reads[0], "\n"]))
+					list2.append("".join([">",name , str(i) , "\n" , reads[1], "\n"]))
+				elif revcomp % 2 == 1:
+					list1.append("".join([">",name , str(i) , "\n" , revcom(reads[0]), "\n"]))
+					list2.append("".join([">",name , str(i) , "\n" , revcom(reads[1]), "\n"]))
+				if i%1000 == 0:
+					sys.stdout.write(str(float(i)/amount*100)+"%\r")
+					sys.stdout.flush()
+			else:
+				continue
+
+		for elem in range(len(list1)):
+			outfile1.write(list1[elem])
+			if not SingleEnd:
+				outfile2.write(list2[elem])
+		list1 = []
+		list2 = []
+		if rest < amount - i:
+			continue
+		else:
+			rest = amount - i
+	print("\n")
+	infile.close()
+
+
+def main(infile1,infile2,name1,name2,temppath,real_readfiles,SE,amount,gapsize=1,verbose=False,chunk=100000):
+	global SingleEnd
+	SingleEnd=SE
+
+	if not SingleEnd:
+		outfile1 = os.path.join(temppath,"simulated_reads_"+name1+"_"+str(amount)+"_1.fasta")
+		outfile2 = os.path.join(temppath,"simulated_reads_"+name1+"_"+str(amount)+"_2.fasta")
+		outfile3 = os.path.join(temppath,"simulated_reads_"+name2+"_"+str(amount)+"_1.fasta")
+		outfile4 = os.path.join(temppath,"simulated_reads_"+name2+"_"+str(amount)+"_2.fasta")
+		outfile5 = os.path.join(temppath,"simulated_reads_"+name1+"_and_"+name2+"_"+str(amount)+"_1.fasta")
+		outfile6 = os.path.join(temppath,"simulated_reads_"+name1+"_and_"+name2+"_"+str(amount)+"_2.fasta")
+		outf1 = open(outfile1,"w")
+		outf2 = open(outfile2,"w")
+		outf3 = open(outfile3,"w")
+		outf4 = open(outfile4,"w")
+	else:
+		outfile1 = os.path.join(temppath,"simulated_reads_"+name1+"_"+str(amount)+".fasta")
+		outfile3 = os.path.join(temppath,"simulated_reads_"+name2+"_"+str(amount)+".fasta")
+		outfile5 = os.path.join(temppath,"simulated_reads_"+name1+"_and_"+name2+"_"+str(amount)+".fasta")
+		outf1 = open(outfile1,"w")
+		outf2 = None
+		outf3 = open(outfile3,"w")
+		outf4 = None
+
+	before = time.time()
+	createrandomassignedreads(infile1, outf1, outf2, name1, gapsize, verbose, chunk, amount, real_readfiles)
+	print(time.time()-before)
+	before = time.time()
+	createrandomassignedreads(infile2, outf3, outf4, name2, gapsize, verbose, chunk, amount, real_readfiles)
+	print(time.time()-before)
+	if not SingleEnd:
+		outf1.close()
+		outf2.close()
+		outf3.close()
+		outf4.close()
+		outf5, outf6 = open(outfile5,"w"), open(outfile6,"w")
+
+	else:
+		outf1.close()
+		outf3.close()
+		outf5 = open(outfile5,"w")
+		outf6 = None
+	#outf5, outf6 = open(outfile5,"w"), open(outfile6,"w")
+	before = time.time()
+	createrandomunassignedreads(infile1, outf5, outf6, name1, gapsize, verbose, chunk, amount, real_readfiles)
+	print(time.time()-before)
+	before = time.time()
+
+	createrandomunassignedreads(infile2, outf5, outf6, name2, gapsize, verbose, chunk,  amount, real_readfiles)
+	print(time.time()-before)
+	if not SingleEnd:
+		outf5.close()
+		outf6.close()
+	else:
+		outf5.close()
+
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser()
+	parser.add_argument('-i', '--infile', help='Input file of species 1 and species 2, given in fasta-format', required=True, action='append')
+	parser.add_argument('-r', '--real_readfiles', help='One File containing a subset of your reads in fastq-format', required=True, action='append')
+	parser.add_argument('-n', '--name', help='Name of species 1 and species 2', required=True, action='append')
+	parser.add_argument('-o', '--outpath', help='Path to write resulting files', required=True)
+	parser.add_argument('-g', '--gapsize', help='Approximate lenght of gap between reads', required=False, default=0)
+	parser.add_argument('-v', '--verbose', help='print invalid reads and additional information',required=False,default=False,action='store_true')
+	parser.add_argument('-a', '--amount', help='number of reads to be simulated for all kinds', required=False,default=500000)
+	parser.add_argument('-c', '--chunk', help='number of reads to be generated before writing. Choose according to your RAM.', required=False,default=100000)
+	parser.add_argument('-s', '--SE', help='Single End mode', action='store_true', default=False)
+	args = parser.parse_args()
+	verbose = args.verbose
+	outpath = str(args.outpath)
+	amount = int(args.amount)
+	chunk = int(args.chunk)
+	SE = args.SE
+	if SE:
+		gapsize = 0
+	else:
+		gapsize = int(args.gapsize)
+
+	infile1 = args.infile[0]
+	infile2 = args.infile[1]
+	name1 = str(args.name[0])
+	name2 = str(args.name[1])
+	real_readfiles = args.real_readfiles
+	main(infile1,infile2,name1,name2,outpath,real_readfiles,SE,amount,gapsize,verbose,chunk)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rambo-k.git



More information about the debian-med-commit mailing list