[med-svn] [mapping-pipeline] 04/10: New upstream version 2.0

Andreas Tille tille at debian.org
Tue Feb 21 12:56:44 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository mapping-pipeline.

commit ccdaa057c87e03dfdcb5026d278d97924e5059c2
Author: Andreas Tille <tille at debian.org>
Date:   Tue Feb 21 09:06:29 2017 +0100

    New upstream version 2.0
---
 extract_genbank.py |   94 ++---
 mapper.py          |    8 +-
 mapping.py         |   83 ++++-
 metrics.py         |   27 +-
 reffinder.py       | 1056 ++++++++++++++++++++++++++++++++++++----------------
 report.tex         |   13 +-
 run.config         |    5 +
 7 files changed, 899 insertions(+), 387 deletions(-)

diff --git a/extract_genbank.py b/extract_genbank.py
index 0cccc4e..741f02b 100755
--- a/extract_genbank.py
+++ b/extract_genbank.py
@@ -1,52 +1,58 @@
 #!/usr/bin/python3
+
 def extract_name(infile):
-  definition="UNKNOWN"
-  accession="UNKNOWN"
-  lines = 0
-  for line in infile:
-    if line.startswith("DEFINITION"):
-      definition = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
-    if line.startswith("ACCESSION"):
-      accession = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
-    lines += 1
-    if accession != "UNKNOWN" and definition != "UNKNOWN":
-      break
-    if lines > 4:
-      break
-  return accession + " " + definition
+    definition="UNKNOWN"
+    accession="UNKNOWN"
+    lines = 0
+    for line in infile:
+        if line.startswith("DEFINITION"):
+            definition = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+        if line.startswith("ACCESSION"):
+            accession = " ".join(line.split(" ")[1:]).strip().replace(" ", "_")
+        lines += 1
+        if accession != "UNKNOWN" and definition != "UNKNOWN":
+            break
+        if lines > 4:
+            break
+    return accession + " " + definition
 
 def extract_sequence(infile):
-  res = []
-  numline = 0
-  for line in infile:
-      if line.startswith("ORIGIN"):
-          break
-  for line in infile:
-    if line[0] == "/":
-      seq = "".join(res)
-      return "".join(seq)
-    res += line.strip().split(" ")[1:]
-    numline += 1
+    res = []
+    numline = 0
+    for line in infile:
+        if line.startswith("ORIGIN"):
+            break
+    for line in infile:
+        if line[0] == "/":
+            seq = "".join(res)
+            return "".join(seq)
+        res += line.strip().split(" ")[1:]
+        numline += 1
 
 def write_sequence(outfile, seq, name, splitlen):
-  f = open(outfile, "w")
-  f.write(">" + name + "\n")
-  pos = 0
-  numline = 0
-  while pos < len(seq):
-    f.write(seq[pos:pos+splitlen])
-    pos += splitlen
-    f.write("\n")
-    numline += 1
+    f = open(outfile, "w")
+    f.write(">" + name + "\n")
+    pos = 0
+    numline = 0
+    while pos < len(seq):
+        f.write(seq[pos:pos+splitlen])
+        pos += splitlen
+        f.write("\n")
+        numline += 1
 
 def main(infile, fastafile):
-  f = open(infile, "r")
-  linelength = 80
-  name = extract_name(f)
-  seq = extract_sequence(f)
-  f.close()
-  write_sequence(fastafile, seq, name, linelength)
-  return fastafile
-
-
-
+    #f = open(infile, "r")
+    #linelength = 80
+    #name = extract_name(f)
+    #seq = extract_sequence(f)
+    #f.close()
+    #write_sequence(fastafile, seq, name, linelength)
+    #return fastafile
+    from Bio import SeqIO
+    with open(infile, 'r') as f, open(fastafile, 'w') as g:
+        sequences = SeqIO.parse(f, "genbank")
+        for record in sequences:
+            #x = record.id
+            #record.id = x.split('.')[0]
+            SeqIO.write(record, g, "fasta")
+    return fastafile
diff --git a/mapper.py b/mapper.py
index a34d68c..d7f6920 100755
--- a/mapper.py
+++ b/mapper.py
@@ -221,7 +221,9 @@ class Bowtie2(Mapper):
                     x_pos = match.group(6)
                     y_pos = match.group(7)
                 except:
-                    log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+                    log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set. Using default flowcell and lane.')
+                    flowcell="unset"
+                    lane="1000"
 
                 if self.query_paired:
                     try:
@@ -415,7 +417,9 @@ class BWAMem:
                     x_pos = match.group(6)
                     y_pos = match.group(7)
                 except:
-                    log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set.')
+                    log.print_write(self.logger, 'warning', 'ERROR in setting readgroups. readgroups not set. Using default flowcell and lane.')
+                    flowcell="unset"
+                    lane="1000"
 
                 if self.query_paired:
                     try:
diff --git a/mapping.py b/mapping.py
index ef0448a..c470760 100755
--- a/mapping.py
+++ b/mapping.py
@@ -28,7 +28,7 @@ import metrics
 __author__ = "Enrico Seiler"
 __credits__ = ["Wojtek Dabrowski"]
 __license__ = "GPL"
-__version__ = "1.0.0"
+__version__ = "1.2.1"
 __maintainer__ = "Enrico Seiler"
 __email__ = "seilere at rki.de"
 __status__ = "Development"
@@ -39,6 +39,7 @@ class MAPPING:
         self.version = __version__
         self.logger = logger
         self.python_version = sys.version.split(' ')[0]
+        self.reffinder_version = reffinder.__version__
         self._argument_parser()
         self.args.output = os.path.realpath(self.args.output)
         self.user = pwd.getpwuid(os.getuid()).pw_name
@@ -65,6 +66,8 @@ class MAPPING:
         self.mapper = None
         self.sam_files = list()
         self.delete = list()
+        self.reads = list() # save read names for report
+        self.read_dict = dict() # save reads per sample for report
         self.tmp = os.path.join(os.path.expanduser('~/tmp/'), datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f'))
         try:
             os.mkdir(os.path.expanduser('~/tmp/'))
@@ -74,19 +77,49 @@ class MAPPING:
             os.mkdir(self.tmp)
         except OSError:
             pass
+        self.samtools_version = self._samtools_version()
+        self.picard_version = self._picard_version()
         self.run()
         if self.args.join:
+            self._join_sample()
+            self.run_metrics()
             self._join()
-        if self.args.join_sample:
+        elif self.args.join_sample:
             self._join_sample()
-        self.run_metrics()
+            self.run_metrics()
+        else:
+            self.run_metrics()
+        self.convert_to_bam()
         self.cleanup()
 
+    def _samtools_version(self):
+        cmd = '{samtools}/samtools --help'.format(samtools=self.args.samtools)
+        out = str(sp.check_output(cmd, shell=True), 'utf-8')
+        pattern = re.compile('.*Version: ([\d\.]+) .*', re.DOTALL)
+        return re.match(pattern, out).group(1)
+
+    def _picard_version(self):
+        try:
+            cmd = '{java} -jar {picard}/picard.jar ViewSam --version'.format(java=self.args.java, picard=self.args.picard)
+            out = ''
+            try:
+                out = str(sp.check_output(cmd, shell=True, stderr=sp.STDOUT), 'utf-8')
+            except sp.CalledProcessError as e:
+                out = str(e.output, 'utf-8')
+            pattern = re.compile('([\d\.]+)\(.*', re.DOTALL)
+            return re.match(pattern, out).group(1)
+        except:
+            return 'Not used'
+
     def run_metrics(self):
         if not self.args.noreport:
             for sam in self.sam_files:
+                out = os.path.join(self.args.output, 'data', os.path.basename(sam))
+                os.makedirs(out, exist_ok=True)
+                if self.args.join_sample or self.args.join:
+                    self.reads = self.read_dict[os.path.basename(sam)+'.sam']
                 self.sam = sam
-                self.metric = metrics.METRICS(sam, output=self.tmp, logger=self.logger, samtools=os.path.realpath(os.path.join(self.args.samtools, 'samtools')))
+                self.metric = metrics.METRICS(sam + '.sam', output=out, logger=self.logger, samtools=os.path.realpath(os.path.join(self.args.samtools, 'samtools')))
                 self.current_project = os.path.basename(self._sam_name(os.path.basename(sam))) #read1
                 self.metrics_results[self.current_project] = dict()
                 self.metric.read_stats()
@@ -108,7 +141,13 @@ class MAPPING:
         loader = jinja2.FileSystemLoader(template_dir)
         env = jinja2.Environment(loader=loader)
         template = env.get_template(template_file)
-        self.args.reference = self.args.reference[0]
+        try:
+            if type(self.args.reference) == tuple or type(self.args.reference) == list:
+                self.args.reference = self.args.reference[0]
+            elif self.args.reference is None or len(self.args.reference) == 0:
+                self.args.reference = 'User defined index'
+        except:
+            self.args.reference = 'User defined index'
         pdf_latex = template.render(pipeline=self, metric=self.metric)
 
         with open(self.report_file, 'w') as self.latex:
@@ -143,7 +182,14 @@ class MAPPING:
         except sp.CalledProcessError:
             log.print_write(self.logger, 'exception', 'ERROR: CPE in MAPPING.cleanup()')
             pass
-
+    def convert_to_bam(self):
+        log.print_write(self.logger, 'info', '=== Converting to BAM file ===')
+        for sam in self.sam_files:
+            sam = sam + '.sam'
+            self.command = '{} view -hb {} -o {}'.format(os.path.realpath(os.path.join(self.args.samtools, 'samtools')), sam, sam[:-3] + 'bam')
+            log.call(self.command, self.logger, shell=True)
+            self.command = 'rm {}'.format(sam)
+            log.call(self.command, self.logger, shell=True)
     def run_reffinder(self, mode, walk=None):
         log.print_write(self.logger, 'info', '=== Running Reffinder ===')
         if self.args.reffinder_options is None:
@@ -164,6 +210,8 @@ class MAPPING:
                 self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} {} --ref {} {} --tmp {}".format(os.path.join(walk, self.r), os.path.join(walk, self.r2), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
             else:
                 self.args.reference = reffinder.find_reference(reffinder.parse_arguments("--fastq {} --ref {} --mode single {} --tmp {}".format(os.path.join(walk, self.r), ' '.join(self.args.reference), self.args.reffinder_options, self.tmp), logger=self.logger))
+        if type(self.args.reference) == tuple:
+            self.args.reference = self.args.reference[0]
     def convert_to_accession(self):
         pass
         #pattern = re.compile('>gi\|\d*\|[^|]*\|([^|]*)\|(.*)')
@@ -238,6 +286,9 @@ class MAPPING:
                         self.run_reffinder(2, walk[0])
                     else:
                         self.args.reference = self.args.reference[0]
+                self.reads.append(self.r)
+                self.reads.append(self.r2)
+                print('RUNNING {} and {}'.format(self.r, self.r2))
                 self._run_once(os.path.join(walk[0], self.r), read2=os.path.join(walk[0], self.r2))
                 if self.args.reference is not None:
                     self.args.reference = self.cache_ref
@@ -251,6 +302,8 @@ class MAPPING:
                         self.run_reffinder(3, walk[0])
                     else:
                         self.args.reference = self.args.reference[0]
+                self.reads.append(self.r)
+                print('RUNNING {}'.format(self.r))
                 self._run_once(os.path.join(walk[0], self.r))
                 if self.args.reference is not None:
                     self.args.reference = self.cache_ref
@@ -265,7 +318,10 @@ class MAPPING:
         try:
             self.sam_name = re.match('(.*)(\.fastq\.gz|\.fastq)$', st).group(1)
         except:
+            st = os.path.basename(st)
             self.sam_name =  '.'.join(st.split('.')[:-1])
+        if self.sam_name == '':
+            self.sam_name = os.path.basename(s)
         return os.path.join(self.args.output, self.sam_name)
     def _join(self):
         log.print_write(self.logger, 'info', '=== Joining SAM files ===')
@@ -274,8 +330,9 @@ class MAPPING:
             command += 'I={}.sam '.format(sam)
             self.delete.append(sam + '.sam')
         log.call(command, self.logger, shell=True)
-        self.sam_files = [os.path.join(self.args.output, 'joined.sam')]
+        self.sam_files = [os.path.join(self.args.output, 'joined')]
     def _join_sample(self):
+        log.print_write(self.logger, 'info', '=== Joining SAM files per sample ===')
         groups = defaultdict(set)
         pattern = re.compile('^U{0,1}P_(.*)_L\d\d\d_R[1,2]_\d\d\d')
         try:
@@ -296,7 +353,8 @@ class MAPPING:
                 command += 'I={}.sam '.format(os.path.join(self.args.output, sam))
                 self.delete.append(os.path.join(self.args.output, sam + '.sam'))
             log.call(command, self.logger, shell=True)
-            self.sam_files.append(os.path.join(self.args.output, key+'.sam'))
+            self.sam_files.append(os.path.join(self.args.output, key))
+            self.read_dict[key+'.sam'] = [i for i in self.reads if key in i]
     def _run_once(self, read1, read2=None):
         log.print_write(self.logger, 'info', '=== Processing read #{} ==='.format(self.read_counter))
         log.write(self.logger, 'info', 'Read1 = {}'.format(read1))
@@ -314,7 +372,6 @@ class MAPPING:
         self.mapper_version = self.args.mapper + ' ' + self.mapper.get_version()
         self.sam_files.append(self._sam_name(self.sam_file))
     def split_multifasta(self):
-        i = 0
         f = None
         files = list()
         for ref in self.args.reference:
@@ -322,9 +379,9 @@ class MAPPING:
                 if '>' in line:
                     if f is not None:
                         f.close()
-                    f = open(os.path.join(self.tmp, '{}.fasta'.format(i)), 'w')
-                    files.append(os.path.join(self.tmp, '{}.fasta'.format(i)))
-                    i += 1
+                    name = os.path.join(self.tmp, '{}.fasta'.format(line.split(' ')[0][1:].strip())) if len(line.split(' ')) >= 2 else os.path.join(self.tmp, '{}.fasta'.format(line[1:].strip()))
+                    f = open(name, 'w')
+                    files.append(name)
                     f.write('{}'.format(line))
                 else:
                     f.write('{}'.format(line))
@@ -427,6 +484,8 @@ class MAPPING:
         self.optional.add_argument('--picard', dest='picard', nargs='?', default='/usr/bin/', type=str, help='Path to folder containing picard.jar')
         self.optional.add_argument('--config', help='Define config file to read parameters from. If run.config exists in the source directory, it is if --config is not defined.')
         self.args = self.parser.parse_args(remaining_argv)
+        if self.args.join_sample and self.args.join:
+            raise argparse.ArgumentTypeError('Only either --join or --join_sample are allowed')
         if self.args.join_sample or self.args.join:
             self.args.picard = self._valid_picard(os.path.expanduser(self.args.picard))
         if self.args.options is not None:
diff --git a/metrics.py b/metrics.py
index a9a9c8a..5fe66e9 100755
--- a/metrics.py
+++ b/metrics.py
@@ -10,6 +10,7 @@ import os
 import pysam
 import pylab
 import itertools
+import json
 import string
 from scipy import stats
 from collections import Counter
@@ -340,10 +341,27 @@ class METRICS:
             conf_int = mean_coverage + 2 * sd_coverage
             lower_conf_int = (reference_coverage < conf_int).sum()
             greater_conf_int = (reference_coverage > conf_int).sum()
-            cov_stats = {'mean': mean_coverage, 'max': maximum_coverage, 'min': minimum_coverage, 'sd': sd_coverage, 'zero': bases_zero_coverage, 'lower': lower_conf_int, 'greater': greater_conf_int}
+            cov_stats = dict()
+            cov_stats['mean'] = int(mean_coverage)
+            cov_stats['max'] = int(maximum_coverage)
+            cov_stats['min'] = int(minimum_coverage)
+            cov_stats['sd'] = int(sd_coverage)
+            cov_stats['zero'] = int(bases_zero_coverage)
+            cov_stats['lower'] = int(lower_conf_int)
+            cov_stats['greater'] = int(greater_conf_int)
             self.cov_stats_result.append(cov_stats)
         if len(self.cov_stats_result) == 0:
-            self.cov_stats_result.append({'mean': -1, 'max': -1, 'min': -1, 'sd': -1, 'zero': -1, 'lower': -1, 'greater': -1})
+            cov_stats = dict()
+            cov_stats['mean'] = -1
+            cov_stats['max'] = -1
+            cov_stats['min'] = -1
+            cov_stats['sd'] = -1
+            cov_stats['zero'] = -1
+            cov_stats['lower'] = -1
+            cov_stats['greater'] = -1
+            self.cov_stats_result.append(cov_stats)
+        with open(os.path.join(self.output, 'coverage_stats_{}.json'.format(reference)), 'w') as f:
+            json.dump(cov_stats, f)
         #return self.cov_stats
         
     def read_stats(self):
@@ -374,13 +392,16 @@ class METRICS:
         self.read_stats['mapped'] = mapped
         self.read_stats['unmapped'] = unmapped
         self.read_stats['total'] = total
+        with open(os.path.join(self.output, 'read_stats.json'), 'w') as f:
+            json.dump(self.read_stats, f)
         #self.read_stats_result.append(self.stats)
         #return self.stats
 
     def cleanup(self):
         log.write(self.logger, 'info', '=== Cleaning up metrics ===')
         try:
-            for mfile in self.reads_per_quality_result+self.bases_per_coverage_result+self.mapq_graph_result+self.coverage_graph_result+[(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
+            #for mfile in self.reads_per_quality_result+self.bases_per_coverage_result+self.mapq_graph_result+self.coverage_graph_result+[(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
+            for mfile in [(self.sam.sam_file_path, 'bam'), (self.sam.sam_file_path[:-1] + 'i', 'bai')]:
                 if mfile is not None:
                     self.command = 'rm {}'.format(mfile[0])
                     log.call(self.command, self.logger, shell=True)
diff --git a/reffinder.py b/reffinder.py
index 4d02b01..32053f6 100755
--- a/reffinder.py
+++ b/reffinder.py
@@ -1,42 +1,44 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #created by Stephan Fuchs (FG13, RKI), 2015
-version = '0.0.9'
+__version__ = '2.1.2'
+version = '2.1.2'
 
 import argparse
 import time
-import re
 import sys
-if sys.version_info < (3, 2):
-    import subprocess32 as sp
-else:
-    import subprocess as sp
 import os
 import random
 import math
-import custom_logger as log
+import subprocess
+import tempfile
+import shutil
+import mimetypes
+import gzip
+import random
+import re
 
-#sys.path.insert(0, "/home/RKID1/fuchss/data/scripts/libs")
+#sys.path.insert(0, "/opt/common/pipelines/tools/refFinder/libs")
 
 import fux_filing as filing
 
-# options is a string containing parameters, e.g. "--fastq sth. --ref sth. -p 0.2"
 def parse_arguments(options=None, logger=None):
     #processing command line arguments
     parser = argparse.ArgumentParser(prog="REFFINDER", description='')
-    parser.add_argument('--fastq', metavar="FILE(S)", help="input FASTQ file(s)", type=argparse.FileType('r'), required=True, nargs='+')
-    parser.add_argument('--ref', metavar="FILE(S)", help="input reference FASTA file(s)", type=str, required=True, nargs='+')
+    parser.add_argument('--fastq', metavar="FILE", help="input FASTQ file(s)", type=argparse.FileType('r'), required=True, nargs='+')
+    parser.add_argument('--ref', metavar="FILE", help="input reference FASTA file(s)", type=argparse.FileType('r'), required=True, nargs='+')
     parser.add_argument('--tmp', type=str, help='Temporary directory to use', default=os.path.expanduser('~/tmp/'))
     parser.add_argument('-p', metavar="DECIMAL", help="proportion of reads used for mapping (default: 0.1)",  type=float, action="store", default=0.1)
     parser.add_argument('-t', metavar='INT', help="number of threads/CPUs used for FastQC (default: 20)", type=int, action="store", default=20)
-    parser.add_argument('-s', help="if set, subset fastq files are stored in a sub folder. Please consider, that high amount of data may be generated.",  action="store_true")
-    parser.add_argument('-b', help="if set, subset BAM files are stored in a sub folder. Please consider, that high amount of data may be generated.",  action="store_true")
-    parser.add_argument('-i', help="if set, reference index files are not deleted.",  action="store_true")
+    #parser.add_argument('-s', help="if set, subset fastq files are stored in a sub folder. Please consider, that high amount of data may be generated.",  action="store_true")
+    parser.add_argument('-b', help="if set, BAM files kept. Please cosnider that this option can accumulate a large amount of data.",  action="store_true")
+    parser.add_argument('-q', help="base quality threshold (default: 20)",  type=int, default=20)
+    parser.add_argument('-Q', help="mapping quality threshold (default: 20)",  type=int, default=20)
     parser.add_argument('--mode', help="select paired (default) or unpaired mode",  choices=['paired', 'single'], action="store", default='paired')
-    parser.add_argument('--mapping', help="select mapping algorithm (default: bwasw).",  choices=['bwasw', 'bwamem'], action="store", default='bwasw')
+    parser.add_argument('--mapping', help="select mapping algorithm (default: bwamem).",  choices=['bwasw', 'bwamem'], action="store", default='bwamem')
     parser.add_argument('--version', action='version', version='%(prog)s ' + version)
     if options is None:
-        args = parser.parse_args() 
+        args = parser.parse_args()
     else:
         args = parser.parse_args(options.split(' '))
     args.logger = logger
@@ -48,340 +50,746 @@ def parse_arguments(options=None, logger=None):
 
 
 #FUNCTIONS
-def indexRef(ref, args):
-	if args.mapping in ['bwasw', 'bwamem']:
-		cmd = ['bwa', 'index',  '-p', os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), '-a',  'is', ref]
-	elif args.mapping == smalt:
-		cmd = ['smalt', 'index',  '-s',  '6', ref, ref]
-	log.call(cmd, args.logger)
-	cmd = ['samtools', 'faidx', ref]
-	log.call(cmd, args.logger)	
-
-def mapping(ref, fastq, revfastq, samfile, args):
-	#for single mapping set revfastq to False
-	if args.mapping == 'bwasw':
-		cmd = ['bwa', 'bwasw', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]),  fastq, revfastq, '-f', samfile]
-		if revfastq == False:
-			del(cmd[6])
-		log.call(cmd, args.logger)		
-	elif args.mapping == 'bwamem':
-		with open(samfile,"wb") as outfile:
-			cmd = ['bwa', 'mem', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]),  fastq, revfastq]
-			if revfastq == False:
-				del(cmd[6])
-			sp.check_call(cmd, stdout=outfile)		
-	elif args.mapping == 'smalt':
-		cmd = ['smalt', 'map', '-n', str(args.t), '-f', 'sam', '-o', samfile, ref,  fastq, revfastq]	
-		if revfastq == False:
-			del(cmd[-1])		
-		log.call(cmd, args.logger)
-	
-def sam2bam(samfile, bamfile, delSam = True):
-	with open(bamfile, 'w') as bf:
-		cmd = ['samtools', 'view', '-F', '4', '-Sb', samfile]
-		sp.check_call(cmd, stdout=bf)
-	if delSam:
-		os.remove(samfile)	
-		
-def getMappedReads(bamfile):		
-	cmd = ['samtools', 'flagstat', bamfile]
-	out = sp.check_output(cmd).decode("utf-8")
-	mapped_reads = re.compile('.*?(\d*) \+ \d* mapped.*', re.DOTALL)
-	m = mapped_reads.match(out)
-	#return int(out[2].split('+')[0])
-	return int(m.group(1))
-
-def getBwaVers():		
-	cmd = ['bwa']
-	process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
-	lines = process.communicate()[1].decode("utf-8").split('\n')
-	return lines[2].split(': ')[1]
-	
-def getSamtoolsVers():		
-	cmd = ['samtools']
-	process = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
-	lines = process.communicate()[1].decode("utf-8").split('\n')
-	return lines[2].split(': ')[1]
-	
-def clean(args):
-	exts = ['amb', 'ann', 'bwt', 'fai', 'pac', 'sa']
-	for ref in args.ref:
-		for ext in exts:
-			if os.path.isfile(ref + '.' + ext):
-				os.remove(ref + '.' + ext)
+def indexRef(ref, logfile, args):
+    with open(logfile, 'a') as lf:
+        if args.mapping in ['bwasw', 'bwamem']:
+            cmd = ['bwa', 'index', '-p', os.path.join(args.tmp, os.path.basename(ref).split('.')[0]), '-a',  'is', ref]
+        elif args.mapping == smalt:
+            cmd = ['smalt', 'index',  '-s',  '6', ref, ref]
+        subprocess.check_call(cmd, shell=False, stdout=lf, stderr=lf)
+
+        cmd = ['samtools', 'faidx', ref]
+        subprocess.check_call(cmd, shell=False, stdout=lf, stderr=lf)
+
+def mapping(ref, fastq, revfastq, samfile, mapper, logfile, args):
+    #for single mapping set revfastq to False
+    with open(logfile, 'a') as lf:
+        if mapper == 'bwasw':
+            cmd = ['bwa', 'bwasw', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]),  fastq, revfastq, '-f', samfile]
+            if revfastq == False:
+                del(cmd[6])
+            subprocess.check_call(cmd, stderr=lf)
+
+        elif mapper == 'bwamem':
+            with open(samfile,"wb") as outfile:
+                cmd = ['bwa', 'mem', '-t', str(args.t), os.path.join(args.tmp, os.path.basename(ref).split('.')[0]),  fastq, revfastq]
+                if revfastq == False:
+                    del(cmd[6])
+                subprocess.check_call(cmd, stdout=outfile, stderr=lf)
+
+        elif mapper == 'smalt':
+            cmd = ['smalt', 'map', '-n', str(args.t), '-f', 'sam', '-o', samfile, ref,  fastq, revfastq]
+            if revfastq == False:
+                del(cmd[-1])
+            subprocess.check_call(cmd, stderr=lf)
+
+def sam2sbam(samfile, bamfile, logfile, delSam = True):
+    bamfile = re.sub(r'\.bam$', '', bamfile) #.bam automatically attached by samtools sort
+
+    #convert
+    with open(bamfile + ".tmp", 'w') as bf, open(logfile, 'a') as lf:
+        cmd = ['samtools', 'view', '-F', '4', '-Sb', samfile]
+        subprocess.check_call(cmd, stdout=bf, stderr=lf)
+
+    #sort
+    with open(bamfile+'.bam', 'w') as bf, open(logfile, 'a') as lf:
+        cmd = ['samtools', 'sort', bamfile + ".tmp"]
+        subprocess.check_call(cmd, stdout=bf, stderr=lf)
+
+    os.remove(bamfile + ".tmp")
+    if delSam:
+        os.remove(samfile)
+
+def getMappedReads(bamfile):
+    cmd = ['samtools', 'flagstat', bamfile]
+    out = subprocess.check_output(cmd).decode("utf-8").split('\n')
+    return int(out[2].split('+')[0])
+
+
+def getCoverage(bamfile, logfile, q, Q, reflen = False):
+    with open(logfile, 'a') as lf:
+        cmd = ['samtools', 'depth', '-q', str(q), '-Q', str(Q), bamfile]
+        out = subprocess.check_output(cmd, stderr=lf).decode("utf-8").split('\n')
+    cov = 0
+    for line in out:
+        line = line.strip()
+        if line != "":
+            cov += int(line.split("\t")[-1])
+    if reflen:
+        cov = cov/reflen
+    return cov
+
+def getBwaVers():
+    cmd = ['bwa']
+    process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    lines = process.communicate()[1].decode("utf-8").split('\n')
+    return lines[2].split(': ')[1]
+
+def getSamtoolsVers():
+    cmd = ['samtools']
+    process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    lines = process.communicate()[1].decode("utf-8").split('\n')
+    return lines[2].split(': ')[1]
+
+def readFasta(filename):
+    entries = [] # [[header, lin_seq]]
+    with open(filename, 'r') as handle:
+        for line in handle:
+            line = line.strip()
+            if line == "":
+                continue
+            elif line[0] == ">":
+                entries.append([line, []])
+            elif len(entries) > 0:
+                entries[-1][1].append(line)
+    out = []
+    for entry in entries:
+        out.append([entry[0], ''.join(entry[1])])
+    return out
+
+def mkOutDir(i = 0):
+    creationtime = time.strftime("%Y-%m-%d_%H-%M-%S")
+    dir = 'REFFINDER_' + creationtime + '/'
+    if os.path.isdir(dir):
+        i += 1
+        if i >= 30:
+            print("ERROR: Cannot generate a ouput dir")
+            exit(1)
+        time.sleep(1)
+        mkOutDir(i)
+    else:
+        os.mkdir(dir)
+
+    return dir
+
+
+def getReadNumber(fastqfilename):
+    '''returns number of reads stored in a fastq file'''
+    mime = mimetypes.guess_type(fastqfilename)[1]
+    if mime == 'gzip':
+        handle = gzip.GzipFile(fastqfilename, 'rb')
+    else:
+        handle = open(fastqfilename, 'r')
+
+    err = []
+    readcount = 0
+    e = 0
+    l = 0
+    for line in handle: #don't use readline() since in case of gzip it is very slow
+        l += 1 #counts line number
+        e += 1 #counts line until 4 then starts at 1 to check entry lines
+
+        if mime == 'gzip':
+            line = line.decode('utf-8')
+
+        #identifier
+        if e == 1:
+            if line.strip() == "":
+                e -= 1
+                continue
+            elif line[0] != "@":
+                err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): identifier line expected")
+                break
+
+        #sequence
+        elif e == 2:
+            if line.strip() == "":
+                err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): missing sequence")
+                break
+
+        #description
+        elif e == 3:
+            if line[0] != "+":
+                err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): description expected")
+                break
+
+        #base qualities
+        else:
+            if line.strip() == "":
+                err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): missing base quality scores")
+                break
+            else:
+                e = 0
+                readcount += 1
+
+    handle.close()
+
+    if readcount == 0:
+        err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): no reads")
+    if l < 4 and l > 0:
+        err.append("FORMAT EXCEPTION in " + fastqfilename + " (line " + str(l) + "): incomplete read entry")
+
+    if len(err) == 0:
+        err = False
+
+    return readcount, err
+
+def createFastqSubset(srcfile, dstfile, readKeys2select):
+    readKeys = set(readKeys2select)
+    total = len(readKeys)
+    stored = 0
+    readcount = 0
+    mime = mimetypes.guess_type(srcfile)[1]
+    if mime == 'gzip':
+        handle = gzip.GzipFile(srcfile, 'rb')
+    else:
+        handle = open(srcfile, 'r')
+
+    with open(dstfile, 'w') as dsthandle:
+        readcount = 0
+        err = []
+        e = 0
+        l = 0
+        entry = []
+        for line in handle: #don't use readline() since in case of gzip it is very slow
+            l += 1 #counts line number
+            e += 1 #counts line until 4 then starts at 1 to check entry lines
+
+            if mime == 'gzip':
+                line = line.decode('utf-8')
+
+            #identifier
+            if e == 1:
+                if line.strip() == "":
+                    e = 0
+                    continue
+                elif line[0] != "@":
+                    err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): identifier line expected")
+                    break
+                else:
+                    entry.append(line)
+
+            #sequence
+            elif e == 2:
+                if line.strip() == "":
+                    err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): missing sequence")
+                    break
+                else:
+                    entry.append(line)
+
+            #description
+            elif e == 3:
+                if line[0] != "+":
+                    err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): description expected")
+                    break
+                else:
+                    entry.append(line)
+
+            #base qualities
+            else:
+                if line.strip() == "":
+                    err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): missing base quality scores")
+                    break
+                else:
+                    e = 0
+                    entry.append(line)
+                    readcount += 1
+
+                    #write selected reads to dstfile
+                    if readcount in readKeys:
+                        dsthandle.write(''.join(entry))
+                        stored += 1
+                        if stored == total:
+                            break
+
+                    entry = []
+
+    handle.close()
+
+    if stored != total:
+        err.append("ERROR: subset generation failed for " + srcfile + "(" + str(stored) + "instead of " +str(total) + "reads collected")
+
+    if readcount == 0:
+        err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): no reads")
+    if l < 4 and l > 0:
+        err.append("FORMAT EXCEPTION in " + srcfile + " (line " + str(l) + "): incomplete read entry")
+
+    if len(err) == 0:
+        err = False
+
+
+def writeAsCols (handle, input, sep = ' ' * 5):
+    #allowed signs in numbers
+    pattern = re.compile(r'([+-])?([0-9]+)([.,]?)([0-9]+)(%?)')
+
+    #str conversion
+    lines = [[str(x) for x in line] for line in input]
+
+    #fill up rows to same number of columns (important for transposing)
+    maxlen = max([len(x) for x in lines])
+    [x.extend(['']*(maxlen-len(x))) for x in lines]
+
+    #find format parameters
+    width = [] #colum width or length
+    align = [] #alignment type (number = r; strings = l)
+    prec = [] #number precision
+    for column in zip(*lines):
+        width.append(len(column[0]))
+        prec.append(0)
+        align.append('r')
+        for field in column[1:]:
+            if align[-1] ==  'l':
+                width[-1] = max(width[-1], len(field))
+            else:
+                match = pattern.match(field)
+                if match and match.group(0) == field:
+                    if match.group(3) and match.group(4):
+                        prec[-1] = max(prec[-1], len(match.group(4)))
+
+                    if prec[-1] > 0:
+                        k = 1
+                    else:
+                        k = 0
+                    width[-1] = max(width[-1], len(match.group(2)) + prec[-1] + k + len(match.group(4)))
+                else:
+                    align[-1] = 'l'
+                    prec[-1] = False
+                    width[-1] = max(width[-1], len(field))
+
+    #formatting
+    output = []
+    for     line in lines:
+        f = 0
+        output.append([])
+        for field in line:
+            if align[f] == 'l':
+                output[-1].append(field + ' ' * (width[f] - len(field)))
+            elif align[f] == 'r':
+                match = pattern.match(field)
+                if not match or match.group(0) != field:
+                    output[-1].append(' ' * (width[f] - len(field)) + field)
+                else:
+                    if match.group(5):
+                        percent = match.group(5)
+                    else:
+                        percent = ''
+                    length = len(percent)
+
+                    intpart = match.group(2)
+                    if match.group(1):
+                        intpart = match.group(1) + intpart
+                    if match.group(3):
+                        decpart = match.group(4)
+                    else:
+                        intpart += match.group(4)
+                        decpart = ''
+                    length += len(intpart)
+
+                    #print('match:', match.group(0))
+                    #print('intpart:', intpart)
+                    #print('decpart:', decpart)
+                    #print('prec:', prec[f])
+
+                    if prec[f] > 0:
+                        decpart += '0' * (prec[f] - len(match.group(4)))
+                        length += len(decpart) + 1
+                        formatted_number = ' ' * (width[f] - length) + intpart + '.' + decpart + percent
+                    else:
+                        formatted_number = ' ' * (width[f] - length) + intpart + percent
+                    output[-1].append(formatted_number)
+            f += 1
+    lb = os.linesep
+    handle.write(lb.join([sep.join(x) for x in output]))
+
+def logging(line, logfilename):
+    with open(logfilename, 'a') as handle:
+        handle.write(line + os.linesep)
+
+def print(*arg, end=None):
+    pass
 
 def find_reference(args):
+
     #START
     creationtime = time.strftime("%Y-%m-%d_%H-%M-%S")
     starttime = time.time()
 
-    #consistency checks
+    #parameter consistency checks
     if args.mode == 'paired' and len(args.fastq) % 2 != 0:
-            log.write(args.logger, 'debug', "EXCEPTION: in paired mode the number of fastq files must be even")
-            exit(1)
-
+        print("EXCEPTION: in paired mode the number of fastq files must be even")
+        exit(1)
     if args.p <= 0 or args.p > 1:
-            log.write(args.logger, 'debug', "EXCEPTION: read proportion must between 0 (exclusive) and 1 (inclusive).")
-            exit(1)	
-            
-    #generate dir
-    if args.s or args.b:
-            storedir = 'REFFINDER_' + creationtime + '/'
-            os.mkdir(storedir)
-    else:
-            storedir = ''
-
-    #files to fastq file objects
-    fastqFileObjs = []
-    for fastqfile in args.fastq:
-            fastqfile.close()
-            fastqFileObjs.append(filing.fastqfile(fastqfile.name))
-            
-    #generating reference index files 
-    log.write(args.logger, 'debug', '')
-    log.write(args.logger, 'debug', "generate reference index files ...")
+        print("EXCEPTION: read proportion must be a float between 0 (exclusive) and 1 (inclusive).")
+        exit(1)
+
+    #generate tmpdir
+    tmpdir = args.tmp
+
+    #generate storedir and logfile
+    storedir = args.tmp
+
+    #log file definition
+    process_logfile = os.path.join(storedir, 'process.log')
+    logging('# START LOG #', process_logfile)
+
+    #processing reference index files
+    print ("processing reference files...")
+    reflengths = {} #dict key: origname; value: length of reference
+    refheader = {} #dict key: origname; value: FASTA header
+    err = []
     for ref in args.ref:
-            indexRef(ref, args)
+        entries = readFasta(ref.name)
+        if len(entries) == 0:
+            err.append("ERROR in " + ref.name + ": No valid FASTA format")
+        elif len(entries) > 1:
+            err.append("ERROR in " + ref.name + ": multiple sequences not supported")
+        else:
+            reflengths[ref.name] = len(entries[0][1])
+            refheader[ref.name] = entries[0][0]
 
-    #generating subset fastq files and mapping 
-    fastq_mapped_reads =[] #minimal quality mapped reads based in fastq files order
-    ref_mapped_reads =[] #minimal quality mapped reads based in ref files order
-    total_reads = 0 #total reads
+    if len(err) > 0:
+        for e in err:
+            print(e)
+        exit(1)
 
-    if args.mode == 'paired':
-            datasets = [['forward read file', 'reverse read file', 'reads', 'subset reads']]
-    elif args.mode == 'single':	
-            datasets = [['read file', 'reads', 'subset reads']]
 
-    current_task = 0
+    #generating reference index files
+    logging('# REFERENCE FILE INDEXING #', process_logfile)
+    print ("indexing reference files...", end = " ")
+    reffiles = [] #list of sets (tmpname, origname)
+    i = 0
+    for ref in args.ref:
+        logging('## INDEXING ' + ref.name + ' ##', process_logfile)
+        i += 1
+        print ("\rindexing reference files...  [" + str(i) + "/" + str(len(args.ref))+ "]        ", end = " ")
+        newname = os.path.join(tmpdir, os.path.basename(ref.name))
+        try:
+            shutil.copyfile(ref.name, newname)
+        except shutil.SameFileError:
+            pass
+        ref.close()
+        reffiles.append((newname, ref.name))
+        indexRef(newname, process_logfile, args)
+    print()
+
+    #processing fastq files
+    print ('processing fastq files...', end = " ")
+    readcount = {}
+    err = []
+    i = 0
+    for fastq in args.fastq:
+        i += 1
+        print ("\rprocessing fastq files...  [" + str(i) + "/" + str(len(args.fastq))+ "]        ", end = " ")
+        count, e = getReadNumber(fastq.name)
+        if e:
+            err.extend(e)
+        else:
+            readcount[fastq.name] = count
+
+
+    print()
+
+    if len(err) > 0:
+        for e in err:
+            print(e)
+        exit(1)
+
+
+    #task actions
+    print("working...")
+    coverages = {} # 1.dim_key: fastq_origname; 2.dim_key: ref_origname; value: mapped reads
 
-    #processing paired files
     if args.mode == 'paired':
-            total_tasks = int((len(args.fastq)/2) * len(args.ref))
-            i = 0
-            while i < len(args.fastq):
-                    log.write(args.logger, 'debug', '')
-                    log.write(args.logger, 'debug', 'processing paired files ...')
-                    log.write(args.logger, 'debug', 'forward read file: {}'.format(args.fastq[i].name))
-                    log.write(args.logger, 'debug', 'reverse read file: {}'.format(args.fastq[i+1].name))
-                    pairedfastqFileObj = filing.pairedfastqfiles(fname = args.fastq[i].name, rname=args.fastq[i+1].name)
-                    log.write(args.logger, 'debug', 'reads in each file: {}'.format(pairedfastqFileObj.getReadNumber()))	
-                    datasets.append([args.fastq[i].name, args.fastq[i+1].name, pairedfastqFileObj.getReadNumber()])
-                                            
-                    #generating subsets
-                    log.write(args.logger, 'debug', 'generating read subsets ...')
-                    mapped_reads = []
-                    with pairedfastqFileObj.randomSubset(prop=args.p) as subsetObj: 
-                            log.write(args.logger, 'debug', 'reads in each subset file: {}'.format(subsetObj.getReadNumber()))
-                            datasets[-1].append(subsetObj.getReadNumber())
-                            total_reads += subsetObj.getReadNumber()
-                            j = 0
-                            for ref in args.ref:
-                                    current_task += 1
-                                    log.write(args.logger, 'debug', '')
-                                    log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
-                                    
-                                    #mapping
-                                    log.write(args.logger, 'debug', 'mapping to {} using {} {}'.format(ref, args.mapping, '...'))
-                                    samfile = subsetObj.forward.uniqueFilename(ext='sam')
-                                    mapping(ref, subsetObj.forward.fullname, subsetObj.reverse.fullname, samfile, args)
-                                    
-                                    #convert to bam
-                                    log.write(args.logger, 'debug', '')
-                                    log.write(args.logger, 'debug', 'converting sam to bam ...')
-                                    bamfile = subsetObj.forward.uniqueFilename(ext='bam')
-                                    sam2bam(samfile, bamfile)
-                                            
-                                    #get reads aligend with minimal mapping quality
-                                    mapped_reads.append(getMappedReads(bamfile))
-                                    if len(ref_mapped_reads) < j + 1:
-                                            ref_mapped_reads.append(0)					
-                                    ref_mapped_reads[j] += mapped_reads[-1]			
-                                    
-                                    if args.b:
-                                            fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
-                                            reffilepath, reffilename = os.path.split(ref)
-                                            os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam') 
-                                    else:
-                                            os.remove(bamfile)								
-                                    
-                                    log.write(args.logger, 'debug', '')
-                                    log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
-                                    j += 1
-                                    
-                            if args.s:
-                                    fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
-                                    subsetObj.forward.renameFile('subset_' + fastqfilename, path=storedir)
-                                    fastqfilepath, fastqfilename = os.path.split(args.fastq[i+1].name)
-                                    subsetObj.reverse.renameFile('subset_' + fastqfilename, path=storedir)
-                            else:
-                                    subsetObj.setTmp()
-                    
-                    #make fastq_mapped_reads relative
-                    k = 0
-                    for reads in mapped_reads:
-                            mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
-                            k += 1
-                    fastq_mapped_reads.append(mapped_reads)
-                    i += 2
-                    
-                    
-    #processing single files	
-    if args.mode == 'single':
-            total_tasks = int(len(args.fastq) * len(args.ref))
-            i = 0
-            while i < len(args.fastq):
-                    log.write(args.logger, 'debug', '')
-                    log.write(args.logger, 'debug', 'processing fastq file {}'.format(args.fastq[i].name))
-                    fastqFileObj = filing.fastqfile(name = args.fastq[i].name)
-                    log.write(args.logger, 'debug', 'number of reads: {}'.format(fastqFileObj.getReadNumber())	)
-                    datasets.append([args.fastq[i].name, fastqFileObj.getReadNumber()])
-                            
-                    log.write(args.logger, 'debug', 'generating read subset ...')
-                    mapped_reads = []
-                    with fastqFileObj.randomSubset(prop=args.p) as subsetObj: 
-                            log.write(args.logger, 'debug', 'reads in subset file: {}'.format(subsetObj.getReadNumber()))
-                            datasets[-1].append(subsetObj.getReadNumber())
-                            j = 0
-                            for ref in args.ref:
-                                    current_task += 1
-                                    log.write(args.logger, 'debug', '')
-                                    log.write(args.logger, 'debug', 'START MAPPING TASK {} OF {}'.format(current_task, total_tasks))
-                                    
-                                    #mapping
-                                    log.write(args.logger, 'debug', 'mapping to {} using {}'.format(ref, args.mapping))
-                                    samfile = subsetObj.uniqueFilename(ext='sam')
-                                    mapping(ref, subsetObj.fullname, False, samfile, args)
-                                    
-                                    #convert to bam
-                                    log.write(args.logger, 'debug', '')
-                                    log.write(args.logger, 'debug', 'converting sam to bam ...')
-                                    bamfile = subsetObj.uniqueFilename(ext='bam')
-                                    sam2bam(samfile, bamfile)
-                                            
-                                    #get reads aligend with minimal mapping quality
-                                    mapped_reads.append(getMappedReads(bamfile))
-                                    if len(ref_mapped_reads) < j + 1:
-                                            ref_mapped_reads.append(0)					
-                                    ref_mapped_reads[j] += mapped_reads[-1]			
-                                    
-                                    if args.b:
-                                            fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
-                                            reffilepath, reffilename = os.path.split(ref)
-                                            os.rename(bamfile, storedir + 'subset_' + fastqfilename + '__to__' + reffilename + '.bam') 
-                                    else:
-                                            os.remove(bamfile)								
-                                    
-                                    log.write(args.logger, 'debug', '')
-                                    log.write(args.logger, 'debug', 'mapped reads: {}'.format(mapped_reads[-1]))
-                                    j += 1
-                                    
-                            if args.s:
-                                    fastqfilepath, fastqfilename = os.path.split(args.fastq[i].name)
-                                    subsetObj.renameFile('subset_' + fastqfilename, path=storedir)
-                            else:
-                                    subsetObj.setTmp()
-                                    
-                    #make fastq_mapped_reads relative
-                    k = 0
-                    for reads in mapped_reads:
-                            mapped_reads[k] = str(round(mapped_reads[k]/subsetObj.getReadNumber() * 100, 2)) + '%'
-                            k += 1
-                    fastq_mapped_reads.append(mapped_reads)
-                    i += 1
-                    
-    #make ref_mapped_reads relative
+        total_tasks = int(len(args.fastq)/2) * len(args.ref)
+    else:
+        total_tasks = len(args.fastq) * len(args.ref)
+
+    if args.b:
+        bamloghandle = open(os.path.join(storedir, "bamfiles.log"), "w")
+        bamloghandle.write("task\tBAM file\treference file\tread file(s)" + os.linesep)
+
+    f = 0
+    current_task = 0
     i = 0
-    for reads in ref_mapped_reads:
-            if total_reads == 0:
-                    ref_mapped_reads[i] = '0%'
-            else:
-                    ref_mapped_reads[i] = str(round(reads/total_reads * 100, 2)) + '%'
+    while current_task < total_tasks:
+        print ("  preparing new dataset...")
+
+        #generate fastq subsets in paired mode
+        if args.mode == 'paired':
+            file1 = args.fastq[i].name
+            file2 = args.fastq[i+1].name
+            subset_file1 = os.path.join(tmpdir, re.sub(r'\.gz$', '', "subset_" + os.path.basename(file1)))
+            subset_file2 = os.path.join(tmpdir, re.sub(r'\.gz$', '', "subset_" + os.path.basename(file2)))
+            i += 2
+            print ("    paired data")
+            print ("      " + file1 + ":", readcount[file1], "reads")
+            print ("      " + file2 + ":", readcount[file2], "reads")
+            if readcount[file1] != readcount[file2]:
+                print("ERROR: inconsistent data set.")
+                exit(1)
+            subamount = int(readcount[file1] * args.p)
+            print ("      subset:", subamount, "reads each")
+            print ("    generating subset files...")
+            selected = random.sample(range(1, readcount[file1]+1), subamount)
+            createFastqSubset(file1, subset_file1, selected)
+            createFastqSubset(file2, subset_file2, selected)
+
+        #single mode
+        else:
+            file1 = args.fastq[i].name
+            subset_file1 = os.path.join(tmpdir, re.sub(r'\.gz$', '', "subset_" + os.path.basename(file1)))
+            subset_file2 = False
             i += 1
-            
-                    
+            print ("    unpaired data")
+            print ("    " + file1 + ":", readcount[file1], "reads")
+            subamount = int(readcount[file1] * args.p)
+            print ("      subset:", subamount, "reads")
+            print ("    generating subset file...")
+            selected = random.sample(range(1, readcount[file1]+1), subamount)
+            createFastqSubset(file1, subset_file1, selected)
 
-    #get reference charts
-    log.write(args.logger, 'debug', '')
-    log.write(args.logger, 'debug', 'prepare output ...')
+        if not file1 in coverages:
+            coverages[file1] = {}
 
-    refcharts = [['rank', 'reference file(s)', 'avg of mapped reads']]
+        for reffile in reffiles:
 
-    total_datasets = len(datasets) -1 #minus 1^since it contains a headline
-    order = sorted(set(ref_mapped_reads), reverse=True)
-    n = 0
-            
-    for value in order:
-            n += 1
-            refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == value]
-            refs = []
-            for key in refkeys:
-                    refs.append(args.ref[key])
-            refcharts.append([str(n), ', '.join(refs), value])	
+            ref_origname = reffile[1]
 
+            #prepare
+            current_task += 1
+            print ('  TASK', current_task, 'OF', total_tasks)
 
-    #get rankmatrix
-    if args.mode == 'paired':
-            rankmatrix = [['forward read file', 'reverse read file']]
-    elif args.mode == 'single':
-            rankmatrix = [['read file']]
+            ref = reffile[0]
+            samfile = os.path.join(tmpdir, str(current_task) + ".sam")
+            bamfile = os.path.join(tmpdir, str(current_task) + ".bam")
+
+            #process logging
+            if subset_file2:
+                logging('# TASK ' + str(current_task) + ': subsets of ' + file1 + " / " + file2 + " TO " + ref_origname + ' #', process_logfile)
+            else:
+                logging('# TASK ' + str(current_task) + ': subset of ' + file1 + " TO " + ref_origname + ' #', process_logfile)
+
+
+            #mapping
+            print("    mapping...")
+            logging('## MAPPING ##', process_logfile)
+            mapping(ref, subset_file1, subset_file2, samfile, args.mapping, process_logfile, args)
+
+            print("    analyzing...")
+
+            #sam to bam
+            logging('## SAM2BAM CONVERSION AND SORTING ##', process_logfile)
+            sam2sbam(samfile, bamfile, process_logfile)
+
+            #get reads aligend with minimal mapping quality
+            #mapped_reads = getMappedReads(bamfile)
+
+            #get coverages
+            coverages[file1][ref_origname] = getCoverage(bamfile, process_logfile, args.q, args.Q, reflengths[ref_origname])
+            print("    avg_base_cov:", coverages[file1][ref_origname])
+
+            #save bam
+            if args.b:
+                dstbamname = str(current_task) + '.bam'
+                dstbam = os.path.join(os.getcwd(), storedir, dstbamname)
+                shutil.move(bamfile, dstbam)
+                bamloghandle.write(str(current_task) + "\t" + dstbamname + "\t" + ref_origname + "\t" + file1)
+                if args.mode == 'paired':
+                    bamloghandle.write(" / " + file2)
+                bamloghandle.write(os.linesep)
+            else:
+                os.remove(bamfile)
+
+        #cleaning subset_files
+        os.remove(subset_file1)
+        if subset_file2:
+            os.remove(subset_file2)
+
+
+
+    if args.b:
+        bamloghandle.close()
+
+    #cleaning tmp dir
+    #shutil.rmtree(tmpdir)
+
+    #process logging end
+    logging('# END LOG #', process_logfile)
+
+    #output
+    print('preparing output...')
 
-    for i in range(1, len(args.ref) + 1):
-            rankmatrix[0].append('rank ' + str(i) + ' (mapped reads)')
+    ##reference charts
+    outfile = os.path.join(storedir, "ref_charts.txt")
+    #refcharts = [['rank', 'reference file(s)', 'avg cov per base']]
+    chart = {} #key = %coverage, value: list of references
+    refcov = {} #key: refname; value: list of %coverage
+    bestfits = {} # number of best fits
+
+
+    ###get coverages
+    for fastq in coverages.keys():
+        for ref in coverages[fastq].keys():
+            cov = coverages[fastq][ref]
+            if not ref in refcov:
+                refcov[ref] = [cov]
+            else:
+                refcov[ref].append(cov)
+
+    ###get best fittings
+    maxcovs = []
+    r = list(refcov.keys())
+    r = r[0]
+    for i in range(len(refcov[r])):
+        maxcovs.append(0)
+        for ref in refcov.keys():
+            maxcovs[i] = max([refcov[ref][i], maxcovs[i]])
+
+    for ref in refcov.keys():
+        bestfits[ref] = 0
+        for i in range(len(refcov[ref])):
+            if refcov[ref][i] == maxcovs[i]:
+                bestfits[ref] += 1
+
+    ###calculate mean
+    for ref in refcov:
+        meancov = sum(refcov[ref]) / len(refcov[ref])
+        if not meancov in chart:
+            chart[meancov] = [ref]
+        else:
+            chart[meancov].append(ref)
+
+    order = sorted(set(chart.keys()), reverse=True)
+
+    out = [["rank", "avg cov per base", "#best fittings", "reference header", "reference file"]]
 
     i = 0
-    for mapped_reads in fastq_mapped_reads:
-            if args.mode == 'paired':
-                    rankmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
-            elif args.mode == 'single':
-                    rankmatrix.append([args.fastq[i].name])
-            
-            order = sorted(set(mapped_reads), reverse=True)
-            for value in order:
-                    refkeys = [i for i, j in enumerate(mapped_reads) if j == value]
-                    refs = []
-                    for key in refkeys:
-                            refs.append(args.ref[key])
-                    
-                    rankmatrix[-1].append(', '.join(refs) + ' (' + str(value) + ')')			
-            
-            if args.mode == 'paired':
-                    i += 2
-            elif args.mode == 'single':
-                    i += 1
-            
-    #get refmatrix
-    if args.mode == 'paired':
-            refmatrix = [['forward read file', 'reverse read file']]
-    elif args.mode == 'single':	
-            refmatrix = [['read file']]
-            
-    for ref in args.ref:
-            refmatrix[0].append(ref)
+    for meancov in order:
+        i += 1
+        fits = []
+        headers = []
+        for ref in chart[meancov]:
+            fits.append(str(bestfits[ref]))
+            header = refheader[ref][1:]
+            headers.append(header.replace(';', ','))
+        out.append([i, meancov, ";".join(fits), ";".join(headers), ";".join(chart[meancov])])
+
+    with open(outfile, "w") as handle:
+        writeAsCols(handle, out)
+
+
+    ##reference ranking
+    outfile = os.path.join(storedir, "ref_ranking.txt")
+    out = [['sample']]
+    count_ranks = 0
+    i = 0
+    best_reference = None
+    for ref in reffiles:
+        if i == 0:
+            best_reference = ref
+        i += 1
+        out[0].append("rank " + str(i))
+
+
+    i = 0
+    while i < len(args.fastq):
+        fastq = args.fastq[i].name
+        out.append([fastq])
+        i +=1
+        if args.mode == 'paired':
+            out[-1][0] += " / " + args.fastq[i].name
+            i +=1
+
+        ranking = {}
+        for ref in reffiles:
+            ref = ref[1]
+            cov = coverages[fastq][ref]
+            if not cov in ranking:
+                ranking[cov] = []
+            ranking[cov].append(ref)
+
+        order = sorted(set(ranking.keys()), reverse=True)
+        for cov in order:
+            out[-1].append(",".join(ranking[cov]) + " (" + str(cov) + " avg cov per base)")
+
+        while len(out[-1]) < len(reffiles):
+            out[-1].append("-")
+
+    with open(outfile, "w") as handle:
+        writeAsCols(handle, out)
+
+    ##ref matrix
+    outfile = os.path.join(storedir, "ref_matrix.txt")
+    out = [['sample']]
+    count_ranks = 0
+    for ref in reffiles:
+        out[0].append(ref[1] + " [avg cov per base]")
 
     i = 0
-    for mapped_reads in fastq_mapped_reads:
-            if args.mode == 'paired':
-                    refmatrix.append([args.fastq[i].name, args.fastq[i+1].name])
-                    i += 2
-            elif args.mode == 'single':
-                    refmatrix.append([args.fastq[i].name])
-                    i += 1
-            refmatrix[-1].extend(mapped_reads)		
-
-    #cleaning
-    if args.i == False:
-            clean(args)
-
-    #print best reference
-    maxvalue = max(set(ref_mapped_reads))
-    refkeys = [i for i, j in enumerate(ref_mapped_reads) if j == maxvalue]
-    log.write(args.logger, 'debug', 'best matching reference:')
-    #log.write(args.logger, 'debug', args.ref[key].name)
-    log.write(args.logger, 'debug', args.ref[refkeys[0]])
-    #return(args.ref[key].name)
-    return(args.ref[refkeys[0]])
+    while i < len(args.fastq):
+        fastq = args.fastq[i].name
+        out.append([fastq])
+        i +=1
+        if args.mode == 'paired':
+            out[-1][0] += " / " + args.fastq[i].name
+            i +=1
+
+        for ref in reffiles:
+            ref = ref[1]
+            out[-1].append(coverages[fastq][ref])
+
+
+    with open(outfile, "w") as handle:
+        writeAsCols(handle, out)
+
+
+    ##logging
+    logname = os.path.join(storedir, 'refFinder.log')
+    with open(logname, 'w') as handle:
+        handle.write('REFFINDER' + os.linesep)
+        handle.write('=========================' + os.linesep)
+        content =[]
+        content.append(['reffinder version:', version])
+        content.append(['bwa version:', getBwaVers()])
+        content.append(['samtools version:', getSamtoolsVers()])
+        content.append(['job time:', creationtime])
+        content.append(['mapping algorithm:', args.mapping])
+        content.append(['mode:', args.mode])
+        if args.mode == 'paired':
+            content.append(['number of datasets:', int(len(args.fastq)/2)])
+        content.append(['number of fastq files:', len(args.fastq)])
+        content.append(['number of references:', len(args.ref)])
+        content.append(['proportion of reads selected for mapping:', args.p])
+        writeAsCols(handle, content)
+
+        handle.write(os.linesep+os.linesep)
+        handle.write('FASTQ FILES' + os.linesep)
+        handle.write('=========================' + os.linesep)
+        i = 0
+        content = [['#', 'file', 'number reads', 'number reads in subset', 'mate file']]
+        files = sorted(readcount.keys())
+        for fastq in files:
+            i += 1
+            count = int(readcount[fastq] * args.p)
+            content.append([i, fastq, readcount[fastq], count])
+            if args.mode == "single":
+                content[-1].append("-")
+            elif i % 2 == 0:
+                content[-1].append(files[i-2])
+            else:
+                content[-1].append(files[i])
+        writeAsCols(handle, content)
+
+        handle.write(os.linesep+os.linesep)
+        handle.write('REFERENCE FILES' + os.linesep)
+        handle.write('=========================' + os.linesep)
+        i = 0
+        content = [['#', 'file', 'header', 'sequence length']]
+        files = sorted(reflengths.keys())
+        for ref in files:
+            i += 1
+            content.append([i, ref, refheader[ref], reflengths[ref]])
+        writeAsCols(handle, content)
+
+        handle.write(os.linesep+os.linesep)
+        handle.write('--' + os.linesep)
+        handle.write('execution time (hh:mm:ss): ')
+        s = time.time() - starttime
+        handle.write('{:02.0f}:{:02.0f}:{:02.0f}'.format(s//3600, s%3600//60, s%60))
+
+    print('results saved to:', storedir)
+    return best_reference
+
 
 if __name__ == '__main__':
     find_reference(parse_arguments())
-
diff --git a/report.tex b/report.tex
index a641807..e336de6 100755
--- a/report.tex
+++ b/report.tex
@@ -50,13 +50,22 @@ Machine: & {{pipeline.machine}}\\
  Tool versions & \\
 Python: & {{pipeline.python_version}}\\
 Mapper: & {{pipeline.mapper_version}}\\
+Samtools: & {{pipeline.samtools_version}}\\
+Picard: & {{pipeline.picard_version}}\\
+Reffinder: & {{pipeline.reffinder_version}}\\
 \end{tabular}\\
 
 %----------------- Workflow -------------------%
 \line(1,0){ \textwidth } \\
 \begin{tabular}{lp{0.85\textwidth}}
-Processed read(s): & \path{{"{"+pipeline.current_read1+"}"}}\\
-{%if pipeline.current_read2 is not none%} & \path{{"{"+pipeline.current_read2+"}"}}\\{%endif%}
+Processed read(s):
+{% if pipeline.args.join or pipeline.args.join_sample %}
+	{% for i in pipeline.reads %}
+		& \path{{"{"+i+"}"}}\\
+	{% endfor %}
+{% else %}
+	& \path{{"{"+pipeline.current_read1+"}"}}\\ {%if pipeline.current_read2 is not none%} & \path{{"{"+pipeline.current_read2+"}"}}\\{%endif%}
+{% endif %}
 {%if pipeline.args.reference is defined and pipeline.args.reference is not none%} Reference file: & \path{{"{"+pipeline.args.reference+"}"}}\\{%endif%}
 \end{tabular}\\
 \line(1,0){ \textwidth } \\
diff --git a/run.config b/run.config
new file mode 100644
index 0000000..389b501
--- /dev/null
+++ b/run.config
@@ -0,0 +1,5 @@
+--samtools=/opt/common/pipelines/tools/samtools-1.3.1/
+--bwa=/opt/common/pipelines/tools/bwa-0.7.15/
+--bowtie2=/usr/bin/
+--picard=/opt/common/pipelines/tools/picard-2.7.1/
+--java=java-1.8

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/mapping-pipeline.git



More information about the debian-med-commit mailing list