[med-svn] [trimmomatic] 06/10: Inject a small wrapper script that simplifies calling Trimmomatic for a heap of sequences

Andreas Tille tille at debian.org
Thu Nov 26 10:08:45 UTC 2015


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository trimmomatic.

commit eccfef656089004ed408f92b3df4ede03a347cc0
Author: Andreas Tille <tille at debian.org>
Date:   Thu Nov 26 10:44:21 2015 +0100

    Inject a small wrapper script that simplifies calling Trimmomatic for a heap of sequences
---
 debian/bin/batchTrim | 295 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)

diff --git a/debian/bin/batchTrim b/debian/bin/batchTrim
new file mode 100755
index 0000000..319a697
--- /dev/null
+++ b/debian/bin/batchTrim
@@ -0,0 +1,295 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#created by Stephan Fuchs (FG13, RKI), 2015
+#
+# Wrapper to simplify calls to trimmomatic
+
+#config
+version = '0.0.9'
+logsep = '=====================================\n'
+cmdTrimPE = "TrimmomaticPE"
+
+
+
+
+#dependencies
+try:
+    import argparse
+except ImportError:
+    print('The "argparse" module is not available. Use Python 3.2 or better.')
+    sys.exit(1)
+
+import os
+import re
+import sys
+import time
+import shutil
+import subprocess
+
+
+#processing command line arguments
+parser = argparse.ArgumentParser(prog="BATCH_TRIM", description='uses trimmomatic on multiple paired-end read file pairs')
+parser.add_argument('file', metavar="FILE(S)", help="input BAM file", type=argparse.FileType('r'), nargs='+')
+parser.add_argument('-t', metavar='INT', help="Number of threads/CPUs that should be used. Default is 4." , type=int, action="store", default=4)
+arg0group = parser.add_mutually_exclusive_group(required=False)
+arg0group.add_argument('--fastqc', help="If set combineFastQC is performed on all trimmed read files.", action="store_true", default=False)
+arg0group.add_argument('--fastqcpaired', help="If set combineFastQC is performed only on trimmed paired read files." , action="store_true", default=False)
+parser.add_argument('--illuminaclip', metavar='STR', help="Cut adapter and other illumina-specific sequences. Pattern: <fasta containing adapters and sequences>:<maximum of allowed mismatches>:<accuracy for palindromic matches>:<accuracy for simple matches> (Example: NexteraPE-PE.fa:2:30:10). By default ILLUMINACLIP is deactivated." , type=str, action="store", default=False)
+arg1group = parser.add_mutually_exclusive_group(required=False)
+arg1group.add_argument('--slidingwindow', metavar='STR', help="Performs a sliding window trimming approach. Pattern: <window size>:<required quality> (Example: 4:15). By default slidingwindow 4.15 is used." , type=str, action="store")
+arg1group.add_argument('--maxinfo', metavar='STR', help="Performs an adaptive quality trimming approach. Pattern: <target length>:<strictness> (Example: 15:0.5). By default maxinfo is not used." , type=str, action="store")
+parser.add_argument('--leading', metavar='INT', help="Cut bases of the start of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
+parser.add_argument('--trailing', metavar='INT', help="Cut bases of the end of a read if below a given threshold quality. By default quality threshold is set to 3. Set 0 to turn deactivate." , type=int, action="store", default=3)
+parser.add_argument('--crop', metavar='INT', help="Cut read to a specific length by removing bases from the end. By default this is deactivated." , type=int, action="store", default=False)
+parser.add_argument('--headcrop', metavar='INT', help="Cut read to a specific length by removing bases from the start. By default this is deactivated." , type=int, action="store", default=False)
+parser.add_argument('--minlen', metavar='INT', help="Drop the read if it is below a defined length. By default minimal length is set to 36. Set 0 to deactivate." , type=int, action="store", default=36)
+parser.add_argument('--avgqual', metavar='INT', help="Drop the read if average quality is below a speciefied level. By default this is deactivated." , type=int, action="store", default=0)
+arg2group = parser.add_mutually_exclusive_group(required=False)
+arg2group.add_argument('--tophred33', help="Convert quality scores to Phred-33. By default this is deactivated." , action="store_true", default=False)
+arg2group.add_argument('--tophred64', help="Convert quality scores to Phred-64. By default this is deactivated." , action="store_true", default=False)
+
+args = parser.parse_args()
+
+
+if len(args.file) % float(2) != 0:
+    exit("ERROR: It is not possible to process odd number of files in PE mode")
+
+if args.slidingwindow == None and args.maxinfo == None:
+    args.slidingwindow = "4:15"
+
+#FUNCTIONS
+def die (msg=False, path = False, status=1):
+    '''Displays message (optional) and exits using the defined status code. If a path is given, this path is deleted (recursively)'''
+    if msg:
+        print(msg)
+    if path:
+        shutil.rmtree(path)
+    sys.exit(status)
+
+
+
+def formatColumns(rows, empty='', space=5):
+    '''formats nested list (1. level = rows; 2. level = columns) in equal-sized columns. fields containing integers or floats are aligned to the right'''
+
+    #allowed signs in numbers
+    allowed = ['+', '-', '.', ',', '%']
+
+    #str conversion
+    rows = [[str(x) for x in row] for row in rows]
+
+    #fill up rows to same number of columns (important for transposing)
+    maxlen = max([len(x) for x in rows])
+    [x.extend([empty]*(maxlen-len(x))) for x in rows]
+    align = []
+    output = [''] * len(rows)
+    for column in zip(*rows):
+      #detect numbers to align right
+      string = ''.join(column)
+      for i in allowed:
+        string = string.replace(i, '')
+      if string.strip('+-.,').isdigit():
+        align = 'r'
+      else:
+        align = 'l'
+
+      maxlen = max([len(x) for x in column])
+      for i in range(len(column)):
+          if align == 'r':
+              output[i] += ' ' * (maxlen - len(column[i])) + column[i] + ' ' * space
+          else:
+              output[i] += column[i] + ' ' * (maxlen - len(column[i])) + ' ' * space
+
+    return '\n'.join(output) + '\n'
+
+#START
+
+#create dir
+err = 1
+while err > 0:
+    timecode = time.strftime("%Y-%m-%d_%H-%M-%S")
+    targetpath = "BATCH_TRIM_" + timecode
+    if os.path.isdir(targetpath):
+        err += 1
+        time.sleep(1)
+    else:
+        os.mkdir(targetpath)
+        err = 0
+    if err == 30:
+        print("ERROR: giving up to find available output file name")
+        sys.exit(1)
+
+
+#working
+print("target path created:  " + targetpath)
+print(str(len(args.file)) + " files to process ...")
+
+#filing
+files = []
+for f in args.file:
+    files.append(f.name)
+    f.close()
+files.sort()
+
+#get version of trimmomatic (unfortunately there is no --version option available)
+try:
+    proc=subprocess.Popen(['apt-cache', 'policy', 'trimmomatic'], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+except subprocess.CalledProcessError:
+    print(subprocess.CalledProcessError)
+except OSError:
+    print(OSError)
+
+if proc.returncode > 0:
+  print(proc.communicate()[1])
+  die(targetdir)
+else:
+  out=proc.communicate()[0]
+  match = re.search('Installed: ([\d\.]*)', out)
+  if match == None:
+      print("trimmomatic version not recognized")
+      tversion='unknown'
+  else:
+      tversion= match.group(1)
+      print("trimmomatic version:  " + tversion)
+
+
+#start logging
+loghandle = open(targetpath + "/batch_trim.log", 'w')
+loghandle.write("BATCH TRIM\n")
+loghandle.write(logsep)
+log=[["version:", version], ['time:', timecode], ['result folder:', targetpath]]
+loghandle.write(formatColumns(log))
+
+loghandle.write("\nTRIMMOMATIC\n")
+loghandle.write(logsep)
+log=[["version:", tversion]]
+
+
+#create trimmomatic command line
+cmd = []
+
+if args.illuminaclip != False:
+    log.append(["ILLUMINACLIP:", args.illuminaclip])
+    cmd.append("ILLUMINACLIP:" + args.illuminaclip)
+else:
+    log.append(["ILLUMINACLIP:", 'off'])
+
+if args.slidingwindow != None:
+    log.append(["SLIDINGWINDOW:", args.slidingwindow])
+    cmd.append("SLIDINGWINDOW:" + args.slidingwindow)
+elif args.maxinfo != None:
+    log.append(["MAXINFO:", args.maxinfo])
+    cmd.append("MAXINFO:" + args.maxinfo)
+
+if args.leading > 0:
+    log.append(["LEADING:", args.leading])
+    cmd.append("LEADING:" + str(args.leading))
+else:
+    log.append(["LEADING:", 'off'])
+
+if args.trailing > 0:
+    log.append(["TRAILING:", args.trailing])
+    cmd.append("TRAILING:" + str(args.trailing))
+else:
+    log.append(["TRAILING:", 'off'])
+
+if args.crop != False:
+    log.append(["CROP:", args.crop])
+    loghandle.write(str(args.crop) + "\n")
+else:
+    log.append(["CROP:", 'off'])
+
+if args.headcrop != False:
+    log.append(["HEADCROP:", args.crop])
+    cmd.append("HEADCROP:" + str(args.headcrop))
+else:
+    log.append(["HEADCROP:", 'off'])
+
+if args.minlen > 0:
+    log.append(["MINLEN:", args.minlen])
+    cmd.append("MINLEN:" + str(args.minlen))
+else:
+    log.append(["MINLEN:", 'off'])
+
+if args.avgqual > 0:
+    log.append(["AVGQUAL:", args.avgqual])
+    cmd.append("AVGQUAL:" + str(args.avgqual))
+else:
+    log.append(["AVGQUAL:", 'off'])
+
+if args.tophred33 != False:
+    log.append(["to PHRED33 conversion:", 'on'])
+    cmd.append("TOPHRED33")
+else:
+    log.append(["to PHRED33 conversion:", 'off'])
+
+if args.tophred64 != False:
+    log.append(["to PHRED64 conversion:", 'on'])
+    cmd.append("TOPHRED64")
+else:
+    log.append(["to PHRED64 conversion:", 'off'])
+
+loghandle.write(formatColumns(log))
+
+#performing trimmomatic
+loghandle.write("\nTRIM RESULTS\n")
+loghandle.write(logsep)
+log = [["forward file", "reverse file", "detected enoding", "input read pairs", "both surviving", "forward only surviving", "reverse only surviving", "dropped", "forward paired read file", "forward unpaired read file", "reverse paired read file", "reverse unpaired read file"]]
+i = 0
+while i*2 < len(files):
+    print("PROCESSING FILE PAIR " + str(i+1) + " OF " + str(len(files)/2))
+    print("  forward file: " + files[i*2])
+    print("  reverse file: " + files[i*2+1])
+
+    ext=str(os.path.splitext(files[i*2]))
+    name=str(os.path.basename(files[i*2]))
+    name=name[:-(len(ext[1])+2)]
+    paired1=targetpath + '/' + name + '_paired.fq.gz'
+    unpaired1=targetpath + '/' + name + '_unpaired.fq.gz'
+
+    ext=str(os.path.splitext(files[i*2+1]))
+    name=str(os.path.basename(files[i*2+1]))
+    name=name[:-(len(ext[1])+2)]
+    paired2=targetpath + '/' + name + '_paired.fq.gz'
+    unpaired2=targetpath + '/' + name + '_unpaired.fq.gz'
+
+    cli = [cmdTrimPE, "-threads", str(args.t), files[i*2], files[i*2+1], paired1, unpaired1, paired2, unpaired2]
+    cli.extend(cmd)
+
+    print("  trimming...")
+    try:
+        proc=subprocess.Popen(cli, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except subprocess.CalledProcessError:
+        print(subprocess.CalledProcessError)
+    except OSError:
+        print(OSError)	
+
+    out=proc.communicate()[1] #unfortunatley trimmomatic prints all in stderr and is not using stdout
+
+    if proc.returncode > 0:
+        exit(1)
+    else:
+        out = out.split("\n")
+        phred = re.search('phred\d*', out[1])
+        matches = re.findall('\d+ *[\d\(\)%\.,]*', out[-3])
+        log.append([files[i*2], files[i*2+1], phred.group(0), matches[0], matches[1], matches[2], matches[3], matches[4], os.path.basename(paired1), os.path.basename(unpaired1), os.path.basename(paired2), os.path.basename(unpaired2)])
+    i += 1
+
+loghandle.write(formatColumns(log))
+
+#starting combineFastQC
+cli = False
+if args.fastqc:
+    cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*.gz'
+elif args.fastqcpaired:	
+    cli = 'batchFastQC.py -t ' + str(args.t) + ' ' + targetpath + '/*_paired.fq.gz'
+
+if cli:
+    print(cli)
+    print("starting batchFastQC...\n\n")
+    try:
+        proc=subprocess.Popen(cli, shell=True)
+    except subprocess.CalledProcessError:
+        print(subprocess.CalledProcessError)
+    except OSError:
+        print(OSError)
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/trimmomatic.git



More information about the debian-med-commit mailing list