[med-svn] [rsem] 01/01: Imported Upstream version 1.2.31+dfsg

Sat Oct 15 09:17:53 UTC 2016

This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to annotated tag upstream/1.2.31+dfsg
in repository rsem.

commit fdf75d6c94a51ed3871944c89bf09bd029beb542
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Sat Jun 4 04:33:45 2016 -0700

    Imported Upstream version 1.2.31+dfsg
---
 WHAT_IS_NEW               |   7 +
 rsem-calculate-expression |   6 +-
 rsem-gff3-to-gtf          | 344 ++++++++++++++++++++++++++++++++++------------
 rsem-prepare-reference    |   4 +
 rsem_perl_utils.pm        |  14 +-
 5 files changed, 284 insertions(+), 91 deletions(-)

diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW
index 3f0c1c6..f897484 100644
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,3 +1,10 @@
+RSEM v1.2.31
+
+- Rewrote `rsem-gff3-to-gtf` to handle a more general set of GFF3 files 
+- Added safety checks to make sure poly(A) tails are not added to the reference when `--star` is set
+
+--------------------------------------------------------------------------------------------
+
 RSEM v1.2.30
 
 - Fixed a bug that can cause SAMtools sort to fail
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 808d839..d120bfa 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -5,7 +5,7 @@ use Pod::Usage;
 use File::Basename;
 use FindBin;
 use lib $FindBin::RealBin;
-use rsem_perl_utils qw(runCommand collectResults showVersionInfo getSAMTOOLS);
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
 
 use Env qw(@PATH);
 
@@ -167,7 +167,7 @@ pod2usage(-verbose => 2) if ($help == 1);
 
 if ($is_alignment) {
     pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
-    pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals, --solexa-quals, --bowtie2, --bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k and --bowtie2-sensitivity-level cannot be set if input is SAM/BAM/CRAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa || $bowtie2 || $bowtie2_path ne "" || $bowtie2_mismatch_rate != 0.1 || $bowtie2_k != 20 [...]
+    pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals, --solexa-quals, --bowtie2, --bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k, --bowtie2-sensitivity-level, --star, --star-path, and --star-output-genome-bam cannot be set if input is SAM/BAM/CRAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa || $bowtie2 || $bowtie2_path ne "" || $ [...]
 }
 else {
     pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (!$paired_end && scalar(@ARGV) != 3 || $paired_end && scalar(@ARGV) != 4);    
@@ -226,6 +226,8 @@ if (((-e "$refName.ta") && !(-e "$refName.gt")) || (!(-e "$refName.ta") && (-e "
 
 $alleleS = (-e "$refName.ta") && (-e "$refName.gt");
 
+pod2usage(-msg => "RSEM reference cannot contain poly(A) tails if you want to use STAR aligner!", -exitval => 2, -verbose => 2) if ($star && (&hasPolyA("$refName.seq")));
+
 if ($genGenomeBamF) {
     open(INPUT, "$refName.ti");
     my $line = <INPUT>; chomp($line);
diff --git a/rsem-gff3-to-gtf b/rsem-gff3-to-gtf
index 13c64be..344d869 100755
--- a/rsem-gff3-to-gtf
+++ b/rsem-gff3-to-gtf
@@ -3,103 +3,275 @@
 import os
 import sys
 import argparse
-import re
+from operator import itemgetter
+
+type_gene = ["gene", "snRNA_gene", "transposable_element_gene", "ncRNA_gene", "telomerase_RNA_gene", 
+	"rRNA_gene", "tRNA_gene", "snoRNA_gene", "mt_gene", "miRNA_gene", "lincRNA_gene", "RNA", "VD_gene_segment"]
+type_transcript = ["transcript", "primary_transcript", "mRNA", "ncRNA", "tRNA", "rRNA", "snRNA", "snoRNA", "miRNA",
+	"pseudogenic_transcript", "lincRNA", "NMD_transcript_variant", "aberrant_processed_transcript",
+	"nc_primary_transcript", "processed_pseudogene", "mRNA_TE_gene"]
+type_exon = ["exon", "CDS", "five_prime_UTR", "three_prime_UTR", "UTR", "noncoding_exon", "pseudogenic_exon"]
+
+# can be either gene or transcript, need special treatment
+type_gene_or_transcript = ["pseudogene", "V_gene_segment", "C_gene_segment", "J_gene_segment", "processed_transcript"] 
+
 
 class HelpOnErrorParser(argparse.ArgumentParser):
-    def error(self, msg):
-        sys.stderr.write("{0}: error: {1}\n\n".format(os.path.basename(sys.argv[0]), msg))
-        self.print_help()
-        sys.exit(-1)
+	def error(self, msg):
+		sys.stderr.write("{0}: error: {1}\n\n".format(os.path.basename(sys.argv[0]), msg))
+		self.print_help()
+		sys.exit(-1)
+
 
 def my_assert(bool, msg):
-    if not bool:
-        sys.stderr.write(msg + "\n")
-        sys.exit(-1)
-
-def findTag(tag, attributes):
-    pos = attributes.find(tag + "=")
-    if pos < 0:
-        return None
-    pos += len(tag) + 1
-    rpos = attributes.find(";", pos)
-    if rpos < 0:
-        rpos = len(attributes)
-    return attributes[pos:rpos]
+	if not bool:
+		sys.stderr.write(msg + "\n")
+		try:
+			os.remove(args.output_GTF_file)
+		except OSError:
+			pass
+		sys.exit(-1)
+
+
+class Feature:
+	# def gen_type_dict():
+	def gen_type_dict(self):
+		my_dict = {}
+		for my_type in type_gene:
+			my_dict[my_type] = "gene"
+		for my_type in type_transcript:
+			my_dict[my_type] = "transcript"
+		for my_type in type_exon:
+			my_dict[my_type] = "exon"
+
+		for my_type in type_gene_or_transcript:
+			my_dict[my_type] = "gene_or_transcript"
+
+		return my_dict
+
+	# type_dict = gen_type_dict()
+
+	def __init__(self):
+		self.type_dict = self.gen_type_dict()
+
+	def parse(self, line, line_no):
+		""" line should be free of leading and trailing spaces """
+
+		self.line = line
+		self.line_no = line_no
+
+		fields = line.split('\t')
+		my_assert(len(fields) == 9, "Line {0} does not have 9 fields:\n{1}".format(self.line_no, self.line))		
+
+		self.seqid = fields[0]
+		self.source = fields[1]
+		self.original_type = fields[2]
+		self.feature_type = self.type_dict.get(fields[2], None)
+		self.start = int(fields[3])
+		self.end = int(fields[4])
+		self.strand = fields[6]
+		self.attributes = fields[8][:-1] if len(fields[8]) > 0 and fields[8][-1] == ';' else fields[8]
+
+	def parseAttributes(self):
+		self.attribute_dict = {}
+		for attribute in self.attributes.split(';'):
+			fields = attribute.split('=')
+			my_assert(len(fields) == 2, "Fail to parse attribute {0} of line {1}:\n{2}".format(attribute, self.line_no, self.line))
+			tag, value = fields
+			if tag == "Parent":
+				self.attribute_dict[tag] = value.split(',')
+			else:
+				self.attribute_dict[tag] = value
+
+	def getAttribute(self, tag, required = False):
+		value = self.attribute_dict.get(tag, None)
+		my_assert(not required or value != None, "Line {0} does not have attribute {1}:\n{2}".format(self.line_no, tag, self.line))
+		return value
+
+
+class Transcript:
+	def __init__(self, tid, feature):
+		self.tid = tid
+		self.tname = self.ttype = None
+		self.gid = self.gname = None
+		self.setT = False # if a transcript feature has been set
+
+		self.seqid = feature.seqid
+		# self.source = feature.source
+		self.source = None
+		self.strand = feature.strand
+
+		self.intervals = []
+
+	def setTranscript(self, feature):
+		my_assert(not self.setT, 
+			"Transcript {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(self.tid, feature.line_no, feature.line))
+		self.setT = True
+		parents = feature.getAttribute("Parent", True)
+		my_assert(len(parents) == 1, "Transcript {0} at line {1} has more than one parents:\n{2}".format(self.tid, feature.line_no, feature.line))
+		self.gid = parents[0]
+		self.tname = feature.getAttribute("Name")
+		self.ttype = feature.original_type
+		self.source = feature.source
+
+	def addExon(self, feature):
+		self.intervals.append((feature.start, feature.end))
+
+	def merge(self):
+		self.intervals.sort(key = itemgetter(0))
+		self.results = []
+		cstart, cend = self.intervals[0]
+		for start, end in self.intervals[1:]:
+			if cend + 1 >= start:
+				cend = max(cend, end)
+			else:
+				self.results.append((cstart, cend))
+				cstart = start
+				cend = end
+		self.results.append((cstart, cend))
+
+	def __iter__(self):
+		self.index = 0
+		return self
+
+	def next(self):
+		if self.index == len(self.results):
+			raise StopIteration
+		interval = self.results[self.index]
+		self.index += 1
+		return interval
+
+
+def getTranscript(tid, feature):
+	assert tid != None
+
+	pos = tid2pos.get(tid, None)
+	if pos == None:
+		transcript = Transcript(tid, feature)
+		tid2pos[tid] = len(transcripts)
+		transcripts.append(transcript)
+	else:
+		my_assert(pos >= 0, 
+			"Line {0} describes an already processed Transcript {1}:\n{2}".format(feature.line_no, tid, feature.line))
+		transcript = transcripts[pos]
+		my_assert(transcript.seqid == feature.seqid and transcript.strand == feature.strand, 
+				"Line {0}'s seqid/strand is not consistent with other records of transcript {1}:\n{2}".format(
+					feature.line_no, tid, feature.line))
+
+	return transcript
+
+def flush_out(fout):
+	global transcripts
+	global tid2pos
+	global num_trans
+	global patterns
+
+	for transcript in transcripts:
+		tid2pos[transcript.tid] = -1
+		if not transcript.setT or len(transcript.intervals) == 0 or (len(patterns) > 0 and transcript.ttype not in patterns):
+			continue
+
+		my_assert(transcript.gid in gid2gname, 
+			"Cannot recognize transcript {0}'s parent {1}, a gene feature might be missing.".format(transcript.tid, transcript.gid))
+
+		transcript.gname = gid2gname[transcript.gid]
+
+		transcript.merge()
+
+		output_string = "{0}\t{1}\texon\t{{0}}\t{{1}}\t.\t{2}\t.\tgene_id \"{3}\"; transcript_id \"{4}\";".format(
+			transcript.seqid, transcript.source, transcript.strand, transcript.gid, transcript.tid)
+		if transcript.gname != None:
+			output_string += " gene_name \"{0}\";".format(transcript.gname)
+		if transcript.tname != None:
+			output_string += " transcript_name \"{0}\";".format(transcript.tname)
+		output_string += "\n"
+
+		for start, end in transcript:
+			fout.write(output_string.format(start, end))
+
+		num_trans += 1
+
+	transcripts = []
+
+
 
 parser = HelpOnErrorParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, description = "Convert GFF3 files to GTF files.")
 parser.add_argument("input_GFF3_file", help = "Input GFF3 file.")
 parser.add_argument("output_GTF_file", help = "Output GTF file.")
-parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted.", default = "mRNA")
-
+parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted, e.g. mRNA,rRNA", metavar = "<patterns>")
+parser.add_argument("--extract-sequences", help = "If GFF3 file contains reference sequences, extract them to the specified file", metavar = "<output.fa>")
 args = parser.parse_args()
 
-trans2gene = {} # tid -> gid
-gid2name = {} # gid -> gene name
-tid2name = {} # tid -> transcript name
+patterns = set()
+if args.RNA_patterns != None:
+	patterns = set(args.RNA_patterns.split(','))
 
-rgx = re.compile("[\t]+")
-rgx2 = re.compile("^(" + "|".join(args.RNA_patterns.split(",")) + ")$")
+line_no = 0
+feature = Feature()
 
-fin = open(args.input_GFF3_file, "r")
-fout = open(args.output_GTF_file, "w")
+gid2gname = {}
 
-line_no = 0
+tid2pos = {}
+transcripts = []
+
+num_trans = 0
+
+reachFASTA = False
+
+with open(args.input_GFF3_file) as fin:
+	fout = open(args.output_GTF_file, "w")
+
+	for line in fin:
+		line = line.strip()
+		line_no += 1
+		if line_no % 100000 == 0:
+			print("Loaded {} lines".format(line_no))
+
+		if line.startswith("##FASTA"):
+			reachFASTA = True
+			break
+
+		if line.startswith("###"):
+			flush_out(fout)
+			continue
+
+		if line.startswith("#"):
+			continue
+
+		feature.parse(line, line_no)
+		if feature.feature_type == None:
+			continue
+		feature.parseAttributes()
+
+		if feature.feature_type == "gene_or_transcript":
+			parent = feature.getAttribute("Parent")
+			if parent == None:
+				feature.feature_type = "gene"
+			else:
+				feature.feature_type = "transcript"
+
+		if feature.feature_type == "gene":
+			gid = feature.getAttribute("ID", True)
+			my_assert(gid not in gid2gname, 
+				"Gene {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(gid, feature.line_no, feature.line))
+			gid2gname[gid] = feature.getAttribute("Name")
+		elif feature.feature_type == "transcript":
+			transcript = getTranscript(feature.getAttribute("ID", True), feature)
+			transcript.setTranscript(feature)
+		else:
+			assert feature.feature_type == "exon"
+			for parent in feature.getAttribute("Parent", True):
+				transcript = getTranscript(parent, feature)
+				transcript.addExon(feature)
+
+	flush_out(fout)
+	fout.close()
+	
+	print("GTF file is successully generated.")
+	print("There are {0} transcripts contained in the generated GTF file.".format(num_trans))	
 
-for line in fin:
-    line = line.rstrip("\r\n") # remove return characters
-    line_no += 1
-    
-    if line.startswith("##FASTA"):
-        break
-    if line.startswith("#"):
-        if line.startswith("###"):
-            # clear all dictionaries
-            trans2gene = {}
-            gid2name = {}
-            tid2name = {}
-        continue
-
-    arr = rgx.split(line)
-    my_assert(len(arr) == 9, "Line {0} does not have 9 fields:\n{1}".format(line_no, line))
-
-    if arr[2] == "exon":
-        parent = findTag("Parent", arr[8])
-        my_assert(parent != None, "Line {0} does not have a Parent tag:\n{1}".format(line_no, line))
-
-        tids = parent.split(",")
-        for tid in tids:
-            if tid not in trans2gene:
-                continue
-            gid = trans2gene[tid]
-            fout.write("{0}\tgene_id \"{1}\"; transcript_id \"{2}\";".format("\t".join(arr[:-1]), gid, tid))
-            name = gid2name.get(gid, None)
-            if name != None:
-                fout.write(" gene_name \"{0}\";".format(name))
-            name = tid2name.get(tid, None)
-            if name != None:
-                fout.write(" transcript_name \"{0}\";".format(name))
-            fout.write("\n")
-        
-    elif arr[2] == "gene":
-        gid = findTag("ID", arr[8])
-        name = findTag("Name", arr[8])
-
-        my_assert(gid != None, "Line {0} does not have a ID tag:\n{1}".format(line_no, line))
-        
-        if name != None:
-            gid2name[gid] = name
-                
-    elif rgx2.search(arr[2]) != None:
-        tid = findTag("ID", arr[8])
-        gid = findTag("Parent", arr[8])
-        name = findTag("Name", arr[8])
-
-        my_assert(tid != None, "Line {0} does not have a ID tag:\n{1}".format(line_no, line))
-        my_assert(gid != None, "Line {0} does not have a Parent tag:\n{1}".format(line_no, line))
-                    
-        trans2gene[tid] = gid
-        if name != None:
-            tid2name[tid] = name
-        
-fin.close()
-fout.close()
+	if reachFASTA and args.extract_sequences != None:
+		with open(args.extract_sequences, "w") as fout:
+			for line in fin:
+				fout.write(line)
+		print("FASTA file is successfully generated.")
\ No newline at end of file
diff --git a/rsem-prepare-reference b/rsem-prepare-reference
index 66e0e81..ed21552 100755
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -63,9 +63,11 @@ pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutuall
 pod2usage(-msg => "--gtf and --gff3 are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($gff3F ne ""));
 pod2usage(-msg => "--gtf/--gff3 and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if ((($gtfF ne "") || ($gff3F ne "")) && ($alleleMappingF ne ""));
 pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+pod2usage(-msg => "No poly(A) tail should be added if --star is set!", -exitval => 2, -verbose => 2) if ($star && $polyA);
 
 if (!$bowtie && ($bowtie_path ne "")) { print "Warning: If Bowtie is not used, no need to set --bowtie-path option!\n"; }
 if (!$bowtie2 && ($bowtie2_path ne "")) { print "Warning: If Bowtie 2 is not used, no need to set --bowtie2-path option!\n"; }
+if (!$star && ($star_path ne "")) { print "Warning: If STAR is not used, no need to set --star-path option!\n"; }
 
 my @list = split(/,/, $ARGV[0]);
 my $size = scalar(@list);
@@ -142,6 +144,8 @@ if ($bowtie2) {
 }
 
 if ($star) {
+    pod2usage(-msg => "Sorry, if you want RSEM run STAR for you, you must provide the genome sequence and associated GTF annotation.", -exitval => 2, -verbose => 2) if ($gtfF eq "");
+    
     my $out_star_genome_path = dirname($ARGV[1]);
     $command = $star_path . "STAR " .
                         " --runThreadN $star_nthreads " .
diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm
index c6a5d94..783ee26 100644
--- a/rsem_perl_utils.pm
+++ b/rsem_perl_utils.pm
@@ -7,9 +7,9 @@ use strict;
 require Exporter;
 our @ISA = qw(Exporter);
 our @EXPORT = qw(runCommand);
-our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS);
+our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
 
-my $version = "RSEM v1.2.30"; # Update version info here
+my $version = "RSEM v1.2.31"; # Update version info here
 my $samtools = "samtools-1.3"; # If update to another version of SAMtools, need to change this
 
 # command, {err_msg}
@@ -95,7 +95,15 @@ sub showVersionInfo {
 }
 
 sub getSAMTOOLS {
-    return $samtools
+    return $samtools;
+}
+
+sub hasPolyA {
+    open(INPUT, $_[0]);
+    my $line = <INPUT>; chomp($line);
+    close(INPUT);
+    my ($fullLen, $totLen) = split(/ /, $line);
+    return $fullLen < $totLen;
 }
 
 1;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/rsem.git