[med-svn] [metabit] 02/04: New upstream version 0.0+20160923

Tue Feb 14 10:52:55 UTC 2017

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository metabit.

commit 028b2292853c3d69629a5966d59266ab5e4a3666
Author: Andreas Tille <tille at debian.org>
Date:   Tue Feb 14 09:27:20 2017 +0100

    New upstream version 0.0+20160923
---
 nodes/samtools.py                      | 16 ----------
 nodes/tools/get_stats.py               | 14 +++++++--
 nodes/tools/statax_Rmodule/doBarplot.R | 29 +++++++++++++++---
 nodes/tools/statax_Rmodule/doClust.R   | 54 +++++++++++++++++++++-------------
 nodes/tools/statax_Rmodule/doDiv.R     | 44 ++++++++++++++++++++++-----
 nodes/tools/statax_Rmodule/doHeatmap.R | 23 +++++++++++----
 nodes/tools/statax_Rmodule/doPcoa.R    | 33 +++++++++++++--------
 7 files changed, 144 insertions(+), 69 deletions(-)

diff --git a/nodes/samtools.py b/nodes/samtools.py
index df7c1ef..3e54c22 100644
--- a/nodes/samtools.py
+++ b/nodes/samtools.py
@@ -13,15 +13,6 @@ from pypeline.atomiccmd.builder import \
 import pypeline.common.versions as versions
 
 
-# _VERSION_REGEX = r"Version: (\d+)\.(\d+)(?:\.(\d+))?"
-# SAMTOOLS_VERSION = versions.Requirement(call=("samtools",),
-#                                         search=_VERSION_REGEX,
-#                                         checks=(0, 1, 18))
-
-## v0.2.0 was the pre-release version of v1.0, and lacks required features
-#_COMMON_CHECK = versions.And(versions.GE(0, 1, 18),
-#                             versions.LT(0, 2, 0))
-
 _COMMON_CHECK = versions.GE(1, 1, 0)
 
 SAMTOOLS_VERSION = versions.Requirement(
@@ -39,13 +30,6 @@ class SortSamNode(CommandNode):
                                 IN_SAM = input_file,
                                 OUT_STDOUT = AtomicCmd.PIPE,
                                 CHECK_VERSION = SAMTOOLS_VERSION)
-        # old samtoolsx0.1
-        # cmd_sortbam = AtomicCmd(["samtools", "sort", "-f", "-", "%(OUT_BAM)s"],
-        #                         IN_STDIN = cmd_sam2bam,
-        #                         OUT_BAM = output_file)
-        # new samtoolsx1
-        ## samtools view -buS reads.singles.bowtie2out.sam | samtools sort -T temp_root -o here.bam -O bam -
-
         cmd_sortbam = AtomicCmd(["samtools", "sort",
                                  "-T", "%(TEMP_OUT_PREFIX)s",
                                  "-o", "%(OUT_BAM)s",
diff --git a/nodes/tools/get_stats.py b/nodes/tools/get_stats.py
index bfe6934..52f7125 100755
--- a/nodes/tools/get_stats.py
+++ b/nodes/tools/get_stats.py
@@ -13,7 +13,8 @@ import re
 import sys
 import pysam
 import os.path
-
+import shlex
+from subprocess import check_output
 from glob import glob
 
 
@@ -96,7 +97,16 @@ def get_bowtie2stats(file_):
     return tot, mapped
 
 
-def get_readcount(bamfile):
+
+def get_readcount(bamfile, cmd="samtools view -c {}".format):
+    if bamfile:
+        readcount = check_output(shlex.split(cmd(bamfile)))
+        return(int(readcount.rstrip("\n")), )
+    else:
+        return (None, )
+    
+
+def _get_readcount(bamfile):
     if bamfile:
         readcount = pysam.view("-c", bamfile)[0]
         return (int(readcount),)
diff --git a/nodes/tools/statax_Rmodule/doBarplot.R b/nodes/tools/statax_Rmodule/doBarplot.R
index ffbab72..32f5441 100755
--- a/nodes/tools/statax_Rmodule/doBarplot.R
+++ b/nodes/tools/statax_Rmodule/doBarplot.R
@@ -20,18 +20,28 @@ doBarplot <- function(d, out.pdf=NULL,
 
   if ( sample.order=="none" || ncol(d)<2 ) {
     sample.order <- seq_len(ncol(d))
+    in.bp <- d
   } else if (sample.order=="alpha") {
     sample.order <- order(colnames(d))
+    in.bp <- d[,sample.order]
   } else {
     sample.order <- hclust(dist(t(d), method=sample.order))$order
+    in.bp <- d[,sample.order]
   }
-  in.bp <- d[,sample.order]
+
   color.gradient <- gglike_colors(nrow(d))
   taxa.names <- rownames(in.bp)
   legend.order <- seq_along(taxa.names)
 
   if (!is.null(uncl_col)) {
-    in.bp <- rbind(in.bp, unclassified=100-colSums(in.bp))
+    if(ncol(in.bp) < 2){
+      in.bp <- rbind(in.bp, "unclassified"=100-sum(in.bp[,1]))
+    } else {
+      in.bp <- rbind(in.bp, "unclassified"=100-colSums(in.bp))
+    }
+
+    # #in.bp <- rbind(in.bp, unclassified=100-colSums(in.bp))
+
     color.gradient <- c(color.gradient, uncl_col)
     taxa.names <- c(sort(taxa.names), "unclassified")
     legend.order <- match(taxa.names, sort(taxa.names))
@@ -114,10 +124,21 @@ if( !interactive() ) {
   d <- read.table(cl$args[1], row.names=1, sep='\t', as.is=TRUE)
   if( row.names(d)[1] == "ID" ) {
     names(d) <- d[1,]
-    d <- d[-1,]
+    if(ncol(d) < 2){
+      d <- subset(d, row(d)>=2)
+    } else {
+      d <- d[-1,]
+    }
   }
   taxa.names <- rownames(d)
-  d <- apply(d, 2, as.numeric)
+
+  if(ncol(d) < 2){
+    d[,1] <- as.numeric(d[,1])
+  } else {
+    d <- apply(d, 2, as.numeric)
+  }
+  
+
   rownames(d) <- taxa.names
 
   doBarplot(d, out.pdf=cl$args[2],
diff --git a/nodes/tools/statax_Rmodule/doClust.R b/nodes/tools/statax_Rmodule/doClust.R
index f88cd81..ff77485 100755
--- a/nodes/tools/statax_Rmodule/doClust.R
+++ b/nodes/tools/statax_Rmodule/doClust.R
@@ -8,13 +8,24 @@ source(paste0(statax_dir, "/pvegclust-internal.R"))
 source(paste0(statax_dir, "/pvegclust.R"))
 
 read.abundance.table <- function(filename) {
-  d <- read.table(cl$args[1], row.names=1, sep='\t', as.is=TRUE)
+  d <- read.table(filename, row.names=1, sep='\t', as.is=TRUE)
   if( row.names(d)[1] == "ID" ) {
     names(d) <- d[1,]
-    d <- d[-1,]
+    if(ncol(d) < 2){
+      d <- subset(d, row(d)>=2)
+    } else {
+      d <- d[-1,]
+    }
   }
   taxa.names <- rownames(d)
-  d <- apply(d, 2, as.numeric)
+
+  if(ncol(d) < 2){
+    d[,1] <- as.numeric(d[,1])
+  } else {
+    d <- apply(d, 2, as.numeric)
+  }
+  
+
   rownames(d) <- taxa.names
   return(d)
 }
@@ -33,24 +44,6 @@ doClust <- function(d, out.clust=NULL, out.clust.pdf=NULL, out.clust.tsv=NULL,
                             paste0(out.clust,".RData"),
                             out.clust.RData)
   }
-  if (ncol(d) < 3)
-    msg <- c("Error: hierarchical clustering cannot be done with less than 3 samples",
-            paste0("(Input file: ", cl$args[1], ")"))
-  if (nrow(d) < 3)
-    msg <- c("Error: hierarchical clustering cannot be done with less than 3 objects (taxa)",
-            paste0("(Input file: ", cl$args[1], ")"))
-  if (exists("msg")) {
-    write(msg, file=stderr())
-    if( !interactive() ) {
-      pdf(out.clust.pdf)
-      plot.new()
-      text(0, c(1, .95), msg, adj=0, col='red')
-      invisible(dev.off())
-      write(msg, file=out.clust.tsv)
-      save(msg, file=out.clust.RData)
-      quit(save="no", status=0)
-    }
-  }
 
   if (ncores>1) {
     pvclust.func <- function(...) {
@@ -125,6 +118,25 @@ if( !interactive() ) {
 
   d <- read.abundance.table(cl$args[1])
 
+  if (ncol(d) < 3)
+    msg <- c("Error: hierarchical clustering cannot be done with less than 3 samples",
+            paste0("(Input file: ", cl$args[1], ")"))
+  if (nrow(d) < 3)
+    msg <- c("Error: hierarchical clustering cannot be done with less than 3 objects (taxa)",
+            paste0("(Input file: ", cl$args[1], ")"))
+  if (exists("msg")) {
+    write(msg, file=stderr())
+    if( !interactive() ) {
+      pdf(cl$options$pdf)
+      plot.new()
+      text(0, c(1, .95), msg, adj=0, col='red')
+      invisible(dev.off())
+      write(msg, file=cl$options$tsv)
+      save(msg, file=cl$options$RData)
+      quit(save="no", status=0)
+    }
+  }
+  
   doClust(d, out.clust=cl$options$basename,
           out.clust.pdf=cl$options$pdf,
           out.clust.tsv=cl$options$tsv,
diff --git a/nodes/tools/statax_Rmodule/doDiv.R b/nodes/tools/statax_Rmodule/doDiv.R
index 4d25e77..3ad347f 100755
--- a/nodes/tools/statax_Rmodule/doDiv.R
+++ b/nodes/tools/statax_Rmodule/doDiv.R
@@ -4,10 +4,19 @@ read.abundance.table <- function(filename) {
   d <- read.table(filename, row.names=1, sep='\t', as.is=TRUE)
   if( row.names(d)[1] == "ID" ) {
     names(d) <- d[1,]
-    d <- d[-1,]
+    if(ncol(d) < 2){
+      d <- subset(d, row(d)>=2)
+    } else {
+      d <- d[-1,]
+    }
   }
   taxa.names <- rownames(d)
-  d <- apply(d, 2, as.numeric)
+
+  if(ncol(d) < 2){
+    d[,1] <- as.numeric(d[,1])
+  } else {
+    d <- apply(d, 2, as.numeric)
+  }
   rownames(d) <- taxa.names
   return(d)
 }
@@ -20,15 +29,21 @@ my.write.table <- function(Table, file, sep="\t", quote=F) {
 
 split.table <- function(Table, rm.strains=TRUE,
                         default.taxlevels=c("k", "p", "c", "o", "f", "g", "s")) {
-    if( rm.strains ) Table <- Table[!grepl('\\|t__', row.names(Table)),]
 
+    if( rm.strains ) {
+      Table <- subset(Table, !grepl('\\|t__', row.names(Table)))
+      # Table <- Table[!grepl('\\|t__', row.names(Table)),]
+    }
     sample_names <- names(Table)
     taxa_names   <- strsplit(row.names(Table), '|', fixed=T)
     taxlevel   <- sapply(taxa_names, function(e) default.taxlevels[length(e)])
     lasttaxon  <- sapply(taxa_names, tail, 1) 
+      ## Data1 <- sapply(default.taxlevels, function(x) Table[taxlevel==x,],
+      ##              simplify=F, USE.NAMES=T)
 
-    Data <- sapply(default.taxlevels, function(x) Table[taxlevel==x,],
+    Data <- sapply(default.taxlevels, function(x) subset(Table, taxlevel==x),
                    simplify=F, USE.NAMES=T)
+    Data
     #warning: you should check that the row.names do not contain duplicated taxa. 
 }
 
@@ -40,6 +55,7 @@ filter.table <- function(Data) {
             colnames(Data$k)[unclassified]),
           file=stderr())
     Data <- sapply(Data, `[`, TRUE, !unclassified)
+    Data
   } else{
     write("No samples are 100% unclassified", file=stderr())
     Data
@@ -48,7 +64,7 @@ filter.table <- function(Data) {
 
 doDiv <- function(Data, index="shannon") {
   suppressPackageStartupMessages(require(vegan, quietly=T))
-  diversities <- sapply(Data, diversity, index=cl$options$index, MARGIN=2,
+  diversities <- sapply(Data, diversity, index=index, MARGIN=2,
                         USE.NAMES=T)
   return(diversities)
 }
@@ -79,10 +95,24 @@ if( !interactive() ) {
   default.taxlevels <- unlist(strsplit(cl$options$taxlevels, ""))
   Data <- split.table(Table, rm.strains=cl$options$rmstrains,
                       default.taxlevels=default.taxlevels)
-  Data <- filter.table(Data)
 
+  Data <- filter.table(Data)
+  # diversity cannot calculate the distance when only one row, so setting to zero and add a dummyrow
+  for (d in names(Data)){
+    if(dim(Data[[d]])[1]<2){
+      Data[[d]] = Data[[d]] * 0
+      Data[[d]] <-rbind(Data[[d]], 0)
+    }
+  }
   diversities <- doDiv(Data, index=cl$options$index)
-  
+
+  if(is.null(dim(diversities))){
+    diversities = t(as.data.frame(diversities))
+    rownames(diversities) = colnames(Data[[1]])[1] # Get sample name
+  } else{
+    
+  }
+
   out.div <- stdout()
   if( length(cl$args) == 2 ) out.div <- cl$args[2]
   my.write.table(diversities, file=out.div)
diff --git a/nodes/tools/statax_Rmodule/doHeatmap.R b/nodes/tools/statax_Rmodule/doHeatmap.R
index 43e5dec..317f4c5 100755
--- a/nodes/tools/statax_Rmodule/doHeatmap.R
+++ b/nodes/tools/statax_Rmodule/doHeatmap.R
@@ -1,13 +1,23 @@
 #!/usr/bin/env Rscript
 
 read.abundance.table <- function(filename) {
-  d <- read.table(cl$args[1], row.names=1, sep='\t', as.is=TRUE)
+  d <- read.table(filename, row.names=1, sep='\t', as.is=TRUE)
   if( row.names(d)[1] == "ID" ) {
     names(d) <- d[1,]
-    d <- d[-1,]
+    if(ncol(d) < 2){
+      d <- subset(d, row(d)>=2)
+    } else {
+      d <- d[-1,]
+    }
   }
   taxa.names <- rownames(d)
-  d <- apply(d, 2, as.numeric)
+
+  if(ncol(d) < 2){
+    d[,1] <- as.numeric(d[,1])
+  } else {
+    d <- apply(d, 2, as.numeric)
+  }
+
   rownames(d) <- taxa.names
   return(d)
 }
@@ -22,11 +32,12 @@ doHeatmap <- function(d, out.pdf=NULL, graph.title=NULL, taxon.title="Taxon",
   taxa.order    <- hclust(dist(d))$order
   if( ncol(d)>2 ) {
     samples.order <- hclust(dist(t(d)))$order
+    heatmap.data  <- d[taxa.order, samples.order]
   } else { 
-    samples.order <- seq_len(ncol(d))
+    # samples.order <- seq_len(ncol(d))
+    heatmap.data  <- d[taxa.order,,drop=FALSE]   # drop=FALSE to keep data.frame
   }
-    
-  heatmap.data  <- d[taxa.order, samples.order]
+
 
   heatmap.data <- melt(t(heatmap.data), varnames=c("sample", taxon.title),
                        value.name="abundance")
diff --git a/nodes/tools/statax_Rmodule/doPcoa.R b/nodes/tools/statax_Rmodule/doPcoa.R
index 4d8b7d0..cb493b2 100755
--- a/nodes/tools/statax_Rmodule/doPcoa.R
+++ b/nodes/tools/statax_Rmodule/doPcoa.R
@@ -40,19 +40,6 @@ doPCoA <- function(d,  # column names will be used for the distance
                          paste0(out.pcoa,".tsv"),
                          out.pcoa.tsv)
 
-  if (nrow(d) <= 2) {
-    msg <- c("Error: PCoA cannot be done with less than 3 columns",
-             paste0("(Input file: ", cl$args[1], ")"))
-    write(msg, file=stderr())
-    if( !interactive() ) {
-      pdf(out.pcoa.pdf)
-      plot.new()
-      text(0, c(1, .95), msg, adj=0, col='red')
-      invisible(dev.off())
-      write(msg, file=out.pcoa.tsv)
-      quit(save="no", status=0)
-    }
-  }
 
   require(ape, quietly=T)
   suppressPackageStartupMessages(require(vegan, quietly=T))
@@ -121,6 +108,10 @@ doPCoA <- function(d,  # column names will be used for the distance
   legend(x = par("usr")[2], y = par("usr")[4], legend = legend.text,
          pch=legend.pch, col=legend.out, pt.bg=legend.in, text.col=text.col,
          bty='n', xpd=TRUE, xjust=0, cex=.8, pt.cex=1.5, ncol=legend.cols)
+
+#  number_of_samples = 37  ## uncomment and replace XX with number of the samples or libs analyzed.
+#  point.labels <- row.names(pcoa.result$vectors)[1:number_of_samples]
+#  text(x=x.pcoa.r[1:number_of_samples], y=y.pcoa.r[1:number_of_samples], labels=point.labels, font=1, cex=0.2, pos=3, col="black")
   
   if (!is.interactive) invisible(dev.off())
 }
@@ -170,8 +161,24 @@ if( !interactive() ) {
                          option_list=option_list)
   cl <- parse_args(Parser, positional_arguments=1)
 
+ 
   if (cl$args == "-") cl$args <- "stdin"
   d <- read.table(cl$args, row.names=1, sep='\t', as.is=TRUE)
+
+  if (ncol(d) <= 2 ) {
+    msg <- c("Error: PCoA cannot be done with less than 3 samples",
+             paste0("(Input file: ", cl$args[1], ")"))
+    write(msg, file=stderr())
+    if( !interactive() ) {
+      pdf(cl$options$pdf)
+      plot.new()
+      text(0, c(1, .95), msg, adj=0, col='red')
+      invisible(dev.off())
+      write(msg, file=cl$options$tsv)
+      quit(save="no", status=0)
+    }
+  }
+
   if( row.names(d)[1] == "ID" ) {
     names(d) <- d[1,]
     d <- d[-1,]

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/metabit.git