[med-svn] [r-bioc-deseq2] 03/05: Imported Upstream version 1.12.3

Tue Jun 28 16:22:41 UTC 2016

This is an automated email from the git hooks/post-receive script.

misterc-guest pushed a commit to branch master
in repository r-bioc-deseq2.

commit 16b20a8e636e3f576f54f0f42a3079010065ac5e
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date:   Mon Jun 27 13:42:25 2016 -0700

    Imported Upstream version 1.12.3
---
 DESCRIPTION                               |  15 +-
 NAMESPACE                                 |  37 ++-
 NEWS                                      |  51 ++-
 R/AllClasses.R                            |  89 ++++--
 R/core.R                                  |  40 ++-
 R/helper.R                                | 169 ++--------
 R/methods.R                               |  56 ++--
 R/plots.R                                 |  11 +-
 R/results.R                               | 217 +++++++------
 R/vst.R                                   |  89 +++++-
 build/vignette.rds                        | Bin 228 -> 229 bytes
 inst/doc/DESeq2.R                         | 496 ++++++++++++++++++++++++++++++
 inst/doc/DESeq2.Rnw                       | 372 ++++++++++++++++++++--
 inst/doc/DESeq2.pdf                       | Bin 642417 -> 700965 bytes
 inst/script/deseq2.R                      | 322 -------------------
 inst/script/runScripts.R                  |  37 ++-
 inst/script/simulateCluster.R             |   5 +-
 inst/script/simulateDE.R                  |  18 +-
 inst/script/simulateLFCAccuracy.R         |   5 +-
 inst/script/simulateOutliers.R            |   5 +-
 inst/script/simulation.Rmd                |  11 +-
 inst/script/simulation.pdf                | Bin 377985 -> 394022 bytes
 {vignettes => inst/script}/vst.nb         |   0
 {vignettes => inst/script}/vst.pdf        | Bin
 man/DESeq.Rd                              |   2 +-
 man/DESeq2-package.Rd                     |   2 +-
 man/DESeqDataSet.Rd                       |  17 +-
 man/estimateSizeFactors.Rd                |  19 +-
 man/fpkm.Rd                               |   9 +-
 man/fpm.Rd                                |   3 +-
 man/nbinomWaldTest.Rd                     |   3 +-
 man/normalizeGeneLength.Rd                |  96 +-----
 man/plotMA.Rd                             |  17 +-
 man/results.Rd                            |  67 ++--
 man/vst.Rd                                |  43 +++
 src/DESeq2.cpp                            |  16 +-
 tests/testthat/test_construction_errors.R |   1 +
 tests/testthat/test_counts_input.R        |   3 +-
 tests/testthat/test_custom_filt.R         |  22 +-
 tests/testthat/test_edge_case.R           |   2 +-
 tests/testthat/test_gene_length.R         |  29 --
 tests/testthat/test_model_matrix.R        |  10 +
 tests/testthat/test_nbinomWald.R          |   7 +
 tests/testthat/test_optim.R               |   6 +-
 tests/testthat/test_outlier.R             |   6 +
 tests/testthat/test_results.R             |   2 +-
 tests/testthat/test_rlog.R                |   2 +
 tests/testthat/test_size_factor.R         |   1 +
 tests/testthat/test_tximport.R            |  22 ++
 tests/testthat/test_vst.R                 |  15 +-
 vignettes/DESeq2.Rnw                      | 372 ++++++++++++++++++++--
 vignettes/library.bib                     |  73 +++++
 52 files changed, 2010 insertions(+), 902 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index dd067d4..7d7c879 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: DESeq2
 Type: Package
 Title: Differential gene expression analysis based on the negative
         binomial distribution
-Version: 1.10.1
+Version: 1.12.3
 Author: Michael Love (HSPH Boston), Simon Anders, Wolfgang Huber (EMBL
     Heidelberg)
 Maintainer: Michael Love <michaelisaiahlove at gmail.com>
@@ -13,14 +13,15 @@ Description: Estimate variance-mean dependence in count data from
 License: LGPL (>= 3)
 VignetteBuilder: knitr
 Imports: BiocGenerics (>= 0.7.5), Biobase, BiocParallel, genefilter,
-        methods, locfit, geneplotter, ggplot2, Hmisc
-Depends: S4Vectors, IRanges, GenomicRanges, SummarizedExperiment (>=
-        0.2.0), Rcpp (>= 0.10.1), RcppArmadillo (>= 0.3.4.4)
+        methods, locfit, geneplotter, ggplot2, Hmisc, Rcpp (>= 0.11.0)
+Depends: S4Vectors (>= 0.9.25), IRanges, GenomicRanges,
+        SummarizedExperiment (>= 1.1.6)
 Suggests: testthat, knitr, BiocStyle, vsn, pheatmap, RColorBrewer,
-        airway, pasilla (>= 0.2.10), DESeq
+        airway, IHW, tximport, tximportData, readr, pasilla (>=
+        0.2.10), DESeq
 LinkingTo: Rcpp, RcppArmadillo
 biocViews: Sequencing, ChIPSeq, RNASeq, SAGE, DifferentialExpression,
-        GeneExpression
+        GeneExpression, Transcription
 RoxygenNote: 5.0.1
 NeedsCompilation: yes
-Packaged: 2015-12-21 03:33:03 UTC; biocbuild
+Packaged: 2016-05-26 03:48:34 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
index 1824d77..81f2bcf 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,6 +8,7 @@ export(DESeq)
 export(DESeqDataSet)
 export(DESeqDataSetFromHTSeqCount)
 export(DESeqDataSetFromMatrix)
+export(DESeqDataSetFromTximport)
 export(DESeqResults)
 export(DESeqTransform)
 export(collapseReplicates)
@@ -40,6 +41,7 @@ export(rlog)
 export(rlogTransformation)
 export(summary.DESeqResults)
 export(varianceStabilizingTransformation)
+export(vst)
 exportClasses(DESeqDataSet)
 exportClasses(DESeqResults)
 exportClasses(DESeqTransform)
@@ -67,7 +69,6 @@ import(BiocParallel)
 import(GenomicRanges)
 import(IRanges)
 import(Rcpp)
-import(RcppArmadillo)
 import(S4Vectors)
 import(SummarizedExperiment)
 import(methods)
@@ -81,6 +82,40 @@ importFrom(ggplot2,geom_point)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,xlab)
 importFrom(ggplot2,ylab)
+importFrom(graphics,axis)
+importFrom(graphics,hist)
+importFrom(graphics,plot)
+importFrom(graphics,points)
 importFrom(locfit,locfit)
+importFrom(stats,Gamma)
+importFrom(stats,as.formula)
 importFrom(stats,coef)
+importFrom(stats,coefficients)
+importFrom(stats,df)
+importFrom(stats,dnbinom)
+importFrom(stats,dnorm)
+importFrom(stats,formula)
+importFrom(stats,glm)
+importFrom(stats,loess)
+importFrom(stats,lowess)
+importFrom(stats,model.matrix)
+importFrom(stats,optim)
+importFrom(stats,p.adjust)
+importFrom(stats,pchisq)
+importFrom(stats,pnorm)
+importFrom(stats,prcomp)
+importFrom(stats,predict)
+importFrom(stats,pt)
+importFrom(stats,qf)
+importFrom(stats,qnorm)
+importFrom(stats,rchisq)
+importFrom(stats,relevel)
+importFrom(stats,rnbinom)
+importFrom(stats,rnorm)
+importFrom(stats,runif)
+importFrom(stats,splinefun)
+importFrom(stats,terms)
+importFrom(stats,terms.formula)
+importFrom(utils,packageVersion)
+importFrom(utils,read.table)
 useDynLib(DESeq2)
diff --git a/NEWS b/NEWS
index 05c62b1..ae01c26 100644
--- a/NEWS
+++ b/NEWS
@@ -1,8 +1,53 @@
-CHANGES IN VERSION 1.10.1
+CHANGES IN VERSION 1.12.3
 -------------------------
 
-    o Change import of ggplot2 to selective import, to avoid collisions
-      with BiocGenerics.
+    o Fixed bug: fpm() and fpkm() for tximport.
+    o Fixed bug: normalization factors and VST.
+    o Added an error if tximport lengths have 0.
+    o Added an error if user matrices are not full rank.
+    o More helpful error for constant factor in design.
+
+CHANGES IN VERSION 1.12.0
+-------------------------
+
+    o Added DESeqDataSetFromTximport() to import
+      counts using tximport.
+    o Added vst() a fast wrapper for the VST.
+    o Added support for IHW p-value adjustment.
+
+CHANGES IN VERSION 1.11.42
+--------------------------
+
+    o Update summary() to be IHW-results-aware.
+
+    o Small change to fitted mu values to improve fit stability
+      when counts are very low. Inference for high count genes
+      is not affected.
+
+    o Galaxy script inst/script/deseq2.R moves to Galaxy repo.
+
+CHANGES IN VERSION 1.11.33
+--------------------------
+
+    o Changed 'filterFun' API to accommodate IHW:
+      independent hypothesis weighting in results(),
+      see vignette for example code. 
+      Thanks to Nikolaos Ignatiadis, maintainer of IHW package.
+
+CHANGES IN VERSION 1.11.18
+--------------------------
+
+    o Added a function vst(), which is a fast wrapper for
+      varianceStabilizingTransformation(). The speed-up
+      is accomplished by subsetting to a smaller number
+      of genes for the estimation of the dispersion trend.
+
+CHANGES IN VERSION 1.11.5
+-------------------------
+
+    o Adding in functionality to import estimated counts and
+      average transcript length offsets from tximport,
+      using DESeqDataSetFromTximport().
 
 CHANGES IN VERSION 1.10.0
 -------------------------
diff --git a/R/AllClasses.R b/R/AllClasses.R
index ad61ab8..6e27976 100644
--- a/R/AllClasses.R
+++ b/R/AllClasses.R
@@ -51,9 +51,14 @@ please only use letters and numbers for levels of factors in the design")
 #' In addition, a formula which specifies the design of the experiment must be provided.
 #' The constructor functions create a DESeqDataSet object
 #' from various types of input:
-#' a RangedSummarizedExperiment, a matrix, or count files generated by
-#' the python package HTSeq.  See the vignette for examples of construction
-#' from all three input types.
+#' a RangedSummarizedExperiment, a matrix, count files generated by
+#' the python package HTSeq, or a list from the tximport function in the
+#' tximport package.
+#' See the vignette for examples of construction from different types.
+#'
+#' Note on the error message "assay colnames() must be NULL or equal colData rownames()":
+#' this means that the colnames of countData are different than the rownames of colData.
+#' Fix this with: \code{colnames(countData) <- NULL}
 #'
 #' @param se a \code{RangedSummarizedExperiment} with columns of variables
 #' indicating sample information in \code{colData},
@@ -78,6 +83,7 @@ please only use letters and numbers for levels of factors in the design")
 #' describes one sample. The first column is the sample name, the second column
 #' the file name of the count file generated by htseq-count, and the remaining
 #' columns are sample metadata which will be stored in \code{colData}
+#' @param txi for tximport: the simple list output of the \code{tximport} function
 #' @param directory for htseq-count: the directory relative to which the filenames are specified. defaults to current directory
 #' @param ignoreRank use of this argument is reserved for DEXSeq developers only.
 #' Users will immediately encounter an error upon trying to estimate dispersion
@@ -103,14 +109,11 @@ please only use letters and numbers for levels of factors in the design")
 #' dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), ~ condition)
 #'
 #' @rdname DESeqDataSet
+#' @importFrom utils packageVersion
 #' @export
 DESeqDataSet <- function(se, design, ignoreRank=FALSE) {
   if (!is(se, "RangedSummarizedExperiment")) {
-    if (is(se, "SummarizedExperiment0")) {
-      se <- as(se, "RangedSummarizedExperiment")
-    } else if (is(se, "SummarizedExperiment")) {
-      # only to help transition from SummarizedExperiment to new
-      # RangedSummarizedExperiment objects, remove once transition is complete
+    if (is(se, "SummarizedExperiment")) {
       se <- as(se, "RangedSummarizedExperiment")
     } else {
       stop("'se' must be a RangedSummarizedExperiment object")
@@ -183,21 +186,28 @@ DESeqDataSet <- function(se, design, ignoreRank=FALSE) {
       }
     }
     if (warnIntVars) {
-      message(paste0("the design formula contains a numeric variable with integer values,
+      message("the design formula contains a numeric variable with integer values,
   specifying a model with increasing fold change for higher values.
   did you mean for this to be a factor? if so, first convert
-  this variable to a factor using the factor() function"))
+  this variable to a factor using the factor() function")
     }
   }
 
   designFactors <- designVars[designVarsClass == "factor"]
-  missingLevels <- sapply(designFactors,function(v) any(table(colData(se)[[v]]) == 0))
+  missingLevels <- sapply(designFactors, function(v) any(table(colData(se)[[v]]) == 0))
   if (any(missingLevels)) {
     message("factor levels were dropped which had no samples")
     for (v in designFactors[missingLevels]) {
       colData(se)[[v]] <- droplevels(colData(se)[[v]])
     }
   }
+
+  singleLevel <- sapply(designFactors, function(v) all(colData(se)[[v]] == colData(se)[[v]][1]))
+  if (any(singleLevel)) {
+    stop("design contains one or more variables with all samples having the same value,
+  remove these variables from the design")
+  }
+
   
   modelMatrix <- model.matrix(design, data=as.data.frame(colData(se)))
   if (!ignoreRank) {
@@ -235,24 +245,27 @@ DESeqDataSet <- function(se, design, ignoreRank=FALSE) {
     cbind(mcols(colData(se)), mcolsCols)
   }
   
-  dds <- new("DESeqDataSet", se, design = design)
+  object <- new("DESeqDataSet", se, design = design)
                                  
   # now we know we have at least an empty GRanges or GRangesList for rowRanges
   # so we can create a metadata column 'type' for the mcols
   # and we label any incoming columns as 'input'
 
   # this is metadata columns on the rows
-  mcolsRows <- DataFrame(type=rep("input",ncol(mcols(dds))),
-                         description=rep("",ncol(mcols(dds))))
-  mcols(mcols(dds)) <- if (is.null(mcols(mcols(dds)))) {
+  mcolsRows <- DataFrame(type=rep("input",ncol(mcols(object))),
+                         description=rep("",ncol(mcols(object))))
+  mcols(mcols(object)) <- if (is.null(mcols(mcols(object)))) {
     mcolsRows
-  } else if (all(names(mcols(mcols(dds))) == c("type","description"))) {
+  } else if (all(names(mcols(mcols(object))) == c("type","description"))) {
     mcolsRows
   } else {
-    cbind(mcols(mcols(dds)), mcolsRows)
+    cbind(mcols(mcols(object)), mcolsRows)
   }
+
+  # stash the package version
+  metadata(object)[["version"]] <- packageVersion("DESeq2")
   
-  return(dds)
+  return(object)
 }
 
 #' @rdname DESeqDataSet
@@ -269,9 +282,10 @@ DESeqDataSetFromMatrix <- function( countData, colData, design, tidy=FALSE, igno
 
   # we expect a matrix of counts, which are non-negative integers
   countData <- as.matrix( countData )
-    
-  if (is(colData,"data.frame")) colData <- DataFrame(colData, row.names=rownames(colData))
 
+  if (is(colData,"data.frame"))
+    colData <- as(colData, "DataFrame")
+  
   # check if the rownames of colData are simply in different order
   # than the colnames of the countData, if so throw an error
   # as the user probably should investigate what's wrong
@@ -290,9 +304,9 @@ DESeqDataSetFromMatrix <- function( countData, colData, design, tidy=FALSE, igno
   }
   
   se <- SummarizedExperiment(assays = SimpleList(counts=countData), colData = colData, ...)
-  dds <- DESeqDataSet(se, design = design, ignoreRank)
+  object <- DESeqDataSet(se, design = design, ignoreRank)
 
-  return(dds)
+  return(object)
 }
 
 #' @rdname DESeqDataSet
@@ -312,10 +326,31 @@ DESeqDataSetFromHTSeqCount <- function( sampleTable, directory=".", design, igno
   # or is one of the old special names (htseq-count backward compatability)
   specialRows <- (substr(rownames(tbl),1,1) == "_") | rownames(tbl) %in% oldSpecialNames
   tbl <- tbl[ !specialRows, , drop=FALSE ]
-  dds <- DESeqDataSetFromMatrix(countData=tbl,colData=sampleTable[,-(1:2),drop=FALSE],design=design,ignoreRank, ...)
-  return(dds)
+  object <- DESeqDataSetFromMatrix(countData=tbl,colData=sampleTable[,-(1:2),drop=FALSE],design=design,ignoreRank, ...)
+  return(object)
+}   
+
+#' @rdname DESeqDataSet
+#' @export
+DESeqDataSetFromTximport <- function(txi, colData, design, ...) 
+{
+  counts <- round(txi$counts)
+  mode(counts) <- "integer"
+  object <- DESeqDataSetFromMatrix(countData=counts, colData=colData, design=design, ...)
+  stopifnot(txi$countsFromAbundance %in% c("no","scaledTPM","lengthScaledTPM"))
+  if (txi$countsFromAbundance %in% c("scaledTPM","lengthScaledTPM")) {
+    message("using just counts from tximport")
+  } else {
+    message("using counts and average transcript lengths from tximport")
+    lengths <- txi$length
+    stopifnot(all(lengths > 0))
+    dimnames(lengths) <- dimnames(object)
+    assays(object)[["avgTxLength"]] <- lengths
+  }
+  return(object)
 }   
 
+
 #' @rdname DESeqResults
 #' @export
 setClass("DESeqResults", contains="DataFrame")
@@ -364,11 +399,7 @@ setClass("DESeqTransform", contains="RangedSummarizedExperiment")
 DESeqTransform <- function(SummarizedExperiment) {
   se <- SummarizedExperiment
   if (!is(se, "RangedSummarizedExperiment")) {
-    if (is(se, "SummarizedExperiment0")) {
-      se <- as(se, "RangedSummarizedExperiment")
-    } else if (is(se, "SummarizedExperiment")) {
-      # only to help transition from SummarizedExperiment to new
-      # RangedSummarizedExperiment objects, remove once transition is complete
+    if (is(se, "SummarizedExperiment")) {
       se <- as(se, "RangedSummarizedExperiment")
     } else {
       stop("'SummarizedExperiment' must be a RangedSummarizedExperiment object")
diff --git a/R/core.R b/R/core.R
index c4c9283..831d658 100644
--- a/R/core.R
+++ b/R/core.R
@@ -49,7 +49,7 @@
 #' For more detailed information on usage, see the package vignette, by typing
 #' \code{vignette("DESeq2")}, or the workflow linked to on the first page
 #' of the vignette. All support questions should be posted to the Bioconductor
-#' support site: \url{support.bioconductor.org}.
+#' support site: \url{http://support.bioconductor.org}.
 #'
 #' @references
 #'
@@ -131,7 +131,7 @@ NULL
 #' Unlike the behavior of \code{\link{replaceOutliers}}, here original counts are
 #' kept in the matrix returned by \code{\link{counts}}, original Cook's
 #' distances are kept in \code{assays(dds)[["cooks"]]}, and the replacement
-#' counts used for fitting are kept in \code{assays(object)[["replaceCounts"]]}.
+#' counts used for fitting are kept in \code{assays(dds)[["replaceCounts"]]}.
 #'
 #' Note that if a log2 fold change prior is used (betaPrior=TRUE)
 #' then expanded model matrices will be used in fitting. These are
@@ -200,12 +200,16 @@ NULL
 #' @references
 #'
 #' Michael I Love, Wolfgang Huber, Simon Anders: Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 2014, 15:550. \url{http://dx.doi.org/10.1186/s13059-014-0550-8}
-#' @import BiocGenerics BiocParallel S4Vectors IRanges GenomicRanges SummarizedExperiment Biobase Rcpp RcppArmadillo methods
+#' @import BiocGenerics BiocParallel S4Vectors IRanges GenomicRanges SummarizedExperiment Biobase Rcpp methods
 #'
 #' @importFrom locfit locfit
 #' @importFrom genefilter rowVars filtered_p
 #' @importFrom Hmisc wtd.quantile
 #' 
+#' @importFrom graphics axis hist plot points
+#' @importFrom stats Gamma as.formula coefficients df dnbinom dnorm formula glm loess lowess model.matrix optim p.adjust pchisq pnorm prcomp predict pt qf qnorm rchisq relevel rnbinom rnorm runif splinefun terms terms.formula
+#' @importFrom utils read.table
+#' 
 #' @useDynLib DESeq2
 #'
 #' @seealso \code{\link{nbinomWaldTest}}, \code{\link{nbinomLRT}}
@@ -273,6 +277,8 @@ DESeq <- function(object, test=c("Wald","LRT"),
     if (modelAsFormula) {
       checkLRT(full, reduced)
     } else {
+      checkFullRank(full)
+      checkFullRank(reduced)
       if (ncol(full) <= ncol(reduced)) {
         stop("the number of columns of 'full' should be more than the number of columns of 'reduced'")
       }
@@ -295,6 +301,7 @@ DESeq <- function(object, test=c("Wald","LRT"),
     if (betaPrior == TRUE) {
       stop("betaPrior=TRUE is not supported for user-provided model matrices")
     }
+    checkFullRank(full)
     # this will be used for dispersion estimation and testing
     modelMatrix <- full
   }
@@ -344,6 +351,9 @@ DESeq <- function(object, test=c("Wald","LRT"),
                                    modelMatrix=modelMatrix,
                                    modelMatrixType=modelMatrixType)
   }
+
+  # stash the package version (again, also in construction)
+  metadata(object)[["version"]] <- packageVersion("DESeq2")
   
   object
 }
@@ -602,10 +612,10 @@ estimateDispersionsGeneEst <- function(object, minDisp=1e-8, kappa_0=1,
   fitidx <- rep(TRUE,nrow(objectNZ))
   mu <- matrix(0, nrow=nrow(objectNZ), ncol=ncol(objectNZ))
   dispIter <- numeric(nrow(objectNZ))
-  # bound the estimated count at 0.1.
+  # bound the estimated count
   # this helps make the fitting more robust,
   # because 1/mu occurs in the weights for the NB GLM
-  minmu <- 0.1
+  minmu <- 0.5
   for (iter in seq_len(niter)) {
     fit <- fitNbinomGLMs(objectNZ[fitidx,,drop=FALSE],
                          alpha_hat=alpha_hat[fitidx],
@@ -967,8 +977,9 @@ estimateDispersionsPriorVar <- function(object, minDisp=1e-8, modelMatrix=NULL)
 #' normalized counts and the trended dispersion fit. The weighting ensures
 #' that noisy estimates of log fold changes from small count genes do not
 #' overly influence the calculation of the prior variance.
+#' See \code{\link{estimateBetaPriorVar}}.
 #' The final prior variance for a factor level is the average of the
-#' estimated prior variance over all contrasts of all levels of the factor.
+#' estimated prior variance over all contrasts of all levels of the factor. 
 #' Another change since the 2014 paper: when interaction terms are present
 #' in the design, the prior on log fold changes is turned off
 #' (for more details, see the vignette section, "Methods changes since
@@ -1252,7 +1263,7 @@ nbinomWaldTest <- function(object, betaPrior, betaPriorVar,
 #' @export
 estimateBetaPriorVar <- function(object, 
                                  betaPriorMethod=c("weighted","quantile"),
-                                 upperQuantile=.05) {
+                                 upperQuantile=0.05) {
   objectNZ <- object[!mcols(object)$allZero,,drop=FALSE]
 
   betaMatrix <- as.matrix(mcols(objectNZ)[,grep("MLE_", names(mcols(object))),drop=FALSE])
@@ -1288,7 +1299,12 @@ estimateBetaPriorVar <- function(object,
   # weighting by 1/Var(log(K))
   # Var(log(K)) ~ Var(K)/mu^2 = 1/mu + alpha
   # and using the fitted alpha
-  varlogk <- 1/mcols(objectNZ)$baseMean + mcols(objectNZ)$dispFit
+  dispFit <- mcols(objectNZ)$dispFit
+  if (is.null(dispFit)) {
+    # betaPrior routine could have been called w/o the dispersion fitted trend
+    dispFit <- mean(dispersions(objectNZ))
+  }
+  varlogk <- 1/mcols(objectNZ)$baseMean + dispFit
   weights <- 1/varlogk
   
   betaPriorVar <- if (nrow(betaMatrix) > 1) {
@@ -2268,7 +2284,7 @@ covarianceMatrix <- function(object, rowNumber) {
   sf <- sizeFactors(object)
   alpha <- dispersions(object)[rowNumber]
   mu.hat <- as.vector(sf * exp(x %*% beta))
-  minmu <- 0.1
+  minmu <- 0.5
   mu.hat[mu.hat < minmu] <- minmu
   w <- diag(1/(1/mu.hat^2 * ( mu.hat + alpha * mu.hat^2 )))
   betaPriorVar <- attr(object,"betaPriorVar")
@@ -2428,7 +2444,7 @@ fitNbinomGLMsOptim <- function(object,modelMatrix,lambda,
     betaMatrix[row,] <- o$par / scaleCols
     # calculate the standard errors
     mu_row <- as.numeric(nf * 2^(x %*% o$par))
-    minmu <- 0.1
+    minmu <- 0.5
     mu_row[mu_row < minmu] <- minmu
     w <- diag((mu_row^-1 + alpha)^-1)
     xtwx <- t(x) %*% w %*% x
@@ -2607,7 +2623,9 @@ refitWithoutOutliers <- function(object, test, betaPrior, full, reduced,
     if (test == "Wald") {
       betaPriorVar <- attr(object, "betaPriorVar")
       objectSub <- nbinomWaldTest(objectSub, betaPrior=betaPrior,
-                                  betaPriorVar=betaPriorVar, quiet=quiet, modelMatrix=modelMatrix)
+                                  betaPriorVar=betaPriorVar, quiet=quiet,
+                                  modelMatrix=modelMatrix,
+                                  modelMatrixType=modelMatrixType)
     } else if (test == "LRT") {
       if (!betaPrior) {
         objectSub <- nbinomLRT(objectSub, full=full, reduced=reduced, quiet=quiet)
diff --git a/R/helper.R b/R/helper.R
index af0be0b..b3df9a7 100644
--- a/R/helper.R
+++ b/R/helper.R
@@ -72,14 +72,19 @@ collapseReplicates <- function(object, groupby, run, renameCols=TRUE) {
 #' as in \code{\link{estimateSizeFactors}}).
 #'
 #' The length of the features (e.g. genes) is calculated one of two ways:
-#' if there is a matrix named "avgTxLength" in \code{assays(dds)}, this will take precedence in the
-#' length normalization. Otherwise, feature length is calculated 
+#' (1) If there is a matrix named "avgTxLength" in \code{assays(dds)},
+#' this will take precedence in the length normalization.
+#' This occurs when using the tximport-DESeq2 pipeline.
+#' (2) Otherwise, feature length is calculated 
 #' from the \code{rowRanges} of the dds object,
 #' if a column \code{basepairs} is not present in \code{mcols(dds)}.
 #' The calculated length is the number of basepairs in the union of all \code{GRanges}
 #' assigned to a given row of \code{object}, e.g., 
 #' the union of all basepairs of exons of a given gene.
-#'
+#' Note that the second approach over-estimates the gene length
+#' (average transcript length, weighted by abundance is a more appropriate
+#' normalization for gene counts), and so the FPKM will be an underestimate of the true value.
+#' 
 #' Note that, when the read/fragment counting has inter-feature dependencies, a strict
 #' normalization would not incorporate the basepairs of a feature which
 #' overlap another feature. This inter-feature dependence is not taken into
@@ -135,7 +140,14 @@ collapseReplicates <- function(object, groupby, run, renameCols=TRUE) {
 fpkm <- function(object, robust=TRUE) {
   fpm <- fpm(object, robust=robust)
   if ("avgTxLength" %in% assayNames(object)) {
-    return(1e3 * fpm / assays(object)[["avgTxLength"]])
+    exprs <- 1e3 * fpm / assays(object)[["avgTxLength"]]
+    if (robust) {
+      sf <- estimateSizeFactorsForMatrix(exprs)
+      exprs <- t(t(exprs)/sf)
+      return(exprs)
+    } else {
+      return(exprs)
+    }
   }
   if (is.null(mcols(object)$basepairs)) {
     if (class(rowRanges(object)) == "GRangesList") {
@@ -172,6 +184,7 @@ which will be used to produce FPKM values")
 #' rather than taking the column sums of the raw counts.
 #' If TRUE, the size factors and the geometric mean of
 #' column sums are multiplied to create a robust library size estimate.
+#' Robust normalization is not used if average transcript lengths are present.
 #' 
 #' @return a matrix which is normalized per million of mapped fragments,
 #' either using the robust median ratio method (robust=TRUE, default)
@@ -212,11 +225,13 @@ which will be used to produce FPKM values")
 #' 
 #' @export
 fpm <- function(object, robust=TRUE) {
-  if (robust & is.null(sizeFactors(object))) {
+  # we do something different if average tx lengths are present
+  noAvgTxLen <- !("avgTxLength" %in% assayNames(object))
+  if (robust & is.null(sizeFactors(object)) & noAvgTxLen) {
     object <- estimateSizeFactors(object)
   }
   k <- counts(object)
-  library.sizes <- if (robust) {
+  library.sizes <- if (robust & noAvgTxLen) {
     sizeFactors(object) * exp(mean(log(colSums(k))))
   } else {
     colSums(k)
@@ -230,137 +245,14 @@ fpm <- function(object, robust=TRUE) {
 #'
 #' Normalize for gene length using the output of transcript abundance estimators
 #'
-#' This is a prototype function for importing information about changes in
-#' the average transcript length for each gene. The use of this function
-#' is only for testing purposes. 
-#'
-#' The function simply imports or calculates average
-#' transcript length for each gene and each sample from external files,
-#' and provides this matrix to the \code{normMatrix} argument of
-#' \code{\link{estimateSizeFactors}}. 
-#' By average transcript length, the average refers to a weighted average with respect
-#' to the transcript abundances. The RSEM method includes such a column in their
-#' \code{gene.results} files, but an estimate of average transcript length can
-#' be obtained from any software which outputs a file with a row for each
-#' transcript, specifying: transcript length, estimate of transcript abundance,
-#' and the gene which the transcript belongs to.
-#'
-#' Normalization factors accounting for both average transcript length and
-#' library size of each sample are generated and then stored within the data object.
-#' The analysis can then continue with \code{\link{DESeq}};
-#' the stored normalization factors will be used instead of size factors in the analysis.
+#' This function is deprecated and moved to a new general purpose package,
+#' tximport, which will be added to Bioconductor.
 #'
-#' For RSEM \code{genes.results} files,
-#' specify \code{level="gene"}, \code{geneIdCol="gene_id"},
-#' and \code{lengthCol="effective_length"}
+#' @param ... ...
 #' 
-#' For Cufflinks \code{isoforms.fpkm_tracking} files,
-#' specify \code{level="tx"}, \code{geneIdCol="gene_id"},
-#' \code{lengthCol="length"}, and \code{abundanceCol="FPKM"}.
-#'
-#' For Sailfish output files, one can write an \code{importer}
-#' function which attaches a column \code{gene_id} based on Transcript ID,
-#' and then specify \code{level="tx"}, \code{geneIdCol="gene_id"},
-#' \code{lengthCol="Length"} and \code{abundanceCol="RPKM"}.
-#' 
-#' Along with the normalization matrix which is stored in \code{normalizationFactors(object)},
-#' the resulting gene length matrix is stored in \code{assays(dds)[["avgTxLength"]]},
-#' and will take precedence in calls to \code{\link{fpkm}}.
-#'
-#' @param object the DESeqDataSet, before calling \code{DESeq}
-#' @param files a character vector specifying the filenames of output files
-#' containing either transcript abundance estimates with transcript length, 
-#' or average transcript length information per gene.
-#' @param level either "tx" or "gene"
-#' @param geneIdCol the name of the column of the files specifying the gene id. This
-#' should line up with the \code{rownames(object)}. The information in the files
-#' will be re-ordered to line up with the rownames of the object.
-#' See \code{dropGenes} for more details.
-#' @param lengthCol the name of the column of files specifying the length of the
-#' feature, either transcript for \code{level="tx"} or the gene for
-#' \code{level="gene"}.
-#' @param abundanceCol only needed if \code{level="tx"}, the name of the
-#' column specifying the abundance estimate of the transcript.
-#' @param dropGenes whether to drop genes from the object,
-#' as labelled by \code{rownames(object)}, which are not
-#' present in the \code{geneIdCol} of the files. Defaults to FALSE
-#' and gives an error upon finding \code{rownames} of the object
-#' not present in the \code{geneIdCol} of the files.
-#' The function will reorder the matching rows of the files to match
-#' the rownames of the object.
-#' @param importer a function to read the \code{files}.
-#' \code{fread} from the data.table package is quite fast,
-#' but other options include \code{read.table}, \code{read.csv}.
-#' One can pre-test with \code{importer(files[1])}.
-#' @param ... further arguments passed to \code{importer}
-#'
-#' @return a DESeqDataSet with \code{\link{normalizationFactors}}
-#' accounting for average transcript length and library size
-#' 
-#' @examples
-#'
-#' n <- 10
-#' files <- c("sample1","sample2")
-#' gene_id <- rep(paste0("gene",seq_len(n)),each=3)
-#' set.seed(1)
-#' sample1 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-#' sample2 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-#' importer <- get
-#' dds <- makeExampleDESeqDataSet(n=n, m=2)
-#' dds <- normalizeGeneLength(dds, files=files, level="tx",
-#'   geneIdCol="gene_id", lengthCol="length", abundanceCol="FPKM",
-#'   dropGenes=TRUE, importer=importer)
-#'
 #' @export
-normalizeGeneLength <- function(object, files, level=c("tx","gene"),
-                                geneIdCol="gene_id", lengthCol="length", abundanceCol="FPKM",
-                                dropGenes=FALSE, importer, ...) {
-  stopifnot(length(files) == ncol(object))
-  message("assuming: 'files' and colnames of the dds in same order:
-  files:    ",paste(head(files,4),collapse=", "),"...
-  colnames: ",paste(head(colnames(object),4),collapse=", "),"...")
-  dataList <- list()
-  level <- match.arg(level, c("tx","gene"))
-  for (i in seq_along(files)) {
-    raw <- as.data.frame(importer(files[i], ...))
-    if (level == "tx") {
-      stopifnot(all(c(geneIdCol, lengthCol, abundanceCol) %in% names(raw)))
-      raw[[geneIdCol]] <- factor(raw[[geneIdCol]], unique(raw[[geneIdCol]]))
-      res <- do.call(c, as.list(by(raw, raw[[geneIdCol]],
-                                   function(x) sum(x[[lengthCol]] * x[[abundanceCol]])/sum(x[[abundanceCol]]),
-                                   simplify=FALSE)))      
-      dataList[[i]] <- data.frame(avgLength=res, row.names=names(res))
-    } else if (level == "gene") {
-      stopifnot(all(c(geneIdCol, lengthCol) %in% names(raw)))
-      dataList[[i]] <- data.frame(avgLength=raw[[lengthCol]], row.names=raw[[geneIdCol]])
-    }
-  }
-  for (i in seq_along(files)[-1]) {
-    stopifnot(all(rownames(dataList[[i]]) == rownames(dataList[[1]])))
-  }
-  data <- do.call(cbind, dataList)
-  data <- as.matrix(data)
-
-  common <- rownames(object)[rownames(object) %in% rownames(data)]
-  if (length(common) < nrow(object)) {
-    message(length(common)," genes out of ",nrow(object)," of the dds object are present in the gene length data")
-    if (dropGenes) {
-      message("dropping genes from the dds")
-      object <- object[common,]
-    } else {
-      stop("specify 'dropGenes=TRUE' in order to drop genes from the dds")
-    }
-  }
-  if (!all(rownames(data) == rownames(object))) {
-    data <- data[match(common, rownames(data)),]
-  }
-  stopifnot(rownames(data) == rownames(object))
-  message("adding normalization factors for gene length")
-  object <- estimateSizeFactors(object, normMatrix=data)
-  message("users can continue with DESeq()")
-  dimnames(data) <- dimnames(object)
-  assays(object)[["avgTxLength"]] <- data
-  object
+normalizeGeneLength <- function(...) {
+  .Deprecated("tximport, a separate package on Bioconductor")
 }
 
 #' Normalized counts transformation
@@ -450,10 +342,13 @@ DESeqParallel <- function(object, test, fitType, betaPrior, full, reduced, quiet
       objectNZSub <- estimateDispersionsMAP(objectNZ[idx == l,,drop=FALSE],
                                             dispPriorVar=dispPriorVar, quiet=TRUE)
       # replace design
-      if (noReps) design(objectNZ) <- designIn
+      if (noReps) design(objectNZSub) <- designIn
       estimateMLEForBetaPriorVar(objectNZSub)
     }, BPPARAM=BPPARAM))
 
+    # replace design
+    if (noReps) design(objectNZ) <- designIn
+    
     # need to set standard model matrix for LRT with beta prior
     if (test == "LRT") {
       attr(object, "modelMatrixType") <- "standard"
@@ -490,7 +385,7 @@ DESeqParallel <- function(object, test, fitType, betaPrior, full, reduced, quiet
         objectNZSub <- estimateDispersionsMAP(objectNZ[idx == l,,drop=FALSE],
                                               dispPriorVar=dispPriorVar, quiet=TRUE, modelMatrix=modelMatrix)
         # replace design
-        if (noReps) design(objectNZ) <- designIn
+        if (noReps) design(objectNZSub) <- designIn
         nbinomWaldTest(objectNZSub, betaPrior=betaPrior,
                        quiet=TRUE, modelMatrix=modelMatrix, modelMatrixType="standard")
       }, BPPARAM=BPPARAM))
@@ -499,7 +394,7 @@ DESeqParallel <- function(object, test, fitType, betaPrior, full, reduced, quiet
         objectNZSub <- estimateDispersionsMAP(objectNZ[idx == l,,drop=FALSE],
                                               dispPriorVar=dispPriorVar, quiet=TRUE, modelMatrix=modelMatrix)
         # replace design
-        if (noReps) design(objectNZ) <- designIn
+        if (noReps) design(objectNZSub) <- designIn
         nbinomLRT(objectNZSub, full=full, reduced=reduced, quiet=TRUE)
       }, BPPARAM=BPPARAM))
     } 
diff --git a/R/methods.R b/R/methods.R
index 06f85de..1cf1bac 100644
--- a/R/methods.R
+++ b/R/methods.R
@@ -271,9 +271,10 @@ setMethod("sizeFactors", signature(object="DESeqDataSet"),
 #' @exportMethod "sizeFactors<-"
 setReplaceMethod("sizeFactors", signature(object="DESeqDataSet", value="numeric"),
                  function( object, value ) {
-                   if (any(value <= 0)) {
-                     stop("size factors must be positive")
-                   }
+                   stopifnot(all(!is.na(value)))
+                   stopifnot(all(is.finite(value)))
+                   stopifnot(all(value > 0))
+
                    # Temporary hack for backward compatibility with "old"
                    # DESeqDataSet objects. Remove once all serialized
                    # DESeqDataSet objects around have been updated.
@@ -352,9 +353,10 @@ setMethod("normalizationFactors", signature(object="DESeqDataSet"),
 #' @exportMethod "normalizationFactors<-"
 setReplaceMethod("normalizationFactors", signature(object="DESeqDataSet", value="matrix"),
                  function(object, value) {
-                   if (any(value <= 0)) {
-                     stop("normalization factors must be positive")
-                   }
+                   stopifnot(all(!is.na(value)))
+                   stopifnot(all(is.finite(value)))
+                   stopifnot(all(value > 0))
+
                    # Temporary hack for backward compatibility with "old"
                    # DESeqDataSet objects. Remove once all serialized
                    # DESeqDataSet objects around have been updated.
@@ -379,28 +381,38 @@ estimateSizeFactors.DESeqDataSet <- function(object, type=c("ratio","iterate"),
   if (type == "iterate") {
     sizeFactors(object) <- estimateSizeFactorsIterate(object)
   } else {
-    if (missing(normMatrix)) {
+    if ("avgTxLength" %in% assayNames(object)) {
+      nm <- assays(object)[["avgTxLength"]]
+      nm <- nm / exp(rowMeans(log(nm))) # divide out the geometric mean
+      normalizationFactors(object) <- estimateNormFactors(counts(object),
+                                                          normMatrix=nm,
+                                                          locfunc=locfunc,
+                                                          geoMeans=geoMeans,
+                                                          controlGenes=controlGenes)
+      message("using 'avgTxLength' from assays(dds), correcting for library size")      
+    } else if (missing(normMatrix)) {
       sizeFactors(object) <- estimateSizeFactorsForMatrix(counts(object), locfunc=locfunc,
                                                           geoMeans=geoMeans,
                                                           controlGenes=controlGenes)
     } else {
-      normalizationFactors(object) <- estimateNormFactors(counts(object), normMatrix=normMatrix,
+      normalizationFactors(object) <- estimateNormFactors(counts(object),
+                                                          normMatrix=normMatrix,
                                                           locfunc=locfunc,
                                                           geoMeans=geoMeans,
                                                           controlGenes=controlGenes)
-      message("adding normalization factors which account for library size")
+      message("using 'normMatrix', adding normalization factors which correct for library size")
     }
   }
   object
 }
 
-#' Estimate the size factors for a DESeqDataSet
+#' Estimate the size factors for a \code{\link{DESeqDataSet}}
 #' 
 #' This function estimates the size factors using the
 #' "median ratio method" described by Equation 5 in Anders and Huber (2010).
-#' The estimated size factors can be accessed using \code{\link{sizeFactors}}.
+#' The estimated size factors can be accessed using the accessor function \code{\link{sizeFactors}}.
 #' Alternative library size estimators can also be supplied
-#' using \code{\link{sizeFactors}}.
+#' using the assignment function \code{\link{sizeFactors<-}}.
 #' 
 #' Typically, the function is called with the idiom:
 #'
@@ -439,11 +451,16 @@ estimateSizeFactors.DESeqDataSet <- function(object, type=c("ratio","iterate"),
 #' for a "frozen" size factor calculation
 #' @param controlGenes optional, numeric or logical index vector specifying those genes to
 #' use for size factor estimation (e.g. housekeeping or spike-in genes)
-#' @param normMatrix optional, a matrix of normalization factors which do not
-#' control for library size (e.g. average transcript length of genes for each
-#' sample). Providing \code{normMatrix} will estimate size factors on the
+#' @param normMatrix optional, a matrix of normalization factors which do not yet
+#' control for library size. Note that this argument should not be used (and
+#' will be ignored) if the \code{dds} object was created using \code{tximport}.
+#' In this case, the information in \code{assays(dds)[["avgTxLength"]]}
+#' is automatically used to create appropriate normalization factors.
+#' Providing \code{normMatrix} will estimate size factors on the
 #' count matrix divided by \code{normMatrix} and store the product of the
 #' size factors and \code{normMatrix} as \code{\link{normalizationFactors}}.
+#' It is recommended to divide out the row-wise geometric mean of
+#' \code{normMatrix} so the rows roughly are centered on 1.
 #' 
 #' @return The DESeqDataSet passed as parameters, with the size factors filled
 #' in.
@@ -762,15 +779,18 @@ summary.DESeqResults <- function(object, alpha, ...) {
   } else {
     round(metadata(object)$filterThreshold)
   }
+  ihw <- "ihwResult" %in% names(metadata(object))
+
   printsig <- function(x) format(x, digits=2) 
   cat("out of",notallzero,"with nonzero total read count\n")
   cat(paste0("adjusted p-value < ",alpha,"\n"))
   cat(paste0("LFC > 0 (up)     : ",up,", ",printsig(up/notallzero*100),"% \n"))
   cat(paste0("LFC < 0 (down)   : ",down,", ",printsig(down/notallzero*100),"% \n"))
   cat(paste0("outliers [1]     : ",outlier,", ",printsig(outlier/notallzero*100),"% \n"))
-  cat(paste0("low counts [2]   : ",filt,", ",printsig(filt/notallzero*100),"% \n"))
-  cat(paste0("(mean count < ",ft,")\n"))
+  if (!ihw) cat(paste0("low counts [2]   : ",filt,", ",printsig(filt/notallzero*100),"% \n"))
+  if (!ihw) cat(paste0("(mean count < ",ft,")\n"))
   cat("[1] see 'cooksCutoff' argument of ?results\n")
-  cat("[2] see 'independentFiltering' argument of ?results\n")
+  if (!ihw) cat("[2] see 'independentFiltering' argument of ?results\n")
+  if (ihw) cat("[2] see metadata(res)$ihwResult on hypothesis weighting\n")
   cat("\n")
 }
diff --git a/R/plots.R b/R/plots.R
index 0701c2d..fbf7b52 100644
--- a/R/plots.R
+++ b/R/plots.R
@@ -70,12 +70,12 @@ plotDispEsts.DESeqDataSet <- function( object, ymin,
 #' @export
 setMethod("plotDispEsts", signature(object="DESeqDataSet"), plotDispEsts.DESeqDataSet)
 
-plotMA.DESeqDataSet <- function(object, alpha=.1, main="", ylim, ...) {
+plotMA.DESeqDataSet <- function(object, alpha=.1, main="", xlab="mean of normalized counts", ylim, MLE=FALSE, ...) {
     res <- results(object, ...)
-    plotMA.DESeqResults(res, alpha=alpha, main=main, ylim=ylim)
+    plotMA.DESeqResults(res, alpha=alpha, main=main, xlab=xlab, ylim=ylim, MLE=MLE)
 }
 
-plotMA.DESeqResults <- function(object, alpha, main="", ylim, MLE=FALSE, ...) {
+plotMA.DESeqResults <- function(object, alpha, main="", xlab="mean of normalized counts", ylim, MLE=FALSE, ...) {
   if (missing(alpha)) {
     alpha <- if (is.null(metadata(object)$alpha)) {
       0.1
@@ -97,9 +97,9 @@ plotMA.DESeqResults <- function(object, alpha, main="", ylim, MLE=FALSE, ...) {
                isDE = ifelse(is.na(object$padj), FALSE, object$padj < alpha))
   }
     if (missing(ylim)) {
-      plotMA(df, main=main, ...)
+      plotMA(df, main=main, xlab=xlab, ...)
     } else {
-       plotMA(df, main=main, ylim=ylim, ...)
+       plotMA(df, main=main, xlab=xlab, ylim=ylim, ...)
     }  
 }
 
@@ -132,6 +132,7 @@ plotMA.DESeqResults <- function(object, alpha, main="", ylim, MLE=FALSE, ...) {
 #' Note that the MLE will be plotted regardless of this argument, if DESeq() was run
 #' with \code{betaPrior=FALSE}.
 #' @param main optional title for the plot
+#' @param xlab optional defaults to "mean of normalized counts"
 #' @param ylim optional y limits
 #' @param ... further arguments passed to \code{plotMA} if object
 #' is \code{DESeqResults} or to \code{\link{results}} if object is
diff --git a/R/results.R b/R/results.R
index bdd4dc4..2cfde89 100644
--- a/R/results.R
+++ b/R/results.R
@@ -45,20 +45,20 @@
 #' By default, independent filtering is performed to select a set of genes
 #' for multiple test correction which maximizes the number of adjusted
 #' p-values less than a given critical value \code{alpha} (by default 0.1).
+#' See the reference in this man page for details on independent filtering.
 #' The filter used for maximizing the number of rejections is the mean
 #' of normalized counts for all samples in the dataset.
-#' In version >= 1.10, the threshold chosen is
+#' Several arguments from the \code{\link[genefilter]{filtered_p}} function of
+#' the genefilter package (used within the \code{results} function)
+#' are provided here to control the independent filtering behavior.
+#' In DESeq2 version >= 1.10, the threshold that is chosen is
 #' the lowest quantile of the filter for which the
 #' number of rejections is close to the peak of a curve fit
 #' to the number of rejections over the filter quantiles.
 #' 'Close to' is defined as within 1 residual standard deviation.
-#' 
 #' The adjusted p-values for the genes which do not pass the filter threshold
-#' are set to \code{NA}. By default, the mean of normalized counts
-#' is used to perform this filtering, though other statistics can be provided.
-#' Several arguments from the \code{filtered_p} function of genefilter
-#' are provided here to control or turn off the independent filtering behavior.
-#'
+#' are set to \code{NA}. 
+#' 
 #' By default, \code{results} assigns a p-value of \code{NA}
 #' to genes containing count outliers, as identified using Cook's distance.
 #' See the \code{cooksCutoff} argument for control of this behavior.
@@ -68,9 +68,13 @@
 #'
 #' For analyses using the likelihood ratio test (using \code{\link{nbinomLRT}}),
 #' the p-values are determined solely by the difference in deviance between
-#' the full and reduced model formula. A log2 fold change is included,
-#' which can be controlled using the \code{name} argument, or by default this will
-#' be the estimated coefficient for the last element of \code{resultsNames(object)}.
+#' the full and reduced model formula. A single log2 fold change is printed
+#' in the results table for consistency with other results table outputs,
+#' however the test statistic and p-values may nevertheless involve
+#' the testing of one or more log2 fold changes.
+#' Which log2 fold change is printed in the results table can be controlled
+#' using the \code{name} argument, or by default this will be the estimated
+#' coefficient for the last element of \code{resultsNames(object)}.
 #'
 #' @references Richard Bourgon, Robert Gentleman, Wolfgang Huber: Independent
 #' filtering increases detection power for high-throughput experiments.
@@ -102,12 +106,14 @@
 #' building a results table. Use this argument rather than \code{contrast}
 #' for continuous variables, individual effects or for individual interaction terms.
 #' The value provided to \code{name} must be an element of \code{resultsNames(object)}.
-#' @param lfcThreshold a non-negative value, which specifies the test which should
-#' be applied to the log2 fold changes. The standard is a test that the log2 fold
-#' changes are not equal to zero. However, log2 fold changes greater or less than
-#' \code{lfcThreshold} can also be tested. Specify the alternative hypothesis
-#' using the \code{altHypothesis} argument. If \code{lfcThreshold} is specified,
-#' the results are Wald tests, and LRT p-values will be overwritten.
+#' @param lfcThreshold a non-negative value which specifies a log2 fold change
+#' threshold. The default value is 0, corresponding to a test that
+#' the log2 fold changes are equal to zero. The user can
+#' specify the alternative hypothesis using the \code{altHypothesis} argument,
+#' which defaults to testing
+#' for log2 fold changes greater in absolute value than a given threshold.
+#' If \code{lfcThreshold} is specified,
+#' the results are for Wald tests, and LRT p-values will be overwritten.
 #' @param altHypothesis character which specifies the alternative hypothesis,
 #' i.e. those values of log2 fold change which the user is interested in
 #' finding. The complement of this set of values is the null hypothesis which
@@ -130,10 +136,9 @@
 #' by default this is \code{c(1,-1)}
 #' @param cooksCutoff theshold on Cook's distance, such that if one or more
 #' samples for a row have a distance higher, the p-value for the row is
-#' set to NA.
-#' The default cutoff is the .99 quantile of the F(p, m-p) distribution,
+#' set to NA. The default cutoff is the .99 quantile of the F(p, m-p) distribution,
 #' where p is the number of coefficients being fitted and m is the number of samples.
-#' Set to Inf or FALSE to disable the resetting of p-values to NA.
+#' Set to \code{Inf} or \code{FALSE} to disable the resetting of p-values to NA.
 #' Note: this test excludes the Cook's distance of samples belonging to experimental
 #' groups with only 2 samples.
 #' @param independentFiltering logical, whether independent filtering should be
@@ -146,16 +151,19 @@
 #' @param theta the quantiles at which to assess the number of rejections
 #' from independent filtering
 #' @param pAdjustMethod the method to use for adjusting p-values, see \code{?p.adjust}
-#' @param filterFun an optional custom function for independent filtering,
-#' with arguments \code{alpha}, \code{filter}, \code{test}, \code{theta}, and \code{method}
-#' similar to \code{genefilter::filtered_R}, and which returns \code{padj}
+#' @param filterFun an optional custom function for performing independent filtering
+#' and p-value adjustment, with arguments \code{res} (a DESeqResults object),
+#' \code{filter} (the quantitity for filtering tests),
+#' \code{alpha} (the target FDR),
+#' \code{pAdjustMethod}. This function should return a DESeqResults object
+#' with a \code{padj} column.
 #' @param format character, either \code{"DataFrame"}, \code{"GRanges"}, or \code{"GRangesList"},
 #' whether the results should be printed as a \code{\link{DESeqResults}} DataFrame,
 #' or if the results DataFrame should be attached as metadata columns to
 #' the \code{GRanges} or \code{GRangesList} \code{rowRanges} of the \code{DESeqDataSet}.
 #' If the \code{rowRanges} is a \code{GRangesList}, and \code{GRanges} is requested, 
 #' the range of each gene will be returned
-#' @param test this is typically automatically detected internally.
+#' @param test this is automatically detected internally if not provided.
 #' the one exception is after \code{nbinomLRT} has been run, \code{test="Wald"}
 #' will generate Wald statistics and Wald test p-values.
 #' @param addMLE whether the "unshrunken" maximum likelihood estimates (MLE)
@@ -170,6 +178,7 @@
 #' to \code{\link{bplapply}} when \code{parallel=TRUE}.
 #' If not specified, the parameters last registered with
 #' \code{\link{register}} will be used.
+#' @param ... optional arguments passed to \code{filterFun}
 #' 
 #' @return For \code{results}: a \code{\link{DESeqResults}} object, which is
 #' a simple subclass of DataFrame. This object contains the results columns:
@@ -188,17 +197,20 @@
 #'
 #' For \code{removeResults}: the original \code{DESeqDataSet} with results metadata columns removed
 #'
-#' @seealso \code{\link{DESeq}}
+#' @seealso \code{\link{DESeq}}, \code{\link[genefilter]{filtered_R}}
 #'
 #' @examples
 #'
-#' ## Example 1: simple two-group comparison
+#' ## Example 1: two-group comparison
 #' 
 #' dds <- makeExampleDESeqDataSet(m=4)
 #' 
 #' dds <- DESeq(dds)
-#' res <- results(dds)
-#' res[ order(res$padj), ]
+#' res <- results(dds, contrast=c("condition","B","A"))
+#'
+#' # with more than two groups, the call would look similar, e.g.:
+#' # results(dds, contrast=c("condition","C","A"))
+#' # etc.
 #' 
 #' ## Example 2: two conditions, two genotypes, with an interaction term
 #' 
@@ -280,7 +292,8 @@ results <- function(object, contrast, name,
                     test, 
                     addMLE=FALSE,
                     tidy=FALSE,
-                    parallel=FALSE, BPPARAM=bpparam()) {
+                    parallel=FALSE, BPPARAM=bpparam(),
+                    ...) {
   # match args
   format <- match.arg(format, choices=c("DataFrame", "GRanges","GRangesList"))
   altHypothesis <- match.arg(altHypothesis, choices=c("greaterAbs","lessAbs","greater","less"))
@@ -293,6 +306,7 @@ results <- function(object, contrast, name,
   stopifnot(lfcThreshold >= 0)
   stopifnot(length(lfcThreshold)==1)
   stopifnot(length(alpha)==1)
+  stopifnot(alpha > 0 & alpha < 1)
   stopifnot(length(pAdjustMethod)==1)
   stopifnot(length(listValues)==2 & is.numeric(listValues))
   stopifnot(listValues[1] > 0 & listValues[2] < 0)
@@ -323,7 +337,7 @@ of length 3 to 'contrast' instead of using 'name'")
   }
   
   if (format == "GRanges" & is(rowRanges(object),"GRangesList")) {
-    if (any(elementLengths(rowRanges(object)) == 0)) {
+    if (any(elementNROWS(rowRanges(object)) == 0)) {
       stop("rowRanges is GRangesList and one or more GRanges have length 0. Use format='DataFrame' or 'GRangesList'")
     }
   }
@@ -475,24 +489,15 @@ of length 3 to 'contrast' instead of using 'name'")
     res$pvalue[nowZero] <- 1
   }
 
-  # p-value adjustment
-  paRes <- pvalueAdjustment(res, independentFiltering, filter, theta, alpha, pAdjustMethod, filterFun)
-  res$padj <- paRes$padj
-
-  # adding metadata columns for padj
-  mcols(res)$type[names(res)=="padj"] <- "results"
-  mcols(res)$description[names(res)=="padj"] <- paste(pAdjustMethod,"adjusted p-values")
-
   # make results object
   deseqRes <- DESeqResults(res)
-
-  # finalize object / add attributes / make GRanges
-  if (independentFiltering) {
-    metadata(deseqRes) <- list(filterThreshold=paRes$filterThreshold,
-                               filterTheta=paRes$filterTheta,
-                               filterNumRej=paRes$filterNumRej,
-                               lo.fit=paRes$lo.fit,
-                               alpha=alpha)
+  
+  # p-value adjustment
+  if (missing(filterFun)) {
+    deseqRes <- pvalueAdjustment(deseqRes, independentFiltering, filter,
+                                 theta, alpha, pAdjustMethod)
+  } else {
+    deseqRes <- filterFun(deseqRes, filter, alpha, pAdjustMethod)
   }
 
   # remove rownames and attach as a new column, 'row'
@@ -515,7 +520,7 @@ of length 3 to 'contrast' instead of using 'name'")
     return(out)
   } else if (format == "GRanges") {
     if (class(rowRanges(object)) == "GRangesList") {
-      warning("rowRanges is GRangesList, performing unlist(range(x)) on the rowRanges")
+      message("rowRanges is GRangesList, performing unlist(range(x)) on the rowRanges")
       out <- unlist(range(rowRanges(object)))
       mcols(out) <- deseqRes
       return(out)
@@ -549,8 +554,7 @@ removeResults <- function(object) {
 ###########################################################
 
 pvalueAdjustment <- function(res, independentFiltering, filter,
-                             theta, alpha, pAdjustMethod,
-                             filterFun) {
+                             theta, alpha, pAdjustMethod) {
   # perform independent filtering
   if (independentFiltering) {
     if (missing(filter)) {
@@ -561,63 +565,58 @@ pvalueAdjustment <- function(res, independentFiltering, filter,
       if (lowerQuantile < .95) upperQuantile <- .95 else upperQuantile <- 1
       theta <- seq(lowerQuantile, upperQuantile, length=50)
     }
-    if (missing(filterFun)) {
-      # do filtering using genefilter
-      stopifnot(length(theta) > 1)
-      stopifnot(length(filter) == nrow(res))
-      filtPadj <- filtered_p(filter=filter, test=res$pvalue,
-                             theta=theta, method=pAdjustMethod) 
-      numRej  <- colSums(filtPadj < alpha, na.rm = TRUE)
-      # prevent over-aggressive filtering when all genes are null,
-      # by requiring the max number of rejections is above a fitted curve.
-      # If the max number of rejection is not greater than 10, then don't
-      # perform independent filtering at all.
-      lo.fit <- lowess(numRej ~ theta, f=1/5)
-      if (max(numRej) <= 10) {
-        j <- 1
-      } else { 
-          residual <- if (all(numRej==0)) {
-            0
-          } else {
-              numRej[numRej > 0] - lo.fit$y[numRej > 0]
-            }
-          thresh <- max(lo.fit$y) - sqrt(mean(residual^2))
-          j <- if (any(numRej > thresh)) {
-            which(numRej > thresh)[1]
-          } else {
-              1  
-            }
-        }
-      # j <- which.max(numRej) # old method
-      padj <- filtPadj[, j, drop=TRUE]
-      cutoffs <- quantile(filter, theta)
-      filterThreshold <- cutoffs[j]
-      filterNumRej <- data.frame(theta=theta, numRej=numRej)
-      filterTheta <- theta[j]
-    } else {
-      # use a custom filter function
-      padj <- filterFun(alpha=alpha,
-                        filter=filter,
-                        test=res$pvalue,
-                        theta=theta,
-                        method=pAdjustMethod)
-      filterThreshold <- NULL
-      filterTheta <- NULL
-      filterNumRej <- NULL
-      lo.fit <- NULL
+
+    # do filtering using genefilter
+    stopifnot(length(theta) > 1)
+    stopifnot(length(filter) == nrow(res))
+    filtPadj <- filtered_p(filter=filter, test=res$pvalue,
+                           theta=theta, method=pAdjustMethod) 
+    numRej  <- colSums(filtPadj < alpha, na.rm = TRUE)
+    # prevent over-aggressive filtering when all genes are null,
+    # by requiring the max number of rejections is above a fitted curve.
+    # If the max number of rejection is not greater than 10, then don't
+    # perform independent filtering at all.
+    lo.fit <- lowess(numRej ~ theta, f=1/5)
+    if (max(numRej) <= 10) {
+      j <- 1
+    } else { 
+      residual <- if (all(numRej==0)) {
+        0
+      } else {
+        numRej[numRej > 0] - lo.fit$y[numRej > 0]
+      }
+      thresh <- max(lo.fit$y) - sqrt(mean(residual^2))
+      j <- if (any(numRej > thresh)) {
+        which(numRej > thresh)[1]
+      } else {
+        1  
+      }
     }
-    return(list(padj=padj,
-                filterThreshold=filterThreshold,
-                filterTheta=filterTheta,
-                filterNumRej=filterNumRej,
-                lo.fit=lo.fit))
+    # j <- which.max(numRej) # old method
+    padj <- filtPadj[, j, drop=TRUE]
+    cutoffs <- quantile(filter, theta)
+    filterThreshold <- cutoffs[j]
+    filterNumRej <- data.frame(theta=theta, numRej=numRej)
+    filterTheta <- theta[j]
+
+    metadata(res)[["filterThreshold"]] <- filterThreshold
+    metadata(res)[["filterTheta"]] <- filterTheta
+    metadata(res)[["filterNumRej"]] <- filterNumRej
+    metadata(res)[["lo.fit"]] <- lo.fit
+    metadata(res)[["alpha"]] <- alpha
   } else {
     # regular p-value adjustment
     # does not include those rows which were removed
     # by maximum Cook's distance
-    padj <- p.adjust(res$pvalue,method=pAdjustMethod)
-    return(list(padj=padj))
+    padj <- p.adjust(res$pvalue,method=pAdjustMethod)  
   }
+  res$padj <- padj
+  
+  # add metadata to padj column
+  mcols(res)$type[names(res)=="padj"] <- "results"
+  mcols(res)$description[names(res)=="padj"] <- paste(pAdjustMethod,"adjusted p-values")
+  
+  res
 }
 
 
@@ -720,9 +719,6 @@ cleanContrast <- function(object, contrast, expanded=FALSE, listValues, test) {
     contrastNumLevel <- contrast[2]
     contrastDenomLevel <- contrast[3]
     contrastBaseLevel <- levels(colData(object)[,contrastFactor])[1]
-
-    # check if both levels have all zero counts
-    contrastAllZero <- contrastAllZeroCharacter(object, contrastFactor, contrastNumLevel, contrastDenomLevel)
     
     # check for intercept
     hasIntercept <- attr(terms(design(object)),"intercept") == 1
@@ -825,6 +821,12 @@ cleanContrast <- function(object, contrast, expanded=FALSE, listValues, test) {
                    "are expected to be in resultsNames(object)"))
       }
     }
+
+    # check if both levels have all zero counts
+    # (this has to be down here to make use of error checking above)
+    contrastAllZero <- contrastAllZeroCharacter(object, contrastFactor,
+                         contrastNumLevel, contrastDenomLevel)
+    
   }
 
   # if the result table not already built in the above code...
@@ -1079,6 +1081,17 @@ contrastAllZeroNumeric <- function(object, contrast) {
     stop("was expecting a model matrix stored as an attribute of the DESeqDataSet")
   }
   modelMatrix <- attr(object, "modelMatrix")
+
+  # note: this extra leg-work to zero out LFC, lfcSE, and set p-value to 1
+  # for contrasts comparing groups where both groups have all zeros
+  # is only implemented for the case in which we can identify
+  # the relevant samples by multiplying the model matrix
+  # with a vector where the non-zero elements of the numeric contrast are replaced with 1
+
+  # so this code will not zero out in the case of standard model matrices
+  # where the user supplies a numeric vector that pulls out a single column
+  # of the model matrix, for example.
+  
   if (all(contrast >= 0) | all(contrast <= 0)) {
     return( rep(FALSE, nrow(object)) )
   }
diff --git a/R/vst.R b/R/vst.R
index 706bf25..39a1021 100644
--- a/R/vst.R
+++ b/R/vst.R
@@ -129,7 +129,7 @@ varianceStabilizingTransformation <- function (object, blind=TRUE, fitType="para
   }
   if (is.matrix(object)) {
     matrixIn <- TRUE
-    object <- DESeqDataSetFromMatrix(object, DataFrame(row.names=colnames(object)), ~ 1)
+    object <- DESeqDataSetFromMatrix(object, DataFrame(row.names=colnames(object)), ~1)
   } else {
     matrixIn <- FALSE
   }
@@ -171,10 +171,14 @@ getVarianceStabilizedData <- function(object) {
   } else if ( attr( dispersionFunction(object), "fitType" ) == "local" ) {
     # non-parametric fit -> numerical integration
     if (is.null(sizeFactors(object))) {
-      stop("call estimateSizeFactors before calling getVarianceStabilizedData if using local dispersion fit")
+      stopifnot(!is.null(normalizationFactors(object)))
+      # approximate size factors from columns of NF
+      sf <- exp(colMeans(log(normalizationFactors(object))))
+    } else {
+      sf <- sizeFactors(object)
     }
     xg <- sinh( seq( asinh(0), asinh(max(ncounts)), length.out=1000 ) )[-1]
-    xim <- mean( 1/sizeFactors(object) )
+    xim <- mean( 1/sf )
     baseVarsAtGrid <- dispersionFunction(object)( xg ) * xg^2 + xim * xg
     integrand <- 1 / sqrt( baseVarsAtGrid )
     splf <- splinefun(
@@ -201,3 +205,82 @@ getVarianceStabilizedData <- function(object) {
     stop( "fitType is not parametric, local or mean" )
   }
 }
+
+#' Quickly estimate dispersion trend and apply a variance stabilizing transformation
+#'
+#' This is a wrapper for the \code{\link{varianceStabilizingTransformation}} (VST)
+#' that provides much faster estimation of the dispersion trend used to determine
+#' the formula for the VST. The speed-up is accomplished by
+#' subsetting to a smaller number of genes in order to estimate this dispersion trend.
+#' The subset of genes is chosen deterministically, to span the range
+#' of genes' mean normalized count.
+#' This wrapper for the VST is not blind to the experimental design:
+#' the sample covariate information is used to estimate the global trend
+#' of genes' dispersion values over the genes' mean normalized count.
+#' It can be made strictly blind to experimental design by first
+#' assigning a \code{\link{design}} of \code{~1} before running this function,
+#' or by avoiding subsetting and using \code{\link{varianceStabilizingTransformation}}.
+#' 
+#' @param object a DESeqDataSet or a matrix of counts
+#' @param blind logical, whether to blind the transformation to the experimental
+#' design (see \code{\link{varianceStabilizingTransformation}})
+#' @param nsub the number of genes to subset to (default 1000)
+#' @param fitType for estimation of dispersions: this parameter
+#' is passed on to \code{\link{estimateDispersions}} (options described there)
+#'
+#' @return a DESeqTranform object or a matrix of transformed, normalized counts
+#'
+#' @examples
+#'
+#' dds <- makeExampleDESeqDataSet(n=20000, m=20)
+#' vsd <- vst(dds)
+#'
+#' @export
+vst <- function(object, blind=TRUE, nsub=1000, fitType="parametric") {
+  if (nrow(object) < nsub) {
+    stop("less than 'nsub' rows,
+  it is recommended to use varianceStabilizingTransformation directly")
+  }
+  if (is.null(colnames(object))) {
+    colnames(object) <- seq_len(ncol(object))
+  }
+  if (is.matrix(object)) {
+    matrixIn <- TRUE
+    object <- DESeqDataSetFromMatrix(object, DataFrame(row.names=colnames(object)), ~ 1)
+  } else {
+    if (blind) {
+      design(object) <- ~ 1
+    }
+    matrixIn <- FALSE
+  }
+  if (is.null(sizeFactors(object)) & is.null(normalizationFactors(object))) {
+    object <- estimateSizeFactors(object)
+  }
+  baseMean <- rowMeans(counts(object, normalized=TRUE))
+  if (sum(baseMean > 5) < nsub) {
+    stop("less than 'nsub' rows with mean normalized count > 5, 
+  it is recommended to use varianceStabilizingTransformation directly")
+  }
+
+  # subset to a specified number of genes with mean normalized count > 5
+  object.sub <- object[baseMean > 5,]
+  baseMean <- baseMean[baseMean > 5]
+  o <- order(baseMean)
+  idx <- o[round(seq(from=1, to=length(o), length=nsub))]
+  object.sub <- object.sub[idx,]
+
+  # estimate dispersion trend
+  object.sub <- estimateDispersionsGeneEst(object.sub, quiet=TRUE)
+  object.sub <- estimateDispersionsFit(object.sub, fitType=fitType, quiet=TRUE)
+
+  # assign to the full object
+  suppressMessages({dispersionFunction(object) <- dispersionFunction(object.sub)})
+
+  # calculate and apply the VST
+  vsd <- varianceStabilizingTransformation(object, blind=FALSE)
+  if (matrixIn) {
+    return(assay(vsd))
+  } else {
+    return(vsd)
+  }
+}
diff --git a/build/vignette.rds b/build/vignette.rds
index c6d055f..05d2369 100644
Binary files a/build/vignette.rds and b/build/vignette.rds differ
diff --git a/inst/doc/DESeq2.R b/inst/doc/DESeq2.R
new file mode 100644
index 0000000..268cdd7
--- /dev/null
+++ b/inst/doc/DESeq2.R
@@ -0,0 +1,496 @@
+## ----knitr, echo=FALSE, results="hide"-----------------------------------
+library("knitr")
+opts_chunk$set(
+  tidy=FALSE,
+  dev="png",
+  fig.show="hide",
+  fig.width=4, fig.height=4.5,
+  cache=TRUE,
+  message=FALSE)
+
+## ----style, eval=TRUE, echo=FALSE, results="asis"---------------------------------------
+BiocStyle::latex()
+
+## ----loadDESeq2, echo=FALSE-------------------------------------------------------------
+library("DESeq2")
+
+## ----options, results="hide", echo=FALSE--------------------------------------
+options(digits=3, width=80, prompt=" ", continue=" ")
+
+## ----quick, eval=FALSE--------------------------------------------------------
+#  dds <- DESeqDataSet(se, design = ~ batch + condition)
+#  dds <- DESeq(dds)
+#  res <- results(dds, contrast=c("condition","trt","con"))
+
+## ----loadSumExp---------------------------------------------------------------
+library("airway")
+data("airway")
+se <- airway
+
+## ----sumExpInput--------------------------------------------------------------
+library("DESeq2")
+ddsSE <- DESeqDataSet(se, design = ~ cell + dex)
+ddsSE
+
+## ----loadPasilla--------------------------------------------------------------
+library("pasilla")
+library("Biobase")
+data("pasillaGenes")
+countData <- counts(pasillaGenes)
+colData <- pData(pasillaGenes)[,c("condition","type")]
+
+## ----showPasilla--------------------------------------------------------------
+head(countData)
+head(colData)
+
+## ----matrixInput--------------------------------------------------------------
+dds <- DESeqDataSetFromMatrix(countData = countData,
+                              colData = colData,
+                              design = ~ condition)
+dds
+
+## ----addFeatureData-----------------------------------------------------------
+featureData <- data.frame(gene=rownames(pasillaGenes))
+(mcols(dds) <- DataFrame(mcols(dds), featureData))
+
+## ----tximport-----------------------------------------------------------------
+library("tximport")
+library("readr")
+library("tximportData")
+dir <- system.file("extdata", package="tximportData")
+samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
+files <- file.path(dir,"salmon", samples$run, "quant.sf")
+names(files) <- paste0("sample",1:6)
+tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))
+txi <- tximport(files, type="salmon", tx2gene=tx2gene, reader=read_tsv)
+
+## ----txi2dds------------------------------------------------------------------
+coldata <- data.frame(condition=factor(rep(c("A","B"),each=3)))
+rownames(coldata) <- colnames(txi$counts)
+ddsTxi <- DESeqDataSetFromTximport(txi, colData=coldata, design=~ condition)
+
+## ----htseqDirI, eval=FALSE----------------------------------------------------
+#  directory <- "/path/to/your/files/"
+
+## ----htseqDirII---------------------------------------------------------------
+directory <- system.file("extdata", package="pasilla", mustWork=TRUE)
+
+## ----htseqInput---------------------------------------------------------------
+sampleFiles <- grep("treated",list.files(directory),value=TRUE)
+sampleCondition <- sub("(.*treated).*","\\1",sampleFiles)
+sampleTable <- data.frame(sampleName = sampleFiles,
+                          fileName = sampleFiles,
+                          condition = sampleCondition)
+ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
+                                       directory = directory,
+                                       design= ~ condition)
+ddsHTSeq
+
+## ----prefilter----------------------------------------------------------------
+dds <- dds[ rowSums(counts(dds)) > 1, ]
+
+## ----factorlvl----------------------------------------------------------------
+dds$condition <- factor(dds$condition, levels=c("untreated","treated"))
+
+## ----relevel------------------------------------------------------------------
+dds$condition <- relevel(dds$condition, ref="untreated")
+
+## ----droplevels---------------------------------------------------------------
+dds$condition <- droplevels(dds$condition)
+
+## ----deseq--------------------------------------------------------------------
+dds <- DESeq(dds)
+res <- results(dds)
+res
+
+## ----parallel, eval=FALSE-----------------------------------------------------
+#  library("BiocParallel")
+#  register(MulticoreParam(4))
+
+## ----resOrder-----------------------------------------------------------------
+resOrdered <- res[order(res$padj),]
+
+## ----sumRes-------------------------------------------------------------------
+summary(res)
+
+## ----sumRes01-----------------------------------------------------------------
+sum(res$padj < 0.1, na.rm=TRUE)
+
+## ----resAlpha05---------------------------------------------------------------
+res05 <- results(dds, alpha=0.05)
+summary(res05)
+sum(res05$padj < 0.05, na.rm=TRUE)
+
+## ----IHW----------------------------------------------------------------------
+library("IHW")
+resIHW <- results(dds, filterFun=ihw)
+summary(resIHW)
+sum(resIHW$padj < 0.1, na.rm=TRUE)
+metadata(resIHW)$ihwResult
+
+## ----MA, fig.width=4.5, fig.height=4.5----------------------------------------
+plotMA(res, main="DESeq2", ylim=c(-2,2))
+
+## ----MAidentify, eval=FALSE---------------------------------------------------
+#  idx <- identify(res$baseMean, res$log2FoldChange)
+#  rownames(res)[idx]
+
+## ----resMLE-------------------------------------------------------------------
+resMLE <- results(dds, addMLE=TRUE)
+head(resMLE, 4)
+
+## ----MANoPrior, fig.width=4.5, fig.height=4.5---------------------------------
+plotMA(resMLE, MLE=TRUE, main="unshrunken LFC", ylim=c(-2,2))
+
+## ----plotCounts, dev="pdf", fig.width=4.5, fig.height=5-----------------------
+plotCounts(dds, gene=which.min(res$padj), intgroup="condition")
+
+## ----plotCountsAdv, dev="pdf", fig.width=3.5, fig.height=3.5------------------
+d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition", 
+                returnData=TRUE)
+library("ggplot2")
+ggplot(d, aes(x=condition, y=count)) + 
+  geom_point(position=position_jitter(w=0.1,h=0)) + 
+  scale_y_log10(breaks=c(25,100,400))
+
+## ----metadata-----------------------------------------------------------------
+mcols(res)$description
+
+## ----export, eval=FALSE-------------------------------------------------------
+#  write.csv(as.data.frame(resOrdered),
+#            file="condition_treated_results.csv")
+
+## ----subset-------------------------------------------------------------------
+resSig <- subset(resOrdered, padj < 0.1)
+resSig
+
+## ----multifactor--------------------------------------------------------------
+colData(dds)
+
+## ----copyMultifactor----------------------------------------------------------
+ddsMF <- dds
+
+## ----replaceDesign------------------------------------------------------------
+design(ddsMF) <- formula(~ type + condition)
+ddsMF <- DESeq(ddsMF)
+
+## ----multiResults-------------------------------------------------------------
+resMF <- results(ddsMF)
+head(resMF)
+
+## ----multiTypeResults---------------------------------------------------------
+resMFType <- results(ddsMF, contrast=c("type","single-read","paired-end"))
+head(resMFType)
+
+## ----rlogAndVST---------------------------------------------------------------
+rld <- rlog(dds, blind=FALSE)
+vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
+vsd.fast <- vst(dds, blind=FALSE)
+head(assay(rld), 3)
+
+## ----vsd1, echo=FALSE, fig.width=4.5, fig.height=4.5--------------------------
+px     <- counts(dds)[,1] / sizeFactors(dds)[1]
+ord    <- order(px)
+ord    <- ord[px[ord] < 150]
+ord    <- ord[seq(1, length(ord), length=50)]
+last   <- ord[length(ord)]
+vstcol <- c("blue", "black")
+matplot(px[ord],
+        cbind(assay(vsd)[, 1], log2(px))[ord, ],
+        type="l", lty=1, col=vstcol, xlab="n", ylab="f(n)")
+legend("bottomright",
+       legend = c(
+        expression("variance stabilizing transformation"),
+        expression(log[2](n/s[1]))),
+       fill=vstcol)
+
+## ----log2meansd, fig.width=4, fig.height=3------------------------------------
+library("vsn")
+notAllZero <- (rowSums(counts(dds))>0)
+meanSdPlot(log2(counts(dds,normalized=TRUE)[notAllZero,] + 1))
+
+## ----rlogmeansd, fig.width=4, fig.height=3------------------------------------
+meanSdPlot(assay(rld[notAllZero,]))
+
+## ----vstmeansd, fig.width=4, fig.height=3-------------------------------------
+meanSdPlot(assay(vsd[notAllZero,]))
+
+## ----heatmap------------------------------------------------------------------
+library("pheatmap")
+select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
+
+## ----figHeatmap2a, dev="pdf", fig.width=5, fig.height=7-----------------------
+nt <- normTransform(dds) # defaults to log2(x+1)
+log2.norm.counts <- assay(nt)[select,]
+df <- as.data.frame(colData(dds)[,c("condition","type")])
+pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=FALSE,
+         cluster_cols=FALSE, annotation_col=df)
+
+## ----figHeatmap2b, dev="pdf", fig.width=5, fig.height=7-----------------------
+pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE,
+         cluster_cols=FALSE, annotation_col=df)
+
+## ----figHeatmap2c, dev="pdf", fig.width=5, fig.height=7-----------------------
+pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
+         cluster_cols=FALSE, annotation_col=df)
+
+## ----sampleClust--------------------------------------------------------------
+sampleDists <- dist(t(assay(rld)))
+
+## ----figHeatmapSamples, dev="pdf", fig.width=7, fig.height=7------------------
+library("RColorBrewer")
+sampleDistMatrix <- as.matrix(sampleDists)
+rownames(sampleDistMatrix) <- paste(rld$condition, rld$type, sep="-")
+colnames(sampleDistMatrix) <- NULL
+colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
+pheatmap(sampleDistMatrix,
+         clustering_distance_rows=sampleDists,
+         clustering_distance_cols=sampleDists,
+         col=colors)
+
+## ----figPCA, dev="pdf", fig.width=5, fig.height=3-----------------------------
+plotPCA(rld, intgroup=c("condition", "type"))
+
+## ----figPCA2, dev="pdf", fig.width=5, fig.height=3----------------------------
+data <- plotPCA(rld, intgroup=c("condition", "type"), returnData=TRUE)
+percentVar <- round(100 * attr(data, "percentVar"))
+ggplot(data, aes(PC1, PC2, color=condition, shape=type)) +
+  geom_point(size=3) +
+  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
+  ylab(paste0("PC2: ",percentVar[2],"% variance"))
+
+## ----WaldTest, eval=FALSE-----------------------------------------------------
+#  dds <- estimateSizeFactors(dds)
+#  dds <- estimateDispersions(dds)
+#  dds <- nbinomWaldTest(dds)
+
+## ----simpleContrast, eval=FALSE-----------------------------------------------
+#  results(dds, contrast=c("condition","C","B"))
+
+## ----combineFactors, eval=FALSE-----------------------------------------------
+#  dds$group <- factor(paste0(dds$genotype, dds$condition))
+#  design(dds) <- ~ group
+#  dds <- DESeq(dds)
+#  resultsNames(dds)
+#  results(dds, contrast=c("group", "IB", "IA"))
+
+## ----interFig, dev="pdf", fig.width=4, fig.height=3, echo=FALSE, results="hide"----
+npg <- 20
+mu <- 2^c(8,10,9,11,10,12)
+cond <- rep(rep(c("A","B"),each=npg),3)
+geno <- rep(c("I","II","III"),each=2*npg)
+table(cond, geno)
+counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01)
+d <- data.frame(log2c=log2(counts+1), cond, geno)
+library(ggplot2)
+plotit <- function(d, title) {
+  ggplot(d, aes(x=cond, y=log2c, group=geno)) + 
+    geom_jitter(size=1.5, position = position_jitter(width=.15)) +
+    facet_wrap(~ geno) + 
+    stat_summary(fun.y=mean, geom="line", colour="red", size=0.8) + 
+    xlab("condition") + ylab("log2(counts+1)") + ggtitle(title)
+}
+plotit(d, "Gene 1") + ylim(7,13)
+lm(log2c ~ cond + geno + geno:cond, data=d)
+
+## ----interFig2, dev="pdf", fig.width=4, fig.height=3,  echo=FALSE, results="hide"----
+mu[4] <- 2^12
+mu[6] <- 2^8
+counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01)
+d2 <- data.frame(log2c=log2(counts + 1), cond, geno)
+plotit(d2, "Gene 2") + ylim(7,13)
+lm(log2c ~ cond + geno + geno:cond, data=d2)
+
+## ----boxplotCooks-------------------------------------------------------------
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+## ----dispFit------------------------------------------------------------------
+plotDispEsts(dds)
+
+## ----dispFitCustom------------------------------------------------------------
+ddsCustom <- dds
+useForMedian <- mcols(ddsCustom)$dispGeneEst > 1e-7
+medianDisp <- median(mcols(ddsCustom)$dispGeneEst[useForMedian],na.rm=TRUE)
+dispersionFunction(ddsCustom) <- function(mu) medianDisp
+ddsCustom <- estimateDispersionsMAP(ddsCustom)
+
+## ----filtByMean, dev="pdf"----------------------------------------------------
+metadata(res)$alpha
+metadata(res)$filterThreshold
+plot(metadata(res)$filterNumRej, 
+     type="b", ylab="number of rejections",
+     xlab="quantiles of filter")
+lines(metadata(res)$lo.fit, col="red")
+abline(v=metadata(res)$filterTheta)
+
+## ----noFilt-------------------------------------------------------------------
+resNoFilt <- results(dds, independentFiltering=FALSE)
+addmargins(table(filtering=(res$padj < .1), noFiltering=(resNoFilt$padj < .1)))
+
+## ----ddsNoPrior---------------------------------------------------------------
+ddsNoPrior <- DESeq(dds, betaPrior=FALSE)
+
+## ----lfcThresh----------------------------------------------------------------
+par(mfrow=c(2,2),mar=c(2,2,1,1))
+yl <- c(-2.5,2.5)
+
+resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs")
+resLA <- results(ddsNoPrior, lfcThreshold=.5, altHypothesis="lessAbs")
+resG <- results(dds, lfcThreshold=.5, altHypothesis="greater")
+resL <- results(dds, lfcThreshold=.5, altHypothesis="less")
+
+plotMA(resGA, ylim=yl)
+abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
+plotMA(resLA, ylim=yl)
+abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
+plotMA(resG, ylim=yl)
+abline(h=.5,col="dodgerblue",lwd=2)
+plotMA(resL, ylim=yl)
+abline(h=-.5,col="dodgerblue",lwd=2)
+
+## ----mcols--------------------------------------------------------------------
+mcols(dds,use.names=TRUE)[1:4,1:4]
+# here using substr() only for display purposes
+substr(names(mcols(dds)),1,10) 
+mcols(mcols(dds), use.names=TRUE)[1:4,]
+
+## ----muAndCooks---------------------------------------------------------------
+head(assays(dds)[["mu"]])
+head(assays(dds)[["cooks"]])
+
+## ----dispersions--------------------------------------------------------------
+head(dispersions(dds))
+# which is the same as 
+head(mcols(dds)$dispersion)
+
+## ----sizefactors--------------------------------------------------------------
+sizeFactors(dds)
+
+## ----coef---------------------------------------------------------------------
+head(coef(dds))
+
+## ----betaPriorVar-------------------------------------------------------------
+attr(dds, "betaPriorVar")
+
+## ----dispPriorVar-------------------------------------------------------------
+dispersionFunction(dds)
+attr(dispersionFunction(dds), "dispPriorVar")
+
+## ----versionNum---------------------------------------------------------------
+metadata(dds)[["version"]]
+
+## ----normFactors, eval=FALSE--------------------------------------------------
+#  normFactors <- normFactors / exp(rowMeans(log(normFactors)))
+#  normalizationFactors(dds) <- normFactors
+
+## ----offsetTransform, eval=FALSE----------------------------------------------
+#  cqnOffset <- cqnObject$glm.offset
+#  cqnNormFactors <- exp(cqnOffset)
+#  EDASeqNormFactors <- exp(-1 * EDASeqOffset)
+
+## ----lineardep, echo=FALSE----------------------------------------------------
+data.frame(batch=factor(c(1,1,2,2)), condition=factor(c("A","A","B","B")))
+
+## ----lineardep2, echo=FALSE---------------------------------------------------
+data.frame(batch=factor(c(1,1,1,1,2,2)), condition=factor(c("A","A","B","B","C","C")))
+
+## ----lineardep3, echo=FALSE---------------------------------------------------
+data.frame(batch=factor(c(1,1,1,2,2,2)), condition=factor(c("A","B","C","A","B","C")))
+
+## ----groupeffect--------------------------------------------------------------
+(coldata <- data.frame(grp=factor(rep(c("X","Y"),each=4)),
+                       ind=factor(rep(1:4,each=2)),
+                       cnd=factor(rep(c("A","B"),4))))
+
+## ----groupeffect2-------------------------------------------------------------
+coldata$ind.n <- factor(rep(rep(1:2,each=2),2))
+coldata
+
+## ----groupeffect3-------------------------------------------------------------
+model.matrix(~ grp + grp:ind.n + grp:cnd, coldata)
+
+## ----groupeffect4, eval=FALSE-------------------------------------------------
+#  results(dds, contrast=list("grpY.cndB","grpX.cndB"))
+
+## ----missingcombo-------------------------------------------------------------
+group <- factor(rep(1:3,each=6))
+condition <- factor(rep(rep(c("A","B","C"),each=2),3))
+(d <- data.frame(group, condition)[-c(17,18),])
+
+## ----missingcombo2------------------------------------------------------------
+m1 <- model.matrix(~ condition*group, d)
+colnames(m1)
+unname(m1)
+
+## ----missingcombo3------------------------------------------------------------
+m1 <- m1[,-9]
+unname(m1)
+
+## ----cooksPlot----------------------------------------------------------------
+W <- res$stat
+maxCooks <- apply(assays(dds)[["cooks"]],1,max)
+idx <- !is.na(W)
+plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", 
+     ylab="maximum Cook's distance per gene",
+     ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3))
+m <- ncol(dds)
+p <- 3
+abline(h=qf(.99, p, m - p))
+
+## ----indFilt------------------------------------------------------------------
+plot(res$baseMean+1, -log10(res$pvalue),
+     log="x", xlab="mean of normalized counts",
+     ylab=expression(-log[10](pvalue)),
+     ylim=c(0,30),
+     cex=.4, col=rgb(0,0,0,.3))
+
+## ----histindepfilt, dev="pdf", fig.width=7, fig.height=5----------------------
+use <- res$baseMean > metadata(res)$filterThreshold
+h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
+h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
+colori <- c(`do not pass`="khaki", `pass`="powderblue")
+
+## ----fighistindepfilt---------------------------------------------------------
+barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
+        col = colori, space = 0, main = "", ylab="frequency")
+text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
+     adj = c(0.5,1.7), xpd=NA)
+legend("topright", fill=rev(colori), legend=rev(names(colori)))
+
+## ----vanillaDESeq, eval=FALSE-------------------------------------------------
+#  dds <- DESeq(dds, minReplicatesForReplace=Inf)
+#  res <- results(dds, cooksCutoff=FALSE, independentFiltering=FALSE)
+
+## ----varGroup, echo=FALSE, fig.width=5, fig.height=5--------------------------
+set.seed(3)
+dds1 <- makeExampleDESeqDataSet(n=1000,m=12,betaSD=.3,dispMeanRel=function(x) 0.01)
+dds2 <- makeExampleDESeqDataSet(n=1000,m=12,
+                                betaSD=.3,
+                                interceptMean=mcols(dds1)$trueIntercept,
+                                interceptSD=0,
+                                dispMeanRel=function(x) 0.2)
+dds2 <- dds2[,7:12]
+dds2$condition <- rep("C",6)
+mcols(dds2) <- NULL
+dds <- cbind(dds1, dds2)
+rld <- rlog(dds, blind=FALSE, fitType="mean")
+plotPCA(rld)
+
+## ----overShrink, echo=FALSE, fig.width=5, fig.height=5------------------------
+plot(c(10^rnorm(1000, 3, 2),300,2000,5000), 
+     c(rnorm(1000, 0, .15), -5.5, -8.5, 7.5),
+     ylim=c(-10,10), log="x", cex=.4,
+     xlab="mean of normalized counts", 
+     ylab="log2 fold change")
+abline(h=0, col=rgb(1,0,0,.7))
+
+## ----convertNA, eval=FALSE----------------------------------------------------
+#  res$padj <- ifelse(is.na(res$padj), 1, res$padj)
+
+## ----sessInfo, results="asis", echo=FALSE-------------------------------------
+toLatex(sessionInfo())
+
+## ----resetOptions, results="hide", echo=FALSE---------------------------------
+options(prompt="> ", continue="+ ")
+
diff --git a/inst/doc/DESeq2.Rnw b/inst/doc/DESeq2.Rnw
index 78543b6..3aeb276 100644
--- a/inst/doc/DESeq2.Rnw
+++ b/inst/doc/DESeq2.Rnw
@@ -124,26 +124,27 @@ for more information about how to construct an informative post.
 
 \subsection{Input data} \label{sec:prep}
 
-\subsubsection{Why raw counts?}
+\subsubsection{Why un-normalized counts?}
 
 As input, the \deseqtwo{} package expects count data as obtained, e.\,g.,
 from RNA-seq or another high-throughput sequencing experiment, in the form of a
 matrix of integer values. The value in the $i$-th row and the $j$-th column of
-the matrix tells how many reads have been mapped to gene $i$ in sample $j$.
+the matrix tells how many reads can be assigned to gene $i$ in sample $j$.
 Analogously, for other types of assays, the rows of the matrix might correspond
 e.\,g.\ to binding regions (with ChIP-Seq) or peptide sequences (with
-quantitative mass spectrometry). We will list method for obtaining count tables
-in a section below.
+quantitative mass spectrometry). We will list method for obtaining count matrices
+in sections below.
 
-The count values must be raw counts of sequencing reads (for
+The values in the matrix should be un-normalized counts of sequencing reads (for
 single-end RNA-seq) or fragments (for paired-end RNA-seq). 
 The \href{http://www.bioconductor.org/help/workflows/rnaseqGene/}{RNA-seq workflow}
 describes multiple techniques for preparing such count matrices.
 It is important to provide count matrices as input for \deseqtwo{}'s
 statistical model \cite{Love2014} to hold, as only the  
 count values allow assessing the measurement precision correctly. The \deseqtwo{}
-model internally corrects for library size, so transformed values such as counts
-scaled by library size should never be used as input.
+model internally corrects for library size, so transformed or
+normalized values such as counts scaled by library size should not
+be used as input. 
 
 \subsubsection{\Rclass{SummarizedExperiment} input} \label{sec:sumExpInput}
 
@@ -252,6 +253,83 @@ featureData <- data.frame(gene=rownames(pasillaGenes))
 (mcols(dds) <- DataFrame(mcols(dds), featureData))
 @ 
 
+\subsubsection{tximport: transcript abundance summarized to gene-level}
+
+Users can create gene-level count matrices for use with \deseqtwo{}
+by importing information using the \Biocpkg{tximport} package.
+This workflow allows users to import transcript abundance estimates
+from a variety of external software, including the following methods:
+
+\begin{itemize}
+\item \href{http://www.cs.cmu.edu/~ckingsf/software/sailfish/}{Sailfish} 
+  \cite{Patro2014Sailfish}
+\item \href{http://combine-lab.github.io/salmon/}{Salmon} 
+  \cite{Patro2015Salmon}
+\item
+  \href{https://pachterlab.github.io/kallisto/about.html}{kallisto} 
+  \cite{Bray2015Near}
+\item \href{http://deweylab.github.io/RSEM/}{RSEM} 
+  \cite{Li2011RSEM}
+\end{itemize}
+
+Some advantages of using the above methods for transcript abundance
+estimation are: (i) this approach corrects for potential changes
+in gene length across samples 
+(e.g. from differential isoform usage) \cite{Trapnell2013Differential},
+(ii) some of these methods (\textit{Sailfish, Salmon, kallisto}) 
+are substantially faster and require less memory
+and disk usage compared to alignment-based methods that require
+creation and storage of BAM files, and
+(iii) it is possible to avoid discarding those fragments that can
+align to multiple genes with homologous sequence, thus increasing
+sensitivity \cite{Robert2015Errors}.
+
+Full details on the motivation and methods for importing transcript
+level abundance and count estimates, summarizing to gene-level count matrices 
+and producing an offset which corrects for potential changes in average
+transcript length across samples are described in \cite{Soneson2015}.
+The \textit{tximport}$\rightarrow$\deseqtwo{} approach uses rounded estimated
+gene counts (but not normalized) instead of the raw count of fragments
+which can be unambiguously assigned to a gene.
+
+Here, we demonstrate how to import transcript abundances
+and construct of a gene-level \Rclass{DESeqDataSet} object
+from \textit{Sailfish} \texttt{quant.sf} files, which are
+stored in the \Biocexptpkg{tximportData} package.
+Note that, instead of locating \Robject{dir} using \Rfunction{system.file},
+a user would typically just provide a path, e.g. \texttt{/path/to/quant/files}.
+For further details on use of \Rfunction{tximport}, 
+including the construction of the \Robject{tx2gene} table for linking
+transcripts to genes, please refer to the \Biocpkg{tximport} package vignette. 
+
+<<tximport>>=
+library("tximport")
+library("readr")
+library("tximportData")
+dir <- system.file("extdata", package="tximportData")
+samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
+files <- file.path(dir,"salmon", samples$run, "quant.sf")
+names(files) <- paste0("sample",1:6)
+tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))
+txi <- tximport(files, type="salmon", tx2gene=tx2gene, reader=read_tsv)
+@ 
+
+Next we create an condition vector to demonstrate building an
+\Robject{DESeqDataSet}. For a typical use, this information would already
+be present as a column of the \Robject{samples} table.
+The best practice is to read \Robject{colData} from a CSV or TSV file, 
+and to construct \Robject{files} 
+from a column of \Robject{colData}, as shown in the code chunk above.
+
+<<txi2dds>>=
+coldata <- data.frame(condition=factor(rep(c("A","B"),each=3)))
+rownames(coldata) <- colnames(txi$counts)
+ddsTxi <- DESeqDataSetFromTximport(txi, colData=coldata, design=~ condition)
+@
+
+The \Robject{ddsTxi} object can then be used as \Robject{dds} in the
+following analysis steps.
+
 \subsubsection{\textit{HTSeq} input}
 
 You can use the function \Rfunction{DESeqDataSetFromHTSeqCount} if you
@@ -435,6 +513,20 @@ summary(res05)
 sum(res05$padj < 0.05, na.rm=TRUE)
 @ 
 
+A generalization of the idea of $p$ value filtering is to \textit{weight} hypotheses
+to optimize power. A new Bioconductor package, \Biocpkg{IHW}, is now available
+that implements the method of \textit{Independent Hypothesis Weighting} \cite{Ignatiadis2015}.
+Here we show the use of \textit{IHW} for $p$ value adjustment of \deseqtwo{} results.
+For more details, please see the vignette of the \Biocpkg{IHW} package.
+Note that the \textit{IHW} result object is stored in the metadata.
+
+<<IHW>>=
+library("IHW")
+resIHW <- results(dds, filterFun=ihw)
+summary(resIHW)
+sum(resIHW$padj < 0.1, na.rm=TRUE)
+metadata(resIHW)$ihwResult
+@ 
 
 If a multi-factor design is used, or if the variable in the design
 formula has more than two levels, the \Robject{contrast} argument of
@@ -586,14 +678,33 @@ can be set to \Robject{NA} for one of the following reasons:
     described in Section~\ref{sec:autoFilt}.
 \end{enumerate}
 
-\subsubsection{Exporting results to HTML or CSV files}
-An HTML report of the results with plots and sortable/filterable columns
-can be exported using the \Biocpkg{ReportingTools} package
+\subsubsection{Rich visualization and reporting of results}
+
+\textbf{ReportingTools.} An HTML report of the results with plots and sortable/filterable columns
+can be generated using the \Biocpkg{ReportingTools} package
 on a \Rclass{DESeqDataSet} that has been processed by the \Rfunction{DESeq} function.
 For a code example, see the ``RNA-seq differential expression'' vignette at
 the \Biocpkg{ReportingTools} page, or the manual page for the 
 \Rfunction{publish} method for the \Rclass{DESeqDataSet} class.
 
+\textbf{regionReport.} An HTML and PDF summary of the results with plots
+can also be generated using the \Biocpkg{regionReport} package.
+The \Rfunction{DESeq2Report} function should be run on a 
+\Rclass{DESeqDataSet} that has been processed by the \Rfunction{DESeq} function.
+For more details see the manual page for \Rfunction{DESeq2Report} 
+and an example vignette in the \Biocpkg{regionReport} package.
+
+\textbf{Glimma.} Interactive visualization of \deseqtwo{} output, 
+including MA-plots (also called MD-plot) can be generated using the
+\Biocpkg{Glimma} package. See the manual page for \Rfunction{glMDPlot.DESeqResults}.
+
+\textbf{pcaExplorer.} Interactive visualization of \deseqtwo{} output,
+including PCA plots, boxplots of counts and other useful summaries can be
+generated using the \Biocpkg{pcaExplorer} package.
+See the ``Launching the application'' section of the package vignette.
+
+\subsubsection{Exporting results to CSV files}
+
 A plain-text file of the results can be exported using the 
 base \R{} functions \Rfunction{write.csv} or \Rfunction{write.delim}. 
 We suggest using a descriptive file name indicating the variable
@@ -726,12 +837,15 @@ variance above the trend which will allow us to cluster samples into
 interesting groups.
 
 \textbf{Note on running time:} if you have many samples (e.g. 100s),
-the \Rfunction{rlog} function might take too long, and the variance
-stabilizing transformation might be a better choice.  The rlog and VST
-have similar properties, but the rlog requires fitting a shrinkage
+the \Rfunction{rlog} function might take too long, and the 
+\Rfunction{varianceStabilizingTransformation} is a faster choice.  
+The rlog and VST have similar properties, but the rlog requires fitting a shrinkage
 term for each sample and each gene which takes time.  See the
 \deseqtwo{} paper for more discussion on the differences
-\cite{Love2014}.
+\cite{Love2014}. In addition, a new function \Rfunction{vst} provides
+an even faster version of the \Rfunction{varianceStabilizingTransformation}
+but calculating the global dispersion trend on a subset of the genes
+(default 1000). \Rfunction{vst} may be attractive for interactive EDA.
 
 \subsubsection{Blind dispersion estimation}
 
@@ -749,7 +863,7 @@ assurance) as demonstrated below.
 However, blind dispersion estimation is not the appropriate choice if
 one expects that many or the majority of genes (rows) will have large
 differences in counts which are explainable by the experimental design,
-and one wishes to tranform the data for downstream analysis. In this
+and one wishes to transform the data for downstream analysis. In this
 case, using blind dispersion estimation will lead to large estimates
 of dispersion, as it attributes differences due to experimental design
 as unwanted ``noise'', and will result in overly shrinking the transformed
@@ -761,18 +875,27 @@ that only the fitted dispersion estimates from mean-dispersion trend
 line are used in the transformation (the global dependence of
 dispersion on mean for the entire experiment).
 So setting \Robject{blind} to \Robject{FALSE} is still for the most
-part unbiased by the information about which samples were in which
-experimental group. 
+part not using the information about which samples were in which
+experimental group in applying the transformation.
 
 \subsubsection{Extracting transformed values}
 
-The two functions return an object of class \Rclass{DESeqTransform}
+These functions return an object of class \Rclass{DESeqTransform}
 which is a subclass of \Rclass{RangedSummarizedExperiment}. 
-The \Rfunction{assay} function is used to extract the matrix of normalized values:
+For $\sim 20$ samples, running on a newly created \Robject{DESeqDataSet},
+\Rfunction{rlog} may take 30 seconds, 
+\Rfunction{varianceStabilizingTransformation} may take 5 seconds, and
+\Rfunction{vst} less than 1 second (by subsetting to 1000 genes for
+calculating the global dispersion trend).
+However, the running times are shorter and more similar with \Rcode{blind=FALSE} and
+if the function \Rfunction{DESeq} has already been run, because then
+it is not necessary to re-estimate the dispersion values.
+The \Rfunction{assay} function is used to extract the matrix of normalized values.
 
 <<rlogAndVST>>=
-rld <- rlog(dds)
-vsd <- varianceStabilizingTransformation(dds)
+rld <- rlog(dds, blind=FALSE)
+vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
+vsd.fast <- vst(dds, blind=FALSE)
 head(assay(rld), 3)
 @
 
@@ -1320,7 +1443,7 @@ Plotting the dispersion estimates is a useful diagnostic. The dispersion
 plot in Figure \ref{figure/dispFit-1} is typical, with the final estimates shrunk
 from the gene-wise estimates towards the fitted estimates. Some gene-wise
 estimates are flagged as outliers and not shrunk towards the fitted value,
-(this outlier detection is described in the man page for \Rfunction{estimateDispersionsMAP}).
+(this outlier detection is described in the manual page for \Rfunction{estimateDispersionsMAP}).
 The amount of shrinkage can be more or less than seen here, depending 
 on the sample size, the number of coefficients, the row mean
 and the variability of the gene-wise estimates.
@@ -1554,6 +1677,14 @@ dispersionFunction(dds)
 attr(dispersionFunction(dds), "dispPriorVar")
 @ 
 
+The version of \deseqtwo{} which was used to construct the
+\Rclass{DESeqDataSet} object, or the version used when
+\Rfunction{DESeq} was run, is stored here:
+
+<<versionNum>>=
+metadata(dds)[["version"]]
+@ 
+
 \subsection{Sample-/gene-dependent normalization factors} \label{sec:normfactors}
 
 In some experiments, there might be gene-dependent dependencies
@@ -1770,7 +1901,16 @@ proportional to the expected true concentration of fragments for sample $j$.
 The coefficients $\beta_i$ give the log2 fold changes for gene $i$ for each 
 column of the model matrix $X$. 
 
-By default these log2 fold changes are the maximum \emph{a posteriori} estimates after incorporating a
+The dispersion parameter $\alpha_i$ defines the relationship between
+the variance of the observed count and its mean value. In other
+words, how far do we expected the observed count will be from the
+mean value, which depends both on the size factor $s_j$ and the
+covariate-dependent part $q_{ij}$ as defined above.
+
+$$ \textrm{Var}(K_{ij}) = E[ (K_{ij} - \mu_{ij})^2 ] = \mu_{ij} + \alpha_i \mu_{ij}^2 $$
+
+The log2 fold changes in $\beta_i$ are the maximum \emph{a posteriori}
+estimates after incorporating a 
 zero-centered Normal prior -- in the software referrred to as a $\beta$-prior -- hence \deseqtwo{}
 provides ``moderated'' log2 fold change estimates.  Dispersions are estimated using expected mean
 values from the maximum likelihood estimate of log2 fold changes, and optimizing the Cox-Reid
@@ -1823,7 +1963,7 @@ version \Biocpkg{DESeq}, are as follows:
     matching the empirical quantile to the quantile of a Normal
     distribution, \deseqtwo() now uses the weighted quantile function
     of the \CRANpkg{Hmisc} package. The weighting is described in the
-    man page for \Rfunction{nbinomWaldTest}.  The weights are the
+    manual page for \Rfunction{nbinomWaldTest}.  The weights are the
     inverse of the expected variance of log counts (as used in the
     diagonals of the matrix $W$ in the GLM). The effect of the change
     is that the estimated prior variance is robust against noisy
@@ -2065,6 +2205,7 @@ genes with all counts equal to zero.
   testing we recommend the \Rfunction{DESeq} function applied to raw
   counts as outlined in Section~\ref{sec:de}.
       
+  
 \subsection{Can I use DESeq2 to analyze paired samples?}
 
 Yes, you should use a multi-factor design which includes the sample
@@ -2073,6 +2214,61 @@ differences between the samples while estimating the effect due to
 the condition. The condition of interest should go at the end of the 
 design formula. See Section~\ref{sec:multifactor}.
 
+\subsection{If I have multiple groups, should I run all together or split into pairs of groups?}
+
+Typically, we recommend users to run samples from all groups together, and then
+use the \Rcode{contrast} argument of the \Rfunction{results} function
+to extract comparisons of interest after fitting the model using \Rfunction{DESeq}.
+
+The model fit by \Rfunction{DESeq} estimates a single dispersion
+parameter for each gene, which defines how far we expect the observed
+count for a sample will be from the mean value from the model 
+given its size factor and its condition group. See Section~\ref{sec:glm} 
+and the \deseqtwo{} paper for full details.
+Having a single dispersion parameter for each gene is usually
+sufficient for analyzing multi-group data, as the final dispersion value will
+incorporate the within-group variability across all groups. 
+
+However, for some datasets, exploratory data analysis (EDA) plots as outlined
+in Section~\ref{sec:pca} could reveal that one or more groups has much
+higher within-group variability than the others. A simulated example
+of such a set of samples is shown in Figure~\ref{figure/varGroup-1}.
+This is case where, by comparing groups A and B separately --
+subsetting a \Rclass{DESeqDataSet} to only samples from those two
+groups and then running \Rfunction{DESeq} on this subset -- will be
+more sensitive than a model including all samples together.
+It should be noted that such an extreme range of within-group
+variability is not common, although it could arise if certain
+treatments produce an extreme reaction (e.g. cell death).
+Again, this can be easily detected from the EDA plots such as PCA
+described in this vignette.
+
+<<varGroup, echo=FALSE, fig.width=5, fig.height=5>>=
+set.seed(3)
+dds1 <- makeExampleDESeqDataSet(n=1000,m=12,betaSD=.3,dispMeanRel=function(x) 0.01)
+dds2 <- makeExampleDESeqDataSet(n=1000,m=12,
+                                betaSD=.3,
+                                interceptMean=mcols(dds1)$trueIntercept,
+                                interceptSD=0,
+                                dispMeanRel=function(x) 0.2)
+dds2 <- dds2[,7:12]
+dds2$condition <- rep("C",6)
+mcols(dds2) <- NULL
+dds <- cbind(dds1, dds2)
+rld <- rlog(dds, blind=FALSE, fitType="mean")
+plotPCA(rld)
+@ 
+
+\incfig{figure/varGroup-1}{.5\textwidth}{Extreme range of within-group variability.}{
+  Typically, it is recommended to run \Rfunction{DESeq} across samples
+  from all groups, for datasets with multiple groups. 
+  However, this simulated dataset shows a case where
+  it would be preferable to compare groups A and B by creating a
+  smaller dataset without the C samples. Group C has much higher
+  within-group variability, which would inflate the per-gene dispersion
+  estimate for groups A and B as well. 
+}
+
 \subsection{Can I run DESeq2 to contrast the levels of 100 groups?}
 
 \deseqtwo{} will work with any kind of design specified using the R
@@ -2091,9 +2287,13 @@ fitting procedure.
 \subsection{Can I use DESeq2 to analyze a dataset without replicates?}
 
 If a \Rclass{DESeqDataSet} is provided with an experimental design without replicates,
-a message is printed, that the samples are treated as replicates
-for estimation of dispersion. More details can be found in the 
-manual page for \Rfunction{?DESeq}.
+a warning is printed, that the samples are treated as replicates
+for estimation of dispersion. This kind of analysis is
+only useful for exploring the data, but will not provide the kind of
+proper statistical inference on differences between groups.
+Without biological replicates, it is not possible to estimate the biological
+variability of each gene. 
+More details can be found in the manual page for \Rfunction{?DESeq}.
 
 \subsection{How can I include a continuous covariate in the design formula?}
 
@@ -2108,6 +2308,98 @@ the trend over the continuous covariates.  In R, \Rclass{numeric}
 vectors can be converted into \Rclass{factors} using the function
 \Rfunction{cut}.
 
+\subsection{Will the log fold change shrinkage ``overshrink'' large differences?}
+
+For most datasets, the application of a prior to the log fold changes
+is a good choice, providing log fold change estimates that are
+more stable across the entire range of mean counts than the maximum
+likelihood estimates (see Figure~\ref{fig:MA} and the \deseqtwo{} paper).
+One situation in which the prior on log fold changes might
+``overshrink'' the estimates is 
+if nearly all genes show no difference across condition, a very
+small set of genes have extremely large differences, and no genes in between.
+A simulated example of such a dataset is Figure~\ref{figure/overShrink-1}.
+This is not likely to be the case for most experiments, where typically
+there is a range of differences by size: some genes with medium-to-large
+differences across treatment, and some with small differences.
+
+<<overShrink, echo=FALSE, fig.width=5, fig.height=5>>=
+plot(c(10^rnorm(1000, 3, 2),300,2000,5000), 
+     c(rnorm(1000, 0, .15), -5.5, -8.5, 7.5),
+     ylim=c(-10,10), log="x", cex=.4,
+     xlab="mean of normalized counts", 
+     ylab="log2 fold change")
+abline(h=0, col=rgb(1,0,0,.7))
+@ 
+
+\incfig{figure/overShrink-1}{.5\textwidth}{Example of a dataset with
+  where the log fold change prior should be turned off.}{
+  Here we show a simulated MA-plot, where nearly all of the log fold changes
+  are falling near the x-axis, with three genes that have very large log
+  fold changes (note the y-axis is from -10 to 10 on the log2 scale).
+  This would indicate a dataset where the log fold change prior would
+  ``overshrink'' the large fold changes, and so \Robject{betaPrior=FALSE}
+  should be used.
+}
+
+There could be experiments in which only a few genes have
+very large log fold changes, and the rest of the genes are
+nearly constant across treatment.
+Or, there could be artificially constructed libraries fitting this description,
+e.g. technical replicates where the only difference across libraries 
+is the concentration of a few spiked-in genes.
+``Overshrinking'' of a few large log fold changes
+can be assessed by running \Rfunction{results} with \Rcode{addMLE=TRUE},
+which will print a results table with columns for the shrunken and
+unshrunken (MLE) log fold changes.
+The two estimates can be visually compared by running \Rfunction{plotMA} with
+\Rcode{MLE=TRUE} and \Rcode{MLE=FALSE}. 
+If ``overshrinking'' very large log fold changes is a concern,
+it is better to turn off the log fold change prior by
+running \Rfunction{DESeq} with \Rcode{betaPrior=FALSE}.
+
+Even more detail: how do we avoid overshrinking on typical datasets?
+The answer is that we estimate the width of the log fold change prior in a
+robust way to accommodate the very largest log fold changes, and so to
+avoid overshrinking. 
+The details of the prior estimation are described in the manual page for
+\Rfunction{nbinomWaldTest}. Briefly, a weighted upper quantile
+is used to match the width of the log fold change prior to the upper
+5\% of the MLE log fold changes, weighting by the expected sampling
+variability of the estimated log fold changes given the mean count for
+each gene. Note that this does not equal an assumption that 5\% of genes are
+differentially expressed, but that a reasonable width of a log fold
+change distribution can be obtained from the upper 5\% of MLE log fold
+changes. 
+
+\subsection{I ran a likelihood ratio test, but \texttt{results()} only gives me one comparison.}
+
+``\dots How do I get the $p$ values for all of the variables/levels 
+that were removed in the reduced design?''
+
+This is explained in the help page for \texttt{?results} in the
+section about likelihood ratio test p-values, but we will restate the
+answer here. When one performs a likelihood ratio test, the $p$ values and
+the test statistic (the \Robject{stat} column) are values for the test
+that removes all of the variables which are present in the full
+design and not in the reduced design. This tests the null hypothesis
+that all the coefficients from these variables and levels of these factors
+are equal to zero.
+
+The likelihood ratio test $p$ values therefore
+represent a test of \textit{all the variables and all the levels of factors}
+which are among these variables. However, the results table only has space for
+one column of log fold change, so a single variable and a single
+comparison is shown (among the potentially multiple log fold changes
+which were tested in the likelihood ratio test). 
+This is indicated at the top of the results table
+with the text, e.g.: ``log2 fold change (MLE): condition C vs A'' followed
+by ``LRT p-value: '\lowtilde{} batch + condition' vs '\lowtilde{} batch' ''.
+This indicates that the $p$ value is for the likelihood ratio test of
+\textit{all the variables and all the levels}, while the log fold change is a single
+comparison from among those variables and levels.
+See the help page for \Rfunction{results} for more details.
+
 \subsection{What are the exact steps performed by \Rfunction{DESeq()}?}
 
 See the manual page for \Rfunction{DESeq}, which links to the 
@@ -2120,13 +2412,34 @@ Yes. The repository for the \deseqtwo{} tool is
 and a link to its location in the Tool Shed is 
 \url{https://toolshed.g2.bx.psu.edu/view/iuc/deseq2/d983d19fbbab}.
 
+\subsection{I want to benchmark DESeq2 comparing to other DE tools.}
+
+One aspect which can cause problems for comparison is that, by default,
+\deseqtwo{} outputs \Rcode{NA} values for adjusted $p$ values based on 
+independent filtering of genes which have low counts.
+This is a way for the \deseqtwo{} to give extra
+information on why the adjusted $p$ value for this gene is not small.
+Additionally, $p$ values can be set to \Rcode{NA} based on extreme 
+count outlier detection (see Section~\ref{sec:moreInfo} for full details). 
+These \Rcode{NA} values should be considered
+negatives for purposes of estimating sensitivity and specificity. The
+easiest way to work with the adjusted $p$ values in a benchmarking
+context is probably to convert these \Rcode{NA} values to 1:
+
+<<convertNA, eval=FALSE>>=
+res$padj <- ifelse(is.na(res$padj), 1, res$padj)
+@ 
+
 \section{Acknowledgments}
 
 We have benefited in the development of \deseqtwo{} from the help and
 feedback of many individuals, including but not limited to: 
 The Bionconductor Core Team,
 Alejandro Reyes, Andrzej Ole\'s, Aleksandra Pekowska, Felix Klein,
+Nikolaos Ignatiadis,
 Vince Carey,
+Owen Solberg,
+Ruping Sun,
 Devon Ryan, 
 Steve Lianoglou, Jessica Larson, Christina Chaivorapol, Pan Du, Richard Bourgon,
 Willem Talloen, 
@@ -2145,7 +2458,10 @@ Sonali Arora,
 Jordan Ramilowski,
 Ian Dworkin,
 Bj\"orn Gr\"uning,
-Ryan McMinds.
+Ryan McMinds,
+Paul Gordon,
+Leonardo Collado Torres,
+Enrico Ferrero.
 \section{Session Info}
 
 <<sessInfo, results="asis", echo=FALSE>>=
diff --git a/inst/doc/DESeq2.pdf b/inst/doc/DESeq2.pdf
index f33a0aa..3f0bb3c 100644
Binary files a/inst/doc/DESeq2.pdf and b/inst/doc/DESeq2.pdf differ
diff --git a/inst/script/deseq2.R b/inst/script/deseq2.R
deleted file mode 100644
index d92ead4..0000000
--- a/inst/script/deseq2.R
+++ /dev/null
@@ -1,322 +0,0 @@
-# A command-line interface to DESeq2 for use with Galaxy
-# written by Bjoern Gruening and modified by Michael Love 2015.01.11
-#
-# one of these arguments is required:
-#
-#   'factors' a JSON list object from Galaxy
-#
-#   'sample_table' is a sample table as described in ?DESeqDataSetFromHTSeq
-#   with columns: sample name, filename, then factors (variables)
-#
-# the output file has columns:
-# 
-#   baseMean (mean normalized count)
-#   log2FoldChange (by default a moderated LFC estimate)
-#   lfcSE (the standard error)
-#   stat (the Wald statistic)
-#   pvalue (p-value from comparison of Wald statistic to a standard Normal)
-#   padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter)
-# 
-# the first variable in 'factors' and first column in 'sample_table' will be the primary factor.
-# the levels of the primary factor are used in the order of appearance in factors or in sample_table.
-#
-# by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile'
-#
-# for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B,
-# to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc.
-# all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages.
-#
-# fit_type is an integer valued argument, with the options from ?estimateDisperions
-#   1 "parametric"
-#   2 "local"
-#   3 "mean"
-
-# setup R error handling to go to stderr
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
-
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
-loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-
-library("getopt")
-options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
-args <- commandArgs(trailingOnly = TRUE)
-
-# get options, using the spec as defined by the enclosed list.
-# we read the options from the default: commandArgs(TRUE).
-spec <- matrix(c(
-  "quiet", "q", 0, "logical",
-  "help", "h", 0, "logical",
-  "outfile", "o", 1, "character",
-  "factors", "f", 1, "character",
-  "plots" , "p", 1, "character",
-  "sample_table", "s", 1, "character",
-  "fit_type", "t", 1, "integer",
-  "many_contrasts", "m", 0, "logical",
-  "outlier_replace_off" , "a", 0, "logical",
-  "outlier_filter_off" , "b", 0, "logical",
-  "auto_mean_filter_off", "c", 0, "logical",
-  "beta_prior_off", "d", 0, "logical"),
-  byrow=TRUE, ncol=4)
-opt <- getopt(spec)
-
-# if help was asked for print a friendly message
-# and exit with a non-zero error code
-if (!is.null(opt$help)) {
-  cat(getopt(spec, usage=TRUE))
-  q(status=1)
-}
-
-# enforce the following required arguments
-if (is.null(opt$outfile)) {
-  cat("'outfile' is required\n")
-  q(status=1)
-}
-if (is.null(opt$sample_table) & is.null(opt$factors)) {
-  cat("'factors' or 'sample_table' is required\n")
-  q(status=1)
-}
-
-verbose <- if (is.null(opt$quiet)) {
-  TRUE
-} else {
-  FALSE
-}
-
-suppressPackageStartupMessages({
-  library("DESeq2")
-  library("RColorBrewer")
-  library("gplots")
-})
-
-# build or read sample table
-
-trim <- function (x) gsub("^\\s+|\\s+$", "", x)
-
-# switch on if 'factors' was provided:
-if (!is.null(opt$factors)) {
-  library("rjson")
-  parser <- newJSONParser()
-  parser$addData(opt$factors)
-  factorList <- parser$getObject()
-  factors <- sapply(factorList, function(x) x[[1]])
-  primaryFactor <- factors[1]
-  filenamesIn <- unname(unlist(factorList[[1]][[2]]))
-  sampleTable <- data.frame(sample=basename(filenamesIn), filename=filenamesIn, row.names=filenamesIn)
-  for (factor in factorList) {
-    factorName <- trim(factor[[1]])
-    sampleTable[[factorName]] <- character(nrow(sampleTable))
-    lvls <- sapply(factor[[2]], function(x) names(x))
-    for (i in seq_along(factor[[2]])) {
-      files <- factor[[2]][[i]][[1]]
-      sampleTable[files,factorName] <- trim(lvls[i])
-    }
-    sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
-  }
-  rownames(sampleTable) <- sampleTable$sample
-} else {
-  # read the sample_table argument
-  # this table is described in ?DESeqDataSet
-  # one column for the sample name, one for the filename, and
-  # the remaining columns for factors in the analysis
-  sampleTable <- read.delim(opt$sample_table)
-  factors <- colnames(sampleTable)[-c(1:2)]
-  for (factor in factors) {
-    lvls <- unique(as.character(sampleTable[[factor]]))
-    sampleTable[[factor]] <- factor(sampleTable[[factor]], levels=lvls)
-  }
-}
-
-primaryFactor <- factors[1]
-designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + ")))
-
-if (verbose) {
-  cat("DESeq2 run information\n\n")
-  cat("sample table:\n")
-  print(sampleTable[,-c(1:2),drop=FALSE])
-  cat("\ndesign formula:\n")
-  print(designFormula)
-  cat("\n\n")
-}
-
-# these are plots which are made once for each analysis
-generateGenericPlots <- function(dds, factors) {
-  rld <- rlog(dds)
-  print(plotPCA(rld, intgroup=rev(factors)))
-  # need meaningful labels, because from Galaxy, sample names are random
-  labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors])))
-  dat <- assay(rld)
-  colnames(dat) <- labs
-  distsRL <- dist(t(dat))
-  mat <- as.matrix(distsRL)
-  hc <- hclust(distsRL)
-  hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
-  heatmap.2(mat, Rowv=as.dendrogram(hc), symm=TRUE, trace="none", col = rev(hmcol),
-            main="Sample-to-sample distances", margin=c(13,13))
-  plotDispEsts(dds, main="Dispersion estimates")
-}
-
-# these are plots which can be made for each comparison, e.g.
-# once for C vs A and once for B vs A
-generateSpecificPlots <- function(res, threshold, title_suffix) {
-  use <- res$baseMean > threshold
-  if (sum(!use) == 0) {
-    h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE)
-    barplot(height = h$counts,
-            col = "powderblue", space = 0, xlab="p-values", ylab="frequency",
-            main=paste("Histogram of p-values for",title_suffix))
-    text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
-  } else {
-    h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
-    h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
-    colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue")
-    barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
-            col = colori, space = 0, xlab="p-values", ylab="frequency",
-            main=paste("Histogram of p-values for",title_suffix))
-    text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
-    legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white")
-  }
-    plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE))
-}
-
-if (verbose) {
-  cat(paste("primary factor:",primaryFactor,"\n"))
-  if (length(factors) > 1) {
-    cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n"))
-  }
-  cat("\n---------------------\n")
-}
-
-# if JSON input from Galaxy, path is absolute
-# otherwise, from sample_table, assume it is relative
-dir <- if (is.null(opt$factors)) {
-  "."
-} else {
-  ""
-}
-
-# construct the object
-dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
-                                  directory = dir,
-                                  design =  designFormula)
-
-if (verbose) cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
-
-# optional outlier behavior
-if (is.null(opt$outlier_replace_off)) {
-  minRep <- 7
-} else {
-  minRep <- Inf
-  if (verbose) cat("outlier replacement off\n")
-}
-if (is.null(opt$outlier_filter_off)) {
-  cooksCutoff <- TRUE
-} else {  
-  cooksCutoff <- FALSE
-  if (verbose) cat("outlier filtering off\n")
-}
-
-# optional automatic mean filtering
-if (is.null(opt$auto_mean_filter_off)) {
-  independentFiltering <- TRUE
-} else {
-  independentFiltering <- FALSE
-  if (verbose) cat("automatic filtering on the mean off\n")
-}
-
-# shrinkage of LFCs
-if (is.null(opt$beta_prior_off)) {
-  betaPrior <- TRUE
-} else {
-  betaPrior <- FALSE
-  if (verbose) cat("beta prior off\n")
-}
-
-# dispersion fit type
-if (is.null(opt$fit_type)) {
-  fitType <- "parametric"
-} else {
-  fitType <- c("parametric","local","mean")[opt$fit_type]
-}
-
-if (verbose) cat(paste("using disperion fit type:",fitType,"\n"))
-
-# run the analysis
-dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep)
-
-# create the generic plots and leave the device open
-if (!is.null(opt$plots)) {
-  if (verbose) cat("creating plots\n")
-  pdf(opt$plots)
-  generateGenericPlots(dds, factors)
-}
-
-n <- nlevels(colData(dds)[[primaryFactor]])
-allLevels <- levels(colData(dds)[[primaryFactor]])
-
-if (is.null(opt$many_contrasts)) {
-  # only contrast the first and second level of the primary factor
-  ref <- allLevels[1]
-  lvl <- allLevels[2]
-  res <- results(dds, contrast=c(primaryFactor, lvl, ref),
-                 cooksCutoff=cooksCutoff,
-                 independentFiltering=independentFiltering)
-  if (verbose) {
-    cat("summary of results\n")
-    cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n"))
-    print(summary(res))
-  }
-  resSorted <- res[order(res$padj),]
-  outDF <- as.data.frame(resSorted)
-  outDF$geneID <- rownames(outDF)
-  outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
-  filename <- opt$outfile
-  write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
-  if (independentFiltering) {
-    threshold <- unname(attr(res, "filterThreshold"))
-  } else {
-    threshold <- 0
-  }
-  title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
-  if (!is.null(opt$plots)) {
-    generateSpecificPlots(res, threshold, title_suffix)
-  }
-} else {
-  # rotate through the possible contrasts of the primary factor
-  # write out a sorted table of results with the contrast as a suffix
-  # add contrast specific plots to the device
-  for (i in seq_len(n-1)) {
-    ref <- allLevels[i]
-    contrastLevels <- allLevels[(i+1):n]
-    for (lvl in contrastLevels) {
-      res <- results(dds, contrast=c(primaryFactor, lvl, ref),
-                     cooksCutoff=cooksCutoff,
-                     independentFiltering=independentFiltering)
-      resSorted <- res[order(res$padj),]
-      outDF <- as.data.frame(resSorted)
-      outDF$geneID <- rownames(outDF)
-      outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
-      filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref)
-      write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
-      if (independentFiltering) {
-        threshold <- unname(attr(res, "filterThreshold"))
-      } else {
-        threshold <- 0
-      }
-      title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
-      if (!is.null(opt$plots)) {
-        generateSpecificPlots(res, threshold, title_suffix)
-      }
-    }
-  }
-}
-
-# close the plot device
-if (!is.null(opt$plots)) {
-  cat("closing plot device\n")
-  dev.off()
-}
-
-cat("Session information:\n\n")
-
-sessionInfo()
-
diff --git a/inst/script/runScripts.R b/inst/script/runScripts.R
index 643157a..9a1f1c1 100644
--- a/inst/script/runScripts.R
+++ b/inst/script/runScripts.R
@@ -13,6 +13,36 @@ runDESeq2 <- function(e, retDDS=FALSE) {
   return(list(pvals=pvals, padj=padj, beta=beta))
 }
 
+runDESeq2LRT <- function(e, retDDS=FALSE) {
+  counts <- exprs(e)
+  mode(counts) <- "integer"
+  dds <- DESeqDataSetFromMatrix(counts, DataFrame(pData(e)), ~ condition)
+  dds <- DESeq(dds,test="LRT",reduced=~1,quiet=TRUE)
+  res <- results(dds)
+  beta <- res$log2FoldChange
+  pvals <- res$pvalue
+  padj <- res$padj
+  pvals[is.na(pvals)] <- 1
+  pvals[rowSums(exprs(e)) == 0] <- NA
+  padj[is.na(padj)] <- 1
+  return(list(pvals=pvals, padj=padj, beta=beta))
+}
+
+runDESeq2NoIF <- function(e, retDDS=FALSE) {
+  counts <- exprs(e)
+  mode(counts) <- "integer"
+  dds <- DESeqDataSetFromMatrix(counts, DataFrame(pData(e)), ~ condition)
+  dds <- DESeq(dds,quiet=TRUE)
+  res <- results(dds, independentFiltering=FALSE)
+  beta <- res$log2FoldChange
+  pvals <- res$pvalue
+  padj <- res$padj
+  pvals[is.na(pvals)] <- 1
+  pvals[rowSums(exprs(e)) == 0] <- NA
+  padj[is.na(padj)] <- 1
+  return(list(pvals=pvals, padj=padj, beta=beta))
+}
+
 runDESeq2Outliers <- function(e, retDDS=FALSE) {
   counts <- exprs(e)
   mode(counts) <- "integer"
@@ -44,9 +74,10 @@ runEdgeR <- function(e) {
   design <- model.matrix(~ pData(e)$condition)
   dgel <- DGEList(exprs(e))
   dgel <- calcNormFactors(dgel)
-  dgel <- estimateGLMCommonDisp(dgel, design)
-  dgel <- estimateGLMTrendedDisp(dgel, design)
-  dgel <- estimateGLMTagwiseDisp(dgel, design)
+  ## dgel <- estimateGLMCommonDisp(dgel, design)
+  ## dgel <- estimateGLMTrendedDisp(dgel, design)
+  ## dgel <- estimateGLMTagwiseDisp(dgel, design)
+  dgel <- estimateDisp(dgel, design)
   edger.fit <- glmFit(dgel, design)
   edger.lrt <- glmLRT(edger.fit)
   predbeta <- predFC(exprs(e), design, offset=getOffset(dgel), dispersion=dgel$tagwise.dispersion)
diff --git a/inst/script/simulateCluster.R b/inst/script/simulateCluster.R
index 281ff21..482bbac 100644
--- a/inst/script/simulateCluster.R
+++ b/inst/script/simulateCluster.R
@@ -4,6 +4,9 @@ library("DESeq2")
 library("PoiClaClu")
 library("mclust")
 
+library("parallel")
+options(mc.cores=20)
+
 set.seed(1)
 n <- 2000
 # create 20 samples, then remove first group, leaving 16
@@ -23,7 +26,7 @@ nreps <- 20
 res <- do.call(rbind, lapply(seq_along(dispScales), function(idx) {
   dispScale <- dispScales[idx]
   do.call(rbind, lapply(rnormsds[[idx]], function(rnormsd) {
-    do.call(rbind, lapply(seq_along(sfs), function(sf.idx) {
+    do.call(rbind, mclapply(seq_along(sfs), function(sf.idx) {
       sf <- sfs[[sf.idx]]
       do.call(rbind, lapply(seq_len(nreps), function(i) {
         beta <- replicate(k, c(rep(0,8/10 * n), rnorm(2/10 * n, 0, rnormsd)))
diff --git a/inst/script/simulateDE.R b/inst/script/simulateDE.R
index bc1975a..7d26b54 100644
--- a/inst/script/simulateDE.R
+++ b/inst/script/simulateDE.R
@@ -8,7 +8,7 @@ library("samr")
 library("DSS")
 library("EBSeq")
 source("runScripts.R")
-algos <- list("DESeq2"=runDESeq2,
+algos <- list("DESeq2"=runDESeq2,"DESeq2-LRT"=runDESeq2LRT,"DESeq2-NoIF"=runDESeq2NoIF,
               "edgeR"=runEdgeR,"edgeR-robust"=runEdgeRRobust,
               "DSS"=runDSS,"DSS-FDR"=runDSSFDR,
               "voom"=runVoom,
@@ -24,11 +24,10 @@ nreps <- 6
 effSizes <- rep(rep(effSizeLevels, each=nreps), times=length(mLevels))
 ms <- rep(mLevels, each=nreps * length(effSizeLevels))
 
-library("BiocParallel")
-register(SerialParam())
-#register(MulticoreParam(workers=8,verbose=TRUE))
+library("parallel")
+options(mc.cores=20)
 
-resList <- bplapply(seq_along(ms), function(i) {
+resList <- mclapply(seq_along(ms), function(i) {
   set.seed(i)
   m <- ms[i]
   es <- effSizes[i]
@@ -43,9 +42,12 @@ resList <- bplapply(seq_along(ms), function(i) {
   sens <- sapply(resTest, function(z) mean((z$padj < .1)[sensidx]))
   rmf <- cut(rowMeans(mat), c(0, 20, 100, 300, Inf), include.lowest=TRUE)
   levels(rmf) <- paste0("sens",c("0to20","20to100","100to300","more300"))
-  sensStratified <- t(sapply(resTest, function(z) tapply((z$padj < .1)[sensidx], rmf[sensidx], mean)))
-  oneminusspecpvals <- sapply(resTest, function(z) mean((z$pvals < .01)[beta == 0 & nonzero], na.rm=TRUE))
-  oneminusspecpadj <- sapply(resTest, function(z) mean((z$padj < .1)[beta == 0 & nonzero], na.rm=TRUE))
+  sensStratified <- t(sapply(resTest, function(z)
+                    tapply((z$padj < .1)[sensidx], rmf[sensidx], mean)))
+  oneminusspecpvals <- sapply(resTest,
+                      function(z) mean((z$pvals < .01)[beta == 0 & nonzero], na.rm=TRUE))
+  oneminusspecpadj <- sapply(resTest,
+                      function(z) mean((z$padj < .1)[beta == 0 & nonzero], na.rm=TRUE))
   oneminusprec <- sapply(resTest, function(z) {
       idx <- which(z$padj < .1)
       ifelse(sum(idx) == 0, 0, mean((beta == 0)[idx]))
diff --git a/inst/script/simulateLFCAccuracy.R b/inst/script/simulateLFCAccuracy.R
index d5a8b01..c46cecf 100644
--- a/inst/script/simulateLFCAccuracy.R
+++ b/inst/script/simulateLFCAccuracy.R
@@ -7,6 +7,9 @@ library("Biobase")
 algos <- list("DESeq2"=runDESeq2,"edgeR"=runEdgeR)
 namesAlgos <- names(algos)
 
+library("parallel")
+options(mc.cores=10)
+
 set.seed(1)
 nreps <- 10
 n <- 1000
@@ -15,7 +18,7 @@ types <- c("bell","slab bell","slab spike","spike spike")
 methods <- c("DESeq2","edgeR predFC","edgeR predFC10")
 res <- do.call(rbind, lapply(ms, function(m) {
   do.call(rbind, lapply(types, function(type) {
-    do.call(rbind, lapply(seq_len(nreps), function(i) {
+    do.call(rbind, mclapply(seq_len(nreps), function(i) {
       beta <- if (type == "bell") {
         rnorm(n)
       } else if (type == "slab bell") {
diff --git a/inst/script/simulateOutliers.R b/inst/script/simulateOutliers.R
index 63f1088..a491561 100644
--- a/inst/script/simulateOutliers.R
+++ b/inst/script/simulateOutliers.R
@@ -10,6 +10,9 @@ algos <- list("DESeq2"=runDESeq2Outliers,
 namesAlgos <- names(algos)
 methods <- c("DESeq2", "DESeq2-noFilt", "DESeq2-noRepl", "edgeR", "edgeR-robust")
 
+library("parallel")
+options(mc.cores=10)
+
 set.seed(1)
 padjVector <- seq(from=0, to=1, length=201)
 pvalsVector <- seq(from=0, to=.4, length=201)
@@ -20,7 +23,7 @@ nreps <- 10
 
 res <- do.call(rbind, lapply(ms, function(m) {
   do.call(rbind, lapply(percentOutliers, function(pOut) {
-    resList <- lapply(seq_len(nreps), function(i) {
+    resList <- mclapply(seq_len(nreps), function(i) {
       condition <- factor(rep(c("A","B"), each = m/2))
       x <- model.matrix(~ condition)
       beta <- c(rep(0, n * 8/10), sample(c(-1,1), n * 2/10, TRUE))
diff --git a/inst/script/simulation.Rmd b/inst/script/simulation.Rmd
index 1bb7e1d..afae4db 100644
--- a/inst/script/simulation.Rmd
+++ b/inst/script/simulation.Rmd
@@ -83,7 +83,7 @@ library("ggplot2")
 p <- ggplot(res, aes(y=sensitivity, x=oneminusprec,
                      color=algorithm, shape=algorithm))
 p + geom_point() + theme_bw() + facet_grid(effSize ~ m) +
-  scale_shape_manual(values=1:9) +
+  scale_shape_manual(values=1:11) +
   xlab("1 - precision (false discovery rate)") + 
   coord_cartesian(xlim=c(-.03, .3)) + 
   geom_vline(xintercept=.1)
@@ -109,7 +109,7 @@ p <- ggplot(melted, aes(y=sensitivity, x=aveexp, group=algorithm,
 p + stat_summary(fun.y="mean", geom="line") +
   stat_summary(fun.y="mean", geom="point") +
   theme_bw() + facet_grid(effSize ~ m) +
-  scale_shape_manual(values=1:9) +
+  scale_shape_manual(values=1:11) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   xlab("mean counts")
 ```
@@ -191,7 +191,7 @@ p + scale_x_continuous(breaks=c(0,.1,.2)) +
   geom_line() + theme_bw() + 
   facet_grid(m ~ percentOutlier) + 
   geom_abline(intercept=0,slope=1) +
-  xlab("adjusted *p*-value") + ylab("1 - precision (FDR)") + 
+  xlab("adjusted p-value") + ylab("1 - precision (FDR)") + 
   coord_cartesian(xlim=c(-.03, .25), ylim=c(-.05, .25)) + 
   geom_point(aes(x=precpadj, y=oneminusprec, shape=algorithm),
              data=res[res$precpadj == .1,])
@@ -372,8 +372,9 @@ $-\log_{10}(p)$ in a comparison of two conditions in which one group
 has all zero counts, e.g.: $\{0,0,0\}$ vs $\{10,10,10\}$.
 
 ```{r onlyonecondition, fig.width=4, fig.height=4, fig.cap="Simulated gene expressed in only one condition"}
+library("DESeq2")
 m <- 6
-disp <- 0.5
+disp <- 0.1
 ii <- seq(from=1,to=4,length=7)
 coldata <- DataFrame(x=factor(rep(1:2,each=m/2)))
 pvals <- sapply(ii, function(i) {
@@ -384,7 +385,7 @@ pvals <- sapply(ii, function(i) {
   results(nbinomWaldTest(dds))$pvalue
 })
 plot(10^ii, -log10(pvals), log="x", type="b", xaxt="n",
-     xlab="group 2 counts", ylab=expression(-log[10](p~value)))
+     xlab="group 2 counts", ylab="-log10 pvalue")
 axis(1,10^(1:4),10^(1:4))
 ```
 
diff --git a/inst/script/simulation.pdf b/inst/script/simulation.pdf
index 164a24a..b433395 100644
Binary files a/inst/script/simulation.pdf and b/inst/script/simulation.pdf differ
diff --git a/vignettes/vst.nb b/inst/script/vst.nb
similarity index 100%
rename from vignettes/vst.nb
rename to inst/script/vst.nb
diff --git a/vignettes/vst.pdf b/inst/script/vst.pdf
similarity index 100%
rename from vignettes/vst.pdf
rename to inst/script/vst.pdf
diff --git a/man/DESeq.Rd b/man/DESeq.Rd
index e5ae5ba..16674f1 100644
--- a/man/DESeq.Rd
+++ b/man/DESeq.Rd
@@ -139,7 +139,7 @@ Cook's distance, and the 'Dealing with outliers' section of the vignette.
 Unlike the behavior of \code{\link{replaceOutliers}}, here original counts are
 kept in the matrix returned by \code{\link{counts}}, original Cook's
 distances are kept in \code{assays(dds)[["cooks"]]}, and the replacement
-counts used for fitting are kept in \code{assays(object)[["replaceCounts"]]}.
+counts used for fitting are kept in \code{assays(dds)[["replaceCounts"]]}.
 
 Note that if a log2 fold change prior is used (betaPrior=TRUE)
 then expanded model matrices will be used in fitting. These are
diff --git a/man/DESeq2-package.Rd b/man/DESeq2-package.Rd
index e97c67f..15be6f3 100644
--- a/man/DESeq2-package.Rd
+++ b/man/DESeq2-package.Rd
@@ -13,7 +13,7 @@ and \code{\link{varianceStabilizingTransformation}}.
 For more detailed information on usage, see the package vignette, by typing
 \code{vignette("DESeq2")}, or the workflow linked to on the first page
 of the vignette. All support questions should be posted to the Bioconductor
-support site: \url{support.bioconductor.org}.
+support site: \url{http://support.bioconductor.org}.
 }
 \author{
 Michael Love, Wolfgang Huber, Simon Anders
diff --git a/man/DESeqDataSet.Rd b/man/DESeqDataSet.Rd
index a3d92cd..ce02af8 100644
--- a/man/DESeqDataSet.Rd
+++ b/man/DESeqDataSet.Rd
@@ -6,6 +6,7 @@
 \alias{DESeqDataSet-class}
 \alias{DESeqDataSetFromHTSeqCount}
 \alias{DESeqDataSetFromMatrix}
+\alias{DESeqDataSetFromTximport}
 \title{DESeqDataSet object and constructors}
 \usage{
 DESeqDataSet(se, design, ignoreRank = FALSE)
@@ -15,6 +16,8 @@ DESeqDataSetFromMatrix(countData, colData, design, tidy = FALSE,
 
 DESeqDataSetFromHTSeqCount(sampleTable, directory = ".", design,
   ignoreRank = FALSE, ...)
+
+DESeqDataSetFromTximport(txi, colData, design, ...)
 }
 \arguments{
 \item{se}{a \code{RangedSummarizedExperiment} with columns of variables
@@ -57,6 +60,8 @@ the file name of the count file generated by htseq-count, and the remaining
 columns are sample metadata which will be stored in \code{colData}}
 
 \item{directory}{for htseq-count: the directory relative to which the filenames are specified. defaults to current directory}
+
+\item{txi}{for tximport: the simple list output of the \code{tximport} function}
 }
 \value{
 A DESeqDataSet object.
@@ -70,9 +75,15 @@ the first element in the assay list.
 In addition, a formula which specifies the design of the experiment must be provided.
 The constructor functions create a DESeqDataSet object
 from various types of input:
-a RangedSummarizedExperiment, a matrix, or count files generated by
-the python package HTSeq.  See the vignette for examples of construction
-from all three input types.
+a RangedSummarizedExperiment, a matrix, count files generated by
+the python package HTSeq, or a list from the tximport function in the
+tximport package.
+See the vignette for examples of construction from different types.
+}
+\details{
+Note on the error message "assay colnames() must be NULL or equal colData rownames()":
+this means that the colnames of countData are different than the rownames of colData.
+Fix this with: \code{colnames(countData) <- NULL}
 }
 \examples{
 
diff --git a/man/estimateSizeFactors.Rd b/man/estimateSizeFactors.Rd
index 24f1074..660aa4d 100644
--- a/man/estimateSizeFactors.Rd
+++ b/man/estimateSizeFactors.Rd
@@ -4,7 +4,7 @@
 \name{estimateSizeFactors}
 \alias{estimateSizeFactors}
 \alias{estimateSizeFactors,DESeqDataSet-method}
-\title{Estimate the size factors for a DESeqDataSet}
+\title{Estimate the size factors for a \code{\link{DESeqDataSet}}}
 \usage{
 \S4method{estimateSizeFactors}{DESeqDataSet}(object, type = c("ratio",
   "iterate"), locfunc = stats::median, geoMeans, controlGenes, normMatrix)
@@ -33,11 +33,16 @@ for a "frozen" size factor calculation}
 \item{controlGenes}{optional, numeric or logical index vector specifying those genes to
 use for size factor estimation (e.g. housekeeping or spike-in genes)}
 
-\item{normMatrix}{optional, a matrix of normalization factors which do not
-control for library size (e.g. average transcript length of genes for each
-sample). Providing \code{normMatrix} will estimate size factors on the
+\item{normMatrix}{optional, a matrix of normalization factors which do not yet
+control for library size. Note that this argument should not be used (and
+will be ignored) if the \code{dds} object was created using \code{tximport}.
+In this case, the information in \code{assays(dds)[["avgTxLength"]]}
+is automatically used to create appropriate normalization factors.
+Providing \code{normMatrix} will estimate size factors on the
 count matrix divided by \code{normMatrix} and store the product of the
-size factors and \code{normMatrix} as \code{\link{normalizationFactors}}.}
+size factors and \code{normMatrix} as \code{\link{normalizationFactors}}.
+It is recommended to divide out the row-wise geometric mean of
+\code{normMatrix} so the rows roughly are centered on 1.}
 }
 \value{
 The DESeqDataSet passed as parameters, with the size factors filled
@@ -46,9 +51,9 @@ in.
 \description{
 This function estimates the size factors using the
 "median ratio method" described by Equation 5 in Anders and Huber (2010).
-The estimated size factors can be accessed using \code{\link{sizeFactors}}.
+The estimated size factors can be accessed using the accessor function \code{\link{sizeFactors}}.
 Alternative library size estimators can also be supplied
-using \code{\link{sizeFactors}}.
+using the assignment function \code{\link{sizeFactors<-}}.
 }
 \details{
 Typically, the function is called with the idiom:
diff --git a/man/fpkm.Rd b/man/fpkm.Rd
index 455b9af..b03ec96 100644
--- a/man/fpkm.Rd
+++ b/man/fpkm.Rd
@@ -31,13 +31,18 @@ as in \code{\link{estimateSizeFactors}}).
 }
 \details{
 The length of the features (e.g. genes) is calculated one of two ways:
-if there is a matrix named "avgTxLength" in \code{assays(dds)}, this will take precedence in the
-length normalization. Otherwise, feature length is calculated 
+(1) If there is a matrix named "avgTxLength" in \code{assays(dds)},
+this will take precedence in the length normalization.
+This occurs when using the tximport-DESeq2 pipeline.
+(2) Otherwise, feature length is calculated 
 from the \code{rowRanges} of the dds object,
 if a column \code{basepairs} is not present in \code{mcols(dds)}.
 The calculated length is the number of basepairs in the union of all \code{GRanges}
 assigned to a given row of \code{object}, e.g., 
 the union of all basepairs of exons of a given gene.
+Note that the second approach over-estimates the gene length
+(average transcript length, weighted by abundance is a more appropriate
+normalization for gene counts), and so the FPKM will be an underestimate of the true value.
 
 Note that, when the read/fragment counting has inter-feature dependencies, a strict
 normalization would not incorporate the basepairs of a feature which
diff --git a/man/fpm.Rd b/man/fpm.Rd
index 00d618a..f1f73bb 100644
--- a/man/fpm.Rd
+++ b/man/fpm.Rd
@@ -13,7 +13,8 @@ fpm(object, robust = TRUE)
 \item{robust}{whether to use size factors to normalize
 rather than taking the column sums of the raw counts.
 If TRUE, the size factors and the geometric mean of
-column sums are multiplied to create a robust library size estimate.}
+column sums are multiplied to create a robust library size estimate.
+Robust normalization is not used if average transcript lengths are present.}
 }
 \value{
 a matrix which is normalized per million of mapped fragments,
diff --git a/man/nbinomWaldTest.Rd b/man/nbinomWaldTest.Rd
index bec8aaa..4829d18 100644
--- a/man/nbinomWaldTest.Rd
+++ b/man/nbinomWaldTest.Rd
@@ -84,8 +84,9 @@ the inverse of the expected variance of log counts, so the inverse of
 normalized counts and the trended dispersion fit. The weighting ensures
 that noisy estimates of log fold changes from small count genes do not
 overly influence the calculation of the prior variance.
+See \code{\link{estimateBetaPriorVar}}.
 The final prior variance for a factor level is the average of the
-estimated prior variance over all contrasts of all levels of the factor.
+estimated prior variance over all contrasts of all levels of the factor. 
 Another change since the 2014 paper: when interaction terms are present
 in the design, the prior on log fold changes is turned off
 (for more details, see the vignette section, "Methods changes since
diff --git a/man/normalizeGeneLength.Rd b/man/normalizeGeneLength.Rd
index 916bfc4..45bea6f 100644
--- a/man/normalizeGeneLength.Rd
+++ b/man/normalizeGeneLength.Rd
@@ -4,104 +4,16 @@
 \alias{normalizeGeneLength}
 \title{Normalize for gene length}
 \usage{
-normalizeGeneLength(object, files, level = c("tx", "gene"),
-  geneIdCol = "gene_id", lengthCol = "length", abundanceCol = "FPKM",
-  dropGenes = FALSE, importer, ...)
+normalizeGeneLength(...)
 }
 \arguments{
-\item{object}{the DESeqDataSet, before calling \code{DESeq}}
-
-\item{files}{a character vector specifying the filenames of output files
-containing either transcript abundance estimates with transcript length, 
-or average transcript length information per gene.}
-
-\item{level}{either "tx" or "gene"}
-
-\item{geneIdCol}{the name of the column of the files specifying the gene id. This
-should line up with the \code{rownames(object)}. The information in the files
-will be re-ordered to line up with the rownames of the object.
-See \code{dropGenes} for more details.}
-
-\item{lengthCol}{the name of the column of files specifying the length of the
-feature, either transcript for \code{level="tx"} or the gene for
-\code{level="gene"}.}
-
-\item{abundanceCol}{only needed if \code{level="tx"}, the name of the
-column specifying the abundance estimate of the transcript.}
-
-\item{dropGenes}{whether to drop genes from the object,
-as labelled by \code{rownames(object)}, which are not
-present in the \code{geneIdCol} of the files. Defaults to FALSE
-and gives an error upon finding \code{rownames} of the object
-not present in the \code{geneIdCol} of the files.
-The function will reorder the matching rows of the files to match
-the rownames of the object.}
-
-\item{importer}{a function to read the \code{files}.
-\code{fread} from the data.table package is quite fast,
-but other options include \code{read.table}, \code{read.csv}.
-One can pre-test with \code{importer(files[1])}.}
-
-\item{...}{further arguments passed to \code{importer}}
-}
-\value{
-a DESeqDataSet with \code{\link{normalizationFactors}}
-accounting for average transcript length and library size
+\item{...}{...}
 }
 \description{
 Normalize for gene length using the output of transcript abundance estimators
 }
 \details{
-This is a prototype function for importing information about changes in
-the average transcript length for each gene. The use of this function
-is only for testing purposes.
-
-The function simply imports or calculates average
-transcript length for each gene and each sample from external files,
-and provides this matrix to the \code{normMatrix} argument of
-\code{\link{estimateSizeFactors}}. 
-By average transcript length, the average refers to a weighted average with respect
-to the transcript abundances. The RSEM method includes such a column in their
-\code{gene.results} files, but an estimate of average transcript length can
-be obtained from any software which outputs a file with a row for each
-transcript, specifying: transcript length, estimate of transcript abundance,
-and the gene which the transcript belongs to.
-
-Normalization factors accounting for both average transcript length and
-library size of each sample are generated and then stored within the data object.
-The analysis can then continue with \code{\link{DESeq}};
-the stored normalization factors will be used instead of size factors in the analysis.
-
-For RSEM \code{genes.results} files,
-specify \code{level="gene"}, \code{geneIdCol="gene_id"},
-and \code{lengthCol="effective_length"}
-
-For Cufflinks \code{isoforms.fpkm_tracking} files,
-specify \code{level="tx"}, \code{geneIdCol="gene_id"},
-\code{lengthCol="length"}, and \code{abundanceCol="FPKM"}.
-
-For Sailfish output files, one can write an \code{importer}
-function which attaches a column \code{gene_id} based on Transcript ID,
-and then specify \code{level="tx"}, \code{geneIdCol="gene_id"},
-\code{lengthCol="Length"} and \code{abundanceCol="RPKM"}.
-
-Along with the normalization matrix which is stored in \code{normalizationFactors(object)},
-the resulting gene length matrix is stored in \code{assays(dds)[["avgTxLength"]]},
-and will take precedence in calls to \code{\link{fpkm}}.
-}
-\examples{
-
-n <- 10
-files <- c("sample1","sample2")
-gene_id <- rep(paste0("gene",seq_len(n)),each=3)
-set.seed(1)
-sample1 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-sample2 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-importer <- get
-dds <- makeExampleDESeqDataSet(n=n, m=2)
-dds <- normalizeGeneLength(dds, files=files, level="tx",
-  geneIdCol="gene_id", lengthCol="length", abundanceCol="FPKM",
-  dropGenes=TRUE, importer=importer)
-
+This function is deprecated and moved to a new general purpose package,
+tximport, which will be added to Bioconductor.
 }
 
diff --git a/man/plotMA.Rd b/man/plotMA.Rd
index 30a22cb..a642e6d 100644
--- a/man/plotMA.Rd
+++ b/man/plotMA.Rd
@@ -7,10 +7,11 @@
 \alias{plotMA,DESeqResults-method}
 \title{MA-plot from base means and log fold changes}
 \usage{
-\S4method{plotMA}{DESeqDataSet}(object, alpha = 0.1, main = "", ylim, ...)
+\S4method{plotMA}{DESeqDataSet}(object, alpha = 0.1, main = "",
+  xlab = "mean of normalized counts", ylim, MLE = FALSE, ...)
 
-\S4method{plotMA}{DESeqResults}(object, alpha, main = "", ylim, MLE = FALSE,
-  ...)
+\S4method{plotMA}{DESeqResults}(object, alpha, main = "",
+  xlab = "mean of normalized counts", ylim, MLE = FALSE, ...)
 }
 \arguments{
 \item{object}{a \code{DESeqResults} object produced by \code{\link{results}};
@@ -21,16 +22,18 @@ individual functions \code{\link{nbinomWaldTest}} or \code{\link{nbinomLRT}}}
 
 \item{main}{optional title for the plot}
 
-\item{ylim}{optional y limits}
+\item{xlab}{optional defaults to "mean of normalized counts"}
 
-\item{...}{further arguments passed to \code{plotMA} if object
-is \code{DESeqResults} or to \code{\link{results}} if object is
-\code{DESeqDataSet}}
+\item{ylim}{optional y limits}
 
 \item{MLE}{whether to plot the MLE (unshrunken estimates), defaults to FALSE.
 Requires that \code{\link{results}} was run with \code{addMLE=TRUE}.
 Note that the MLE will be plotted regardless of this argument, if DESeq() was run
 with \code{betaPrior=FALSE}.}
+
+\item{...}{further arguments passed to \code{plotMA} if object
+is \code{DESeqResults} or to \code{\link{results}} if object is
+\code{DESeqDataSet}}
 }
 \description{
 A simple helper function that makes a so-called "MA-plot", i.e. a
diff --git a/man/results.Rd b/man/results.Rd
index 53b5c07..a973dd8 100644
--- a/man/results.Rd
+++ b/man/results.Rd
@@ -11,7 +11,7 @@ results(object, contrast, name, lfcThreshold = 0,
   listValues = c(1, -1), cooksCutoff, independentFiltering = TRUE,
   alpha = 0.1, filter, theta, pAdjustMethod = "BH", filterFun,
   format = c("DataFrame", "GRanges", "GRangesList"), test, addMLE = FALSE,
-  tidy = FALSE, parallel = FALSE, BPPARAM = bpparam())
+  tidy = FALSE, parallel = FALSE, BPPARAM = bpparam(), ...)
 
 resultsNames(object)
 
@@ -47,12 +47,14 @@ building a results table. Use this argument rather than \code{contrast}
 for continuous variables, individual effects or for individual interaction terms.
 The value provided to \code{name} must be an element of \code{resultsNames(object)}.}
 
-\item{lfcThreshold}{a non-negative value, which specifies the test which should
-be applied to the log2 fold changes. The standard is a test that the log2 fold
-changes are not equal to zero. However, log2 fold changes greater or less than
-\code{lfcThreshold} can also be tested. Specify the alternative hypothesis
-using the \code{altHypothesis} argument. If \code{lfcThreshold} is specified,
-the results are Wald tests, and LRT p-values will be overwritten.}
+\item{lfcThreshold}{a non-negative value which specifies a log2 fold change
+threshold. The default value is 0, corresponding to a test that
+the log2 fold changes are equal to zero. The user can
+specify the alternative hypothesis using the \code{altHypothesis} argument,
+which defaults to testing
+for log2 fold changes greater in absolute value than a given threshold.
+If \code{lfcThreshold} is specified,
+the results are for Wald tests, and LRT p-values will be overwritten.}
 
 \item{altHypothesis}{character which specifies the alternative hypothesis,
 i.e. those values of log2 fold change which the user is interested in
@@ -78,10 +80,9 @@ by default this is \code{c(1,-1)}}
 
 \item{cooksCutoff}{theshold on Cook's distance, such that if one or more
 samples for a row have a distance higher, the p-value for the row is
-set to NA.
-The default cutoff is the .99 quantile of the F(p, m-p) distribution,
+set to NA. The default cutoff is the .99 quantile of the F(p, m-p) distribution,
 where p is the number of coefficients being fitted and m is the number of samples.
-Set to Inf or FALSE to disable the resetting of p-values to NA.
+Set to \code{Inf} or \code{FALSE} to disable the resetting of p-values to NA.
 Note: this test excludes the Cook's distance of samples belonging to experimental
 groups with only 2 samples.}
 
@@ -100,9 +101,12 @@ from independent filtering}
 
 \item{pAdjustMethod}{the method to use for adjusting p-values, see \code{?p.adjust}}
 
-\item{filterFun}{an optional custom function for independent filtering,
-with arguments \code{alpha}, \code{filter}, \code{test}, \code{theta}, and \code{method}
-similar to \code{genefilter::filtered_R}, and which returns \code{padj}}
+\item{filterFun}{an optional custom function for performing independent filtering
+and p-value adjustment, with arguments \code{res} (a DESeqResults object),
+\code{filter} (the quantitity for filtering tests),
+\code{alpha} (the target FDR),
+\code{pAdjustMethod}. This function should return a DESeqResults object
+with a \code{padj} column.}
 
 \item{format}{character, either \code{"DataFrame"}, \code{"GRanges"}, or \code{"GRangesList"},
 whether the results should be printed as a \code{\link{DESeqResults}} DataFrame,
@@ -111,7 +115,7 @@ the \code{GRanges} or \code{GRangesList} \code{rowRanges} of the \code{DESeqData
 If the \code{rowRanges} is a \code{GRangesList}, and \code{GRanges} is requested, 
 the range of each gene will be returned}
 
-\item{test}{this is typically automatically detected internally.
+\item{test}{this is automatically detected internally if not provided.
 the one exception is after \code{nbinomLRT} has been run, \code{test="Wald"}
 will generate Wald statistics and Wald test p-values.}
 
@@ -130,6 +134,8 @@ execution using \code{BiocParallel}, see next argument \code{BPPARAM}}
 to \code{\link{bplapply}} when \code{parallel=TRUE}.
 If not specified, the parameters last registered with
 \code{\link{register}} will be used.}
+
+\item{...}{optional arguments passed to \code{filterFun}}
 }
 \value{
 For \code{results}: a \code{\link{DESeqResults}} object, which is
@@ -196,19 +202,19 @@ On p-values:
 By default, independent filtering is performed to select a set of genes
 for multiple test correction which maximizes the number of adjusted
 p-values less than a given critical value \code{alpha} (by default 0.1).
+See the reference in this man page for details on independent filtering.
 The filter used for maximizing the number of rejections is the mean
 of normalized counts for all samples in the dataset.
-In version >= 1.10, the threshold chosen is
+Several arguments from the \code{\link[genefilter]{filtered_p}} function of
+the genefilter package (used within the \code{results} function)
+are provided here to control the independent filtering behavior.
+In DESeq2 version >= 1.10, the threshold that is chosen is
 the lowest quantile of the filter for which the
 number of rejections is close to the peak of a curve fit
 to the number of rejections over the filter quantiles.
 'Close to' is defined as within 1 residual standard deviation.
-
 The adjusted p-values for the genes which do not pass the filter threshold
-are set to \code{NA}. By default, the mean of normalized counts
-is used to perform this filtering, though other statistics can be provided.
-Several arguments from the \code{filtered_p} function of genefilter
-are provided here to control or turn off the independent filtering behavior.
+are set to \code{NA}.
 
 By default, \code{results} assigns a p-value of \code{NA}
 to genes containing count outliers, as identified using Cook's distance.
@@ -219,19 +225,26 @@ observed counts might not fit to a Negative Binomial distribution.
 
 For analyses using the likelihood ratio test (using \code{\link{nbinomLRT}}),
 the p-values are determined solely by the difference in deviance between
-the full and reduced model formula. A log2 fold change is included,
-which can be controlled using the \code{name} argument, or by default this will
-be the estimated coefficient for the last element of \code{resultsNames(object)}.
+the full and reduced model formula. A single log2 fold change is printed
+in the results table for consistency with other results table outputs,
+however the test statistic and p-values may nevertheless involve
+the testing of one or more log2 fold changes.
+Which log2 fold change is printed in the results table can be controlled
+using the \code{name} argument, or by default this will be the estimated
+coefficient for the last element of \code{resultsNames(object)}.
 }
 \examples{
 
-## Example 1: simple two-group comparison
+## Example 1: two-group comparison
 
 dds <- makeExampleDESeqDataSet(m=4)
 
 dds <- DESeq(dds)
-res <- results(dds)
-res[ order(res$padj), ]
+res <- results(dds, contrast=c("condition","B","A"))
+
+# with more than two groups, the call would look similar, e.g.:
+# results(dds, contrast=c("condition","C","A"))
+# etc.
 
 ## Example 2: two conditions, two genotypes, with an interaction term
 
@@ -304,6 +317,6 @@ filtering increases detection power for high-throughput experiments.
 PNAS (2010), \url{http://dx.doi.org/10.1073/pnas.0914005107}
 }
 \seealso{
-\code{\link{DESeq}}
+\code{\link{DESeq}}, \code{\link[genefilter]{filtered_R}}
 }
 
diff --git a/man/vst.Rd b/man/vst.Rd
new file mode 100644
index 0000000..1a98fd2
--- /dev/null
+++ b/man/vst.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/vst.R
+\name{vst}
+\alias{vst}
+\title{Quickly estimate dispersion trend and apply a variance stabilizing transformation}
+\usage{
+vst(object, blind = TRUE, nsub = 1000, fitType = "parametric")
+}
+\arguments{
+\item{object}{a DESeqDataSet or a matrix of counts}
+
+\item{blind}{logical, whether to blind the transformation to the experimental
+design (see \code{\link{varianceStabilizingTransformation}})}
+
+\item{nsub}{the number of genes to subset to (default 1000)}
+
+\item{fitType}{for estimation of dispersions: this parameter
+is passed on to \code{\link{estimateDispersions}} (options described there)}
+}
+\value{
+a DESeqTranform object or a matrix of transformed, normalized counts
+}
+\description{
+This is a wrapper for the \code{\link{varianceStabilizingTransformation}} (VST)
+that provides much faster estimation of the dispersion trend used to determine
+the formula for the VST. The speed-up is accomplished by
+subsetting to a smaller number of genes in order to estimate this dispersion trend.
+The subset of genes is chosen deterministically, to span the range
+of genes' mean normalized count.
+This wrapper for the VST is not blind to the experimental design:
+the sample covariate information is used to estimate the global trend
+of genes' dispersion values over the genes' mean normalized count.
+It can be made strictly blind to experimental design by first
+assigning a \code{\link{design}} of \code{~1} before running this function,
+or by avoiding subsetting and using \code{\link{varianceStabilizingTransformation}}.
+}
+\examples{
+
+dds <- makeExampleDESeqDataSet(n=20000, m=20)
+vsd <- vst(dds)
+
+}
+
diff --git a/src/DESeq2.cpp b/src/DESeq2.cpp
index 7dc6560..0efa29a 100644
--- a/src/DESeq2.cpp
+++ b/src/DESeq2.cpp
@@ -1,3 +1,17 @@
+/*
+ * DESeq2 C++ functions
+ * 
+ * Author: Michael I. Love
+ * Last modified: September 30, 2015
+ * License: LGPL (>= 3)
+ *
+ * Note: The canonical, up-to-date DESeq2.cpp lives in 
+ * the DESeq2 library, the development branch of which 
+ * can be viewed here: 
+ *
+ * https://github.com/Bioconductor-mirror/DESeq2/blob/master/src/DESeq2.cpp
+ */
+
 // include RcppArmadillo and Rcpp
 #include "RcppArmadillo.h"
 
@@ -241,7 +255,7 @@ Rcpp::List fitBeta(SEXP ySEXP, SEXP xSEXP, SEXP nfSEXP, SEXP alpha_hatSEXP, SEXP
   Rcpp::NumericVector iter(y_n);
   Rcpp::NumericVector deviance(y_n);
   // bound the estimated count, as weights include 1/mu
-  double minmu = 0.1;
+  double minmu = 0.5;
   for (int i = 0; i < y_n; i++) {
     Rcpp::checkUserInterrupt();
     nfrow = nf.row(i).t();
diff --git a/tests/testthat/test_construction_errors.R b/tests/testthat/test_construction_errors.R
index 28bfe2f..3434869 100644
--- a/tests/testthat/test_construction_errors.R
+++ b/tests/testthat/test_construction_errors.R
@@ -19,6 +19,7 @@ expect_error(DESeqDataSetFromMatrix(counts, coldata, ~ ident))
 expect_message(DESeqDataSetFromMatrix(counts, coldata, ~ num))
 expect_message(DESeqDataSetFromMatrix(counts, coldata, ~ missinglevels))
 expect_message(DESeqDataSetFromMatrix(counts, coldata, ~ notref))
+expect_error(DESeqDataSetFromMatrix(counts, coldata, ~ident + x), "design contains")
 
 # same colnames but in different order:
 expect_error(DESeqDataSetFromMatrix(matrix(1:16, ncol=4, dimnames=list(1:4, 4:1)), coldata, ~ x))
diff --git a/tests/testthat/test_counts_input.R b/tests/testthat/test_counts_input.R
index f4b2d17..dbb0a90 100644
--- a/tests/testthat/test_counts_input.R
+++ b/tests/testthat/test_counts_input.R
@@ -1,14 +1,13 @@
 # count matrix input
 cnts <- matrix(rnbinom(40,mu=100,size=2),ncol=4)
-colnames(cnts) <- letters[1:4]
 mode(cnts) <- "integer"
 coldata <- data.frame(cond=factor(c("A","A","B","B")))
 dds <- DESeqDataSetFromMatrix(cnts, coldata, ~cond)
 
 # tidy data frame input
 gene.names <- paste0("gene",1:10)
+rownames(coldata) <- colnames(cnts) <- letters[1:4]
 tidy.counts <- cbind(gene.names, as.data.frame(cnts))
 dds <- DESeqDataSetFromMatrix(tidy.counts, coldata, ~cond, tidy=TRUE)
 expect_true(all(rownames(dds) == gene.names))
 expect_true(all(counts(dds) == cnts))
-
diff --git a/tests/testthat/test_custom_filt.R b/tests/testthat/test_custom_filt.R
index ed66e7b..cab2026 100644
--- a/tests/testthat/test_custom_filt.R
+++ b/tests/testthat/test_custom_filt.R
@@ -3,23 +3,23 @@ set.seed(1)
 dds <- makeExampleDESeqDataSet(n=200, m=4, betaSD=rep(c(0,2),c(150,50)))
 dds <- DESeq(dds)
 res <- results(dds)
-
-filter <- mcols(dds)$baseMean
-test <- res$pvalue
-theta <- seq(mean(filter == 0), 1, length=20)
 method <- "BH"
 alpha <- 0.1
 
-customFilt <- function(alpha, filter, test, theta, method) {
+customFilt <- function(res, filter, alpha, method) {
+  if (missing(filter)) {
+    filter <- res$baseMean
+  }
+  theta <- 0:10/10
   cutoff <- quantile(filter, theta)
-  numRej <- sapply(cutoff, function(x) sum(p.adjust(test[filter > x]) < alpha, na.rm=TRUE))
+  numRej <- sapply(cutoff, function(x) sum(p.adjust(res$pvalue[filter > x]) < alpha, na.rm=TRUE))
   threshold <- theta[which(numRej == max(numRej))[1]]
-  padj <- numeric(length(test))
-  padj <- NA
+  res$padj <- numeric(nrow(res))
   idx <- filter > quantile(filter, threshold)
-  padj[idx] <- p.adjust(test[idx], method=method)
-  return(padj)
+  res$padj[!idx] <- NA
+  res$padj[idx] <- p.adjust(res$pvalue[idx], method=method)
+  res
 }
 
 resCustom <- results(dds, filterFun=customFilt)
-#plot(res$padj, resCustom$padj);abline(0,1)
+plot(res$padj, resCustom$padj);abline(0,1)
diff --git a/tests/testthat/test_edge_case.R b/tests/testthat/test_edge_case.R
index f8c77ea..cdd93b8 100644
--- a/tests/testthat/test_edge_case.R
+++ b/tests/testthat/test_edge_case.R
@@ -26,7 +26,7 @@ dds2 <- DESeq(dds2)
 results(dds2)
 expect_true(class(mcols(mcols(dds2))$type) == "character")
 
-dds3 <- DESeqDataSetFromMatrix( counts(dds), DataFrame(row.names=1:ncol(dds)), ~ 1 )
+dds3 <- DESeqDataSetFromMatrix( counts(dds), DataFrame(row.names=colnames(dds)), ~ 1 )
 dds3$test <- 1:ncol(dds3)
 dds3 <- estimateSizeFactors(dds3)
 expect_true(class(mcols(colData(dds3))$type) == "character")
diff --git a/tests/testthat/test_gene_length.R b/tests/testthat/test_gene_length.R
deleted file mode 100644
index 479eae9..0000000
--- a/tests/testthat/test_gene_length.R
+++ /dev/null
@@ -1,29 +0,0 @@
-n <- 10
-files <- c("sample1","sample2","sample3","sample4")
-
-envir <- environment()
-
-# Cufflinks-like
-gene_id <- rep(paste0("gene",seq_len(n)),each=3)
-set.seed(1)
-sample1 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-sample2 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-sample3 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-sample4 <- data.frame(gene_id=gene_id,length=rpois(3*n,2000),FPKM=round(rnorm(3*n,10,1),2))
-importer <- get
-dds <- makeExampleDESeqDataSet(n=n + 2, m=4)
-dds <- normalizeGeneLength(dds, files=files, level="tx",
-                           geneIdCol="gene_id", lengthCol="length", abundanceCol="FPKM",
-                           dropGenes=TRUE, importer=importer, envir=envir)
-
-# RSEM-like
-gene_id <- paste0("gene",seq_len(n))
-sample1 <- data.frame(gene_id=gene_id,effective_length=rpois(n,2000))
-sample2 <- data.frame(gene_id=gene_id,effective_length=rpois(n,2000))
-sample3 <- data.frame(gene_id=gene_id,effective_length=rpois(n,2000))
-sample4 <- data.frame(gene_id=gene_id,effective_length=rpois(n,2000))
-dds <- makeExampleDESeqDataSet(n=n + 2, m=4)
-dds <- normalizeGeneLength(dds, files=files, level="gene",
-                           geneIdCol="gene_id", lengthCol="effective_length",
-                           dropGenes=TRUE, importer=importer, envir=envir)
-
diff --git a/tests/testthat/test_model_matrix.R b/tests/testthat/test_model_matrix.R
index 4b49ddc..29af080 100644
--- a/tests/testthat/test_model_matrix.R
+++ b/tests/testthat/test_model_matrix.R
@@ -16,3 +16,13 @@ dds <- removeResults(dds)
 dds <- DESeq(dds, full=m1, test="Wald", betaPrior=FALSE)
 results(dds)[1,]
 
+# test better error than "error: inv(): matrix seems singular"
+coldata <- data.frame(group=factor(rep(1:3,each=6)),
+                      group2=factor(rep(1:3,each=6)),
+                      condition=factor(rep(1:6,3)))
+counts <- matrix(rpois(180, 100), ncol=18)
+m1 <- model.matrix(~ group + group2, coldata)
+m2 <- model.matrix(~ condition + group, coldata)
+dds <- DESeqDataSetFromMatrix(counts, coldata, ~group)
+expect_error(dds <- DESeq(dds, full=m1, fitType="mean"), "full rank")
+expect_error(dds <- DESeq(dds, full=m2, reduced=m1, test="LRT", fitType="mean"), "full rank")
diff --git a/tests/testthat/test_nbinomWald.R b/tests/testthat/test_nbinomWald.R
index 3385a12..1578f0c 100644
--- a/tests/testthat/test_nbinomWald.R
+++ b/tests/testthat/test_nbinomWald.R
@@ -13,3 +13,10 @@ dds2 <- estimateMLEForBetaPriorVar(dds)
 estimateBetaPriorVar(dds2, betaPriorMethod="quantile")
 dds <- nbinomWaldTest(dds, modelMatrixType="standard")
 covarianceMatrix(dds, 1)
+
+# try nbinom after no fitted dispersions
+dds <- makeExampleDESeqDataSet(n=100, m=4)
+dds <- estimateSizeFactors(dds)
+dds <- estimateDispersionsGeneEst(dds)
+dispersions(dds) <- mcols(dds)$dispGeneEst
+dds <- nbinomWaldTest(dds)
diff --git a/tests/testthat/test_optim.R b/tests/testthat/test_optim.R
index b5d1ff8..bd94fc8 100644
--- a/tests/testthat/test_optim.R
+++ b/tests/testthat/test_optim.R
@@ -1,5 +1,5 @@
 set.seed(1)
-dds <- makeExampleDESeqDataSet(n=100)
+dds <- makeExampleDESeqDataSet(n=100,interceptMean=10,interceptSD=3)
 dds <- estimateSizeFactors(dds)
 dds <- estimateDispersions(dds)
 # make a large predictor to test scaling
@@ -19,6 +19,8 @@ fitOptim <- DESeq2:::fitNbinomGLMs(dds, modelMatrix=modelMatrix,
                                    renameCols=TRUE, betaTol=1e-8,
                                    maxit=100, useOptim=TRUE,
                                    useQR=TRUE, forceOptim=TRUE)
+#plot(fit$betaMatrix[,2], fitOptim$betaMatrix[,2])
+#abline(0,1,col="red")
 expect_equal(fit$betaMatrix, fitOptim$betaMatrix,tolerance=1e-6)
 expect_equal(fit$betaSE, fitOptim$betaSE,tolerance=1e-6)
 
@@ -27,6 +29,8 @@ set.seed(1)
 dds <- makeExampleDESeqDataSet(n=100, m=10)
 counts(dds)[1,] <- c(rep(0L,5),c(1000L,1000L,0L,0L,0L))
 dds <- DESeq(dds, betaPrior=FALSE)
+# beta iter = 100 implies optim used for fitting
+expect_equal(mcols(dds)$betaIter[1], 100)
 res1 <- results(dds, contrast=c("condition","B","A"))
 res2 <- results(dds, contrast=c(0,1))
 expect_true(all.equal(res1$lfcSE, res2$lfcSE))
diff --git a/tests/testthat/test_outlier.R b/tests/testthat/test_outlier.R
index 66d86a6..4e1898d 100644
--- a/tests/testthat/test_outlier.R
+++ b/tests/testthat/test_outlier.R
@@ -63,3 +63,9 @@ dds <- makeExampleDESeqDataSet(n=100,m=4)
 expect_error(replaceOutliers(dds))
 dds <- DESeq(dds)
 expect_error(replaceOutliers(dds, minReplicates=2))
+
+# check model matrix standard bug
+set.seed(1)
+dds <- makeExampleDESeqDataSet(n=100, m=20)
+counts(dds)[1,] <- c(100000L, rep(0L, 19))
+dds <- DESeq(dds, modelMatrixType="standard")
diff --git a/tests/testthat/test_results.R b/tests/testthat/test_results.R
index 93f18e7..6fa1507 100644
--- a/tests/testthat/test_results.R
+++ b/tests/testthat/test_results.R
@@ -135,7 +135,7 @@ expect_warning(results(dds, format="GRangesList"))
 
 rowRanges(dds) <- as(rowRanges(dds), "GRangesList")
 dds <- DESeq(dds)
-expect_warning(results(dds, format="GRanges"))
+expect_message(results(dds, format="GRanges"))
 
 # check tidy-ness
 res <- results(dds, tidy=TRUE)
diff --git a/tests/testthat/test_rlog.R b/tests/testthat/test_rlog.R
index c6a2a6e..8356ab5 100644
--- a/tests/testthat/test_rlog.R
+++ b/tests/testthat/test_rlog.R
@@ -22,3 +22,5 @@ expect_error(rlogData(dds, intercept=rep(1,10)))
 dds <- makeExampleDESeqDataSet(n=50, m=10)
 nt <- normTransform(dds)
 plotPCA(nt)
+
+rld <- rlog(counts(dds))
diff --git a/tests/testthat/test_size_factor.R b/tests/testthat/test_size_factor.R
index 46a1964..64f2566 100644
--- a/tests/testthat/test_size_factor.R
+++ b/tests/testthat/test_size_factor.R
@@ -7,6 +7,7 @@ estimateSizeFactorsForMatrix(m, geoMeans=1:4)
 estimateSizeFactorsForMatrix(m, controlGenes=1:2)
 
 # iterate method
+set.seed(1)
 true.sf <- 2^(rep(c(-2,-1,0,0,1,2),each=2))
 dds <- makeExampleDESeqDataSet(sizeFactors=true.sf, n=100)
 cts <- counts(dds)
diff --git a/tests/testthat/test_tximport.R b/tests/testthat/test_tximport.R
new file mode 100644
index 0000000..fc1e973
--- /dev/null
+++ b/tests/testthat/test_tximport.R
@@ -0,0 +1,22 @@
+library("tximport")
+library("tximportData")
+library("readr")
+dir <- system.file("extdata", package="tximportData")
+samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
+files <- file.path(dir,"salmon", samples$run, "quant.sf")
+names(files) <- paste0("sample",1:6)
+tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))
+txi <- tximport(files, type="salmon", tx2gene=tx2gene, reader=read_tsv)
+dds <- DESeqDataSetFromTximport(txi, samples, ~1)
+
+# test fpkm
+
+exprs <- fpm(dds)
+exprs <- fpkm(dds)
+
+# test length of 0
+
+txi2 <- txi
+txi2$length[1,1] <- 0
+expect_error(dds2 <- DESeqDataSetFromTximport(txi2, samples, ~1), "lengths")
+
diff --git a/tests/testthat/test_vst.R b/tests/testthat/test_vst.R
index e5e5ffa..d5e4286 100644
--- a/tests/testthat/test_vst.R
+++ b/tests/testthat/test_vst.R
@@ -14,5 +14,18 @@ dds <- makeExampleDESeqDataSet(n=20, m=4)
 colnames(dds) <- NULL
 varianceStabilizingTransformation(dds)
 head(varianceStabilizingTransformation(assay(dds)))
-
 expect_error(getVarianceStabilizedData(dds))
+
+# test just matrix
+vsd <- varianceStabilizingTransformation(counts(dds))
+
+# test fast VST based on subsampling
+dds <- makeExampleDESeqDataSet(n=20000, m=10)
+vsd <- vst(dds)
+vsd <- vst(counts(dds))
+
+# test VST and normalization factors
+dds <- makeExampleDESeqDataSet(n=100, m=10, betaSD=1.5)
+nf <- matrix(exp(rnorm(1000,0,.2)),ncol=10)
+normalizationFactors(dds) <- nf
+vsd <- varianceStabilizingTransformation(dds, fitType="local")
diff --git a/vignettes/DESeq2.Rnw b/vignettes/DESeq2.Rnw
index 78543b6..3aeb276 100644
--- a/vignettes/DESeq2.Rnw
+++ b/vignettes/DESeq2.Rnw
@@ -124,26 +124,27 @@ for more information about how to construct an informative post.
 
 \subsection{Input data} \label{sec:prep}
 
-\subsubsection{Why raw counts?}
+\subsubsection{Why un-normalized counts?}
 
 As input, the \deseqtwo{} package expects count data as obtained, e.\,g.,
 from RNA-seq or another high-throughput sequencing experiment, in the form of a
 matrix of integer values. The value in the $i$-th row and the $j$-th column of
-the matrix tells how many reads have been mapped to gene $i$ in sample $j$.
+the matrix tells how many reads can be assigned to gene $i$ in sample $j$.
 Analogously, for other types of assays, the rows of the matrix might correspond
 e.\,g.\ to binding regions (with ChIP-Seq) or peptide sequences (with
-quantitative mass spectrometry). We will list method for obtaining count tables
-in a section below.
+quantitative mass spectrometry). We will list method for obtaining count matrices
+in sections below.
 
-The count values must be raw counts of sequencing reads (for
+The values in the matrix should be un-normalized counts of sequencing reads (for
 single-end RNA-seq) or fragments (for paired-end RNA-seq). 
 The \href{http://www.bioconductor.org/help/workflows/rnaseqGene/}{RNA-seq workflow}
 describes multiple techniques for preparing such count matrices.
 It is important to provide count matrices as input for \deseqtwo{}'s
 statistical model \cite{Love2014} to hold, as only the  
 count values allow assessing the measurement precision correctly. The \deseqtwo{}
-model internally corrects for library size, so transformed values such as counts
-scaled by library size should never be used as input.
+model internally corrects for library size, so transformed or
+normalized values such as counts scaled by library size should not
+be used as input. 
 
 \subsubsection{\Rclass{SummarizedExperiment} input} \label{sec:sumExpInput}
 
@@ -252,6 +253,83 @@ featureData <- data.frame(gene=rownames(pasillaGenes))
 (mcols(dds) <- DataFrame(mcols(dds), featureData))
 @ 
 
+\subsubsection{tximport: transcript abundance summarized to gene-level}
+
+Users can create gene-level count matrices for use with \deseqtwo{}
+by importing information using the \Biocpkg{tximport} package.
+This workflow allows users to import transcript abundance estimates
+from a variety of external software, including the following methods:
+
+\begin{itemize}
+\item \href{http://www.cs.cmu.edu/~ckingsf/software/sailfish/}{Sailfish} 
+  \cite{Patro2014Sailfish}
+\item \href{http://combine-lab.github.io/salmon/}{Salmon} 
+  \cite{Patro2015Salmon}
+\item
+  \href{https://pachterlab.github.io/kallisto/about.html}{kallisto} 
+  \cite{Bray2015Near}
+\item \href{http://deweylab.github.io/RSEM/}{RSEM} 
+  \cite{Li2011RSEM}
+\end{itemize}
+
+Some advantages of using the above methods for transcript abundance
+estimation are: (i) this approach corrects for potential changes
+in gene length across samples 
+(e.g. from differential isoform usage) \cite{Trapnell2013Differential},
+(ii) some of these methods (\textit{Sailfish, Salmon, kallisto}) 
+are substantially faster and require less memory
+and disk usage compared to alignment-based methods that require
+creation and storage of BAM files, and
+(iii) it is possible to avoid discarding those fragments that can
+align to multiple genes with homologous sequence, thus increasing
+sensitivity \cite{Robert2015Errors}.
+
+Full details on the motivation and methods for importing transcript
+level abundance and count estimates, summarizing to gene-level count matrices 
+and producing an offset which corrects for potential changes in average
+transcript length across samples are described in \cite{Soneson2015}.
+The \textit{tximport}$\rightarrow$\deseqtwo{} approach uses rounded estimated
+gene counts (but not normalized) instead of the raw count of fragments
+which can be unambiguously assigned to a gene.
+
+Here, we demonstrate how to import transcript abundances
+and construct of a gene-level \Rclass{DESeqDataSet} object
+from \textit{Sailfish} \texttt{quant.sf} files, which are
+stored in the \Biocexptpkg{tximportData} package.
+Note that, instead of locating \Robject{dir} using \Rfunction{system.file},
+a user would typically just provide a path, e.g. \texttt{/path/to/quant/files}.
+For further details on use of \Rfunction{tximport}, 
+including the construction of the \Robject{tx2gene} table for linking
+transcripts to genes, please refer to the \Biocpkg{tximport} package vignette. 
+
+<<tximport>>=
+library("tximport")
+library("readr")
+library("tximportData")
+dir <- system.file("extdata", package="tximportData")
+samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
+files <- file.path(dir,"salmon", samples$run, "quant.sf")
+names(files) <- paste0("sample",1:6)
+tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))
+txi <- tximport(files, type="salmon", tx2gene=tx2gene, reader=read_tsv)
+@ 
+
+Next we create an condition vector to demonstrate building an
+\Robject{DESeqDataSet}. For a typical use, this information would already
+be present as a column of the \Robject{samples} table.
+The best practice is to read \Robject{colData} from a CSV or TSV file, 
+and to construct \Robject{files} 
+from a column of \Robject{colData}, as shown in the code chunk above.
+
+<<txi2dds>>=
+coldata <- data.frame(condition=factor(rep(c("A","B"),each=3)))
+rownames(coldata) <- colnames(txi$counts)
+ddsTxi <- DESeqDataSetFromTximport(txi, colData=coldata, design=~ condition)
+@
+
+The \Robject{ddsTxi} object can then be used as \Robject{dds} in the
+following analysis steps.
+
 \subsubsection{\textit{HTSeq} input}
 
 You can use the function \Rfunction{DESeqDataSetFromHTSeqCount} if you
@@ -435,6 +513,20 @@ summary(res05)
 sum(res05$padj < 0.05, na.rm=TRUE)
 @ 
 
+A generalization of the idea of $p$ value filtering is to \textit{weight} hypotheses
+to optimize power. A new Bioconductor package, \Biocpkg{IHW}, is now available
+that implements the method of \textit{Independent Hypothesis Weighting} \cite{Ignatiadis2015}.
+Here we show the use of \textit{IHW} for $p$ value adjustment of \deseqtwo{} results.
+For more details, please see the vignette of the \Biocpkg{IHW} package.
+Note that the \textit{IHW} result object is stored in the metadata.
+
+<<IHW>>=
+library("IHW")
+resIHW <- results(dds, filterFun=ihw)
+summary(resIHW)
+sum(resIHW$padj < 0.1, na.rm=TRUE)
+metadata(resIHW)$ihwResult
+@ 
 
 If a multi-factor design is used, or if the variable in the design
 formula has more than two levels, the \Robject{contrast} argument of
@@ -586,14 +678,33 @@ can be set to \Robject{NA} for one of the following reasons:
     described in Section~\ref{sec:autoFilt}.
 \end{enumerate}
 
-\subsubsection{Exporting results to HTML or CSV files}
-An HTML report of the results with plots and sortable/filterable columns
-can be exported using the \Biocpkg{ReportingTools} package
+\subsubsection{Rich visualization and reporting of results}
+
+\textbf{ReportingTools.} An HTML report of the results with plots and sortable/filterable columns
+can be generated using the \Biocpkg{ReportingTools} package
 on a \Rclass{DESeqDataSet} that has been processed by the \Rfunction{DESeq} function.
 For a code example, see the ``RNA-seq differential expression'' vignette at
 the \Biocpkg{ReportingTools} page, or the manual page for the 
 \Rfunction{publish} method for the \Rclass{DESeqDataSet} class.
 
+\textbf{regionReport.} An HTML and PDF summary of the results with plots
+can also be generated using the \Biocpkg{regionReport} package.
+The \Rfunction{DESeq2Report} function should be run on a 
+\Rclass{DESeqDataSet} that has been processed by the \Rfunction{DESeq} function.
+For more details see the manual page for \Rfunction{DESeq2Report} 
+and an example vignette in the \Biocpkg{regionReport} package.
+
+\textbf{Glimma.} Interactive visualization of \deseqtwo{} output, 
+including MA-plots (also called MD-plot) can be generated using the
+\Biocpkg{Glimma} package. See the manual page for \Rfunction{glMDPlot.DESeqResults}.
+
+\textbf{pcaExplorer.} Interactive visualization of \deseqtwo{} output,
+including PCA plots, boxplots of counts and other useful summaries can be
+generated using the \Biocpkg{pcaExplorer} package.
+See the ``Launching the application'' section of the package vignette.
+
+\subsubsection{Exporting results to CSV files}
+
 A plain-text file of the results can be exported using the 
 base \R{} functions \Rfunction{write.csv} or \Rfunction{write.delim}. 
 We suggest using a descriptive file name indicating the variable
@@ -726,12 +837,15 @@ variance above the trend which will allow us to cluster samples into
 interesting groups.
 
 \textbf{Note on running time:} if you have many samples (e.g. 100s),
-the \Rfunction{rlog} function might take too long, and the variance
-stabilizing transformation might be a better choice.  The rlog and VST
-have similar properties, but the rlog requires fitting a shrinkage
+the \Rfunction{rlog} function might take too long, and the 
+\Rfunction{varianceStabilizingTransformation} is a faster choice.  
+The rlog and VST have similar properties, but the rlog requires fitting a shrinkage
 term for each sample and each gene which takes time.  See the
 \deseqtwo{} paper for more discussion on the differences
-\cite{Love2014}.
+\cite{Love2014}. In addition, a new function \Rfunction{vst} provides
+an even faster version of the \Rfunction{varianceStabilizingTransformation}
+but calculating the global dispersion trend on a subset of the genes
+(default 1000). \Rfunction{vst} may be attractive for interactive EDA.
 
 \subsubsection{Blind dispersion estimation}
 
@@ -749,7 +863,7 @@ assurance) as demonstrated below.
 However, blind dispersion estimation is not the appropriate choice if
 one expects that many or the majority of genes (rows) will have large
 differences in counts which are explainable by the experimental design,
-and one wishes to tranform the data for downstream analysis. In this
+and one wishes to transform the data for downstream analysis. In this
 case, using blind dispersion estimation will lead to large estimates
 of dispersion, as it attributes differences due to experimental design
 as unwanted ``noise'', and will result in overly shrinking the transformed
@@ -761,18 +875,27 @@ that only the fitted dispersion estimates from mean-dispersion trend
 line are used in the transformation (the global dependence of
 dispersion on mean for the entire experiment).
 So setting \Robject{blind} to \Robject{FALSE} is still for the most
-part unbiased by the information about which samples were in which
-experimental group. 
+part not using the information about which samples were in which
+experimental group in applying the transformation.
 
 \subsubsection{Extracting transformed values}
 
-The two functions return an object of class \Rclass{DESeqTransform}
+These functions return an object of class \Rclass{DESeqTransform}
 which is a subclass of \Rclass{RangedSummarizedExperiment}. 
-The \Rfunction{assay} function is used to extract the matrix of normalized values:
+For $\sim 20$ samples, running on a newly created \Robject{DESeqDataSet},
+\Rfunction{rlog} may take 30 seconds, 
+\Rfunction{varianceStabilizingTransformation} may take 5 seconds, and
+\Rfunction{vst} less than 1 second (by subsetting to 1000 genes for
+calculating the global dispersion trend).
+However, the running times are shorter and more similar with \Rcode{blind=FALSE} and
+if the function \Rfunction{DESeq} has already been run, because then
+it is not necessary to re-estimate the dispersion values.
+The \Rfunction{assay} function is used to extract the matrix of normalized values.
 
 <<rlogAndVST>>=
-rld <- rlog(dds)
-vsd <- varianceStabilizingTransformation(dds)
+rld <- rlog(dds, blind=FALSE)
+vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
+vsd.fast <- vst(dds, blind=FALSE)
 head(assay(rld), 3)
 @
 
@@ -1320,7 +1443,7 @@ Plotting the dispersion estimates is a useful diagnostic. The dispersion
 plot in Figure \ref{figure/dispFit-1} is typical, with the final estimates shrunk
 from the gene-wise estimates towards the fitted estimates. Some gene-wise
 estimates are flagged as outliers and not shrunk towards the fitted value,
-(this outlier detection is described in the man page for \Rfunction{estimateDispersionsMAP}).
+(this outlier detection is described in the manual page for \Rfunction{estimateDispersionsMAP}).
 The amount of shrinkage can be more or less than seen here, depending 
 on the sample size, the number of coefficients, the row mean
 and the variability of the gene-wise estimates.
@@ -1554,6 +1677,14 @@ dispersionFunction(dds)
 attr(dispersionFunction(dds), "dispPriorVar")
 @ 
 
+The version of \deseqtwo{} which was used to construct the
+\Rclass{DESeqDataSet} object, or the version used when
+\Rfunction{DESeq} was run, is stored here:
+
+<<versionNum>>=
+metadata(dds)[["version"]]
+@ 
+
 \subsection{Sample-/gene-dependent normalization factors} \label{sec:normfactors}
 
 In some experiments, there might be gene-dependent dependencies
@@ -1770,7 +1901,16 @@ proportional to the expected true concentration of fragments for sample $j$.
 The coefficients $\beta_i$ give the log2 fold changes for gene $i$ for each 
 column of the model matrix $X$. 
 
-By default these log2 fold changes are the maximum \emph{a posteriori} estimates after incorporating a
+The dispersion parameter $\alpha_i$ defines the relationship between
+the variance of the observed count and its mean value. In other
+words, how far do we expected the observed count will be from the
+mean value, which depends both on the size factor $s_j$ and the
+covariate-dependent part $q_{ij}$ as defined above.
+
+$$ \textrm{Var}(K_{ij}) = E[ (K_{ij} - \mu_{ij})^2 ] = \mu_{ij} + \alpha_i \mu_{ij}^2 $$
+
+The log2 fold changes in $\beta_i$ are the maximum \emph{a posteriori}
+estimates after incorporating a 
 zero-centered Normal prior -- in the software referrred to as a $\beta$-prior -- hence \deseqtwo{}
 provides ``moderated'' log2 fold change estimates.  Dispersions are estimated using expected mean
 values from the maximum likelihood estimate of log2 fold changes, and optimizing the Cox-Reid
@@ -1823,7 +1963,7 @@ version \Biocpkg{DESeq}, are as follows:
     matching the empirical quantile to the quantile of a Normal
     distribution, \deseqtwo() now uses the weighted quantile function
     of the \CRANpkg{Hmisc} package. The weighting is described in the
-    man page for \Rfunction{nbinomWaldTest}.  The weights are the
+    manual page for \Rfunction{nbinomWaldTest}.  The weights are the
     inverse of the expected variance of log counts (as used in the
     diagonals of the matrix $W$ in the GLM). The effect of the change
     is that the estimated prior variance is robust against noisy
@@ -2065,6 +2205,7 @@ genes with all counts equal to zero.
   testing we recommend the \Rfunction{DESeq} function applied to raw
   counts as outlined in Section~\ref{sec:de}.
       
+  
 \subsection{Can I use DESeq2 to analyze paired samples?}
 
 Yes, you should use a multi-factor design which includes the sample
@@ -2073,6 +2214,61 @@ differences between the samples while estimating the effect due to
 the condition. The condition of interest should go at the end of the 
 design formula. See Section~\ref{sec:multifactor}.
 
+\subsection{If I have multiple groups, should I run all together or split into pairs of groups?}
+
+Typically, we recommend users to run samples from all groups together, and then
+use the \Rcode{contrast} argument of the \Rfunction{results} function
+to extract comparisons of interest after fitting the model using \Rfunction{DESeq}.
+
+The model fit by \Rfunction{DESeq} estimates a single dispersion
+parameter for each gene, which defines how far we expect the observed
+count for a sample will be from the mean value from the model 
+given its size factor and its condition group. See Section~\ref{sec:glm} 
+and the \deseqtwo{} paper for full details.
+Having a single dispersion parameter for each gene is usually
+sufficient for analyzing multi-group data, as the final dispersion value will
+incorporate the within-group variability across all groups. 
+
+However, for some datasets, exploratory data analysis (EDA) plots as outlined
+in Section~\ref{sec:pca} could reveal that one or more groups has much
+higher within-group variability than the others. A simulated example
+of such a set of samples is shown in Figure~\ref{figure/varGroup-1}.
+This is case where, by comparing groups A and B separately --
+subsetting a \Rclass{DESeqDataSet} to only samples from those two
+groups and then running \Rfunction{DESeq} on this subset -- will be
+more sensitive than a model including all samples together.
+It should be noted that such an extreme range of within-group
+variability is not common, although it could arise if certain
+treatments produce an extreme reaction (e.g. cell death).
+Again, this can be easily detected from the EDA plots such as PCA
+described in this vignette.
+
+<<varGroup, echo=FALSE, fig.width=5, fig.height=5>>=
+set.seed(3)
+dds1 <- makeExampleDESeqDataSet(n=1000,m=12,betaSD=.3,dispMeanRel=function(x) 0.01)
+dds2 <- makeExampleDESeqDataSet(n=1000,m=12,
+                                betaSD=.3,
+                                interceptMean=mcols(dds1)$trueIntercept,
+                                interceptSD=0,
+                                dispMeanRel=function(x) 0.2)
+dds2 <- dds2[,7:12]
+dds2$condition <- rep("C",6)
+mcols(dds2) <- NULL
+dds <- cbind(dds1, dds2)
+rld <- rlog(dds, blind=FALSE, fitType="mean")
+plotPCA(rld)
+@ 
+
+\incfig{figure/varGroup-1}{.5\textwidth}{Extreme range of within-group variability.}{
+  Typically, it is recommended to run \Rfunction{DESeq} across samples
+  from all groups, for datasets with multiple groups. 
+  However, this simulated dataset shows a case where
+  it would be preferable to compare groups A and B by creating a
+  smaller dataset without the C samples. Group C has much higher
+  within-group variability, which would inflate the per-gene dispersion
+  estimate for groups A and B as well. 
+}
+
 \subsection{Can I run DESeq2 to contrast the levels of 100 groups?}
 
 \deseqtwo{} will work with any kind of design specified using the R
@@ -2091,9 +2287,13 @@ fitting procedure.
 \subsection{Can I use DESeq2 to analyze a dataset without replicates?}
 
 If a \Rclass{DESeqDataSet} is provided with an experimental design without replicates,
-a message is printed, that the samples are treated as replicates
-for estimation of dispersion. More details can be found in the 
-manual page for \Rfunction{?DESeq}.
+a warning is printed, that the samples are treated as replicates
+for estimation of dispersion. This kind of analysis is
+only useful for exploring the data, but will not provide the kind of
+proper statistical inference on differences between groups.
+Without biological replicates, it is not possible to estimate the biological
+variability of each gene. 
+More details can be found in the manual page for \Rfunction{?DESeq}.
 
 \subsection{How can I include a continuous covariate in the design formula?}
 
@@ -2108,6 +2308,98 @@ the trend over the continuous covariates.  In R, \Rclass{numeric}
 vectors can be converted into \Rclass{factors} using the function
 \Rfunction{cut}.
 
+\subsection{Will the log fold change shrinkage ``overshrink'' large differences?}
+
+For most datasets, the application of a prior to the log fold changes
+is a good choice, providing log fold change estimates that are
+more stable across the entire range of mean counts than the maximum
+likelihood estimates (see Figure~\ref{fig:MA} and the \deseqtwo{} paper).
+One situation in which the prior on log fold changes might
+``overshrink'' the estimates is 
+if nearly all genes show no difference across condition, a very
+small set of genes have extremely large differences, and no genes in between.
+A simulated example of such a dataset is Figure~\ref{figure/overShrink-1}.
+This is not likely to be the case for most experiments, where typically
+there is a range of differences by size: some genes with medium-to-large
+differences across treatment, and some with small differences.
+
+<<overShrink, echo=FALSE, fig.width=5, fig.height=5>>=
+plot(c(10^rnorm(1000, 3, 2),300,2000,5000), 
+     c(rnorm(1000, 0, .15), -5.5, -8.5, 7.5),
+     ylim=c(-10,10), log="x", cex=.4,
+     xlab="mean of normalized counts", 
+     ylab="log2 fold change")
+abline(h=0, col=rgb(1,0,0,.7))
+@ 
+
+\incfig{figure/overShrink-1}{.5\textwidth}{Example of a dataset with
+  where the log fold change prior should be turned off.}{
+  Here we show a simulated MA-plot, where nearly all of the log fold changes
+  are falling near the x-axis, with three genes that have very large log
+  fold changes (note the y-axis is from -10 to 10 on the log2 scale).
+  This would indicate a dataset where the log fold change prior would
+  ``overshrink'' the large fold changes, and so \Robject{betaPrior=FALSE}
+  should be used.
+}
+
+There could be experiments in which only a few genes have
+very large log fold changes, and the rest of the genes are
+nearly constant across treatment.
+Or, there could be artificially constructed libraries fitting this description,
+e.g. technical replicates where the only difference across libraries 
+is the concentration of a few spiked-in genes.
+``Overshrinking'' of a few large log fold changes
+can be assessed by running \Rfunction{results} with \Rcode{addMLE=TRUE},
+which will print a results table with columns for the shrunken and
+unshrunken (MLE) log fold changes.
+The two estimates can be visually compared by running \Rfunction{plotMA} with
+\Rcode{MLE=TRUE} and \Rcode{MLE=FALSE}. 
+If ``overshrinking'' very large log fold changes is a concern,
+it is better to turn off the log fold change prior by
+running \Rfunction{DESeq} with \Rcode{betaPrior=FALSE}.
+
+Even more detail: how do we avoid overshrinking on typical datasets?
+The answer is that we estimate the width of the log fold change prior in a
+robust way to accommodate the very largest log fold changes, and so to
+avoid overshrinking. 
+The details of the prior estimation are described in the manual page for
+\Rfunction{nbinomWaldTest}. Briefly, a weighted upper quantile
+is used to match the width of the log fold change prior to the upper
+5\% of the MLE log fold changes, weighting by the expected sampling
+variability of the estimated log fold changes given the mean count for
+each gene. Note that this does not equal an assumption that 5\% of genes are
+differentially expressed, but that a reasonable width of a log fold
+change distribution can be obtained from the upper 5\% of MLE log fold
+changes. 
+
+\subsection{I ran a likelihood ratio test, but \texttt{results()} only gives me one comparison.}
+
+``\dots How do I get the $p$ values for all of the variables/levels 
+that were removed in the reduced design?''
+
+This is explained in the help page for \texttt{?results} in the
+section about likelihood ratio test p-values, but we will restate the
+answer here. When one performs a likelihood ratio test, the $p$ values and
+the test statistic (the \Robject{stat} column) are values for the test
+that removes all of the variables which are present in the full
+design and not in the reduced design. This tests the null hypothesis
+that all the coefficients from these variables and levels of these factors
+are equal to zero.
+
+The likelihood ratio test $p$ values therefore
+represent a test of \textit{all the variables and all the levels of factors}
+which are among these variables. However, the results table only has space for
+one column of log fold change, so a single variable and a single
+comparison is shown (among the potentially multiple log fold changes
+which were tested in the likelihood ratio test). 
+This is indicated at the top of the results table
+with the text, e.g.: ``log2 fold change (MLE): condition C vs A'' followed
+by ``LRT p-value: '\lowtilde{} batch + condition' vs '\lowtilde{} batch' ''.
+This indicates that the $p$ value is for the likelihood ratio test of
+\textit{all the variables and all the levels}, while the log fold change is a single
+comparison from among those variables and levels.
+See the help page for \Rfunction{results} for more details.
+
 \subsection{What are the exact steps performed by \Rfunction{DESeq()}?}
 
 See the manual page for \Rfunction{DESeq}, which links to the 
@@ -2120,13 +2412,34 @@ Yes. The repository for the \deseqtwo{} tool is
 and a link to its location in the Tool Shed is 
 \url{https://toolshed.g2.bx.psu.edu/view/iuc/deseq2/d983d19fbbab}.
 
+\subsection{I want to benchmark DESeq2 comparing to other DE tools.}
+
+One aspect which can cause problems for comparison is that, by default,
+\deseqtwo{} outputs \Rcode{NA} values for adjusted $p$ values based on 
+independent filtering of genes which have low counts.
+This is a way for the \deseqtwo{} to give extra
+information on why the adjusted $p$ value for this gene is not small.
+Additionally, $p$ values can be set to \Rcode{NA} based on extreme 
+count outlier detection (see Section~\ref{sec:moreInfo} for full details). 
+These \Rcode{NA} values should be considered
+negatives for purposes of estimating sensitivity and specificity. The
+easiest way to work with the adjusted $p$ values in a benchmarking
+context is probably to convert these \Rcode{NA} values to 1:
+
+<<convertNA, eval=FALSE>>=
+res$padj <- ifelse(is.na(res$padj), 1, res$padj)
+@ 
+
 \section{Acknowledgments}
 
 We have benefited in the development of \deseqtwo{} from the help and
 feedback of many individuals, including but not limited to: 
 The Bionconductor Core Team,
 Alejandro Reyes, Andrzej Ole\'s, Aleksandra Pekowska, Felix Klein,
+Nikolaos Ignatiadis,
 Vince Carey,
+Owen Solberg,
+Ruping Sun,
 Devon Ryan, 
 Steve Lianoglou, Jessica Larson, Christina Chaivorapol, Pan Du, Richard Bourgon,
 Willem Talloen, 
@@ -2145,7 +2458,10 @@ Sonali Arora,
 Jordan Ramilowski,
 Ian Dworkin,
 Bj\"orn Gr\"uning,
-Ryan McMinds.
+Ryan McMinds,
+Paul Gordon,
+Leonardo Collado Torres,
+Enrico Ferrero.
 \section{Session Info}
 
 <<sessInfo, results="asis", echo=FALSE>>=
diff --git a/vignettes/library.bib b/vignettes/library.bib
index 5b9bea0..6d01a57 100644
--- a/vignettes/library.bib
+++ b/vignettes/library.bib
@@ -1,3 +1,11 @@
+ at article{Ignatiadis2015,
+  author = {Ignatiadis, Nikolaos and Klaus, Bernd and Zaugg, Judith and Huber, Wolfgang},
+  journal = {bioRxiv},
+  title = {Data-driven hypothesis weighting increases detection power in big data analytics},
+  url = {http://dx.doi.org/10.1101/034330},
+  year = 2015
+}
+
 @article{Love2014,
   url =          {http://dx.doi.org/10.1186/s13059-014-0550-8},
   author =       {Love, Michael I. and Huber, Wolfgang and Anders, Simon},
@@ -10,6 +18,16 @@
   Pages =        550,
 }
 
+ at article{Soneson2015,
+  url =          {http://dx.doi.org/10.12688/f1000research.7563.1},
+  author =       {Soneson, Charlotte and Love, Michael I. and Robinson, Mark},
+  title =        {{Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences}},
+  journal =      {F1000Research},
+  year =         2015,
+  Volume =       4,
+  Issue =        1521
+}
+
 @article{Anders:2010:GB,
   url =		 {http://genomebiology.com/2010/11/10/R106},
   author =	 {Anders, Simon and Huber, Wolfgang},
@@ -241,3 +259,58 @@ journal = {Bioinformatics}
    Year={2013},
    Month={Nov}
 }
+
+ at article{Li2011RSEM,
+   author = {Li, Bo and Dewey, Colin N.},
+   doi = {10.1186/1471-2105-12-3231},
+   journal = {BMC Bioinformatics},
+   pages = {323+},
+   title = {{RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome.}}, 
+   url = {http://dx.doi.org/10.1186/1471-2105-12-323},
+   volume = {12},
+   year = {2011}
+}
+
+ at article{Patro2014Sailfish,
+  author = {Patro, Rob and Mount, Stephen M. and Kingsford, Carl},
+  journal = {Nature Biotechnology},
+  pages = {462--464},
+  title = {{Sailfish enables alignment-free isoform quantification from RNA-seq reads using lightweight algorithms}},
+  url = {http://dx.doi.org/10.1038/nbt.2862},
+  volume = {32},
+  year = {2014}
+}
+
+ at article{Patro2015Salmon,
+  author = {Patro, Rob and Duggal, Geet and Kingsford, Carl},
+  journal = {bioRxiv},
+  title = {Salmon: Accurate, Versatile and Ultrafast Quantification from RNA-seq Data using Lightweight-Alignment},
+  url = {http://biorxiv.org/content/early/2015/06/27/021592},
+  year = 2015
+}
+
+ at article{Bray2015Near,
+  author = {Bray, Nicolas and Pimentel, Harold and Melsted, Pall and Pachter, Lior},
+  journal = {arXiv},
+  title = {Near-optimal RNA-Seq quantification},
+  url = {http://arxiv.org/abs/1505.02710},
+  year = 2015
+}
+
+ at article{Robert2015Errors,
+   author = {Robert, Christelle and Watson, Mick},
+   doi = {10.1186/s13059-015-0734-x},
+   journal = {Genome Biology},
+   title = {{Errors in RNA-Seq quantification affect genes of relevance to human disease}},
+   url = {http://dx.doi.org/10.1186/s13059-015-0734-x},
+   year = {2015}
+}
+
+ at article{Trapnell2013Differential,
+   author = {Trapnell, Cole and Hendrickson, David G and Sauvageau, Martin and Goff, Loyal and Rinn, John L and Pachter, Lior},
+   doi = {10.1038/nbt.2450},
+   journal = {Nature Biotechnology},
+   title = {{Differential analysis of gene regulation at transcript resolution with RNA-seq}},
+   url = {http://dx.doi.org/10.1038/nbt.2450},
+   year = {2013}
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/r-bioc-deseq2.git