[med-svn] [python-pbcore] 01/06: Imported Upstream version 1.2.10+dfsg

Afif Elghraoui afif at moszumanska.debian.org
Sun Oct 9 18:38:06 UTC 2016


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository python-pbcore.

commit a06a1f09e29c192af60fc19fce873086e8343c43
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Sat Oct 8 14:54:03 2016 -0700

    Imported Upstream version 1.2.10+dfsg
---
 CHANGELOG.org                          |   5 +
 pbcore/__init__.py                     |   2 +-
 pbcore/chemistry/resources/mapping.xml |  71 ++++++-
 pbcore/io/align/BamAlignment.py        |   2 +-
 pbcore/io/align/PacBioBamIndex.py      |  17 ++
 pbcore/io/dataset/DataSetErrors.py     |  17 ++
 pbcore/io/dataset/DataSetIO.py         | 313 +++++++++++++++++++++--------
 pbcore/io/dataset/DataSetMembers.py    | 185 ++++++++++++------
 pbcore/io/dataset/DataSetReader.py     |  44 ++++-
 pbcore/io/dataset/utils.py             |  81 +++++++-
 requirements.txt                       |   2 +-
 tests/test_pbcore_io_AlnFileReaders.py |  19 ++
 tests/test_pbdataset.py                | 348 ++++++++++++++++++++++++++++++++-
 tests/test_pbdataset_subtypes.py       | 129 +++++++++++-
 14 files changed, 1073 insertions(+), 162 deletions(-)

diff --git a/CHANGELOG.org b/CHANGELOG.org
index bf187aa..daccd48 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,8 @@
+* Version 1.2.10 (> SMRTanalysis 3.1)
+  - Update to pysam 0.9.0
+  - Recognize new and prospective future partnumbers for the 3.1 timeframe
+  - Misc fixes for speeding up dataset and cmp.h5 access
+
 * Version 1.2.9
   - Rename pulseFeature accessors to "baseFeature...", since they
     reflect features accompanying basecalls as opposed to pulse calls;
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index bfe805f..7375688 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #################################################################################
 
-__VERSION__ = "1.2.9"
+__VERSION__ = "1.2.10"
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
index c40b10a..0f7e95b 100644
--- a/pbcore/chemistry/resources/mapping.xml
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -182,9 +182,78 @@
     <SoftwareVersion>3.0</SoftwareVersion>
   </Mapping>
   <Mapping>
-    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
     <BindingKit>100-619-300</BindingKit>
     <SequencingKit>100-620-000</SequencingKit>
     <SoftwareVersion>3.1</SoftwareVersion>
   </Mapping>
+
+  <!-- 1.0/1.1: a 1/1 compabible new SK condition  -->
+  <Mapping>
+    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <BindingKit>100-619-300</BindingKit>
+    <SequencingKit>100-867-300</SequencingKit>
+    <SoftwareVersion>3.1</SoftwareVersion>
+  </Mapping>
+
+  <!-- 1.1/1.1: a 1/1 compatible new BK and SK condition -->
+  <Mapping>
+    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <BindingKit>100-867-500</BindingKit>
+    <SequencingKit>100-867-300</SequencingKit>
+    <SoftwareVersion>3.1</SoftwareVersion>
+  </Mapping>
+
+  <!-- 2.0/2.0: new kits... 1/1 compatible... -->
+  <Mapping>
+    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+    <BindingKit>100-864-400</BindingKit>
+    <SequencingKit>100-864-100</SequencingKit>
+    <SoftwareVersion>3.1</SoftwareVersion>
+  </Mapping>
+
+  <!-- 2.1/2.1, ibid -->
+  <Mapping>
+    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+    <BindingKit>100-864-600</BindingKit>
+    <SequencingKit>100-864-200</SequencingKit>
+    <SoftwareVersion>3.1</SoftwareVersion>
+  </Mapping>
+
+  <!-- 3.2 basecaller support -->
+  <Mapping>
+    <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
+    <BindingKit>100-619-300</BindingKit>
+    <SequencingKit>100-620-000</SequencingKit>
+    <SoftwareVersion>3.2</SoftwareVersion>
+  </Mapping>
+
+  <Mapping>
+    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <BindingKit>100-619-300</BindingKit>
+    <SequencingKit>100-867-300</SequencingKit>
+    <SoftwareVersion>3.2</SoftwareVersion>
+  </Mapping>
+
+  <Mapping>
+    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <BindingKit>100-867-500</BindingKit>
+    <SequencingKit>100-867-300</SequencingKit>
+    <SoftwareVersion>3.2</SoftwareVersion>
+  </Mapping>
+
+  <Mapping>
+    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+    <BindingKit>100-864-400</BindingKit>
+    <SequencingKit>100-864-100</SequencingKit>
+    <SoftwareVersion>3.2</SoftwareVersion>
+  </Mapping>
+
+  <Mapping>
+    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+    <BindingKit>100-864-600</BindingKit>
+    <SequencingKit>100-864-200</SequencingKit>
+    <SoftwareVersion>3.2</SoftwareVersion>
+  </Mapping>
+
 </MappingTable>
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
index 8428d0f..104e0b8 100644
--- a/pbcore/io/align/BamAlignment.py
+++ b/pbcore/io/align/BamAlignment.py
@@ -302,7 +302,7 @@ class BamAlignment(AlignmentRecordMixin):
     @property
     @requiresMapping
     def identity(self):
-        if self.hasPbi is not None:
+        if self.hasPbi:
             # Fast (has pbi)
             if self.readLength == 0:
                 return 0.
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index ea7d724..89ae316 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -74,6 +74,10 @@ class PacBioBamIndex(object):
         return (self.pbiFlags & PBI_FLAGS_MAPPED)
 
     @property
+    def hasCoordinateSortedInfo(self):
+        return (self.pbiFlags & PBI_FLAGS_COORDINATE_SORTED)
+
+    @property
     def hasBarcodeInfo(self):
         return (self.pbiFlags & PBI_FLAGS_BARCODE)
 
@@ -99,6 +103,11 @@ class PacBioBamIndex(object):
             ("nMM"               , "u4"),
             ("mapQV"             , "u1") ]
 
+        COORDINATE_SORTED_DTYPE = [
+            ("tId"               , "u4"),
+            ("beginRow"          , "u4"),
+            ("endRow"            , "u4")]
+
         BARCODE_INDEX_DTYPE = [
             ("bcForward"         , "i2"),
             ("bcReverse"         , "i2"),
@@ -135,6 +144,14 @@ class PacBioBamIndex(object):
             tbl.nIns = tbl.aEnd - tbl.aStart - tbl.nM - tbl.nMM
             tbl.nDel = tbl.tEnd - tbl.tStart - tbl.nM - tbl.nMM
 
+        # TODO: do something with these:
+        # TODO: remove nReads check when the rest of this code can handle empty
+        # mapped bam files (columns are missing, flags don't reflect that)
+        if self.hasCoordinateSortedInfo and self.nReads:
+            ntId = int(peek("u4", 1))
+            for columnName, columnType in COORDINATE_SORTED_DTYPE:
+                peek(columnType, ntId)
+
         if self.hasBarcodeInfo:
             for columnName, columnType in BARCODE_INDEX_DTYPE:
                 tbl[columnName] = peek(columnType, self.nReads)
diff --git a/pbcore/io/dataset/DataSetErrors.py b/pbcore/io/dataset/DataSetErrors.py
new file mode 100644
index 0000000..4695df9
--- /dev/null
+++ b/pbcore/io/dataset/DataSetErrors.py
@@ -0,0 +1,17 @@
+
+class InvalidDataSetIOError(Exception):
+    """The base class for all DataSetIO related custom exceptions
+    """
+
+
+class ResourceMismatchError(InvalidDataSetIOError):
+
+    def __init__(self, responses):
+        super(ResourceMismatchError, self).__init__()
+        self.responses = responses
+
+    def __str__(self):
+        return "Resources responded differently: " + ', '.join(
+            map(str, self.responses))
+
+
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index a5edaef..cbde652 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -8,6 +8,7 @@ import copy
 import os, sys
 import re
 import errno
+import uuid
 import logging
 import itertools
 import xml.dom.minidom
@@ -26,7 +27,8 @@ from pbcore.io import (BaxH5Reader, FastaReader, IndexedFastaReader,
 from pbcore.io.align._BamSupport import UnavailableFeature
 from pbcore.io.dataset.DataSetReader import (parseStats, populateDataSet,
                                              resolveLocation, xmlRootType,
-                                             wrapNewResource, openFofnFile)
+                                             wrapNewResource, openFofnFile,
+                                             parseMetadata)
 from pbcore.io.dataset.DataSetWriter import toXml
 from pbcore.io.dataset.DataSetValidator import validateString
 from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
@@ -36,7 +38,10 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
                                               ExternalResources,
                                               ExternalResource, Filters)
 from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam,
-                                     _indexBam, _indexFasta)
+                                     _indexBam, _indexFasta, _fileCopy,
+                                     _swapPath, which, consolidateXml)
+from pbcore.io.dataset.DataSetErrors import (InvalidDataSetIOError,
+                                             ResourceMismatchError)
 
 
 log = logging.getLogger(__name__)
@@ -111,13 +116,38 @@ def isDataSet(xmlfile):
     except Exception:
         return False
 
-def openDataSet(*files, **kwargs):
-    """Factory function for DataSet types as suggested by the first file"""
+def getDataSetUuid(xmlfile):
+    """
+    Quickly retrieve the uuid from the root element of a dataset XML file,
+    using a streaming parser to avoid loading the entire dataset into memory.
+    Returns None if the parsing fails.
+    """
+    try:
+        import xml.etree.cElementTree as ET
+        for event, element in ET.iterparse(xmlfile, events=("start",)):
+            return element.get("UniqueId")
+    except Exception:
+        return None
+
+
+def getDataSetMetaType(xmlfile):
+    """
+    Quickly retrieve the MetaType from the root element of a dataset XML file,
+    using a streaming parser to avoid loading the entire dataset into memory.
+    Returns None if the parsing fails.
+    """
     try:
-        tbrType = _typeDataSet(files[0])
-        return tbrType(*files, **kwargs)
+        import xml.etree.cElementTree as ET
+        for event, element in ET.iterparse(xmlfile, events=("start",)):
+            return element.get("MetaType")
     except Exception:
-        raise TypeError("openDataSet requires that the first file is a DS")
+        return None
+
+
+def openDataSet(*files, **kwargs):
+    """Factory function for DataSet types as suggested by the first file"""
+    tbrType = _typeDataSet(files[0])
+    return tbrType(*files, **kwargs)
 
 def openDataFile(*files, **kwargs):
     """Factory function for DataSet types determined by the first data file"""
@@ -194,6 +224,12 @@ def _flatten(lol, times=1):
         lol = np.concatenate(lol)
     return lol
 
+def _ranges_in_list(alist):
+    """Takes a sorted list, finds the boundaries of runs of each value"""
+    unique, indices, counts = np.unique(np.array(alist), return_index=True,
+                                        return_counts=True)
+    return {u: (i, i + c) for u, i, c in zip(unique, indices, counts)}
+
 def divideKeys(keys, chunks):
     if chunks < 1:
         return []
@@ -281,6 +317,24 @@ def _pathChanger(osPathFunc, metaTypeFunc, resource):
         resource.resourceId = currentPath
         metaTypeFunc(resource)
 
+def _copier(dest, resource, subfolder=None):
+    """Apply these two functions to the resource or ResourceId"""
+    if subfolder is None:
+        subfolder = [uuid.uuid4()]
+    resId = resource.resourceId
+    currentPath = urlparse(resId)
+    if currentPath.scheme == 'file' or not currentPath.scheme:
+        currentPath = currentPath.path
+        try:
+            currentPath = _fileCopy(dest, currentPath, uuid=resource.uniqueId)
+            subfolder[0] = resource.uniqueId
+        except AttributeError:
+            if subfolder:
+                currentPath = _fileCopy(dest, currentPath, uuid=subfolder[0])
+            else:
+                raise
+        resource.resourceId = currentPath
+
 
 class DataSet(object):
     """The record containing the DataSet information, with possible type
@@ -425,10 +479,13 @@ class DataSet(object):
                         break
                 if not found:
                     allowed = self._metaTypeMapping().keys()
+                    extension = fname.split('.')[-1]
                     raise IOError(errno.EIO,
                                   "Cannot create {c} with resource of type "
-                                  "{t}, only {a}".format(c=dsType, t=fname,
-                                                         a=allowed))
+                                  "'{t}' ({f}), only {a}".format(c=dsType,
+                                                           t=extension,
+                                                           f=fname,
+                                                           a=allowed))
 
         # State tracking:
         self._cachedFilters = []
@@ -520,27 +577,28 @@ class DataSet(object):
             # dataset for the first time
             firstIn = True if len(self.externalResources) == 0 else False
 
+            if copyOnMerge:
+                result = self.copy()
+            else:
+                result = self
+
             # Block on filters?
             if (not firstIn and
                     not self.filters.testCompatibility(other.filters)):
                 log.warning("Filter incompatibility has blocked the merging "
                             "of two datasets")
                 return None
-            else:
-                self.addFilters(other.filters, underConstruction=True)
+            elif firstIn:
+                result.addFilters(other.filters, underConstruction=True)
 
             # reset the filters, just in case
-            self._cachedFilters = []
+            result._cachedFilters = []
 
             # block on object metadata?
-            self._checkObjMetadata(other.objMetadata)
+            result._checkObjMetadata(other.objMetadata)
 
             # There is probably a cleaner way to do this:
-            self.objMetadata.update(other.objMetadata)
-            if copyOnMerge:
-                result = self.copy()
-            else:
-                result = self
+            result.objMetadata.update(other.objMetadata)
 
             # If this dataset has no subsets representing it, add self as a
             # subdataset to the result
@@ -683,6 +741,22 @@ class DataSet(object):
             self.objMetadata['UniqueId'] = newId
         return newId
 
+    def copyTo(self, dest, relative=False):
+        """Doesn't resolve resource name collisions"""
+        ofn = None
+        dest = os.path.abspath(dest)
+        if not os.path.isdir(dest):
+            ofn = dest
+            dest = os.path.split(dest)[0]
+        # unfortunately file indices must have the same name as the file they
+        # index, so we carry around some state to store the most recent uuid
+        # seen. Good thing we do a depth first traversal!
+        state = [self.uuid]
+        resFunc = partial(_copier, dest, subfolder=state)
+        self._modResources(resFunc)
+        if not ofn is None:
+            self.write(ofn, relPaths=relative)
+
     def copy(self, asType=None):
         """Deep copy the representation of this DataSet
 
@@ -958,15 +1032,16 @@ class DataSet(object):
             results.append(newCopy)
         return results
 
-    def write(self, outFile, validate=True, modPaths=False,
-              relPaths=False, pretty=True):
+    def write(self, outFile, validate=True, modPaths=None,
+              relPaths=None, pretty=True):
         """Write to disk as an XML file
 
         Args:
             :outFile: The filename of the xml file to be created
             :validate: T/F (True) validate the ExternalResource ResourceIds
-            :relPaths: T/F (False) make the ExternalResource ResourceIds
-                       relative instead of absolute filenames
+            :relPaths: T/F (None/no change) make the ExternalResource
+                       ResourceIds relative instead of absolute filenames
+            :modPaths: DEPRECATED (T/F) allow paths to be modified
 
         Doctest:
             >>> import pbcore.data.datasets as data
@@ -980,8 +1055,17 @@ class DataSet(object):
             >>> ds1 == ds2
             True
         """
+        if not modPaths is None:
+            log.info("modPaths as a write argument is deprecated. Paths "
+                     "aren't modified unless relPaths is explicitly set "
+                     "to True or False. Will be removed in future versions.")
+            # make sure we keep the same effect for now, in case someone has
+            # something odd like modPaths=False, relPaths=True
+            if not modPaths:
+                relPaths = None
+
         # fix paths if validate:
-        if validate and modPaths:
+        if validate and not relPaths is None:
             if relPaths:
                 self.makePathsRelative(os.path.dirname(outFile))
             else:
@@ -1052,6 +1136,25 @@ class DataSet(object):
         else:
             self.metadata.append(statsMetadata)
 
+    def readsByName(self, query):
+        reads = _flatten([rr.readsByName(query)
+                          for rr in self.resourceReaders()])
+        return sorted(reads, key=lambda a: a.readStart)
+
+    def loadMetadata(self, filename):
+        """Load pipeline metadata from a <moviename>.run.metadata.xml file.
+
+        Args:
+            :filename: the filename of a <moviename>.run.metadata.xml file
+
+        """
+        if isinstance(filename, basestring):
+            metadata = parseMetadata(str(filename))
+        else:
+            metadata = filename
+        self.addMetadata(metadata)
+        self.updateCounts()
+
     def processFilters(self):
         """Generate a list of functions to apply to a read, all of which return
         T/F. Each function is an OR filter, so any() true passes the read.
@@ -1303,7 +1406,6 @@ class DataSet(object):
         """
         self.metadata.totalLength = -1
         self.metadata.numRecords = -1
-        self._countsUpdated = True
 
     def addExternalResources(self, newExtResources, updateCount=True):
         """Add additional ExternalResource objects, ensuring no duplicate
@@ -1685,6 +1787,15 @@ class DataSet(object):
         responses = self._pollResources(lambda x: x.baseFeaturesAvailable())
         return self._unifyResponses(responses)
 
+    def hasPulseFeature(self, featureName):
+        responses = self._pollResources(
+            lambda x: x.hasPulseFeature(featureName))
+        return self._unifyResponses(responses)
+
+    def pulseFeaturesAvailable(self):
+        responses = self._pollResources(lambda x: x.pulseFeaturesAvailable())
+        return self._unifyResponses(responses)
+
     @property
     def sequencingChemistry(self):
         responses = self._pollResources(lambda x: x.sequencingChemistry)
@@ -1753,20 +1864,26 @@ class DataSet(object):
     def __getitem__(self, index):
         """Should respect filters for free, as _indexMap should only be
         populated by filtered reads. Only pbi filters considered, however."""
+        if self._indexMap is None:
+            _ = self.index
         if isinstance(index, int):
-            if self._indexMap is None:
-                _ = self.index
             # support negatives
             if index < 0:
                 index = len(self.index) + index
             rrNo, recNo = self._indexMap[index]
             return self.resourceReaders()[rrNo][recNo]
         elif isinstance(index, slice):
-            raise NotImplementedError()
+            indexTuples = self._indexMap[index]
+            return [self.resourceReaders()[ind[0]][ind[1]] for ind in
+                    indexTuples]
         elif isinstance(index, list):
             indexTuples = [self._indexMap[ind] for ind in index]
             return [self.resourceReaders()[ind[0]][ind[1]] for ind in
                     indexTuples]
+        elif isinstance(index, np.ndarray):
+            indexTuples = self._indexMap[index]
+            return [self.resourceReaders()[ind[0]][ind[1]] for ind in
+                    indexTuples]
         elif isinstance(index, str):
             if 'id' in self.index.dtype.names:
                 row = np.nonzero(self.index.id == index)[0][0]
@@ -1775,22 +1892,6 @@ class DataSet(object):
                 raise NotImplementedError()
 
 
-class InvalidDataSetIOError(Exception):
-    """The base class for all DataSetIO related custom exceptions
-    """
-
-
-class ResourceMismatchError(InvalidDataSetIOError):
-
-    def __init__(self, responses):
-        super(ResourceMismatchError, self).__init__()
-        self.responses = responses
-
-    def __str__(self):
-        return "Resources responded differently: " + ', '.join(
-            map(str, self.responses))
-
-
 class ReadSet(DataSet):
     """Base type for read sets, should probably never be used as a concrete
     class"""
@@ -1809,7 +1910,8 @@ class ReadSet(DataSet):
             if not res.bai:
                 newInds.append(_indexBam(fname))
                 self.close()
-            res.addIndices(newInds)
+            if newInds:
+                res.addIndices(newInds)
         self._populateMetaTypes()
         self.updateCounts()
 
@@ -1852,7 +1954,15 @@ class ReadSet(DataSet):
             if refFile:
                 if not refFile in sharedRefs:
                     log.debug("Using reference: {r}".format(r=refFile))
-                    sharedRefs[refFile] = IndexedFastaReader(refFile)
+                    try:
+                        sharedRefs[refFile] = IndexedFastaReader(refFile)
+                    except IOError:
+                        if not self._strict:
+                            log.warn("Problem opening reference with"
+                                     "IndexedFastaReader")
+                            sharedRefs[refFile] = None
+                        else:
+                            raise
             location = urlparse(extRes.resourceId).path
             resource = None
             try:
@@ -2230,30 +2340,61 @@ class ReadSet(DataSet):
             :numFiles: The number of data files to be produced.
 
         """
-        if numFiles > 1:
-            assert len(self.resourceReaders()) == len(self.toExternalFiles())
-            resSizes = [[i, size[0], size[1]]
-                        for i, size in enumerate(self._resourceSizes())]
-            chunks = self._chunkList(resSizes, numFiles, lambda x: x[1])
-            resLists = []
-            for chunk in chunks:
-                thisResList = []
-                for i in chunk:
-                    thisResList.append(self.toExternalFiles()[i[0]])
-                resLists.append(thisResList)
-            fnames = [_infixFname(dataFile, str(i)) for i in range(numFiles)]
-            for resList, fname in zip(resLists, fnames):
-                consolidateBams(resList, fname, filterDset=self, useTmp=useTmp)
+        references = [er.reference for er in self.externalResources if
+                      er.reference]
+        if which('pbmerge'):
+            log.debug("Using pbmerge to consolidate")
+            dsets = self.split(zmws=True, chunks=numFiles)
+            if numFiles > 1:
+                fnames = [_infixFname(dataFile, str(i))
+                          for i in range(numFiles)]
+            else:
+                fnames = [dataFile]
+            for chunk, fname in zip(dsets, fnames):
+                consolidateXml(chunk, fname, useTmp=useTmp)
             log.debug("Replacing resources")
             self.externalResources = ExternalResources()
             self.addExternalResources(fnames)
+            self.induceIndices()
         else:
-            consolidateBams(self.toExternalFiles(), dataFile, filterDset=self,
-                            useTmp=useTmp)
-            # TODO: add as subdatasets
-            log.debug("Replacing resources")
-            self.externalResources = ExternalResources()
-            self.addExternalResources([dataFile])
+            if numFiles > 1:
+                assert (len(self.resourceReaders()) ==
+                        len(self.toExternalFiles()))
+                resSizes = [[i, size[0], size[1]]
+                            for i, size in enumerate(self._resourceSizes())]
+                chunks = self._chunkList(resSizes, numFiles, lambda x: x[1])
+                resLists = []
+                for chunk in chunks:
+                    thisResList = []
+                    for i in chunk:
+                        thisResList.append(self.toExternalFiles()[i[0]])
+                    resLists.append(thisResList)
+                fnames = [_infixFname(dataFile, str(i))
+                          for i in range(numFiles)]
+                for resList, fname in zip(resLists, fnames):
+                    consolidateBams(resList, fname, filterDset=self,
+                                    useTmp=useTmp)
+                log.debug("Replacing resources")
+                self.externalResources = ExternalResources()
+                self.addExternalResources(fnames)
+            else:
+                consolidateBams(self.toExternalFiles(), dataFile,
+                                filterDset=self, useTmp=useTmp)
+                # TODO: remove subdatasets?
+                log.debug("Replacing resources")
+                self.externalResources = ExternalResources()
+                self.addExternalResources([dataFile])
+        # make sure reference gets passed through:
+        if references:
+            refCounts = dict(Counter(references))
+            if len(refCounts) > 1:
+                log.warn("Consolidating AlignmentSets with "
+                         "different references, but BamReaders "
+                         "can only have one. References will be "
+                         "lost")
+            else:
+                for extres in self.externalResources:
+                    extres.reference = refCounts.keys()[0]
         # reset the indexmap especially, as it is out of date:
         self._index = None
         self._indexMap = None
@@ -2261,7 +2402,6 @@ class ReadSet(DataSet):
         self._populateMetaTypes()
 
     def updateCounts(self):
-        self._countsUpdated = True
         if self._skipCounts:
             log.debug("SkipCounts is true, skipping updateCounts()")
             self.metadata.totalLength = -1
@@ -2273,6 +2413,7 @@ class ReadSet(DataSet):
             numRecords, totalLength = self._length
             self.metadata.totalLength = totalLength
             self.metadata.numRecords = numRecords
+            self._countsUpdated = True
         except (IOError, UnavailableFeature):
             if not self._strict:
                 log.debug("File problem, metadata not populated")
@@ -2331,7 +2472,6 @@ class HdfSubreadSet(ReadSet):
 
     def updateCounts(self):
         """Overriding here so we don't have to assertIndexed"""
-        self._countsUpdated = True
         if self._skipCounts:
             log.debug("SkipCounts is true, skipping updateCounts()")
             self.metadata.totalLength = -1
@@ -2342,6 +2482,7 @@ class HdfSubreadSet(ReadSet):
             numRecords, totalLength = self._length
             self.metadata.totalLength = totalLength
             self.metadata.numRecords = numRecords
+            self._countsUpdated = True
         except (IOError, UnavailableFeature):
             if not self._strict:
                 log.debug("File problem, metadata not populated")
@@ -2516,7 +2657,7 @@ class AlignmentSet(ReadSet):
 
         """
         recArrays = []
-        log.debug("Processing resource pbis")
+        log.debug("Processing resource indices")
         _indexMap = []
         for rrNum, rr in enumerate(self.resourceReaders()):
             indices = rr.index
@@ -2560,11 +2701,13 @@ class AlignmentSet(ReadSet):
                                                 'tEnd',))
             tbr = tbr[sort_order]
             self._indexMap = self._indexMap[sort_order]
+            ranges = _ranges_in_list(tbr.RefGroupID)
             for ref in self.referenceInfoTable:
-                hits = np.flatnonzero(tbr.RefGroupID == ref.ID)
-                if len(hits) != 0:
-                    ref.StartRow = hits[0]
-                    ref.EndRow = hits[-1]
+                bounds = ranges.get(ref.ID)
+                if bounds:
+                    ref.StartRow = bounds[0]
+                    # we want the ranges to be inclusive:
+                    ref.EndRow = bounds[1] - 1
                 # and fix the naming scheme while we're at it
                 ref.Name = self._cleanCmpName(ref.FullName)
         return tbr
@@ -3721,11 +3864,21 @@ class ContigSet(DataSet):
             self._populateContigMetadata()
 
     def _populateContigMetadata(self):
+        log.debug("Adding contig metadata...")
+        numrec = 0
+        totlen = 0
         for contig in self.contigs:
             self._metadata.addContig(contig)
+            numrec += 1
+            totlen += len(contig)
+        if not self._countsUpdated:
+            log.debug("Counts updated: numrec={n} totlen={l}".format(n=numrec,
+                                                                     l=totlen))
+            self.numRecords = numrec
+            self.totalLength = totlen
+            self._countsUpdated = True
 
     def updateCounts(self):
-        self._countsUpdated = True
         if self._skipCounts:
             if not self.metadata.totalLength:
                 self.metadata.totalLength = -1
@@ -3733,14 +3886,17 @@ class ContigSet(DataSet):
                 self.metadata.numRecords = -1
             return
         if not self.isIndexed:
-            log.info("Cannot updateCounts without an index file")
-            self.metadata.totalLength = 0
-            self.metadata.numRecords = 0
+            if (not self.totalLength and not self.numRecords and not
+                    self._countsUpdated):
+                log.info("Cannot updateCounts without an index file")
+                self.metadata.totalLength = 0
+                self.metadata.numRecords = 0
             return
         try:
             log.debug('Updating counts')
             self.metadata.totalLength = sum(self.index.length)
             self.metadata.numRecords = len(self.index)
+            self._countsUpdated = True
         except (IOError, UnavailableFeature, TypeError):
             # IOError for missing files
             # UnavailableFeature for files without companion files
@@ -3906,8 +4062,9 @@ class ContigSet(DataSet):
         # index file
         names = []
         for contig in self.contigs:
-            if (self.noFiltering
-                    or self._filters.testParam('id', contig.id, str)):
+            if self.noFiltering:
+                names.append(contig.id)
+            elif self._filters.testParam('id', contig.id, str):
                 names.append(contig.id)
         return sorted(list(set(names)))
 
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 05303a7..99090ae 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -205,20 +205,32 @@ class RecordWrapper(object):
             **repr_d)
         return rep
 
+    def __eq__(self, other):
+        """Does not take child order into account!!!!"""
+        if (sorted([c.record['tag'] for c in self]) !=
+                sorted([c.record['tag'] for c in other])):
+            return False
+        if self.__class__.__name__ != other.__class__.__name__:
+            return False
+        for attrib in ['metaname', 'namespace', 'metavalue', 'metadata']:
+            if getattr(self, attrib) != getattr(other, attrib):
+                return False
+        return True
+
     def pop(self, index):
         return self.record['children'].pop(index)
 
     def merge(self, other):
         pass
 
-    def getMemberV(self, tag, container='text'):
+    def getMemberV(self, tag, container='text', default=None, asType=str):
         """Generic accessor for the contents of the children of this element,
         without having to interface with them directly"""
         try:
-            return self.record['children'][self.index(str(tag))][
-                str(container)]
-        except KeyError:
-            return None
+            return asType(self.record['children'][self.index(str(tag))][
+                str(container)])
+        except (KeyError, ValueError):
+            return default
 
     def setMemberV(self, tag, value, container='text'):
         """Generic accessor for the contents of the children of this element,
@@ -420,8 +432,8 @@ class Filters(RecordWrapper):
         if len(self) == 0:
             return True
         return all([sFilt == oFilt for sFilt, oFilt in zip(
-            sorted(list(self)),
-            sorted(list(other)))])
+            sorted(list(self), key=lambda x: x.metaname),
+            sorted(list(other), key=lambda x: x.metaname))])
 
     def __nonzero__(self):
         for filt in self:
@@ -557,6 +569,8 @@ class Filters(RecordWrapper):
     def _bamTypeMap(self):
         return {'rname': str,
                 'length': int,
+                'qstart': int,
+                'qend': int,
                 'qname': str,
                 'movie': str,
                 'zm': int,
@@ -622,6 +636,7 @@ class Filters(RecordWrapper):
                 # renamed:
                 accMap['movie'] = (lambda x: x.MovieID)
                 accMap['qname'] = (lambda x: x.MovieID)
+                accMap['zm'] = (lambda x: x.HoleNumber)
         elif readType == 'fasta':
             accMap = {'id': (lambda x: x.id),
                       'length': (lambda x: x.length.astype(int)),
@@ -654,6 +669,29 @@ class Filters(RecordWrapper):
                         operator = mapOp(req.operator)
                         reqResultsForRecords &= operator(
                             accMap[param](indexRecords), value)
+                    elif param == 'qname':
+                        movie, hole, span = value.split('/')
+                        operator = mapOp(req.operator)
+
+                        value = movieMap[movie]
+                        reqResultsForRecords = operator(
+                            accMap[param](indexRecords), value)
+
+                        param = 'zm'
+                        value = typeMap[param](hole)
+                        reqResultsForRecords &= operator(
+                            accMap[param](indexRecords), value)
+
+                        param = 'qstart'
+                        value = typeMap[param](span.split('_')[0])
+                        reqResultsForRecords &= operator(
+                            accMap[param](indexRecords), value)
+
+                        param = 'qend'
+                        value = typeMap[param](span.split('_')[1])
+                        reqResultsForRecords &= operator(
+                            accMap[param](indexRecords), value)
+
                     else:
                         operator = mapOp(req.operator)
                         reqResultsForRecords = operator(
@@ -977,6 +1015,10 @@ class ExternalResource(RecordWrapper):
             return True
         return False
 
+    @property
+    def uniqueId(self):
+        return self.getV('attrib', 'UniqueId')
+
     def merge(self, other):
         if self.metaType:
             if self.metaType != other.metaType:
@@ -1052,6 +1094,14 @@ class ExternalResource(RecordWrapper):
         self._setSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile', value)
 
     @property
+    def control(self):
+        return self._getSubResByMetaType('PacBio.SubreadFile.Control.SubreadBamFile')
+
+    @control.setter
+    def control(self, value):
+        self._setSubResByMetaType('PacBio.SubreadFile.Control.SubreadBamFile', value)
+
+    @property
     def barcodes(self):
         return self._getSubResByMetaType("PacBio.DataSet.BarcodeSet")
 
@@ -1076,8 +1126,11 @@ class ExternalResource(RecordWrapper):
                 return res.resourceId
 
     def _setSubResByMetaType(self, mType, value):
-        tmp = ExternalResource()
-        tmp.resourceId = value
+        if not isinstance(value, ExternalResource):
+            tmp = ExternalResource()
+            tmp.resourceId = value
+        else:
+            tmp = value
         tmp.metaType = mType
         tmp.timeStampedName = getTimeStampedName(mType)
         resources = self.externalResources
@@ -1122,11 +1175,11 @@ class ExternalResource(RecordWrapper):
             fileIndices = FileIndices(fileIndices[0])
         else:
             fileIndices = FileIndices()
+            self.append(fileIndices)
         for index in list(indices):
             temp = FileIndex()
             temp.resourceId = index
             fileIndices.append(temp)
-        self.append(fileIndices)
 
 class FileIndices(RecordWrapper):
 
@@ -1203,10 +1256,7 @@ class DataSetMetadata(RecordWrapper):
     def numRecords(self):
         """Return the number of records in a DataSet using helper functions
         defined in the base class"""
-        try:
-            return int(self.getMemberV('NumRecords'))
-        except ValueError:
-            return 0
+        return self.getMemberV('NumRecords', default=0, asType=int)
 
     @numRecords.setter
     def numRecords(self, value):
@@ -1218,10 +1268,7 @@ class DataSetMetadata(RecordWrapper):
         """Return the TotalLength property of this dataset.
         TODO: update the value from the actual external reference on
         ValueError"""
-        try:
-            return int(self.getMemberV('TotalLength'))
-        except ValueError:
-            return 0
+        return self.getMemberV('TotalLength', default=0, asType=int)
 
     @totalLength.setter
     def totalLength(self, value):
@@ -1320,10 +1367,7 @@ class ContigSetMetadata(DataSetMetadata):
 
     @property
     def organism(self):
-        try:
-            return self.getMemberV('Organism')
-        except ValueError:
-            return None
+        return self.getMemberV('Organism')
 
     @organism.setter
     def organism(self, value):
@@ -1331,10 +1375,7 @@ class ContigSetMetadata(DataSetMetadata):
 
     @property
     def ploidy(self):
-        try:
-            return self.getMemberV('Ploidy')
-        except ValueError:
-            return None
+        return self.getMemberV('Ploidy')
 
     @ploidy.setter
     def ploidy(self, value):
@@ -1381,10 +1422,7 @@ class BarcodeSetMetadata(DataSetMetadata):
 
     @property
     def barcodeConstruction(self):
-        try:
-            return self.getMemberV('BarcodeConstruction')
-        except ValueError:
-            return None
+        return self.getMemberV('BarcodeConstruction')
 
     @barcodeConstruction.setter
     def barcodeConstruction(self, value):
@@ -1550,32 +1588,35 @@ class StatsMetadata(RecordWrapper):
     """The metadata from the machine sts.xml"""
 
     def merge(self, other):
-        self.shortInsertFraction = (self.shortInsertFraction *
-                                    self.prodDist.bins[1] +
-                                    other.shortInsertFraction *
-                                    other.prodDist.bins[1])/(
-                                        self.prodDist.bins[1]
-                                        + other.prodDist.bins[1])
-        self.adapterDimerFraction = (self.adapterDimerFraction *
-                                     self.prodDist.bins[1] +
-                                     other.adapterDimerFraction *
-                                     other.prodDist.bins[1])/(
-                                         self.prodDist.bins[1]
-                                         + other.prodDist.bins[1])
+        if other.shortInsertFraction and other.prodDist:
+            self.shortInsertFraction = (self.shortInsertFraction *
+                                        self.prodDist.bins[1] +
+                                        other.shortInsertFraction *
+                                        other.prodDist.bins[1])/(
+                                            self.prodDist.bins[1]
+                                            + other.prodDist.bins[1])
+        if other.adapterDimerFraction and other.prodDist:
+            self.adapterDimerFraction = (self.adapterDimerFraction *
+                                         self.prodDist.bins[1] +
+                                         other.adapterDimerFraction *
+                                         other.prodDist.bins[1])/(
+                                             self.prodDist.bins[1]
+                                             + other.prodDist.bins[1])
         self.numSequencingZmws += other.numSequencingZmws
         for dist in DISTLIST:
             selfDist = getattr(self, dist[0].lower() + dist[1:])
             otherDist = getattr(other, dist[0].lower() + dist[1:])
-            try:
-                selfDist.merge(otherDist)
-            except ZeroBinWidthError as e:
-                removed = self.removeChildren(dist)
-                self.append(otherDist)
-            except BinMismatchError:
-                self.append(otherDist)
-            except ValueError:
+            if not selfDist:
                 if otherDist:
                     self.append(otherDist)
+            else:
+                try:
+                    selfDist.merge(otherDist)
+                except ZeroBinWidthError as e:
+                    removed = self.removeChildren(dist)
+                    self.append(otherDist)
+                except BinMismatchError:
+                    self.append(otherDist)
 
     @property
     def prodDist(self):
@@ -1609,10 +1650,20 @@ class StatsMetadata(RecordWrapper):
                                                 'InsertReadQualDist'))
 
     @property
+    def insertReadLenDists(self):
+        return [ContinuousDistribution(child) for child in
+                self.findChildren('InsertReadLenDist')]
+
+    @property
     def insertReadLenDist(self):
         return ContinuousDistribution(self.getV('children',
                                                 'InsertReadLenDist'))
     @property
+    def insertReadQualDists(self):
+        return [ContinuousDistribution(child) for child in
+                self.findChildren('InsertReadQualDist')]
+
+    @property
     def controlReadQualDist(self):
         return ContinuousDistribution(self.getV('children',
                                                 'ControlReadQualDist'))
@@ -1633,7 +1684,7 @@ class StatsMetadata(RecordWrapper):
 
     @property
     def adapterDimerFraction(self):
-        return float(self.getMemberV('AdapterDimerFraction'))
+        return self.getMemberV('AdapterDimerFraction', asType=float)
 
     @adapterDimerFraction.setter
     def adapterDimerFraction(self, value):
@@ -1641,7 +1692,7 @@ class StatsMetadata(RecordWrapper):
 
     @property
     def numSequencingZmws(self):
-        return float(self.getMemberV('NumSequencingZmws'))
+        return self.getMemberV('NumSequencingZmws', asType=float)
 
     @numSequencingZmws.setter
     def numSequencingZmws(self, value):
@@ -1649,7 +1700,7 @@ class StatsMetadata(RecordWrapper):
 
     @property
     def shortInsertFraction(self):
-        return float(self.getMemberV('ShortInsertFraction'))
+        return self.getMemberV('ShortInsertFraction', asType=float)
 
     @shortInsertFraction.setter
     def shortInsertFraction(self, value):
@@ -1697,7 +1748,7 @@ class ContinuousDistribution(RecordWrapper):
 
     @property
     def numBins(self):
-        return int(self.getMemberV('NumBins'))
+        return self.getMemberV('NumBins', asType=int)
 
     @numBins.setter
     def numBins(self, value):
@@ -1705,27 +1756,27 @@ class ContinuousDistribution(RecordWrapper):
 
     @property
     def sampleSize(self):
-        return int(self.getMemberV('SampleSize'))
+        return self.getMemberV('SampleSize', asType=int)
 
     @property
     def sampleMean(self):
-        return float(self.getMemberV('SampleMean'))
+        return self.getMemberV('SampleMean', asType=float)
 
     @property
     def sampleMed(self):
-        return float(self.getMemberV('SampleMed'))
+        return self.getMemberV('SampleMed', asType=float)
 
     @property
     def sampleStd(self):
-        return float(self.getMemberV('SampleStd'))
+        return self.getMemberV('SampleStd', asType=float)
 
     @property
     def sample95thPct(self):
-        return float(self.getMemberV('Sample95thPct'))
+        return self.getMemberV('Sample95thPct', asType=float)
 
     @property
     def binWidth(self):
-        return float(self.getMemberV('BinWidth'))
+        return self.getMemberV('BinWidth', asType=float)
 
     @binWidth.setter
     def binWidth(self, value):
@@ -1733,15 +1784,15 @@ class ContinuousDistribution(RecordWrapper):
 
     @property
     def minOutlierValue(self):
-        return float(self.getMemberV('MinOutlierValue'))
+        return self.getMemberV('MinOutlierValue', asType=float)
 
     @property
     def maxOutlierValue(self):
-        return float(self.getMemberV('MaxOutlierValue'))
+        return self.getMemberV('MaxOutlierValue', asType=float)
 
     @property
     def minBinValue(self):
-        return float(self.getMemberV('MinBinValue'))
+        return self.getMemberV('MinBinValue', asType=float)
 
     @minBinValue.setter
     def minBinValue(self, value):
@@ -1749,7 +1800,7 @@ class ContinuousDistribution(RecordWrapper):
 
     @property
     def maxBinValue(self):
-        return float(self.getMemberV('MaxBinValue'))
+        return self.getMemberV('MaxBinValue', asType=float)
 
     @maxBinValue.setter
     def maxBinValue(self, value):
@@ -1838,7 +1889,7 @@ class DiscreteDistribution(RecordWrapper):
 
     @property
     def numBins(self):
-        return int(self.getMemberV('NumBins'))
+        return self.getMemberV('NumBins', asType=int)
 
     @property
     def bins(self):
@@ -1876,6 +1927,10 @@ class RunDetailsMetadata(RecordWrapper):
     def name(self):
         return self.getMemberV('Name')
 
+    @name.setter
+    def name(self, value):
+        return self.setMemberV('Name', value)
+
 
 class WellSampleMetadata(RecordWrapper):
 
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index c33ddb1..1ba4c27 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -11,6 +11,7 @@ from pbcore.io.dataset.DataSetMembers import (ExternalResource,
                                               Filters, AutomationParameters,
                                               StatsMetadata, DISTLIST)
 from pbcore.io.dataset.DataSetWriter import _eleFromDictList
+from pbcore.io.dataset.DataSetErrors import InvalidDataSetIOError
 
 log = logging.getLogger(__name__)
 
@@ -98,10 +99,22 @@ def _addUnknownFile(dset, path):
     else:
         return _addGenericFile(dset, path)
 
+SUB_RESOURCES = ['.scraps.bam', '.control.subreads.bam']
+FILE_INDICES = ['.fai', '.pbi', '.bai', '.metadata.xml',
+                '.index', '.contig.index', '.sa']
+
 # TODO needs namespace
 def _addGenericFile(dset, path):
     """Create and populate an Element object, put it in an available members
     dictionary, return"""
+    # filter out resource file types that aren't top level:
+    # if we want to exclude scraps as well:
+    for ext in SUB_RESOURCES + FILE_INDICES:
+        if path.endswith(ext):
+            log.debug('Sub resource file {f} given as regular file, '
+                      'will be treated '
+                      'as a sub resource file instead'.format(f=path))
+            return
     extRes = wrapNewResource(path)
     extRess = ExternalResources()
     extRess.append(extRes)
@@ -110,9 +123,8 @@ def _addGenericFile(dset, path):
 
 # TODO needs namespace
 def wrapNewResource(path):
-    possible_indices = ['.fai', '.pbi', '.bai', '.metadata.xml',
-                        '.index', '.contig.index', '.sa']
-    for ext in possible_indices:
+    # filter out non-resource file types:
+    for ext in FILE_INDICES:
         if path.endswith(ext):
             log.debug('Index file {f} given as regular file, will be treated '
                       ' as an index file instead'.format(f=path))
@@ -120,10 +132,18 @@ def wrapNewResource(path):
     extRes = ExternalResource()
     path = resolveLocation(path)
     extRes.resourceId = path
-    index_files = [path + ext for ext in possible_indices if
+    index_files = [path + ext for ext in FILE_INDICES if
                    os.path.exists(path + ext)]
     if index_files:
         extRes.addIndices(index_files)
+
+    # Check for sub resources:
+    for ext in SUB_RESOURCES:
+        filen = '.'.join(path.split('.')[:-2]) + ext
+        # don't want to add e.g. scraps to scraps:
+        if os.path.exists(filen) and path.endswith('subreads.bam'):
+            subres = wrapNewResource(filen)
+            setattr(extRes, ext.split('.')[1], subres)
     return extRes
 
 
@@ -255,3 +275,19 @@ def parseStats(filename):
     stats.pruneChildrenTo(whitelist)
     return stats
 
+def parseMetadata(filename):
+    url = urlparse(filename)
+    fileLocation = url.path.strip()
+    if url.netloc:
+        fileLocation = url.netloc
+    tree = ET.parse(fileLocation)
+    dsm_tag = (".//{http://pacificbiosciences.com/PacBioDatasets.xsd}"
+               "DataSetMetadata")
+    try:
+        metadata = _parseXmlDataSetMetadata(tree.getroot().find(dsm_tag))
+    except AttributeError:
+        # the tag wasn't found, we're trying to do something with None
+        raise InvalidDataSetIOError("Unable to parse metadata from "
+                                    "{f}".format(f=filename))
+    return metadata
+
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index 6881555..d9b352b 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -11,6 +11,50 @@ from pbcore.util.Process import backticks
 
 log = logging.getLogger(__name__)
 
+def which(exe):
+    if os.path.exists(exe) and os.access(exe, os.X_OK):
+        return exe
+    path = os.getenv('PATH')
+    for this_path in path.split(os.path.pathsep):
+        this_path = os.path.join(this_path, exe)
+        if os.path.exists(this_path) and os.access(this_path, os.X_OK):
+            return this_path
+    return None
+
+def consolidateXml(indset, outbam, useTmp=True):
+    tmpout = tempfile.mkdtemp(suffix="consolidate-xml")
+    tmp_xml = os.path.join(tmpout,
+                           "orig.{t}.xml".format(
+                               t=indset.__class__.__name__.lower()))
+
+    final_free_space = disk_free(os.path.split(outbam)[0])
+    projected_size = sum(file_size(infn)
+                         for infn in indset.toExternalFiles())
+    log.debug("Projected size:            {p}".format(p=projected_size))
+    log.debug("In place free space:       {f}".format(f=final_free_space))
+    # give it a 5% buffer
+    if final_free_space < (projected_size * 1.05):
+        raise RuntimeError("No space available to consolidate")
+    if useTmp:
+        tmp_free_space = disk_free(tmpout)
+        log.debug("Tmp free space (need ~2x): {f}".format(f=tmp_free_space))
+        # need 2x for tmp in and out, plus 10% buffer
+        if tmp_free_space > (projected_size * 2.1):
+            log.debug("Using tmp storage: " + tmpout)
+            indset.copyTo(tmp_xml)
+            origOutBam = outbam
+            outbam = os.path.join(tmpout, "outfile.bam")
+        else:
+            log.debug("Using in place storage")
+            indset.write(tmp_xml)
+            useTmp = False
+    _pbmergeXML(tmp_xml, outbam)
+    if useTmp:
+        shutil.copy(outbam, origOutBam)
+        # cleanup:
+        shutil.rmtree(tmpout)
+    return outbam
+
 def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
     """Take a list of infiles, an outFile to produce, and optionally a dataset
     (filters) to provide the definition and content of filtrations."""
@@ -127,11 +171,11 @@ def _sortBam(fname):
     shutil.move(tmpname, fname)
 
 def _indexBam(fname):
-    pysam.index(fname)
+    pysam.samtools.index(fname, catch_stdout=False)
     return fname + ".bai"
 
 def _indexFasta(fname):
-    pysam.faidx(fname)
+    pysam.samtools.faidx(fname, catch_stdout=False)
     return fname + ".fai"
 
 def _mergeBams(inFiles, outFile):
@@ -146,6 +190,15 @@ def _mergeBams(inFiles, outFile):
     else:
         shutil.copy(inFiles[0], outFile)
 
+def _pbmergeXML(indset, outbam):
+    cmd = "pbmerge -o {o} {i} ".format(i=indset,
+                                             o=outbam)
+    log.info(cmd)
+    o, r, m = backticks(cmd)
+    if r != 0:
+        raise RuntimeError(m)
+    return outbam
+
 def _filterBam(inFile, outFile, filterDset):
     BamtoolsVersion().check()
     tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
@@ -162,6 +215,30 @@ def _infixFname(fname, infix):
     prefix, extension = os.path.splitext(fname)
     return prefix + str(infix) + extension
 
+def _earlyInfixFname(fname, infix):
+    path, name = os.path.split(fname)
+    tokens = name.split('.')
+    tokens.insert(1, str(infix))
+    return os.path.join(path, '.'.join(tokens))
+
+def _swapPath(dest, infile):
+    return os.path.join(dest, os.path.split(infile)[1])
+
+def _fileCopy(dest, infile, uuid=None):
+    fn = _swapPath(dest, infile)
+    if os.path.exists(fn):
+        if uuid is None:
+            raise IOError("File name exists in destination: "
+                          "{f}".format(f=infile))
+        subdir = os.path.join(dest, uuid)
+        if not os.path.exists(subdir):
+            os.mkdir(subdir)
+        fn = _swapPath(subdir, fn)
+    shutil.copy(infile, fn)
+    assert os.path.exists(fn)
+    return fn
+
+
 def _emitFilterScript(filterDset, filtScriptName):
     """Use the filter script feature of bamtools. Use with specific filters if
     all that are needed are available, otherwise filter by readname (easy but
diff --git a/requirements.txt b/requirements.txt
index 357619b..0182647 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 cython
 numpy >= 1.7.1
 h5py >= 2.0.1
-pysam == 0.8.4
+pysam >= 0.9.0
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
index df0e005..2a128d7 100644
--- a/tests/test_pbcore_io_AlnFileReaders.py
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -7,6 +7,7 @@ from nose.tools import (nottest,
 from nose import SkipTest
 
 import tempfile
+import shutil
 import pysam
 import numpy as np
 import bisect
@@ -426,12 +427,30 @@ class TestIndexedBam(_IndexedAlnFileReaderTests):
         EQ(len(bam), 0)
 
     def test_alignment_identity(self):
+        """
+        Check that the values of the 'identity' property are consistent
+        between IndexedBamReader (numpy array) and BamAlignment (float)
+        """
         fn = data.getBamAndCmpH5()[0]
         with IndexedBamReader(fn) as bam_in:
             i1 = bam_in.identity
             i2 = np.array([ rec.identity for rec in bam_in ])
             EQ((i2 == i1).all(), True)
 
+    def test_alignment_identity_unindexed(self):
+        """
+        Check that the value of the 'identity' property is the same whether
+        or not the .pbi index was used to calculate it.
+        """
+        fn1 = data.getBamAndCmpH5()[0]
+        fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
+        shutil.copyfile(fn1, fn2)
+        with IndexedBamReader(fn1) as bam_pbi:
+            with BamReader(fn2) as bam_noindex:
+                i1 = np.array([ rec.identity for rec in bam_pbi ])
+                i2 = np.array([ rec.identity for rec in bam_noindex ])
+                EQ((i2 == i1).all(), True)
+
 
 class TestCCSBam(object):
     def __init__(self):
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index 13ea083..dd3577b 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -8,6 +8,7 @@ import tempfile
 import numpy as np
 import unittest
 import shutil
+import copy
 from random import shuffle
 from unittest.case import SkipTest
 
@@ -17,7 +18,8 @@ from pbcore.io.dataset.utils import BamtoolsVersion
 from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
                        openDataSet, DataSetMetaTypes, HdfSubreadSet,
                        ConsensusReadSet, ConsensusAlignmentSet, openDataFile,
-                       divideKeys, keysToRanges)
+                       divideKeys, keysToRanges, getDataSetUuid,
+                       getDataSetMetaType)
 from pbcore.io.dataset.DataSetIO import _dsIdToSuffix, InvalidDataSetIOError
 from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
 from pbcore.io.dataset.DataSetValidator import validateFile
@@ -80,7 +82,6 @@ class TestDataSet(unittest.TestCase):
         self.assertEquals(ds1.numExternalResources, 2)
         ds1 = DataSet(data.getFofn())
         self.assertEquals(ds1.numExternalResources, 2)
-        # New! Use the correct constructor:
         self.assertEquals(type(SubreadSet(data.getSubreadSet(),
                                           skipMissing=True)).__name__,
                           'SubreadSet')
@@ -149,6 +150,49 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(merged.subdatasets[1].toExternalFiles(),
                          AlignmentSet(data.getXml(11)).toExternalFiles())
 
+        # No filters, 3 files:
+        ds1 = AlignmentSet(data.getXml(8))
+        self.assertEqual(len(ds1.subdatasets), 0)
+        ds2 = AlignmentSet(data.getXml(11))
+        self.assertEqual(len(ds2.subdatasets), 0)
+        ds3 = AlignmentSet(data.getXml(11))
+        self.assertEqual(len(ds3.subdatasets), 0)
+        ds3.externalResources[0].resourceId = "/blah.bam"
+        ds4 = ds1 + ds2 + ds3
+        self.assertEqual(len(ds4.externalResources), 3)
+        self.assertEqual(len(ds4.subdatasets), 3)
+
+        # Filters, 3 files:
+        ds1 = AlignmentSet(data.getXml(8))
+        self.assertEqual(len(ds1.subdatasets), 0)
+        ds1.filters.addRequirement(rq=[('>', 0.8)])
+        ds2 = AlignmentSet(data.getXml(11))
+        self.assertEqual(len(ds2.subdatasets), 0)
+        ds2.filters.addRequirement(rq=[('>', 0.8)])
+        ds3 = AlignmentSet(data.getXml(11))
+        self.assertEqual(len(ds3.subdatasets), 0)
+        ds3.externalResources[0].resourceId = "/blah.bam"
+        ds3.filters.addRequirement(rq=[('>', 0.8)])
+        ds4 = ds1 + ds2 + ds3
+        self.assertEqual(len(ds4.externalResources), 3)
+        self.assertEqual(len(ds4.subdatasets), 3)
+        self.assertEqual(str(ds4.filters), '( rq > 0.8 )')
+        for sss in ds4.subdatasets:
+            self.assertEqual(str(sss.filters), '( rq > 0.8 )')
+        with self.assertRaises(TypeError):
+            # mismatched Filters, 3 files:
+            ds1 = AlignmentSet(data.getXml(8))
+            self.assertEqual(len(ds1.subdatasets), 0)
+            ds1.filters.addRequirement(rq=[('>', 0.8)])
+            ds2 = AlignmentSet(data.getXml(11))
+            self.assertEqual(len(ds2.subdatasets), 0)
+            ds2.filters.addRequirement(rq=[('>', 0.7)])
+            ds3 = AlignmentSet(data.getXml(11))
+            self.assertEqual(len(ds3.subdatasets), 0)
+            ds3.externalResources[0].resourceId = "/blah.bam"
+            ds3.filters.addRequirement(rq=[('>', 0.8)])
+            ds4 = ds1 + ds2 + ds3
+
     def test_empty_metatype(self):
         inBam = data.getBam()
         d = DataSet(inBam)
@@ -526,6 +570,42 @@ class TestDataSet(unittest.TestCase):
 
     @unittest.skipIf(not _internal_data(),
                      "Internal data not available")
+    def test_scraps_detection(self):
+        path = ('/pbi/dept/secondary/siv/testdata/SA3-Sequel/'
+                'lambda/315/3150128/r54008_20160308_001811/'
+                '2_B01/m54008_160308_053311.')
+        subreads = path + 'subreads.bam'
+        control = path + 'control.subreads.bam'
+        controlscraps = path + 'control.scraps.bam'
+        scraps = path + 'scraps.bam'
+        subreadspbi = subreads + '.pbi'
+        scrapspbi = scraps + '.pbi'
+
+        filesets = [[subreads],
+                    [subreads, scraps],
+                    [subreads, subreadspbi],
+                    [subreads, scrapspbi]]
+
+        for files in filesets:
+            sset = SubreadSet(*files, strict=True)
+            self.assertEqual(len(sset.externalResources), 1)
+            self.assertEqual(sset.externalResources[0].resourceId, subreads)
+            self.assertEqual(sset.externalResources[0].scraps, scraps)
+            self.assertEqual(sset.externalResources[0].control, control)
+            self.assertEqual(
+                sset.externalResources[0].externalResources[0].resourceId,
+                scraps)
+            self.assertEqual(
+                sset.externalResources[0].externalResources[1].resourceId,
+                control)
+            self.assertEqual(
+                sset.externalResources[
+                    0].externalResources[1].externalResources[0].resourceId,
+                controlscraps)
+
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
     def test_referenceInfoTableMerging(self):
         log.info("Testing refIds, etc. after merging")
         bam1 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
@@ -650,8 +730,9 @@ class TestDataSet(unittest.TestCase):
                 ds1.index.holeNumber < rg[1],
                 ds1.index.holeNumber > rg[0]))[0]), 0)
 
-    @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
-                         "Missing testadata directory")
+    #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+    #                     "Missing testadata directory")
+    @unittest.skip("Too expensive")
     def test_large_split_zmws(self):
         N_RECORDS = 959539
         test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
@@ -727,6 +808,40 @@ class TestDataSet(unittest.TestCase):
 
     @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
                          "Missing testadata directory")
+    def test_multi_movie_readsByName(self):
+        N_RECORDS = 1745161
+        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                       "2372215/0007/Analysis_Results/m150404_101626_42"
+                       "267_c100807920800000001823174110291514_s1_p0.al"
+                       "l.subreadset.xml")
+        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                       "2590980/0008/Analysis_Results/m141115_075238_et"
+                       "han_c100699872550000001823139203261572_s1_p0.al"
+                       "l.subreadset.xml")
+        ds1 = SubreadSet(test_file_1, test_file_2)
+        self.assertEqual(len(ds1), N_RECORDS)
+        queries = [('m150404_101626_42267_c1008079208000'
+                    '00001823174110291514_s1_p0/7/*', 2),
+                   ('m141115_075238_ethan_c1006998725500'
+                    '00001823139203261572_s1_p0/9/*', 39),
+                   ]
+        for query, count in queries:
+            reads = ds1.readsByName(query)
+            self.assertEqual(len(reads), count)
+            parts = query.split('/')
+            movie = parts[0]
+            hn = int(parts[1])
+            if len(parts) > 2:
+                qrange = parts[2]
+            for read in reads:
+                self.assertEqual(read.movieName, movie)
+                self.assertEqual(read.holeNumber, hn)
+                # TODO: test qrange/ccs
+
+
+    #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+    #                     "Missing testadata directory")
+    @unittest.skip("Too expensive")
     def test_large_pbi(self):
         pbiFn = ('/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/simulated'
                  '/100Gb/alnsubreads/pbalchemy_100Gb_Seq_sim1_p0.'
@@ -842,6 +957,85 @@ class TestDataSet(unittest.TestCase):
         ds2._metadata.totalLength += 100000
         self.assertEquals(ds2._metadata.totalLength, 200000)
 
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_loadMetadata(self):
+        aln = AlignmentSet(data.getXml(no=8))
+        self.assertFalse(aln.metadata.collections)
+        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+                         'SA3-Sequel/lambda/roche_SAT/'
+                         'm54013_151205_032353.run.metadata.xml')
+        self.assertTrue(aln.metadata.collections)
+        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+                'SA3-Sequel/lambda/roche_SAT/'
+                'm54013_151205_032353.subreadset.xml')
+        sset = SubreadSet(sset_fn)
+        orig_metadata = copy.deepcopy(sset.metadata)
+        sset.metadata.collections = None
+        self.assertFalse(sset.metadata.collections)
+        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+                          'SA3-Sequel/lambda/roche_SAT/'
+                          'm54013_151205_032353.run.metadata.xml')
+        stack = zip(sset.metadata, orig_metadata)
+        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        sset.write(fn)
+        validateFile(fn)
+        validateFile(sset_fn)
+        self.assertEqual(sset.metadata, orig_metadata)
+
+
+        # load the wrong thing...
+        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+                'SA3-Sequel/lambda/roche_SAT/'
+                'm54013_151205_032353.subreadset.xml')
+        sset = SubreadSet(sset_fn)
+        orig_metadata = copy.deepcopy(sset.metadata)
+        sset.metadata.collections = None
+        self.assertFalse(sset.metadata.collections)
+        with self.assertRaises(InvalidDataSetIOError):
+            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+                              'SA3-Sequel/lambda/roche_SAT/'
+                              'm54013_151205_032353.sts.xml')
+
+
+    def test_copyTo(self):
+        aln = AlignmentSet(data.getXml(no=8), strict=True)
+        explen = len(aln)
+        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
+        aln.copyTo(fn)
+        aln.close()
+        del aln
+        aln = AlignmentSet(fn, strict=True)
+        self.assertEqual(explen, len(aln))
+
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        aln.copyTo(outdir)
+        fn = os.path.join(outdir, "test.alignmentset.xml")
+        aln.write(fn)
+        aln.close()
+        del aln
+        aln = AlignmentSet(fn, strict=True)
+        self.assertEqual(explen, len(aln))
+
+        # do it twice to same dir to induce collisions
+        aln = AlignmentSet(data.getXml(no=8), strict=True)
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        aln.copyTo(outdir)
+        fn = os.path.join(outdir, "test.alignmentset.xml")
+        aln.write(fn)
+
+        aln = AlignmentSet(data.getXml(no=8), strict=True)
+        aln.copyTo(outdir)
+        fn2 = os.path.join(outdir, "test2.alignmentset.xml")
+        aln.write(fn2)
+
+        aln = AlignmentSet(fn, strict=True)
+        aln2 = AlignmentSet(fn2, strict=True)
+        self.assertEqual(explen, len(aln))
+        self.assertEqual(explen, len(aln2))
+        self.assertNotEqual(sorted(aln.toExternalFiles()),
+                            sorted(aln2.toExternalFiles()))
+
     def test_addExternalResources(self):
         ds = DataSet()
         er1 = ExternalResource()
@@ -979,7 +1173,7 @@ class TestDataSet(unittest.TestCase):
     def test_reads_in_range_order(self):
         log.debug("Testing with one file")
         testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
-                    "2372215/0007/Alignment_Results/m150404_101"
+                    "2372215/0007_tiny/Alignment_Results/m150404_101"
                     "626_42267_c1008079208000000018231741102915"
                     "14_s1_p0.1.alignmentset.xml")
         aln = AlignmentSet(testFile)
@@ -991,11 +1185,11 @@ class TestDataSet(unittest.TestCase):
         for r1, r2 in itertools.izip(reads1, reads2):
             self.assertEqual(r1, r2)
             num += 1
-        self.assertTrue(num > 2000)
+        self.assertEqual(num, 28)
 
         log.debug("Testing with three files")
         testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
-                    "2372215/0007/Alignment_Results/m150404_101"
+                    "2372215/0007_tiny/Alignment_Results/m150404_101"
                     "626_42267_c1008079208000000018231741102915"
                     "14_s1_p0.all.alignmentset.xml")
         aln = AlignmentSet(testFile)
@@ -1007,7 +1201,7 @@ class TestDataSet(unittest.TestCase):
         for r1, r2 in itertools.izip(reads1, reads2):
             self.assertEqual(r1, r2)
             num += 1
-        self.assertTrue(num > 2000)
+        self.assertEqual(num, 105)
 
     @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
                          "Missing testadata directory")
@@ -2203,6 +2397,38 @@ class TestDataSet(unittest.TestCase):
 
     @unittest.skipIf(not _internal_data(),
                      "Internal data not available")
+    def test_qname_filter_scaling(self):
+        # unaligned bam
+        bam0 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/"
+                "Analysis_Results/m140913_222218_42240_c10069"
+                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+        bam1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/"
+                "Analysis_Results/m140913_005018_42139_c10071"
+                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+        sset = SubreadSet(bam0, bam1)
+        self.assertEqual(len(sset), 178570)
+        size = 10
+        qn = [r.qName for r in sset[:size]]
+        good_qn = [('=', name) for name in qn]
+        sset.filters.addRequirement(qname=good_qn)
+        self.assertEqual(size, sum(1 for _ in sset))
+        self.assertEqual(size, len(sset))
+
+
+        sset = SubreadSet(data.getXml(10))
+        self.assertEqual(len(sset), 92)
+        size = 10
+        qn = [r.qName for r in sset[:size]]
+        good_qn = [('=', name) for name in qn]
+        sset.filters.addRequirement(qname=good_qn)
+        self.assertEqual(size, sum(1 for _ in sset))
+        self.assertEqual(size, len(sset))
+
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
     def test_movie_filter(self):
         # unaligned bam
         bam0 = ("/pbi/dept/secondary/siv/testdata/"
@@ -2390,6 +2616,95 @@ class TestDataSet(unittest.TestCase):
                           zip(ss.metadata.summaryStats.readLenDist.bins,
                               ss2.metadata.summaryStats.readLenDist.bins)])
 
+        # smoke tests
+        ss3.metadata.summaryStats.insertReadLenDists
+        ss3.metadata.summaryStats.insertReadQualDists
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_reduced_sts_merging(self):
+        # don't have a subreadset.xml with loaded sts.xml in testdata,
+        # fabricate one here:
+
+        full = ('/pbi/dept/secondary/siv/testdata/'
+                'pbreports-unittest/data/sts_xml/'
+                '3120134-r54009_20160323_173308-'
+                '1_A01-Bug30772/m54009_160323_'
+                '173323.sts.xml')
+        partial = ('/pbi/dept/secondary/siv/testdata/'
+                   'pbreports-unittest/data/sts_xml/'
+                   '32246/m54026_160402_062929.sts.xml')
+
+        # two partial
+        ss = SubreadSet(data.getXml(10))
+        ss.externalResources[0].sts = partial
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        outXml2 = os.path.join(outdir, 'tempfile2.xml')
+        ss.write(outXml)
+        ss.write(outXml2)
+        ss = SubreadSet(outXml)
+        ss2 = SubreadSet(outXml2)
+        ss3 = ss + ss2
+        self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+                         [b1 + b2 for b1, b2 in
+                          zip(ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins)])
+        ss4 = SubreadSet(outXml, outXml2)
+
+        # one partial one full
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        outXml2 = os.path.join(outdir, 'tempfile2.xml')
+        ss = SubreadSet(data.getXml(10))
+        ss.externalResources[0].sts = partial
+        ss.write(outXml)
+        ss.externalResources[0].sts = full
+        ss.write(outXml2)
+        ss = SubreadSet(outXml)
+        ss2 = SubreadSet(outXml2)
+        ss3 = ss + ss2
+        self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+                         [b1 + b2 for b1, b2 in
+                          zip(ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins)])
+        ss4 = SubreadSet(outXml, outXml2)
+
+        # one full one partial
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        outXml2 = os.path.join(outdir, 'tempfile2.xml')
+        ss = SubreadSet(data.getXml(10))
+        ss.externalResources[0].sts = full
+        ss.write(outXml)
+        ss.externalResources[0].sts = partial
+        ss.write(outXml2)
+        ss = SubreadSet(outXml)
+        ss2 = SubreadSet(outXml2)
+        ss3 = ss + ss2
+        self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+                         [b1 + b2 for b1, b2 in
+                          zip(ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins)])
+        ss4 = SubreadSet(outXml, outXml2)
+
+        # two full
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        outXml2 = os.path.join(outdir, 'tempfile2.xml')
+        ss = SubreadSet(data.getXml(10))
+        ss.externalResources[0].sts = full
+        ss.write(outXml)
+        ss.write(outXml2)
+        ss = SubreadSet(outXml)
+        ss2 = SubreadSet(outXml2)
+        ss3 = ss + ss2
+        self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+                         [b1 + b2 for b1, b2 in
+                          zip(ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins)])
+        ss4 = SubreadSet(outXml, outXml2)
+
     @unittest.skipIf(not _internal_data(),
                      "Internal data not available")
     def test_missing_extres(self):
@@ -2445,3 +2760,20 @@ class TestDataSet(unittest.TestCase):
         sset = SubreadSet(human)
         ssets = sset.split(zmws=True, maxChunks=5)
 
+    def test_get_dataset_uuid(self):
+        ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+        ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        ds.write(ds_file)
+        uuid = getDataSetUuid(ds_file)
+        self.assertEqual(uuid, ds.uuid)
+        with open(ds_file, "w") as out:
+            out.write("hello world!")
+        uuid = getDataSetUuid(ds_file)
+        self.assertEqual(uuid, None)
+
+    def test_get_dataset_metatype(self):
+        ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+        ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        ds.write(ds_file)
+        meta_type = getDataSetMetaType(ds_file)
+        self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index 7741b36..71282a7 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -16,7 +16,7 @@ from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
                        ReferenceSet, ContigSet, AlignmentSet, BarcodeSet,
                        FastaReader, FastaWriter, IndexedFastaReader,
                        HdfSubreadSet, ConsensusAlignmentSet,
-                       openDataFile, FastaWriter, FastqReader)
+                       openDataFile, FastaWriter, FastqReader, openDataSet)
 import pbcore.data as upstreamData
 import pbcore.data.datasets as data
 from pbcore.io.dataset.DataSetValidator import validateXml
@@ -362,6 +362,43 @@ class TestDataSet(unittest.TestCase):
             self.assertEqual(read1, read2)
         self.assertEqual(len(list(aln)), len(list(nonCons)))
 
+        log.debug("Test with one reference")
+        aln = AlignmentSet(data.getXml(12))
+        reference = upstreamData.getFasta()
+        aln.externalResources[0].reference = reference
+        nonCons = aln.copy()
+        self.assertEqual(len(aln.toExternalFiles()), 2)
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outfn = os.path.join(outdir, 'merged.bam')
+        aln.consolidate(outfn)
+        self.assertTrue(os.path.exists(outfn))
+        self.assertEqual(len(aln.toExternalFiles()), 1)
+        #nonCons = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(nonCons.toExternalFiles()), 2)
+        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
+            self.assertEqual(read1, read2)
+        self.assertEqual(len(aln), len(nonCons))
+        self.assertEqual(aln.externalResources[0].reference, reference)
+
+        log.debug("Test with two references")
+        aln = AlignmentSet(data.getXml(12))
+        reference = upstreamData.getFasta()
+        for extRes in aln.externalResources:
+            extRes.reference = reference
+        self.assertEqual(len(aln.toExternalFiles()), 2)
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outfn = os.path.join(outdir, 'merged.bam')
+        aln.consolidate(outfn)
+        self.assertTrue(os.path.exists(outfn))
+        self.assertEqual(len(aln.toExternalFiles()), 1)
+        #nonCons = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(nonCons.toExternalFiles()), 2)
+        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
+            self.assertEqual(read1, read2)
+        self.assertEqual(len(aln), len(nonCons))
+        self.assertEqual(aln.externalResources[0].reference, reference)
+
+
     def test_accuracy_filter(self):
         aln = AlignmentSet(data.getXml(12))
         self.assertEqual(len(list(aln)), 177)
@@ -517,6 +554,7 @@ class TestDataSet(unittest.TestCase):
         for name, seq in zip(exp_names, exp_double_seqs):
             self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
 
+
     def test_contigset_consolidate_genomic_consensus(self):
         """
         Verify that the contigs output by GenomicConsensus (e.g. quiver) can
@@ -602,6 +640,86 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(cset_l, sum(1 for _ in cfq))
 
 
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_empty_fastq_consolidate(self):
+        fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
+              'lambda/2590980/0008/Analysis_Results/'
+              'm141115_075238_ethan_c100699872550000001'
+              '823139203261572_s1_p0.1.subreads.fastq')
+        fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name
+        fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name
+        cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+
+        # Two full
+        with open(fq1_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                for line in itertools.islice(fih, 240):
+                    fqh.write(line)
+        with open(fq2_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                for line in itertools.islice(fih, 240, 480):
+                    fqh.write(line)
+        cset = ContigSet(fq1_out, fq2_out)
+        cset_l = sum(1 for _ in cset)
+        self.assertEqual(cset_l, 120)
+        cset.consolidate(cfq_out)
+        cset_l = sum(1 for _ in cset)
+        cfq = FastqReader(cfq_out)
+        self.assertEqual(cset_l, 120)
+        self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+        # one full one empty
+        with open(fq1_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                for line in itertools.islice(fih, 240):
+                    fqh.write(line)
+        with open(fq2_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                fqh.write("")
+        cset = ContigSet(fq1_out, fq2_out)
+        cset_l = sum(1 for _ in cset)
+        self.assertEqual(cset_l, 60)
+        cset.consolidate(cfq_out)
+        cset_l = sum(1 for _ in cset)
+        cfq = FastqReader(cfq_out)
+        self.assertEqual(cset_l, 60)
+        self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+        # one empty one full
+        with open(fq1_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                fqh.write("")
+        with open(fq2_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                for line in itertools.islice(fih, 240):
+                    fqh.write(line)
+        cset = ContigSet(fq1_out, fq2_out)
+        cset_l = sum(1 for _ in cset)
+        self.assertEqual(cset_l, 60)
+        cset.consolidate(cfq_out)
+        cset_l = sum(1 for _ in cset)
+        cfq = FastqReader(cfq_out)
+        self.assertEqual(cset_l, 60)
+        self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+        # both empty
+        with open(fq1_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                fqh.write("")
+        with open(fq2_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                fqh.write("")
+        cset = ContigSet(fq1_out, fq2_out)
+        cset_l = sum(1 for _ in cset)
+        self.assertEqual(cset_l, 0)
+        cset.consolidate(cfq_out)
+        cset_l = sum(1 for _ in cset)
+        cfq = FastqReader(cfq_out)
+        self.assertEqual(cset_l, 0)
+        self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+
     def test_len(self):
         # AlignmentSet
         aln = AlignmentSet(data.getXml(8), strict=True)
@@ -862,6 +980,15 @@ class TestDataSet(unittest.TestCase):
             self.assertEqual(len(list(cset)), 49)
             self.assertEqual(len(cset), 49)
 
+    def test_getitem(self):
+        types = [AlignmentSet(data.getXml(8)),
+                 ReferenceSet(data.getXml(9)),
+                 SubreadSet(data.getXml(10)),
+                ]
+        for ds in types:
+            self.assertTrue(ds[0])
+
+
     def test_incorrect_len_getitem(self):
         types = [AlignmentSet(data.getXml(8)),
                  ReferenceSet(data.getXml(9)),

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git



More information about the debian-med-commit mailing list